最近整理发现以前写的scala数据统计废弃不用了,但是里面很多语法当时查了很多文档.
比如活跃统计
package com.adups.stats.offline
import com.adups.base.Utility.spark
import com.adups.bean.impl.StatDeviceActiveUsers
import com.adups.config.HdfsPath
import com.adups.stats.online.OtaAppLogStats.{fileSystem, logger, withConnection}
import org.apache.hadoop.fs.Path
import org.apache.spark.sql.functions.{countDistinct, lit}
/**
* Created by Administrator on 2017/7/19 0019.
*/
object OtaAppLogOffStats {
def activeUsers(pt: String): Unit = {
import spark.implicits._
// val path = s"/user/kafka/OtaAppLog-test-env/OtaAppLogPt/pt=$pt"
val path = HdfsPath.otaAppLogPath + s"/pt=$pt"
if (fileSystem.exists(new Path(path))) {
logger.error("开始统计StatDevice表中激活用户,数据源来自ota_applog表:"+this.getClass)
val df = spark.read.parquet(path).select($"productId", $"deviceId")
val f = df.groupBy("productId").agg(countDistinct($"deviceId") as "count").withColumn("pt", lit(pt)).as[StatDeviceActiveUsers]
f.foreachPartition(iter => {
withConnection { connection =>
val statement = connection.createStatement
iter.foreach { t => statement.executeUpdate(t.mergerStatements);
}
}
})
logger.error("完成统计StatDevice表中激活用户,数据源来自ota_applog表:"+this.getClass)
}
}
def runAll(pt:String): Unit ={
activeUsers(pt)
}
}
比如用Streaming同步kafka消费到HDFS
package com.adups.syncer.base
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.functions._
import org.apache.spark.sql.streaming.{OutputMode, ProcessingTime}
import org.apache.spark.sql.types.StructType
/**
* Created by Administrator on 2017/6/7.
*/
abstract class KKHDFS[T](val dataFrame: Dataset[T]) {
def allFieldNames:List[String]
def schema:StructType
def jsonName:String
def timeFieldName:String
def extractJson() = {
dataFrame.select(from_json(dataFrame("value").cast("string"),schema) as jsonName)
// dataFrame.printSchema()
}
def allColumnsWithPrefix={
allFieldNames.map(s => dataFrame(jsonName + "." + s))
}
def selectAllColumns={
dataFrame.select(allColumnsWithPrefix:_*)
}
def allColumns={
allFieldNames.map(s => dataFrame(s))
}
def addPt={
val ptColumn = date_format(dataFrame(timeFieldName).cast("timestamp"),"yyyy-MM-dd") as "pt"
val g = allColumns:+ ptColumn
dataFrame.select(g:_*)
}
def outputParquet(outputDir:String,checkpointLocation:String,triggerTime:String)={
dataFrame.writeStream
.format("parquet")
.partitionBy("pt")
.option("path", outputDir)
.option("checkpointLocation", checkpointLocation)
.trigger(ProcessingTime(triggerTime))
.outputMode(OutputMode.Append)
.start()
}
def outputConsole()={
dataFrame.writeStream.format("console").outputMode(OutputMode.Append).start()
}
}
package com.adups.syncer.base
import java.util.TimeZone
import com.adups.config.{KafkaConfig}
import grizzled.slf4j.Logger
import org.apache.spark.sql.{Dataset, SparkSession}
import com.adups.base.Utility._
/**
* Created by Administrator on 2017/7/11 0011.
*/
abstract class SYNCProcess(val config:KafkaConfig) {
@transient lazy val logger = Logger[this.type]
implicit def KKHDFS[T](dataFrame: Dataset[T]):KKHDFS[T]
val KafkaConfig(bootstrap,topic)=config
def createKafkaStream()(implicit spark:SparkSession) ={
spark
.readStream
.format("kafka")
.option("kafka.bootstrap.servers", bootstrap)
.option("subscribe", topic)
.option("startingOffsets", "earliest")
.option("maxPartitions", 10)
.option("kafkaConsumer.pollTimeoutMs", 512)
.option("failOnDataLoss", false).load()
}
def run(customTrigerTime:String,outputPath:String,checkpointLocation:String) {
logger.error("Begin Running Spark Stream")
while(true) {
try {
val query = createKafkaStream()
.extractJson
.selectAllColumns
.addPt
.outputParquet(outputPath,checkpointLocation,customTrigerTime)
query.awaitTermination()
} catch {
case ex:Throwable =>
ex.printStackTrace()
logger.error("spark stream:" + ex.getMessage())
}
}
}
def run(){
logger.info("Begin Running Spark Stream")
while(true) {
try {
val query = createKafkaStream()
.extractJson
.selectAllColumns
.addPt
.outputConsole
query.awaitTermination()
} catch {
case ex:Throwable => logger.error("spark stream:" + ex.getMessage())
}
}
}
def main(args: Array[String]): Unit = {
def errorRemind() {
System.err.println("You arguments were " + args.mkString("[", ", ", "]"))
System.err.println(
"""
|Usage: com.adups.CPStatistical <间隔时间量> <间隔时间单位> <checkpointLocation>.
| <间隔时间量> 必须是整数
| <间隔时间单位> minutes,seconds,etc.
| <outputPath>
| <checkpointLocation>
|
""".
stripMargin
)
System.exit(1)
}
if (args.length == 1 && args(0) == "console") run
else if (args.length != 4) errorRemind()
else{
val Array(intervalNum, intervalUnit,outputPath, checkpointLocation) = args
val timeUnitSet = Set("minutes", "seconds", "hours", "days")
try {
intervalNum.toInt
val lowerIntevalUnit = intervalUnit.toLowerCase
if (!timeUnitSet(lowerIntevalUnit))
throw new Throwable("Error Unit")
println(s"$intervalNum $intervalUnit")
run(s"$intervalNum $intervalUnit",outputPath, checkpointLocation)
} catch {
case ex: Throwable => {
logger.error("Parameters Error:" + ex)
errorRemind()
}
}
}
}
}
package com.adups.produce
import java.sql.{ResultSet, Array => _}
import com.adups.config.QueueConfig
import com.adups.produce.base.Producer
import org.json4s.JsonDSL._
import org.json4s.jackson.JsonMethods._
object OtaAppLogProducer extends Producer(QueueConfig.otaAppLogConf,"ota_applog"){
def packResult(rs:ResultSet):(String,Any)={
val id = rs.getLong("id")
val json = ("product_id" -> rs.getLong("product_id"))~
("device_id" -> rs.getString("device_id")) ~
("mid" -> rs.getString("mid")) ~
("ip" -> rs.getString("ip"))~
("version" -> rs.getString("version"))~
("continent_en" -> rs.getString("continent_en"))~
("continent_zh" -> rs.getString("continent_zh"))~
("country_en" -> rs.getString("country_en"))~
("country_zh" -> rs.getString("country_zh"))~
("province_en" -> rs.getString("province_en"))~
("province_zh" -> rs.getString("province_zh"))~
("city_zh" -> rs.getString("city_zh"))~
("city_en" -> rs.getString("city_en"))~
("networkType" -> rs.getString("networkType"))~
("create_time" -> rs.getString("create_time"))
(compact(render(json)), id)
}
def main(args: Array[String]) {
sendDataFromMysqlWithID(1)
}
}
详细demo放在github上,地址:
https://github.com/baifanwudi/ScalaStats.git