Spark数据统计(java版)

Java数据统计

spark版本2.1.2,包含Dateset使用,SparkStreaming数据统计

项目地址为https://github.com/baifanwudi/big-data-analysis

代码示例

SparkSql demo: 读取json文件写入hive

package com.adups.offline.hive.log;

import com.adups.base.AbstractSparkSql;
import com.adups.config.FlumePath;
import com.adups.util.DateUtil;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;



public class OtaAppLog extends AbstractSparkSql {

    private Logger logger = LoggerFactory.getLogger(OtaAppLog.class);

    @Override
    public void executeProgram(String pt, String path, SparkSession spark) throws IOException {

        int partitionNum = 4;
        String ptWithPre= DateUtil.pathPtWithPre(pt);
        String appLogPath= FlumePath.APP_LOG_PATH+ptWithPre;
        if(!existsPath(appLogPath)){
            return;
        }

        Dataset<Row> otaAppLog= spark.read().schema(produceSchema()).json(appLogPath).distinct().repartition(partitionNum);
        otaAppLog.createOrReplaceTempView("OtaAppLog");
        beforePartition(spark);
        String sql = "insert overwrite table ota_app_log partition(pt='"+pt+"') " +
                "select mid,ip,version,deviceId,productId,continentEn,continentZh,countryEn,countryZh,provinceEn,provinceZh,cityEn,cityZh," +
                "networktype,lac,cid,mcc,mnc,rxlev,num,goType,createTime,dataType from OtaAppLog";
        logger.warn("executing sql is :" + sql);
        spark.sql(sql);
    }

    public StructType produceSchema(){
        List<StructField> inputFields=new ArrayList<>();
        String splitSeq=",";
        String stringType="mid,ip,version,continentEn,continentZh,countryEn,countryZh,provinceEn,provinceZh," +
                "cityEn,cityZh,networktype,deviceId,lac,cid,mcc,mnc,rxlev,dataType";
        String timeType="createTime";
        String longType="productId";
        String integerType="num,goType";
        for(String stringTmp:stringType.split(splitSeq)){
            inputFields.add(DataTypes.createStructField(stringTmp,DataTypes.StringType,true));
        }
        inputFields.add(DataTypes.createStructField(timeType,DataTypes.TimestampType,false));
        for(String integerTmp:integerType.split(splitSeq)){
            inputFields.add(DataTypes.createStructField(integerTmp,DataTypes.IntegerType,true));
        }
        for(String longTmp:longType.split(splitSeq)){
            inputFields.add(DataTypes.createStructField(longTmp,DataTypes.LongType,false));
        }
        return DataTypes.createStructType(inputFields);
    }

    public static void main(String[] args) throws Exception {
        String pt= DateUtil.producePtOrYesterday(args);
        OtaAppLog otaAppLog =new OtaAppLog();
        otaAppLog.runAll(pt);
    }
}

package com.adups.base;


import com.adups.config.HiveConfig;
import org.apache.hadoop.fs.Path;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;

/**
 * @author allen
 * Created by allen on 04/08/2017.
 */
public abstract class AbstractSparkSql extends AbstractFileSystem {

    private  Logger logger = LoggerFactory.getLogger(this.getClass());

    /**
     * spark运算
     * @param pt 时间格式 pt=2017-10-11
     * @param path hdfs路径
     * @param spark
     * @throws IOException
     */
    public abstract void  executeProgram(String pt,String path,SparkSession spark) throws IOException;

    public boolean existsPath(String... pathList) throws IOException {
        for (String path : pathList) {
            if (!fileSystem.exists(new Path(path))) {
                logger.error(" the path:" + path + " is not existed");
                return false;
            }else{
                logger.warn("executing the path is : " + path);
            }
        }
        return true;
    }

    public void runAll(String pt,String path,Boolean isHiveSupport) throws IOException {
        if(path!=null && !existsPath(path)) {
            logger.error("the src path is not existed:" + path);
            return;
        }
        executeSpark(pt,path,isHiveSupport);
    }

    /**
     *  没有路径判断,默认激活 hive
     */
    public void runAll(String pt) throws IOException {
        runAll(pt,null,true);
    }

    public void runAll(String pt,String path) throws IOException {
        runAll(pt,path,true);
    }

    public void runAll(String pt,Boolean isHiveSupport) throws IOException {
        runAll(pt,null,isHiveSupport);
    }

    private void executeSpark(String pt,String path,Boolean isHiveSupport) throws IOException {

        SparkSession spark ;
        String appName=this.getClass().getSimpleName();
        if(isHiveSupport) {
            spark = SparkSession.builder().appName(appName).enableHiveSupport().getOrCreate();
            logger.info("spark enable hive, begin to execute the program");
            useDataBase(spark);
        }else{
            spark = SparkSession.builder().appName(appName).getOrCreate();
            logger.info("spark begin to execute the program");
        }
        executeProgram(pt,path,spark);
        logger.info("spark has finished the program ");
    }

    private void useDataBase(SparkSession spark){
        logger.info("before the sql : "+HiveConfig.SQL_DATABASE );
        spark.sql(HiveConfig.SQL_DATABASE);
    }

    public void beforePartition(SparkSession spark){
        spark.sql(HiveConfig.HIVE_PARTITION);
    }
}

spark读json文件做数据统计写入mysql

package com.adups.online.flume;

import com.adups.base.AbstractSparkSql;
import com.adups.bean.out.DeviceArea;
import com.adups.config.OnlineOfflinePath;
import com.adups.common.ReadTable;
import com.adups.common.sql.flume.DeviceAreaOnlineSave;
import com.adups.util.CommonUtil;
import com.adups.util.DateUtil;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.functions;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.collection.Seq;
import static org.apache.spark.sql.functions.*;
import java.io.IOException;

/**
 * @author allen
 * Created by allen on 03/08/2017.
 */
public class DeviceAreaOnline extends AbstractSparkSql {

    private Logger logger = LoggerFactory.getLogger(this.getClass());

    @Override
    public void executeProgram(String pt, String path, SparkSession spark) throws IOException {
        String prePath = DateUtil.pathPtWithPre(pt);
        String nowPt = DateUtil.nowPtDay();

        String beginTime = nowPt + " 00:00:00";
        String endTime = nowPt + " 23:59:59";

        String deviceTotal = OnlineOfflinePath.OFFLINE_DEVICE_NEW_TOTAL_PATH + prePath;
        String deviceAreaTotal = OnlineOfflinePath.OFFLINE_DEVICE_AREA_NEW_TOTAL_PATH + prePath;
        String originAreaPath = OnlineOfflinePath.ONLINE_DEVICE_AREA_NEW_TOTAL_PATH;

        if (!existsPath(deviceAreaTotal, deviceAreaTotal)) {
            return;
        }

        String where = "(select product_id as productId,device_id as deviceId,country_zh as country,province_zh as province from iot_register.device_info " +
                "where create_time between '" + beginTime + "' and '" + endTime + "' ) as device_time_filter";

        Dataset<Row> todayDevice = new ReadTable().loadTable(spark, where).coalesce(1);
        Dataset<Row> yesterdayStats = spark.read().parquet(deviceTotal).select("productId", "totalNum");
        Dataset<Row> totalIncrement = todayDevice.groupBy("productId").agg(functions.countDistinct("deviceId").as("newNum"));

        Seq<String> seq = CommonUtil.columnNames("productId");
        Seq<String> naFillZero = CommonUtil.columnNames("newNum,totalNum");
        Dataset<Row> result = yesterdayStats.join(totalIncrement, seq, "outer").na().fill(0, naFillZero)
                .select(col("productId"), col("newNum"), col("newNum").plus(col("totalNum")).as("totalNum"))
                .withColumn("pt", lit(nowPt)).coalesce(1);

        Dataset<Row> yesterdayAreaStatistics = spark.read().parquet(deviceAreaTotal).select("productId", "country", "province", "totalNum").toDF();

        Dataset<Row> areaIncrement = todayDevice.groupBy("productId", "country", "province").agg(functions.countDistinct("deviceId").as("newNum"));

        seq = CommonUtil.columnNames("productId,country,province");
        Dataset<Row>  areaResult = yesterdayAreaStatistics.join(areaIncrement, seq, "outer").na().fill(0, naFillZero)
                .select(col("productId"), col("country"), col("province"), col("newNum"),
                        col("newNum").plus(col("totalNum")).as("totalNum")).withColumn("pt", lit(nowPt)).coalesce(1);

        Dataset<DeviceArea> deltaArea;

        if (existsPath(originAreaPath)) {
            try {
                Dataset<Row> originBase = spark.read().parquet(originAreaPath);
                deltaArea = areaResult.except(originBase).coalesce(1).as(new DeviceArea().produceBeanEncoder());
            } catch (Exception e) {
                logger.error(e.getMessage());
                deltaArea = areaResult.as(new DeviceArea().produceBeanEncoder());
            }
        } else {
            deltaArea = areaResult.as(new DeviceArea().produceBeanEncoder());
        }

        try {
            insertToMysql(deltaArea);
        } catch (Exception e) {
            logger.error(e.getMessage());
        }
        areaResult.write().mode("overwrite").format("parquet").save(originAreaPath);
        result.write().mode("overwrite").format("parquet").save(OnlineOfflinePath.ONLINE_DEVICE_NEW_TOTAL_PATH);

    }

    public void insertToMysql(Dataset<DeviceArea> dataSet) {
        dataSet.foreachPartition(data -> {
            String sql = "insert into stats_device_area(product_id,country,province,new_num,total_num,pt)" +
                    "values (?,?,?,?,?,?) on duplicate key update new_num=?,total_num=?";
            new DeviceAreaOnlineSave().putDataBatch(data, sql);
        });
    }

    public static void main(String[] args) throws IOException {
        String pt = DateUtil.producePtOrYesterday(args);
        DeviceAreaOnline deviceAreaOnline = new DeviceAreaOnline();
        deviceAreaOnline.runAll(pt, false);
    }
}

Spark数据统计(java版)

Java数据统计

代码示例

SparkSql demo: 读取json文件写入hive

spark读json文件做数据统计写入mysql

猜你喜欢