Spark实战(十)sparkSQL保存至数据库三种方式

一、当需保存各字段确定时

   在spark官方文档上给出了几种保存数据至数据库的方式,在字段确定时使用非常方便,代码如下:

##  读取数据
val jdbcDF = spark.read
  .format("jdbc")
  .option("url", "jdbc:postgresql:dbserver")
  .option("dbtable", "schema.tablename")
  .option("user", "username")
  .option("password", "password")
  .load()

val connectionProperties = new Properties()
connectionProperties.put("user", "username")
connectionProperties.put("password", "password")

val jdbcDF2 = spark.read
  .jdbc("jdbc:postgresql:dbserver", "schema.tablename", connectionProperties)
// Specifying the custom data types of the read schema
connectionProperties.put("customSchema", "id DECIMAL(38, 0), name STRING")

val jdbcDF3 = spark.read
  .jdbc("jdbc:postgresql:dbserver", "schema.tablename", connectionProperties)


##  保存数据

// Saving data to a JDBC source
jdbcDF.write
  .format("jdbc")
  .option("url", "jdbc:postgresql:dbserver")
  .option("dbtable", "schema.tablename")
  .option("user", "username")
  .option("password", "password")
  .save()

jdbcDF2.write
  .jdbc("jdbc:postgresql:dbserver", "schema.tablename", connectionProperties)

// Specifying create table column data types on write
jdbcDF.write
  .option("createTableColumnTypes", "name CHAR(64), comments VARCHAR(1024)")
  .jdbc("jdbc:postgresql:dbserver", "schema.tablename", connectionProperties)

二、字段不确定,通过手动写入

   在需要保存的字段不确定时,需要自己通过获取数据库驱动,通过PreparedStatement来写入数据:

object MySQLUtils {

  /**
    * 获取数据库连接
    * @return
    */
  def getConnection() ={
      DriverManager.getConnection("jdbc:mysql://localhost:3306/hellospark?user=root&password=123456")
    }

  /**
    * 释放数据库连接等资源
    * @param connection
    * @param pstmt
    */
  def release(connection: Connection, pstmt: PreparedStatement) : Unit = {
    try {
      if (pstmt != null) {
        pstmt.close()
      }
    } catch {
      case e: Exception => e.printStackTrace()
    } finally {
      if (connection != null) {
        connection.close()
      }
    }
  }
}


object StatDAO {

  /**
    * 批量保存到数据库
    */
  def insertDayVideoAccessTopN(list: ListBuffer[DataStat]): Unit = {

var connection: Connection = null
var pstmt: PreparedStatement = null

try{
  connection = MySQLUtils.getConnection()
  connection.setAutoCommit(false)//设置手动提交
  val sql = "insert into data_stat(day,cms_id,times) values (?,?,?) "

  pstmt = connection.prepareStatement(sql)

  for(ele <- list){
    pstmt.setString(1,ele.day)
    pstmt.setLong(2,ele.cmsId)
    pstmt.setLong(3,ele.times)
    pstmt.addBatch()
  }

  pstmt.executeBatch()//执行批量处理
  connection.commit()//手工提交

} catch {
  case e: Exception => e.printStackTrace()
} finally {
  MySQLUtils.release(connection, pstmt)
}
  }
}

三、涉及rdd操作需先进行转换为DataFrame

   如果是读取文档或者日志中的数据,还需要先将数据进行清洗,然后将需要的字段转换为dataframe,再进行保存操作

object ProvinceConvertUtil {

  val structType = StructType(
    Array(
//    StructField("tname",StringType),
//    StructField("tstatus",StringType),
//    StructField("nameShort",StringType),
//    StructField("regionCode",StringType),
//    StructField("parentRegionCode",StringType),
//    StructField("tlevel",StringType)
      StructField("CODE",StringType),
      StructField("TNAME",StringType),
      StructField("TSTATUS",StringType),
      StructField("NAME_SHORT",StringType),
      StructField("REGION_CODE",StringType),
      StructField("PARETN_REGION_CODE",StringType),
      StructField("TLEVEL",StringType)
	)
  )
  def praseLocation(provinceStr:String,code:Int) = {
      val splits = provinceStr.split("\t")
      var parentRegionCode = ""
      var tname = splits(1)
      var tlevel = ""
      var tstatus = "1"
      var nameShort = ""
      var regionCode = splits(0)
  if (splits(0).charAt(2).toString == "0" && splits(0).charAt(3).toString == "0") {
    //省级
    parentRegionCode = ""
    tlevel = "1"
  } else if (splits(0).charAt(4).toString == "0" && splits(0).charAt(5).toString == "0") {
    //市级
    parentRegionCode = splits(0).substring(0, 2) + "0000"
    tlevel = "2"
  } else {
    //区级
    parentRegionCode = splits(0).substring(0, 4) + "00"
    tlevel = "3"
  }
  Row(code.toString,tname, tstatus, nameShort, regionCode, parentRegionCode, tlevel)
  }
	

object ProvinceFormat {

  def main(args: Array[String]): Unit = {

val spark = SparkSession.builder().appName("TopNStatJob")
  .config("spark.sql.parquet.compression.codec","gzip")
  .master("local[2]").getOrCreate()

//    val accessRdd = spark.sparkContext.textFile("D://item//mukewang//data//1.txt")
    val accessRdd = transfer(spark,"D://item//mukewang//data//province.txt")

program(accessRdd,spark)

spark.stop()

 }

  /**
    * 读取txt文件
    * @param sc
    * @param path
    * @return
    */
  def transfer(sc:SparkSession,path:String):RDD[String]={
    sc.sparkContext.hadoopFile(path,classOf[TextInputFormat],classOf[LongWritable],classOf[Text],1)
      .map(p => new String(p._2.getBytes, 0, p._2.getLength, "GBK"))
  }

  /**
    * schema
    * @param rdd
    * @param spark
    */
  def program(rdd :RDD[String],spark:SparkSession):Unit={

var index = 100000

val infoDF = spark.createDataFrame(rdd.map(line => {
  index += 1
  ProvinceConvertUtil.praseLocation(line,index)
}),ProvinceConvertUtil.structType)
infoDF.show(40)
infoDF.write.mode(SaveMode.Append).jdbc("jdbc:mysql://localhost:3306/hellospark","region",connectionProperties)
infoDF.coalesce(1).write.format("parquet").mode(SaveMode.Overwrite).save("D://item//mukewang//log//out//clean")

spark.stop()
}