本文章主要通过scala实现spark读取各类数据源
1 读取hive数据
/** * @author jhp * 使用spark读取Hive数据 */ object HiveDataSource { def main(args: Array[String]): Unit = { val conf = new SparkConf() .setAppName("HiveDataSource"); val sc = new SparkContext(conf); val hiveContext = new HiveContext(sc); hiveContext.sql("DROP TABLE IF EXISTS student_infos"); hiveContext.sql("CREATE TABLE IF NOT EXISTS student_infos (name STRING, age INT)"); hiveContext.sql("LOAD DATA " + "LOCAL INPATH '/usr/local/spark-study/resources/student_infos.txt' " + "INTO TABLE student_infos"); hiveContext.sql("DROP TABLE IF EXISTS student_scores"); hiveContext.sql("CREATE TABLE IF NOT EXISTS student_scores (name STRING, score INT)"); hiveContext.sql("LOAD DATA " + "LOCAL INPATH '/usr/local/spark-study/resources/student_scores.txt' " + "INTO TABLE student_scores"); val goodStudentsDF = hiveContext.sql("SELECT si.name, si.age, ss.score " + "FROM student_infos si " + "JOIN student_scores ss ON si.name=ss.name " + "WHERE ss.score>=80"); hiveContext.sql("DROP TABLE IF EXISTS good_student_infos"); goodStudentsDF.saveAsTable("good_student_infos"); val goodStudentRows = hiveContext.table("good_student_infos").collect(); for(goodStudentRow <- goodStudentRows) { println(goodStudentRow); } } }
2 读取json数据
import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.sql.SQLContext import org.apache.spark.sql.types.StructType import org.apache.spark.sql.types.StructField import org.apache.spark.sql.types.StringType import org.apache.spark.sql.types.IntegerType import org.apache.spark.sql.Row import org.apache.spark.sql.types.LongType /** * @author jhp * 使用spark读取json文件 */ object JSONDataSource { def main(args: Array[String]): Unit = { val conf = new SparkConf() .setAppName("JSONDataSource") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) // 创建学生成绩DataFrame val studentScoresDF = sqlContext.read.json("hdfs://spark1:9000/spark-study/students.json") // 查询出分数大于80分的学生成绩信息,以及学生姓名 studentScoresDF.registerTempTable("student_scores") val goodStudentScoresDF = sqlContext.sql("select name,score from student_scores where score>=80") val goodStudentNames = goodStudentScoresDF.rdd.map { row => row(0) }.collect() // 创建学生基本信息DataFrame val studentInfoJSONs = Array("{\"name\":\"Leo\", \"age\":18}", "{\"name\":\"Marry\", \"age\":17}", "{\"name\":\"Jack\", \"age\":19}") val studentInfoJSONsRDD = sc.parallelize(studentInfoJSONs, 3); val studentInfosDF = sqlContext.read.json(studentInfoJSONsRDD) // 查询分数大于80分的学生的基本信息 studentInfosDF.registerTempTable("student_infos") var sql = "select name,age from student_infos where name in (" for(i <- 0 until goodStudentNames.length) { sql += "'" + goodStudentNames(i) + "'" if(i < goodStudentNames.length - 1) { sql += "," } } sql += ")" val goodStudentInfosDF = sqlContext.sql(sql) // 将分数大于80分的学生的成绩信息与基本信息进行join val goodStudentsRDD = goodStudentScoresDF.rdd.map { row => (row.getAs[String]("name"), row.getAs[Long]("score")) } .join(goodStudentInfosDF.rdd.map { row => (row.getAs[String]("name"), row.getAs[Long]("age")) }) // 将rdd转换为dataframe val goodStudentRowsRDD = goodStudentsRDD.map( info => Row(info._1, info._2._1.toInt, info._2._2.toInt)) val structType = StructType(Array( StructField("name", StringType, true), StructField("score", IntegerType, true), StructField("age", IntegerType, true))) val goodStudentsDF = sqlContext.createDataFrame(goodStudentRowsRDD, structType) // 将dataframe中的数据保存到json中 goodStudentsDF.write.format("json").save("hdfs://spark1:9000/spark-study/good-students-scala") } }
3 读取parquet数据
import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.sql.SQLContext import org.apache.spark.sql.SaveMode /** * @author jhp * spark操作parquet格式数据 */ object ParquetMergeSchema { def main(args: Array[String]): Unit = { val conf = new SparkConf() .setAppName("ParquetMergeSchema") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) import sqlContext.implicits._ // 创建一个DataFrame,作为学生的基本信息,并写入一个parquet文件中 val studentsWithNameAge = Array(("leo", 23), ("jack", 25)).toSeq val studentsWithNameAgeDF = sc.parallelize(studentsWithNameAge, 2).toDF("name", "age") studentsWithNameAgeDF.save("hdfs://spark1:9000/spark-study/students", "parquet", SaveMode.Append) // 创建第二个DataFrame,作为学生的成绩信息,并写入一个parquet文件中 val studentsWithNameGrade = Array(("marry", "A"), ("tom", "B")).toSeq val studentsWithNameGradeDF = sc.parallelize(studentsWithNameGrade, 2).toDF("name", "grade") studentsWithNameGradeDF.save("hdfs://spark1:9000/spark-study/students", "parquet", SaveMode.Append) // 首先,第一个DataFrame和第二个DataFrame的元数据肯定是不一样的吧 // 一个是包含了name和age两个列,一个是包含了name和grade两个列 // 所以, 这里期望的是,读取出来的表数据,自动合并两个文件的元数据,出现三个列,name、age、grade // 用mergeSchema的方式,读取students表中的数据,进行元数据的合并 val students = sqlContext.read.option("mergeSchema", "true") .parquet("hdfs://spark1:9000/spark-study/students") students.printSchema() students.show() } }