1.读取json格式的文件创建DataFrame
json文件如下:
{"name":"Fanbingbing", "score":100}
{"name":"Xuruyun", "score":99}
{"name":"Liangyongqi", "score":74}
Java代码:
package demo.java.cn;
import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;
public class DataFrameFromJson
{
public static void main(String[] args)
{
SparkConf conf = new SparkConf();
conf.setMaster("local").setAppName("jsonfile");
SparkContext sc = new SparkContext(conf);
//创建sqlContext
SQLContext sqlContext = new SQLContext(sc);
//读取json格式的文件
DataFrame df = sqlContext.read().format("json").load("star.json");
df.show();//显示 DataFrame中的内容,如果显示多行要指定多少行show(行数)
df.printSchema();//显示schema信息
//将DataFrame注册成临时的一张表,这张表临时注册到内存中,不会到磁盘
df.registerTempTable("startable");
DataFrame sqlDf = sqlContext.sql("select * from startable where score >80");
sqlDf.show();
sc.stop();
}
}
打印出来的结果:
+-----------+-----+
| name|score|
+-----------+-----+
|Fanbingbing| 100|
| Xuruyun| 99|
|Liangyongqi| 74|
+-----------+-----+
root
|-- name: string (nullable = true)
|-- score: long (nullable = true)
+-----------+-----+
| name|score|
+-----------+-----+
|Fanbingbing| 100|
| Xuruyun| 99|
+-----------+-----+
Scala代码:
package demo.scala.cn
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}
object DataFrameFromJson {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setMaster("local").setAppName("jsonfile")
val sc=new SparkContext(conf)
val sqlContext=new SQLContext(sc)
val df=sqlContext.read.json("star.json")
df.show() //显示 DataFrame中的内容,如果显示多行要指定多少行show(行数)
df.printSchema() //显示schema信息
df.registerTempTable("startable")
val sqlDf=sqlContext.sql("select * from startable where score >80")
sqlDf.show()
sc.stop()
}
}
2.非Json格式的文件创建DataFrame
数据文件如下:
Fanbingbing,100
Xuruyun,99
Liangyongqi,74
Java代码:
package demo.java.cn;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import java.util.Arrays;
import java.util.List;
public class DataFrameFromFile
{
public static void main(String[] args)
{
SparkConf conf = new SparkConf();
conf.setMaster("local").setAppName("rddStruct");
JavaSparkContext sc = new JavaSparkContext(conf);
SQLContext sqlContext = new SQLContext(sc);
//JavaRDD<String> linesRDD = sc.textFile("star.txt");
JavaRDD<String> linesRDD = sc.textFile("hdfs://localhost:9000/user/root/modelNames/part-00000“);
JavaRDD<Row> rowRDD = linesRDD.map(new Function<String, Row>()
{
private static final long serialVersionUID = 1L;
public Row call(String s) throws Exception
{
String[] split = s.split(",");
return RowFactory.create(//这里字段顺序一定要和下边 StructField对应起来
String.valueOf(split[0]),
Integer.valueOf(split[1])
);
}
});
List<StructField> asList = Arrays.asList(
DataTypes.createStructField("name", DataTypes.StringType, true),
DataTypes.createStructField("score", DataTypes.IntegerType, true)
);
StructType schema = DataTypes.createStructType(asList);
DataFrame df = sqlContext.createDataFrame(rowRDD, schema);
df.show();
//DataFrame再转为RDD
JavaRDD<Row> rowRDD2 = df.javaRDD();
rowRDD2.foreach(new VoidFunction<Row>()
{
public void call(Row row) throws Exception
{
System.out.print(row.getString(0));
System.out.println(","+row.getInt(1));
}
});
sc.stop();
}
}
Scala代码:
package demo.scala.cn
import org.apache.spark.sql.{RowFactory, SQLContext}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.{SparkConf, SparkContext}
object DataFrameFromFile {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setMaster("local").setAppName("rddStruct")
val sc = new SparkContext(conf)
val sqlContext=new SQLContext(sc)
val linesRDD = sc.textFile("star.txt")
val rowRDD = linesRDD.map { x => {
val split = x.split(",")
RowFactory.create(split(0), Integer.valueOf(split(1)))
}}
val schema = StructType(List(
StructField("name", StringType, true),
StructField("score", IntegerType, true)
))
val df=sqlContext.createDataFrame(rowRDD,schema)
df.show()
df.printSchema()
sc.stop()
}
}
3.将DataFrame存储成parquet文件,保存成parquet的方式有两种:
- df.write().mode(SaveMode.Overwrite).format("parquet").save("./sparksql/parquet");
- df.write().mode(SaveMode.Overwrite).parquet("./sparksql/parquet");
4.读取parquet文件创建DataFrame
扫描二维码关注公众号,回复:
12731229 查看本文章

Java代码:
package demo.java.cn;
import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;
public class DataFrameFromParquet
{
public static void main(String[] args)
{
SparkConf conf = new SparkConf();
conf.setMaster("local").setAppName("fromparquet");
SparkContext sc = new SparkContext(conf);
SQLContext sqlContext = new SQLContext(sc);
DataFrame df = sqlContext.read().parquet("./sparksql/parquet");
df.show();
sc.stop();
}
}
Scala代码:
package demo.scala.cn
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}
object DataFrameFromParquet {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setMaster("local").setAppName("fromparquet")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
val df = sqlContext.read.parquet("./sparksql/parquet")
df.show()
sc.stop()
}
}
5.读取mysql中的数据创建DataFrame
mysql中的数据如下:
mysql> select * from Star;
+-------------+-------+
| name | score |
+-------------+-------+
| Fanbingbing | 100 |
| Xuruyun | 99 |
| Liangyongqi | 74 |
+-------------+-------+
3 rows in set (0.00 sec)
Java代码:
package demo.java.cn;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;
import java.util.HashMap;
import java.util.Map;
public class DataFrameFromMysql
{
public static void main(String[] args)
{
SparkConf conf = new SparkConf();
conf.setMaster("local").setAppName("mysql");
JavaSparkContext sc = new JavaSparkContext(conf);
SQLContext sqlContext = new SQLContext(sc);
Map<String, String> options = new HashMap<String, String>();
options.put("url", "jdbc:mysql://master.cn:3306/db_spark");
options.put("driver", "com.mysql.jdbc.Driver");
options.put("user", "root");
options.put("password", "123456");
options.put("dbtable", "Star");
DataFrame df = sqlContext.read().format("jdbc").options(options).load();
df.show();
sc.stop();
}
}
Scala代码:
package demo.scala.cn
import java.util
import java.util.Properties
import org.apache.spark.sql.{SQLContext, SaveMode}
import org.apache.spark.{SparkConf, SparkContext}
object DataFrameFromMysql {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setMaster("local").setAppName("mysql")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
val options = new util.HashMap[String, String]()
options.put("url", "jdbc:mysql://master.cn:3306/db_spark")
options.put("driver", "com.mysql.jdbc.Driver")
options.put("user", "root")
options.put("password", "123456")
options.put("dbtable", "Star")
val df = sqlContext.read.format("jdbc").options(options).load()
df.show()
//将DataFrame的数据插入mysql
val properties = new Properties()
properties.setProperty("user","root")
properties.setProperty("password","123456");
df.write.mode(SaveMode.Append).jdbc("jdbc:mysql://master.cn:3306/db_spark","result",properties)
sc.stop()
}
}
6.读取hive中的数据创建DataFrame
hive中的数据如下:
hive> select * from Star;
+-------------+-------+
| name | score |
+-------------+-------+
| Fanbingbing | 100 |
| Xuruyun | 99 |
| Liangyongqi | 74 |
+-------------+-------+
3 rows in set (0.00 sec)
Java代码:
SparkConf conf = new SparkConf();
conf.setAppName("hive");
JavaSparkContext sc = new JavaSparkContext(conf);
//HiveContext是SQLContext的子类。
HiveContext hiveContext = new HiveContext(sc);
hiveContext.sql("USE spark");
hiveContext.sql("DROP TABLE IF EXISTS student_infos");
//在hive中创建student_infos表
hiveContext.sql("CREATE TABLE IF NOT EXISTS student_infos (name STRING,age INT) row format delimited fields terminated by '\t' ");
hiveContext.sql("load data local inpath '/root/test/student_infos' into table student_infos");
hiveContext.sql("DROP TABLE IF EXISTS student_scores");
hiveContext.sql("CREATE TABLE IF NOT EXISTS student_scores (name STRING, score INT) row format delimited fields terminated by '\t'");
hiveContext.sql("LOAD DATA "
+ "LOCAL INPATH '/root/test/student_scores'"
+ "INTO TABLE student_scores");
/**
* 查询表生成DataFrame
*/
DataFrame goodStudentsDF = hiveContext.sql("SELECT si.name, si.age, ss.score "
+ "FROM student_infos si "
+ "JOIN student_scores ss "
+ "ON si.name=ss.name "
+ "WHERE ss.score>=80");
hiveContext.sql("DROP TABLE IF EXISTS good_student_infos");
goodStudentsDF.registerTempTable("goodstudent");
DataFrame result = hiveContext.sql("select * from goodstudent");
result.show();
/**
* 将结果保存到hive表 good_student_infos
*/
goodStudentsDF.write().mode(SaveMode.Overwrite).saveAsTable("good_student_infos");
Row[] goodStudentRows = hiveContext.table("good_student_infos").collect();
for(Row goodStudentRow : goodStudentRows) {
System.out.println(goodStudentRow);
}
sc.stop();
Scala代码:
val conf = new SparkConf()
conf.setAppName("HiveSource")
val sc = new SparkContext(conf)
/**
* HiveContext是SQLContext的子类。
*/
val hiveContext = new HiveContext(sc)
hiveContext.sql("use spark")
hiveContext.sql("drop table if exists student_infos")
hiveContext.sql("create table if not exists student_infos (name string,age int) row format delimited fields terminated by '\t'")
hiveContext.sql("load data local inpath '/root/test/student_infos' into table student_infos")
hiveContext.sql("drop table if exists student_scores")
hiveContext.sql("create table if not exists student_scores (name string,score int) row format delimited fields terminated by '\t'")
hiveContext.sql("load data local inpath '/root/test/student_scores' into table student_scores")
val df = hiveContext.sql("select si.name,si.age,ss.score from student_infos si,student_scores ss where si.name = ss.name")
hiveContext.sql("drop table if exists good_student_infos")
/**
* 将结果写入到hive表中
*/
df.write.mode(SaveMode.Overwrite).saveAsTable("good_student_infos")
sc.stop()