一、Java版
public class WordCountLocal {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("wordCountLocal").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
// 数据源
JavaRDD<String> sourceRdd = sc.parallelize(Arrays.asList("hello world", "hello you"));
// 拆分
JavaRDD<String> splitedRdd = sourceRdd.flatMap(new FlatMapFunction<String, String>() {
private static final long serialVersionUID = 1L;
public Iterable<String> call(String row) throws Exception {
return Arrays.asList(row.split(" "));
}
});
// map
JavaPairRDD<String, Integer> mapRdd = splitedRdd
.mapToPair(new PairFunction<String, String, Integer>() {
private static final long serialVersionUID = 1L;
public Tuple2<String, Integer> call(String row) throws Exception {
return new Tuple2<String, Integer>(row, 1);
}
});
// reduce
JavaPairRDD<String, Integer> reduceRdd = mapRdd
.reduceByKey(new Function2<Integer, Integer, Integer>() {
private static final long serialVersionUID = 1L;
public Integer call(Integer v1, Integer v2) throws Exception {
return v1 + v2;
}
});
// 循环打印
reduceRdd.foreach(new VoidFunction<Tuple2<String, Integer>>() {
private static final long serialVersionUID = 1L;
public void call(Tuple2<String, Integer> t) throws Exception {
System.out.println("word:" + t._1 + ",count:" + t._2);
}
});
// 关闭
sc.close();
}
}
二、Scala版
val conf = new SparkConf()
.setAppName("WordCount");
val sc = new SparkContext(conf)
val lines = sc.textFile("hdfs://spark1:9000/spark.txt", 1);
val words = lines.flatMap { line => line.split(" ") }
val pairs = words.map { word => (word, 1) }
val wordCounts = pairs.reduceByKey { _ + _ }
wordCounts.foreach(wordCount => println(wordCount._1 + " appeared " + wordCount._2 + " times."))
三、原理剖析