map传入到reduce时,是按键排序的,但键内的值序列是无序的,如果要想构造有序的值序列,需要二次排序,构造组合中间键,让MapReduce框架执行排序
//中间键的排序顺序
DateTemperaturePair类
public class DateTemperaturePair
implements Writable, WritableComparable<DateTemperaturePair>{
private Text yearMonth = new Text(); //自然键
private Text day =new Text();
private IntWritable temperature =new IntWritable(); //次键
...
@Override
// 控制键排序顺序的比较器
public int compareTo(DateTemperaturePair pair){
int compareValue= this.yearMonth.compareTo(pair.getYearMonth());
if (compareValue ==0){
compareValue = temperature.compareTo(pair.getTemperature());
}
return compareValue; //升序排序
}
....
}
定制分区器 控制那个归约器处理哪些键,对自然键而不是二次组合键,分区器确保相同的键发送给同一个归约器
DateTemperaturePartitioner 类
public class DateTemperaturePartitioner
extends Partitioner<DateTemperaturePair,Text>{
@Override
public int getPartition(DateTemperaturePair pair,
Text text,
int numberOfPartitions){
return Math.abs(pair.getYeatMonth().hashCode()%numberOfPartitions);
}
}
分组比较器 控制哪些键分组到一个reduce函数调用
DateTemperatureGroupingComparator类
pulic class DateTemperatureGroupingComparator
extends WritableComparator{
public DateTemperatureGroupingComparator(){
super(DateTemperaturePair.class,true)}
@Override
//控制那些键要分组到一个reduce方法调用
public int compare(WritableComparable wc1,WritableComparable wc2){
DateTemperaturePair pair=(DateTemperaturePair) wc1;
DateTemperaturePair pair2 = (DateTemperaturePair) wc2;
return pair.getYearMonth().compareTo(pair2.getYearMonth());
}
}
Hadoop二次排序解决方案 完整的MapReduce实现
map(key, value){
String[] tokens=value.split(","); // YYY=tokens[0] MM=tokens[1] DD=tokens[2] temperature=tokens[3]
String yearMonth= tokens[0]+tokens[1];
String day = tokens[2];
int temperature= Inter.parseInt(tokens[3]);
DateTemperaturePair reducekey =new DateTemperaturePair();
reducekey.setYearMonth(yearMonth);
reducekey.setDay(Day);
reducekey.setTemperature(Temperature);
emit(reducekey, temperature);
}
reduce 函数
reduce(key, value){
StringBuilder SortedTemperatureList = new StringBuilder();
for (Integer temperature :value){
sortedTemperatureList.append(temperature);
sortedTemperatureList.append(",");
}
emit(key, sortedTemperatureList);
}
spark
public class SecondarySort{
//1 读取输入参数并验证
//2创建javasparkcontext对象连接到spark master
//3使用ctx创建javaRDD
//4 由RDD创建键值对
//5验证上一步的数据 并打印
//6 按键对RDD元素分组
//7收集RDD所有只并打印
//8对归约器排序
//9对归约器输出验证,打印RDD中的值
ctx.close();
System.exit(0);
}
//步骤1 从HDFS中读取输入文件
if(args.length<1){
System.err.println("usage:secondarysort<file>");
System.exit(1);}
String inputPath=args[0];
System.out.println("args[0]:<file>="+args[0]);
//步骤2 连接spark master javaSparkContext对象 初始化应用驱动器,向集群管理器注册这个应用驱动器 获得一个执行器列表来执行应用驱动器。
final JavaSparkContext ctx = new JavaSparkContext();
//用javasparkcontext创建javaRDD 读取一个HDFS文件
JavaRDD<string> lines =ctx.textFile(inputpath,1);
//从RDD创建键值对
JavaPairRDD<String,Tuple2<Integer, Integer>> pairs=
lines.mapToPair(new PairFunction<String,String,Tuple2<Integer,Integer>>){
public Tuple2<String, Tuple2<Integer,Integer>> call(String s){
String[] tokens=s.split(",");
System.out.println(tokens[0]+","+tokens[1]+","+tokens[2]);
Integer time = new Integer(tokens[1]);
Integer value = new Integer(tokens[2]);
Tuple2<Integer,Integer> timevalue=
new Tuple2<Integer, Integer>(time, value);
return new Tuple2<String,Tuple2<Integer,Integer>>(tokens[0],timevalue);
new Tuple2}}
//按键对JavaPairRDD分组
JavaPairRDD<String,Iterable<Tuple2<Integer,Integer>>> groups =pairs.groupByKey();
//在内存中归约器值排序
JavaPairRDD<String,Iterable<Tuple2<Integer,Integer>>> sorted=groups.mapvalues(
new Function<Iterable<Tuple2<Integer,Integer>>,
Iterable<Tuple2<Integer,Integer>>>){
public Iterable<Tuple2<Integer,Integer>> call (Iterable<Tuple2<Integer, Integer>> s){
List<Tuple2<Integer,Integer>> newList== new ArrayList<Tuple2<Integer,Integer>>(s);
Collections.sort(newList,new TupleComparator());
return newList;}
}