spark(10)-spark高级排序(course19)

1.内容
基础排序算法实战
二次排序算法实战
更高级排序算法
排序算法内幕解密

//修改一下log级别
scala> sc.setLogLevel("WARN")

2.二次排序就是排序的时候考虑两个维度,维度1相同时,再排维度2

例如数据源:
SecondSort.txt

2 3 
4 1 
3 2 
4 3 
9 7 
2 1

3.实现Ordered(排序规则),Serializable接口的javabean:

/*
SecondSortKey.java
*/
package cn.whbing.spark.SparkApps.cores;

import java.io.Serializable;

import scala.math.Ordered;

/*
 * 自定义二次排序,实现的是scala的接口,不是java中的排序接口
 * */

public class SecondSortKey implements Ordered<SecondSortKey>,Serializable{

    //需要二次排序的key
    private int first;
    private int second;


    public int getFirst() {
        return first;
    }

    public void setFirst(int first) {
        this.first = first;
    }

    public int getSecond() {
        return second;
    }

    public void setSecond(int second) {
        this.second = second;
    }

    public SecondSortKey(int first,int second) {
        this.first = first;
        this.second = second;
    }

    @Override
    public boolean $greater(SecondSortKey other) {
        // 大于的时候的情况
        if(this.first > other.getFirst()){
            return true;
        }else if(this.first == other.getFirst() && this.second > other.getSecond()){
            return true;
        }
        return false;
    }

    @Override
    public boolean $greater$eq(SecondSortKey other) {
        // 大于等于的情况
        if(this.$greater(other)){
            return true;
        }else if(this.first == other.getFirst() && this.second == other.getSecond()){
            return true;
        }
        return false;
    }

    @Override
    public boolean $less(SecondSortKey other) {
        // 小于的情况
        if(this.first < other.getFirst()){
            return true;
        }else if(this.first == other.getFirst() && this.second < other.getSecond()){
            return true;
        }
        return false;
    }

    @Override
    public boolean $less$eq(SecondSortKey other) {
        // TODO 小于等于的情况
        if(this.$less(other)){
            return true;
        }else if(this.first == other.getSecond() && this.second == other.getSecond()){
            return true;
        }
        return false;
    }

    @Override
    public int compare(SecondSortKey other) {
        if(this.first - other.getFirst() !=0){
            return this.first - other.getFirst();
        }else {
            return this.second - other.getSecond();
        }
    }

    @Override
    public int compareTo(SecondSortKey other) {
        if(this.first - other.getFirst() !=0){
            return this.first - other.getFirst();
        }else {
            return this.second - other.getSecond();
        } 
    }

    @Override
    public int hashCode() {
        final int prime = 31;
        int result = 1;
        result = prime * result + first;
        result = prime * result + second;
        return result;
    }

    @Override
    public boolean equals(Object obj) {
        if (this == obj)
            return true;
        if (obj == null)
            return false;
        if (getClass() != obj.getClass())
            return false;
        SecondSortKey other = (SecondSortKey) obj;
        if (first != other.first)
            return false;
        if (second != other.second)
            return false;
        return true;
    }
}
/*SecondSortApp.java*/

package cn.whbing.spark.SparkApps.cores;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;

import scala.Tuple2;

/*
 * 二次排序:
 * 第一步:按照Ordered和serializable接口实现自定义排序
 * 第二步:将要排序的二次排序的文件加载进<Key, Value>类型的RDD
 * 第三步:使用sortByKey基于自定义的Key进行二次排序
 * 第四步:去除掉排序的Key,只保留排序后的结果
 * 
 */
public class SecondSortApp {

    public static void main(String[] args) {
        SparkConf conf = new SparkConf();
        conf.setAppName("SecondSort").setMaster("local");

        JavaSparkContext sc = new JavaSparkContext(conf);
        sc.setLogLevel("WARN");
        JavaRDD<String> lines = sc.textFile("D://javaTools//EclipseWork2//SparkApps//SecondSort.txt");
        JavaPairRDD<SecondSortKey, String> pairs = lines.mapToPair(new PairFunction<String, SecondSortKey, String>() {
            //String:lines读进来的内容, K2:处理的key,为SecondSortKey,V2:String

            @Override
            public Tuple2<SecondSortKey, String> call(String line) throws Exception {
                String[] splited = line.split(" ");
                SecondSortKey key = new SecondSortKey(
                        Integer.valueOf(splited[0]), Integer.valueOf(splited[1]));
                return new Tuple2(key, line);
            }           
        });
        JavaPairRDD<SecondSortKey, String> sorted = pairs.sortByKey();//完成二次排序
        //过滤掉排序后的key,保留原结果
        JavaRDD<String> secondSorted = sorted.map(new Function<Tuple2<SecondSortKey,String>, String>() {

            private static final long serialVersionUID = 1L;

            @Override
            public String call(Tuple2<SecondSortKey, String> sortedContent) throws Exception {

                return sortedContent._2;
            }
        });

        secondSorted.foreach(new VoidFunction<String>() {

            @Override
            public void call(String sorted) throws Exception {
                System.out.println(sorted);
            }
        });
    }
}

结果:

2 1
2 3 
3 2 
4 1 
4 3 
9 7 

4.小结:
对于待排序的每一行原数据,我们进行封装成javabean,并实现Ordered接口,将其作为key;
原来的行作为value;
对上述(k,v)进行sortByKey操作即进行自定义的排序操作;
最后去掉自己定义key即可。

猜你喜欢

转载自blog.csdn.net/answer100answer/article/details/78774783