大数据系列之Hadoop知识整理(九)MapReduce的几个典型事例,全排序,倒排序,二次排序

以下的几种实现,采用的是本地实现,并没有放在分布式上去运行,如果方分布式,打成Jar包即可

1.全排序

全排序的意思是在生成的所有结果文件中,Key是从小到大排序的,注意,不是局部有序(某一个输出文件Key有序)而是所有的输出文件,也就是说从第一个输出文件开始到最后一个输出文件,Key是从小到大排序的。

实现方法:

      1.定义1个reduce
      2.自定义分区函数.
自行设置分解区间。
      3.使用hadoop采样机制。

通过采样器生成分区文件,结合hadoop的TotalOrderPartitioner进行分区划分。

我们来实现一个实例,随机生成6000条天气预报数据,每条数据为年份与年份对应的温度,利用全排序将每年的最高气温输出

其中方法1,2不具有普遍性,方法3的实现如下:

数据准备:

import org.junit.Test;

import java.io.FileWriter;
import java.io.IOException;
import java.util.Random;

/**
 * @author zpx
 * @Title: ${file_name}
 * @Package ${package_name}
 * @Description: 随机生成天气数据
 * @date 2018/7/316:24
 */
public class PrepareTempData{
    @Test
    public void makeData() throws IOException {
        FileWriter fw=new FileWriter("f:/hadoop-mr/temp.txt");
        for(int i=0;i<6000;i++){
            int year=1970+new Random().nextInt(100);
            int temp=-30+new Random().nextInt(100);
            fw.write("" + year + " " + temp + "\r\n");
        }
        fw.close();
    }

}

Map实现:

package com.hadoop.mr.allosort;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * @author zpx
 * @Title: ${file_name}
 * @Package ${package_name}
 * @Description: 全排序最大气温的Mapper
 * @date 2018/6/2811:13
 */
public class MaxTempMapper extends Mapper<IntWritable, IntWritable, IntWritable, IntWritable> {
    @Override
    protected void map(IntWritable key, IntWritable value, Context context) throws IOException, InterruptedException {
        context.write(key,value);
    }
}

Reduce实现:

package com.hadoop.mr.allosort;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 * @author zpx
 * @Title: ${file_name}
 * @Package ${package_name}
 * @Description: 全排序最大气温的Reducer
 * @date 2018/7/316:37
 */
public class MaxTempReducer extends Reducer<IntWritable,IntWritable,IntWritable,IntWritable>{
    @Override
    protected void reduce(IntWritable key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        int max=Integer.MIN_VALUE;
        for(IntWritable v:values){
            max=max>v.get()?max:v.get();
        }
        context.write(key,new IntWritable(max));
    }
}

主函数

package com.hadoop.mr.allosort;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.InputSampler;
import org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner;

/**
 *
 */
public class MaxTempApp {
    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();
        conf.set("fs.defaultFS","file:///");
        Job job = Job.getInstance(conf);
        //设置job的各种属性
        job.setJobName("MaxTempApp");                        //作业名称
        job.setJarByClass(MaxTempApp.class);                 //搜索类
        job.setInputFormatClass(SequenceFileInputFormat.class); //设置输入格式
        //添加输入路径
        FileInputFormat.addInputPath(job,new Path("F://hadoop-mr//seq//temp"));
        //设置输出路径
        FileOutputFormat.setOutputPath(job,new Path("F://hadoop-mr//out"));
        job.setMapperClass(MaxTempMapper.class);             //mapper类
        job.setReducerClass(MaxTempReducer.class);           //reducer类
        job.setMapOutputKeyClass(IntWritable.class);        //
        job.setMapOutputValueClass(IntWritable.class);      //
        job.setOutputKeyClass(IntWritable.class);
        job.setOutputValueClass(IntWritable.class);         //
        //创建随机采样器对象
        //freq:每个key被选中的概率
        //numSapmple:抽取样本的总数
        //maxSplitSampled:最大采样切片数
        InputSampler.Sampler<IntWritable, IntWritable> sampler =
                new InputSampler.RandomSampler<IntWritable, IntWritable>(0.5, 3000, 3);
        job.setNumReduceTasks(3);                           //reduce个数
        //将sample数据写入分区文件.
        TotalOrderPartitioner.setPartitionFile(job.getConfiguration(),new Path("f:/hadoop-mr/seq/par.lst"));
        //设置全排序分区类
        job.setPartitionerClass(TotalOrderPartitioner.class);
        InputSampler.writePartitionFile(job, sampler);
        job.waitForCompletion(true);
    }
}

2.倒排序

KV对调。

3.二次排序

Key排序(默认Map端排序),但有时候也需要对Value来进行排序。我们来用二次排序实现一个天气预报数据输出的功能,我们自己生成6000条数据,随机产生年份与年份对应的温度,然后输出每年的最高气温

(1)自定义Key

package com.hadoop.mr.secondarysort;

import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

/**
 * 自定义组合key
 */
public class ComboKey implements WritableComparable<ComboKey> {
    private int year ;
    private int temp ;

    public int getYear() {
        return year;
    }

    public void setYear(int year) {
        this.year = year;
    }

    public int getTemp() {
        return temp;
    }

    public void setTemp(int temp) {
        this.temp = temp;
    }

    /**
     * 对key进行比较实现
     */
    public int compareTo(ComboKey o) {
        int y0 = o.getYear();
        int t0 = o.getTemp() ;
        //年份相同(升序)
        if(year == y0){
            //气温降序
            return -(temp - t0) ;
        }
        else{
            return year - y0 ;
        }
    }

    /**
     * 序列化过程
     */
    public void write(DataOutput out) throws IOException {
        //年份
        out.writeInt(year);
        //气温
        out.writeInt(temp);
    }

    public void readFields(DataInput in) throws IOException {
        year = in.readInt();
        temp = in.readInt();
    }

    public String toString() {
        return year + ":" + temp ;
    }
}

(2.自定义分区类,按照年份分区)

package com.hadoop.mr.secondarysort;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Partitioner;

/**
 * @author 邹培贤
 * @Title: ${file_name}
 * @Package ${package_name}
 * @Description: 自定义分区:按照年份来进行分区
 * @date 2018/7/317:10
 */
public class YearPartitioner extends Partitioner<ComboKey,NullWritable>{
    @Override
    public int getPartition(ComboKey key, NullWritable nullWritable, int numPartitions){
        int year = key.getYear();
        return year % numPartitions;
    }
}

(3.自定义分组对比器)

package com.hadoop.mr.secondarysort;

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

/**
 * @author 邹培贤
 * @Title: ${file_name}
 * @Package ${package_name}
 * @Description: 自定义分组:按照年份来进行分组
 * @date 2018/7/317:13
 */
public class YearGroupComparator extends WritableComparator{
    protected YearGroupComparator() {
        super(ComboKey.class, true);
    }

    public int compare(WritableComparable a, WritableComparable b) {
        ComboKey k1 = (ComboKey)a ;
        ComboKey k2 = (ComboKey)b ;
        return k1.getYear() - k2.getYear() ;
    }

}

(4.定义Key排序对比器)

package com.hadoop.mr.secondarysort;

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
/**
  * @Description:定义Key排序对比器将相同年份的温度最高值放在第一位
  * @param ${tags}
  * @return ${return_type}
  * @throws
  * @author 邹培贤
  * @date 2018/7/3 17:17
  */
public class ComboKeyComparator extends WritableComparator {

    protected ComboKeyComparator() {
        super(ComboKey.class, true);
    }

    public int compare(WritableComparable a, WritableComparable b) {
        ComboKey k1 = (ComboKey) a;
        ComboKey k2 = (ComboKey) b;
        return k1.compareTo(k2);
    }
}

(5.Mapper)

package com.hadoop.mr.secondarysort;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * @author 邹培贤
 * @Title: ${file_name}
 * @Package ${package_name}
 * @Description: 二次排序的最大气温的Mapper
 * @date 2018/7/317:06
 */
public class MaxTempMapper extends Mapper<LongWritable,Text,ComboKey,NullWritable>{
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String[] arr = value.toString().split(" ");
        ComboKey keyOut=new ComboKey();
        keyOut.setYear(Integer.parseInt(arr[0]));
        keyOut.setTemp(Integer.parseInt(arr[1]));
        context.write(keyOut,NullWritable.get());
    }
}

(6.Reducer)

package com.hadoop.mr.secondarysort;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 * @author 邹培贤
 * @Title: ${file_name}
 * @Package ${package_name}
 * @Description: 二次排序的最大气温的Reducer
 * @date 2018/7/317:18
 */
public class MaxTempReducer extends Reducer<ComboKey,NullWritable,IntWritable,IntWritable>{
    @Override
    protected void reduce(ComboKey key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
        context.write(new IntWritable(key.getYear()),new IntWritable(key.getTemp()));
    }
}

(7.主函数)

package com.hadoop.mr.secondarysort;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 *
 */
public class MaxTempApp {
    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();
        conf.set("fs.defaultFS","file:///");

        Job job = Job.getInstance(conf);

        //设置job的各种属性
        job.setJobName("SecondarySortApp");                        //作业名称
        job.setJarByClass(MaxTempApp.class);                 //搜索类
        job.setInputFormatClass(TextInputFormat.class); //设置输入格式

        //添加输入路径
        FileInputFormat.addInputPath(job,new Path("F://hadoop-mr//temp.txt"));
        //设置输出路径
        FileOutputFormat.setOutputPath(job,new Path("F://hadoop-mr//out"));

        job.setMapperClass(MaxTempMapper.class);             //mapper类
        job.setReducerClass(MaxTempReducer.class);           //reducer类

        //设置Map输出类型
        job.setMapOutputKeyClass(ComboKey.class);            //
        job.setMapOutputValueClass(NullWritable.class);      //

        //设置ReduceOutput类型
        job.setOutputKeyClass(IntWritable.class);
        job.setOutputValueClass(IntWritable.class);         //

        //设置分区类
        job.setPartitionerClass(YearPartitioner.class);
        //设置分组对比器
        job.setGroupingComparatorClass(YearGroupComparator.class);
        //设置排序对比器
        job.setSortComparatorClass(ComboKeyComparator.class);

        job.setNumReduceTasks(3);                           //reduce个数
        //
        job.waitForCompletion(true);
    }
}

猜你喜欢

转载自blog.csdn.net/u011444062/article/details/80902832