Map/Reduce二次排序与分组分区用法

我对二次排序的定义就是先按照Key值排序，在Key相同的时候比较Value的值

数据来源于某程序网的测试数据集

输入

1,mr1,3234
2,mr2,123
3,mr3,9877
4,mr4,348
5,mr5,12345
6,mr6,6646
7,mr7,98
8,mr8,12345
1,mr1,334
2,mr2,3123
3,mr3,97
4,mr4,231
5,mr5,122
6,mr6,3455
7,mr7,1222
8,mr8,12345
4,mr4,123

输出

mr1	3234,334
mr2	3123,123
mr3	9877,97
mr4	348,231,123
mr5	12345,122
mr6	6646,3455
mr7	1222,98
mr8	12345,12345

解释一下输入数据的三个参数含义，第一个是id号，第二个是名字，第三个是花费金额，要输出这个人的名称所花费的金额，且这些金额按照从大到小排序。

代码：

import java.io.*;

import org.apache.hadoop.conf.*;
import org.apache.hadoop.util.*;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.io.*;
import org.apache.hadoop.fs.*;
public class ThreeSort extends Configured implements Tool{

	enum Counter
	{
		LINKSKIP,
	}
	public static class textpair implements WritableComparable<textpair>
	{
		String name;
		int cost;
		public textpair(String name,int cost)
		{
			this.name=name;
			this.cost=cost;
		}
		public textpair(){}
		public void readFields(DataInput arg0) throws IOException {
			// TODO Auto-generated method stub
			name=arg0.readUTF();
			cost=arg0.readInt();
		}
		public void write(DataOutput arg0) throws IOException {
			// TODO Auto-generated method stub
			arg0.writeUTF(name);
			arg0.writeInt(cost);
		}
		public int compareTo(textpair o) {
			// TODO Auto-generated method stub
			String name1=o.name;
			int cost1=o.cost;
			if(!(name.equals(name1)))
				return name.compareTo(name1)<0?-1:1;
			else if(cost!=cost1)
				return cost>cost1?-1:1;
			return 0;
		}
		public String getname()
		{
			return this.name.toString();
		}
		public int getcost()
		{
			return this.cost;
		}
		@Override
		public int hashCode() {  
	        return this.cost;  
	    }
		@Override
		public boolean equals(Object o)
		{
			if(o==null)
				return false;
			if(this==o)
				return true;
			if(o instanceof textpair)
			{
				textpair o1=(textpair)o;
				return o1.name.equals(this.name)&&o1.cost==this.cost;
			}
			return false;
		}
	}
	public static class MyPartitioner extends Partitioner<textpair,IntWritable>
	{

		@Override
		public int getPartition(textpair key, IntWritable value, int numPartitions) {
			// TODO Auto-generated method stub
			return Math.abs((key.getcost()*127)%numPartitions);
		}
		
	}
	public static class GroupingComparator extends WritableComparator
	{
		public GroupingComparator()
		{
			super(textpair.class,true);
		}
		@Override
		public int compare(WritableComparable a,WritableComparable b)
		{
			textpair a1=(textpair)a;
			textpair b1=(textpair)b;
			return a1.getname().compareTo(b1.getname());
		}
	}
	public static class map extends Mapper<Object,Text,textpair,IntWritable>
	{
		private IntWritable cost=new IntWritable();
		@Override
		public void map(Object key,Text value,Context context)throws IOException,InterruptedException
		{
			String line=value.toString();
			try
			{
				String[] linesplit=line.split(",");
				String name=linesplit[1];
				int id=Integer.parseInt(linesplit[0]);
				cost.set(Integer.valueOf(linesplit[2]));
				textpair p1=new textpair(name,Integer.valueOf(linesplit[2]));
				context.write(p1, cost);
			}
			catch(java.lang.ArrayIndexOutOfBoundsException e)
			{
				context.getCounter(Counter.LINKSKIP).increment(1);
				return ;
			}
		}
	}
	public static class reduce extends Reducer<textpair,IntWritable,Text,Text>
	{
		public Text okey=new Text();
		public Text ovalue=new Text();
		@Override
		public void reduce(textpair key,Iterable<IntWritable> values,Context context)throws IOException,InterruptedException
		{
			String str1="";
			for(IntWritable value:values)
			{
				str1+=key.getcost()+",";
			}
			String str2=str1.substring(0,str1.length()-1);
			okey.set(key.getname());
			ovalue.set(str2);
			context.write(okey, ovalue);
		}
	}
	public int run(String[] args)throws Exception
	{
		Configuration conf=getConf();
		Job job=new Job(conf,"ThreeSort");
		job.setJarByClass(ThreeSort.class);
		job.setMapperClass(map.class);
		job.setMapOutputKeyClass(textpair.class);
		job.setMapOutputValueClass(IntWritable.class);
		job.setPartitionerClass(MyPartitioner.class);
		job.setGroupingComparatorClass(GroupingComparator.class); 
		job.setReducerClass(reduce.class);
		job.setOutputFormatClass(TextOutputFormat.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		FileInputFormat.addInputPath(job, new Path(args[0]));
		/*Path outputDir=new Path(args[1]);
		FileSystem fs=FileSystem.get(conf);
		if(fs.exists(outputDir))
			fs.delete(outputDir,true);*/
		FileOutputFormat.setOutputPath(job,new Path(args[1]));
		job.waitForCompletion(true);
		return job.isSuccessful()?0:1;
	}
	public static void main(String[] args)throws Exception
	{
		int res=ToolRunner.run(new Configuration(),new ThreeSort(), args);
		System.exit(res);
	}
}

Map/Reduce二次排序与分组分区用法

猜你喜欢