Hadoop中split数量计算法则（源码跟踪）

　　从前面的文章（MapReduce运行原理【源码跟踪】）我们知道计算切片的部分在JobSubmitter类中，然后我们看此类的Structure（在idea中View->Tool Windows ->Structure）查看类结构我们很轻易的就能找到有关split的方法

我们可以在writeSplits方法中打一个断点，随便运行一个计数程序Debug跟踪查看。

这里给出一下计数程序

WCmapper

 1 package com.qin.MapReduce;
 2 
 3 import org.apache.hadoop.io.IntWritable;
 4 import org.apache.hadoop.io.LongWritable;
 5 import org.apache.hadoop.io.Text;
 6 import org.apache.hadoop.mapreduce.Mapper;
 7 
 8 import java.io.IOException;
 9 
10 public class WCMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
11 
12     protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
13         Text outText = new Text();
14         IntWritable valueOut = new IntWritable();
15         String[] split = value.toString().split(" ");
16         for (String  str: split ){
17             outText.set(str);
18             valueOut.set(1);
19             context.write(outText,valueOut);
20         }
21     }
22     
23 }

WCreducer

package com.qin.MapReduce;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;


public class WCreducer extends Reducer<Text, IntWritable, Text, IntWritable>{

    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        int count = 0;
        for (IntWritable value : values){
            count = value.get() + count;
        }

        context.write(key, new IntWritable(count));
    }
}

WCapp

package com.qin.MapReduce;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class WCapp {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

        Configuration conf = new Configuration();
        conf.set("fs.defaultFS","file:///");

        Job job = Job.getInstance(conf);
        //Job的各种属性
        job.setJobName("WCapp");        //设置作业名称
        job.setJarByClass(WCapp.class); //设置搜索类
        job.setInputFormatClass(TextInputFormat.class);

        job.setMapperClass(WCMapper.class);
        job.setReducerClass(WCreducer.class);

        job.setNumReduceTasks(1);

        //添加输入路径
        FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job,new Path(args[1]));

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        job.waitForCompletion(true);   //是否打印出详细信息

    }
}