MapReduce多个job任务其本质原理就是job2任务依赖job1任务的返回结果即job1的输出路径是job2的输入路径。
job2任务的启动依赖job1任务结束所返回的状态。
依据自己的实际经验:多个job的串行或并行触发,我认为还是分开写几个程序,用脚本控制其执行顺序,这样便于程序的调试,管理。
当然,选择使用那种情况也是依赖项目的需求而定。
多个job依赖代码如下:
package more_job; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class moreJob { private static final LongWritable num = new LongWritable(1); static int pv = 0; public static class MMap extends Mapper<LongWritable, Text, Text, LongWritable> { @Override protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, LongWritable>.Context context) throws IOException, InterruptedException { String line[] = value.toString().split("\t"); String url = line[3]; if(url.contains("baidu.com")){ context.write(new Text(url), num); } } } public static class MRed extends Reducer<Text, LongWritable, Text, Text>{ @Override protected void reduce(Text key, Iterable<LongWritable> value, Reducer<Text, LongWritable, Text, Text>.Context context) throws IOException, InterruptedException { for(LongWritable i :value){ pv++; } context.write(new Text(key), new Text(Integer.toString(pv))); } } public static class MMap2 extends Mapper<LongWritable, Text, Text, LongWritable> { @Override protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, LongWritable>.Context context) throws IOException, InterruptedException { String line[] = value.toString().split("\t"); String url = line[0].split("baidu.com")[0]; context.write(new Text(url), num); } } public static class MRed2 extends Reducer<Text, LongWritable, Text, LongWritable>{ @Override protected void reduce(Text key, Iterable<LongWritable> value, Reducer<Text, LongWritable, Text, LongWritable>.Context context) throws IOException, InterruptedException { for(LongWritable i :value){ pv++; } context.write(new Text(key), new LongWritable(pv)); } } public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf = new Configuration(); //job1的配置 Job job1 = Job.getInstance(conf, "Job1"); job1.setJarByClass(moreJob.class); job1.setMapperClass(MMap.class); job1.setReducerClass(MRed.class); job1.setMapOutputKeyClass(Text.class); job1.setMapOutputValueClass(LongWritable.class); FileInputFormat.addInputPath(job1, new Path(args[0])); FileOutputFormat.setOutputPath(job1, new Path(args[1])); FileSystem fs =new Path(args[1]).getFileSystem(conf); if(fs.exists(new Path(args[1]))){ fs.delete(new Path(args[1]), true); } job1.setMaxMapAttempts(4); job1.setNumReduceTasks(50); /* * job1的输出路径是job2的输入路径 * 判断job1结束的返回状态,成功结束就执行job2 * job2只是依赖job1的结果路径,并不是依赖job1的输出结果的键值对类型。 */ if(job1.waitForCompletion(true)){ //job2的配置 Job job2 = Job.getInstance(conf, "Job2"); job2.setJarByClass(moreJob.class); job2.setMapperClass(MMap2.class); job2.setReducerClass(MRed2.class); job2.setMapOutputKeyClass(Text.class); job2.setMapOutputValueClass(LongWritable.class); job2.setOutputValueClass(LongWritable.class); FileInputFormat.addInputPath(job2, new Path(args[1])); FileOutputFormat.setOutputPath(job2, new Path(args[2])); if(fs.exists(new Path(args[2]))){ fs.delete(new Path(args[2]), true); } job2.setMaxMapAttempts(4); job2.setNumReduceTasks(50); System.exit(job2.waitForCompletion(true) ? 0 : 1); } } }