环境:ubuntu、hadoop-2.6.0、jdk-1.6
Demo说明:此demo改编自hadoop权威指南一书;
1、存在一个气温记录的txt文件,记录每一年每一个月的气温值(此处数据伪造,记录的为1990 - 1991年数据),如下:
途中+号表示正数温度,-号表示零下。
demo意图是期望通过hadoop计算出每一年的最高气温,结果期望如下:
开始构建hadoop应用,首先此处用到的核心类是Mapper、Reducer、Combiner(这并不是一个具体的类,此对象也是Reducer的一个子类,用于实现hadoop中的合并函数)这三个类,Mapper类是用于从每一行中获取出year(年份)和tempper(气温值),将输出参数的year和tempper作为Combiner的输入参数,Combiner计算出此数据块的最大值(因为在分布式计算中,同一个年份的数据可能被分割在不同的数据块中,所以,合并函数显得非常重要),将合并函数计算的year和tempper传入Reducer,reducer输出结果到HDFS;(hadooper的mapper函数的输入是基于标准流,对文件逐行读取,逐行提交给mapper)
架构如下:
NewMaxTemperMapper >> Mapper的子类;
NewMaxTemperReducer >> Reducer的子类:
NewMaxTempperCombiner >> Reducer的子类:
NewDomainWithCombiner >> Reducer的子类:
NewMaxTemperMapper.java
import java.io.IOException; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; public class NewMaxTemperMapper extends Mapper { private static final int MISSING = 9999; @Override protected void map(LongWritable key, Text value, Context context) throws IOException,InterruptedException { System.out.println("start to work in map method ..."); String line = value.toString(); String year = line.substring(1, 5); int airTempper; if (line.charAt(6) == '+') { airTempper = Integer.parseInt(line.substring(7, 10)); } else { airTempper = Integer.parseInt(line.substring(6, 10)); } if (airTempper != MISSING) { context.write(new Text(year), new IntWritable(airTempper)); } } }
NewMaxTemperReducer.java
import java.io.IOException; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; public class NewMaxTemperReducer extends Reducer { @Override protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException { System.out.println("start to work in reducer method ..."); int maxValue = Integer.MIN_VALUE; for (IntWritable intWrit : values) { maxValue = Math.max(maxValue, intWrit.get()); } context.write(key, new IntWritable(maxValue)); } }
NewMaxTempperCombiner.java
import java.io.IOException; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; public class NewMaxTempperCombiner extends Reducer { @Override protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException { System.out.println("NewMaxTempperCombiner - reduce() - do combiner..."); int maxValue = Integer.MIN_VALUE; for (IntWritable intWrit : values) { maxValue = Math.max(maxValue, intWrit.get()); } context.write(new Text(key), new IntWritable(maxValue)); } }
NewDomain.java
import java.io.IOException; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class NewDomain { public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { if (args == null || args.length != 2) { System.err.println("input is not legal."); System.exit(-1); } Job job = new Job(); job.setJarByClass(NewDomain.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(NewMaxTemperMapper.class); job.setReducerClass(NewMaxTemperReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); System.exit(job.waitForCompletion(true)?0:1); } }
OK 将项目导出成一个可运行jar
将此jar放到单机模式或者伪分布式模式下,通过hadoop/bin下面的hadoop执行