hadoop helloworld with Java

环境:ubuntu、hadoop-2.6.0、jdk-1.6

Demo说明:此demo改编自hadoop权威指南一书;
1、存在一个气温记录的txt文件,记录每一年每一个月的气温值(此处数据伪造,记录的为1990 - 1991年数据),如下:
hadoop的helloworld程序 <wbr>for <wbr>java

途中+号表示正数温度,-号表示零下。
demo意图是期望通过hadoop计算出每一年的最高气温,结果期望如下:
hadoop的helloworld程序 <wbr>for <wbr>java

开始构建hadoop应用,首先此处用到的核心类是Mapper、Reducer、Combiner(这并不是一个具体的类,此对象也是Reducer的一个子类,用于实现hadoop中的合并函数)这三个类,Mapper类是用于从每一行中获取出year(年份)和tempper(气温值),将输出参数的year和tempper作为Combiner的输入参数,Combiner计算出此数据块的最大值(因为在分布式计算中,同一个年份的数据可能被分割在不同的数据块中,所以,合并函数显得非常重要),将合并函数计算的year和tempper传入Reducer,reducer输出结果到HDFS;(hadooper的mapper函数的输入是基于标准流,对文件逐行读取,逐行提交给mapper)

架构如下:
NewMaxTemperMapper         >> Mapper的子类;
NewMaxTemperReducer        >> Reducer的子类:
NewMaxTempperCombiner    >> Reducer的子类:
NewDomainWithCombiner      >> Reducer的子类:




NewMaxTemperMapper.java


import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;


public class NewMaxTemperMapper extends Mapper {
   
    private static final int MISSING = 9999;

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException,InterruptedException {
       
        System.out.println("start to work in map method ...");
        String line = value.toString();
        String year = line.substring(1, 5);
       
        int airTempper;
       
        if (line.charAt(6) == '+') {
            airTempper = Integer.parseInt(line.substring(7, 10));
        } else {
            airTempper = Integer.parseInt(line.substring(6, 10));
        }
       
        if (airTempper != MISSING) {
            context.write(new Text(year), new IntWritable(airTempper));
        }
       
    }

}

 



NewMaxTemperReducer.java

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;


public class NewMaxTemperReducer extends Reducer {
   
   
    @Override
    protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {

        System.out.println("start to work in reducer method ...");
        int maxValue = Integer.MIN_VALUE;
       
        for (IntWritable intWrit : values) {
            maxValue = Math.max(maxValue, intWrit.get());
        }
       
        context.write(key, new IntWritable(maxValue));
    }
   
   
}

 


NewMaxTempperCombiner.java

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class NewMaxTempperCombiner extends Reducer  {
   
    @Override
    protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {
       
        System.out.println("NewMaxTempperCombiner - reduce() - do combiner...");
       
        int maxValue = Integer.MIN_VALUE;
       
        for (IntWritable intWrit : values) {
           
            maxValue = Math.max(maxValue, intWrit.get());
           
        }
       
        context.write(new Text(key), new IntWritable(maxValue));
       
    }
   
}

 

NewDomain.java

import java.io.IOException;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class NewDomain {
   
    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
       
        if (args == null || args.length != 2) {
            System.err.println("input is not legal.");
            System.exit(-1);
        }
       
        Job job = new Job();
        job.setJarByClass(NewDomain.class);
       
        FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
       
        job.setMapperClass(NewMaxTemperMapper.class);
        job.setReducerClass(NewMaxTemperReducer.class);
       
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
       
        System.exit(job.waitForCompletion(true)?0:1);
       
    }
   
}

 

OK 将项目导出成一个可运行jar

将此jar放到单机模式或者伪分布式模式下,通过hadoop/bin下面的hadoop执行

猜你喜欢

转载自boyuliu.iteye.com/blog/2193274