Hadoop1.x 的MapReduce 简单例子WordCount

一. 前言

 之前笔记里记得案例今天看到了，所以拿出来分享。

 **首先介绍一下案例需求，统计出hadoop上的一个hello目录下的文件不同单词的个数，并输出统计结果。**

MapReduce 是一种分布式计算模型，主要分为Map和Reduce两部分，用户只需要实现map()和reduce()函数就可以，一般两个函数之间以key和value这种键值对传递参数

mapreduce过程

二.代码


import java.io.IOException;
import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;

public class WordCountApp {

    static final String INPUT_PATH = "hdfs://hadoop1:9000/hello";
    static final String OUT_PATH = "hdfs://hadoop1:9000/out";

    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();

        final Job job = new Job(conf,WordCountApp.class.getSimpleName());
        //判断输出文件是否存在，存在就删除
        FileSystem fileSystem = FileSystem.get(new URI(INPUT_PATH), conf);
        Path outPath = new Path(OUT_PATH);
        if(fileSystem.exists(outPath)){
            fileSystem.delete(outPath, true);
        }


        // 指定输入目录
        FileInputFormat.setInputPaths(job, INPUT_PATH);

        //指定输入数据进行格式化处理的类
        job.setInputFormatClass(TextInputFormat.class);

        //指定自定义的Mapper类
        job.setMapperClass(MyMapper.class);
        //指定Mapper输出<k,v>类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(LongWritable.class);
        //分区
        job.setPartitionerClass(HashPartitioner.class);
        //设置reduce个数
        job.setNumReduceTasks(1);

        //指定自定义的Reducer类
        job.setReducerClass(MyReduce.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);

        //指定输出的路径
        FileOutputFormat.setOutputPath(job, new Path(OUT_PATH));
        //指定输出的格式化类
        job.setOutputFormatClass(TextOutputFormat.class);
        //将整个作业提交给JobTracker
        job.waitForCompletion(true);



    }
    /**
     * k1    每一行起始的位置
     * v1    每一行的文本内容
     * k2    每一行中的每个单词
     * v2    每一行中的每个单词出现的次数，固定值1
     * @author mademin
     *
     */
    static class MyMapper extends Mapper<LongWritable, Text, Text, LongWritable>{

        @Override
        protected void map(LongWritable k1, Text v1,
                Mapper<LongWritable, Text, Text, LongWritable>.Context context)
                throws IOException, InterruptedException {
            //将每行文本按照制表符来进行分割
            String[] splited = v1.toString().split("\t");
            for(String word : splited){

                context.write(new Text(word), new LongWritable(1L));
            }


        }
    }

    /**
     * k2 
     * v2
     * k3      整个文件中的不同单词
     * v3      整个文件中的不同单词出现总数
     * @author mademin
     *
     */
    static class MyReduce extends Reducer<Text, LongWritable, Text, LongWritable>{

        @Override
        protected void reduce(Text k2, Iterable<LongWritable> v2s,
                Reducer<Text, LongWritable, Text, LongWritable>.Context context)
                throws IOException, InterruptedException {
            long sum = 0L;

            for(LongWritable v2 : v2s){

                sum += v2.get();
            }

            context.write(k2, new LongWritable(sum));
        }
    }

}

Hadoop1.x 的MapReduce 简单例子WordCount

一. 前言

二.代码

猜你喜欢