大数据之mapreduce小实战

手写wordcount的程序

1、pom.xml


  
     
   
   <dependencies>

  
  
  
      
   
   <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-hdfs-client -->

  
  
  
          
   
   <dependency>

  
  
  
              
   
   <groupId>org.apache.hadoop
   
   </groupId>

  
  
  
              
   
   <artifactId>hadoop-client
   
   </artifactId>

  
  
  
              
   
   <version>2.7.3
   
   </version>

  
  
  
          
   
   </dependency>

  
  
  
      
   
   </dependencies>

2、新建Mapper类


  
  
   
   package com.hsiehchou.wordcount;

  
  
  
  
   
   import org.apache.hadoop.io.
   
   IntWritable;

  
  
  
  
   
   import org.apache.hadoop.io.
   
   LongWritable;

  
  
  
  
   
   import org.apache.hadoop.io.
   
   Text;

  
  
  
  
   
   import org.apache.hadoop.mapreduce.
   
   Mapper;

  
  
  
  
   
   import java.io.
   
   IOException;

  
  
  
  
   
   /**

  
  
  
  
   
    * 海量数据

  
  
  
  
   
    *

  
  
  
  
   
    * hello hsiehchou

  
  
  
  
   
    * nihao

  
  
  
  
   
    *

  
  
  
  
   
    * 数据的输入与输出以Key value进行传输

  
  
  
  
   
    * keyIN:LongWritable(Long) 数据的起始偏移量

  
  
  
  
   
    * valuewIN:具体数据

  
  
  
  
   
    *

  
  
  
  
   
    * mapper需要把数据传递到reducer阶段（<hello,1>）

  
  
  
  
   
    * keyOut:单词 Text

  
  
  
  
   
    * valueOut:出现的次数IntWritable

  
  
  
  
   
    *

  
  
  
  
   
    */

  
  
  
  
   
   public 
   
   class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

  
  
  
      
   
   //对数据进行打散 ctrl+o

  
  
  
      
   
   @Override

  
  
  
      
   
   protected void map(
   
   LongWritable key, 
   
   Text value, 
   
   Context context) 
   
   throws 
   
   IOException, 
   
   InterruptedException {

  
  
  
          
   
   //1、接入数据 hello nihao

  
  
  
          
   
   String line = value.toString();

  
  
  
          
   
   //2、对数据进行切分

  
  
  
          
   
   String[] words = line.split(
   
   " ");

  
  
  
          
   
   //3、写出以<hello,1>

  
  
  
          
   
   for (
   
   String w:words){

  
  
  
              
   
   //写出reducer端

  
  
  
  
   
               context.write(
   
   new 
   
   Text(w), 
   
   new 
   
   IntWritable(
   
   1));

  
  
  
  
   
           }

  
  
  
  
   
       }

  
  
  
  
   
   }

3、新建Reducer类


  
  
   
   package com.hsiehchou.wordcount;

  
  
  
  
   
   import org.apache.curator.framework.recipes.locks.
   
   InterProcessReadWriteLock;

  
  
  
  
   
   import org.apache.hadoop.io.
   
   IntWritable;

  
  
  
  
   
   import org.apache.hadoop.io.
   
   Text;

  
  
  
  
   
   import org.apache.hadoop.mapreduce.
   
   Reducer;

  
  
  
  
   
   import java.io.
   
   IOException;

  
  
  
  
   
   /**

  
  
  
  
   
    * reducer阶段接收的是Mapper输出的数据

  
  
  
  
   
    * mapper的输出是reducer输入

  
  
  
  
   
    *

  
  
  
  
   
    * keyIn:mapper输出的key的类型

  
  
  
  
   
    * valueIn:mapper输出的value的类型

  
  
  
  
   
    *

  
  
  
  
   
    * reducer端输出的数据类型，想要一个什么样的结果<hello,1888>

  
  
  
  
   
    * keyOut:Text

  
  
  
  
   
    * valueOut:IntWritalble

  
  
  
  
   
    *

  
  
  
  
   
    */

  
  
  
  
   
   public 
   
   class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {

  
  
  
      
   
   //key-->单词  value-->次数

  
  
  
      
   
   @Override

  
  
  
      
   
   protected void reduce(
   
   Text key, 
   
   Iterable<
   
   IntWritable> values, 
   
   Context context) 
   
   throws 
   
   IOException, 
   
   InterruptedException {

  
  
  
          
   
   //1、记录出现的次数

  
  
  
  
   
           int sum = 
   
   0;

  
  
  
          
   
   for (
   
   IntWritable v:values){

  
  
  
  
   
               sum += v.get();

  
  
  
  
   
           }

  
  
  
          
   
   //2、l累加求和输出

  
  
  
  
   
           context.write(key, 
   
   new 
   
   IntWritable(sum));

  
  
  
  
   
       }

  
  
  
  
   
   }

4、新建驱动类


  
  
   
   package com.hsiehchou.wordcount;

  
  
  
  
   
   import org.apache.hadoop.conf.Configuration;

  
  
  
  
   
   import org.apache.hadoop.fs.Path;

  
  
  
  
   
   import org.apache.hadoop.io.IntWritable;

  
  
  
  
   
   import org.apache.hadoop.io.Text;

  
  
  
  
   
   import org.apache.hadoop.mapreduce.Job;

  
  
  
  
   
   import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

  
  
  
  
   
   import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

  
  
  
  
   
   import java.io.IOException;

  
  
  
  
   
   public 
   
   class WordCountDriver {

  
  
  
      
   
   public 
   
   static void main(
   
   String[] args) 
   
   throws 
   
   IOException, 
   
   ClassNotFoundException, 
   
   InterruptedException {

  
  
  
          
   
   //1、创建job任务

  
  
  
          
   
   Configuration conf = new 
   
   Configuration();

  
  
  
          
   
   Job job = 
   
   Job.getInstance(conf);

  
  
  
          
   
   //2、指定jar包位置

  
  
  
  
   
           job.setJarByClass(
   
   WordCountDriver.
   
   class);

  
  
  
          
   
   //3、关联使用的Mapper类

  
  
  
  
   
           job.setMapperClass(
   
   WordCountMapper.
   
   class);

  
  
  
          
   
   //4、关联使用的Reducer类

  
  
  
  
   
           job.setReducerClass(
   
   WordCountReducer.
   
   class);

  
  
  
          
   
   //5、设置mapper阶段输出的数据类型

  
  
  
  
   
           job.setMapOutputKeyClass(
   
   Text.
   
   class);

  
  
  
  
   
           job.setMapOutputValueClass(
   
   IntWritable.
   
   class);

  
  
  
          
   
   //6、设置reducer阶段输出的数据类型

  
  
  
  
   
           job.setOutputKeyClass(
   
   Text.
   
   class);

  
  
  
  
   
           job.setOutputValueClass(
   
   IntWritable.
   
   class);

  
  
  
          
   
   //7、设置数据输入的路径

  
  
  
          
   
   FileInputFormat.setInputPaths(job, new 
   
   Path(args[
   
   0]));

  
  
  
          
   
   //8设置数据输出的路径

  
  
  
          
   
   FileOutputFormat.setOutputPath(job, new 
   
   Path(args[
   
   1]));

  
  
  
          
   
   //9、提交任务

  
  
  
  
   
           boolean rs = job.waitForCompletion(
   
   true);

  
  
  
          
   
   System.exit(rs ? 
   
   0:
   
   1);

  
  
  
  
   
       }

  
  
  
  
   
   }

运行结果


  
  
   
   [root@hsiehchou121 ~]# hadoop jar mapreduce
   
   -1.0-SNAPSHOT.jar com.hsiehchou.wordcount.WordCountDriver /wc/in /wc/out

  
  
  
  
   
   [root@hsiehchou121 ~]# hdfs dfs -cat /wc/out/part-r
   
   -00000

  
  
  
  
   
   fd  
   
   1

  
  
  
  
   
   fdgs    
   
   1

  
  
  
  
   
   fdsbv   
   
   1

  
  
  
  
   
   gd  
   
   1

  
  
  
  
   
   hello   
   
   3

5、IDEA的相关使用

Ctrl+O导入相关未实现的方法
Maven中的Lifecycle的package可以直接打包成jar

案例分析

需求：运营商流量日志
10086
计算每个用户当前使用的总流量
思路？总流量 = 上行流量+下行流量
三个字段：手机号上行流量下行流量
技术选型：PB+
数据分析：海量数据(存储hdfs)
海量数据计算(分布式计算框架MapReduce)

实现

FlowBean类


  
  
   
   package com.hsiehchou.logs;

  
  
  
  
   
   import org.apache.hadoop.io.Writable;

  
  
  
  
   
   import java.io.DataInput;

  
  
  
  
   
   import java.io.DataOutput;

  
  
  
  
   
   import java.io.IOException;

  
  
  
  
   
   /**

  
  
  
  
   
    * 封装数据类型需要怎么做

  
  
  
  
   
    * hadoop数据类型实现了序列化接口

  
  
  
  
   
    * 如果自定义需要实现这个序列化接口

  
  
  
  
   
    */

  
  
  
  
   
   public 
   
   class FlowBean implements Writable {

  
  
  
      
   
   //定义属性：上行流量 下行流量 总流量总和

  
  
  
      
   
   private 
   
   long upFlow;

  
  
  
      
   
   private 
   
   long dfFlow;

  
  
  
      
   
   private 
   
   long flowsum;

  
  
  
  
   
   

  
  
  
      
   
   public FlowBean(){}

  
  
  
      
   
   public FlowBean(long upFlow, long dfFlow){

  
  
  
          
   
   this.upFlow = upFlow;

  
  
  
          
   
   this.dfFlow = dfFlow;

  
  
  
          
   
   this.flowsum = upFlow + dfFlow;

  
  
  
  
   
       }

  
  
  
      
   
   public long getUpFlow(){

  
  
  
          
   
   return upFlow;

  
  
  
  
   
       }

  
  
  
  
   
   

  
  
  
      
   
   public void setUpFlow(long upFlow){

  
  
  
          
   
   this.upFlow = upFlow;

  
  
  
  
   
       }

  
  
  
      
   
   public long getDfFlow(){

  
  
  
          
   
   return dfFlow;

  
  
  
  
   
       }

  
  
  
      
   
   public void setDfFlow(long dfFlow){

  
  
  
          
   
   this.dfFlow = dfFlow;

  
  
  
  
   
       }

  
  
  
      
   
   public long getFlowsum(){

  
  
  
          
   
   return flowsum;

  
  
  
  
   
       }

  
  
  
      
   
   public void setFlowsum(long flowsum){

  
  
  
          
   
   this.flowsum = flowsum;

  
  
  
  
   
       }

  
  
  
      
   
   //序列化

  
  
  
      
   
   public void write(DataOutput out) throws IOException {

  
  
  
  
   
           out.writeLong(upFlow);

  
  
  
  
   
           out.writeLong(dfFlow);

  
  
  
  
   
           out.writeLong(flowsum);

  
  
  
  
   
       }

  
  
  
      
   
   //反序列化

  
  
  
      
   
   public void readFields(DataInput in) throws IOException {

  
  
  
  
   
           upFlow = in.readLong();

  
  
  
  
   
           dfFlow = in.readLong();

  
  
  
  
   
           flowsum = in.readLong();

  
  
  
  
   
       }

  
  
  
      
   
   @Override

  
  
  
      
   
   public String toString() {

  
  
  
          
   
   return upFlow + 
   
   "\t" + dfFlow + 
   
   "\t" + flowsum;

  
  
  
  
   
       }

  
  
  
  
   
   }

FlowCountMapper类


  
  
   
   package com.hsiehchou.logs;

  
  
  
  
   
   import org.apache.hadoop.io.
   
   LongWritable;

  
  
  
  
   
   import org.apache.hadoop.io.
   
   Text;

  
  
  
  
   
   import org.apache.hadoop.mapreduce.
   
   Mapper;

  
  
  
  
   
   import java.io.
   
   IOException;

  
  
  
  
   
   /**

  
  
  
  
   
    * keyIN:

  
  
  
  
   
    * valueIN:

  
  
  
  
   
    *

  
  
  
  
   
    * 思路：根据想要的结果的kv类型  手机号  流量总和（上行+下行）自定义类

  
  
  
  
   
    * keyOut:

  
  
  
  
   
    * valueOut:

  
  
  
  
   
    */

  
  
  
  
   
   public 
   
   class FlowCountMapper extends Mapper<LongWritable, Text, Text, FlowBean> {

  
  
  
      
   
   @Override

  
  
  
      
   
   protected void map(
   
   LongWritable key, 
   
   Text value, 
   
   Context context) 
   
   throws 
   
   IOException, 
   
   InterruptedException {

  
  
  
          
   
   //1、接入数据源

  
  
  
          
   
   String line = value.toString();

  
  
  
          
   
   //2、切割   \t

  
  
  
          
   
   String[] fields = line.split(
   
   "\t");

  
  
  
          
   
   //3、拿到关键字段

  
  
  
          
   
   String phoneNr = fields[
   
   1];

  
  
  
  
   
           long upFlow = 
   
   Long.parseLong(fields[fields.length - 
   
   3]);

  
  
  
  
   
           long dfFlow = 
   
   Long.parseLong(fields[fields.length - 
   
   2]);

  
  
  
          
   
   //4、写出到reducer

  
  
  
  
   
           context.write(
   
   new 
   
   Text(phoneNr), 
   
   new 
   
   FlowBean(upFlow,dfFlow));

  
  
  
  
   
       }

  
  
  
  
   
   }

FlowCountReducer类


  
  
   
   package com.hsiehchou.logs;

  
  
  
  
   
   import org.apache.hadoop.io.
   
   Text;

  
  
  
  
   
   import org.apache.hadoop.mapreduce.
   
   Reducer;

  
  
  
  
   
   import java.io.
   
   IOException;

  
  
  
  
   
   public 
   
   class FlowCountReducer extends Reducer<Text, FlowBean, Text, FlowBean> {

  
  
  
      
   
   @Override

  
  
  
      
   
   protected void reduce(
   
   Text key, 
   
   Iterable<
   
   FlowBean> values, 
   
   Context context) 
   
   throws 
   
   IOException, 
   
   InterruptedException {

  
  
  
  
   
           long upFlow_sum = 
   
   0;

  
  
  
  
   
           long dfFlow_sum = 
   
   0;

  
  
  
          
   
   for (
   
   FlowBean v:values){

  
  
  
  
   
               upFlow_sum += v.getUpFlow();

  
  
  
  
   
               dfFlow_sum += v.getDfFlow();

  
  
  
  
   
           }

  
  
  
          
   
   FlowBean rsSum = 
   
   new 
   
   FlowBean(upFlow_sum, dfFlow_sum);

  
  
  
          
   
   //输出结果

  
  
  
  
   
           context.write(key, rsSum);

  
  
  
  
   
       }

  
  
  
  
   
   }

FlowCountDriver类


  
  
   
   package com.hsiehchou.logs;

  
  
  
  
   
   import org.apache.hadoop.conf.Configuration;

  
  
  
  
   
   import org.apache.hadoop.fs.Path;

  
  
  
  
   
   import org.apache.hadoop.io.Text;

  
  
  
  
   
   import org.apache.hadoop.mapreduce.Job;

  
  
  
  
   
   import org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat;

  
  
  
  
   
   import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

  
  
  
  
   
   import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

  
  
  
  
   
   import java.io.IOException;

  
  
  
  
   
   public 
   
   class FlowCountDriver {

  
  
  
      
   
   public 
   
   static void main(
   
   String[] args) 
   
   throws 
   
   IOException, 
   
   ClassNotFoundException, 
   
   InterruptedException {

  
  
  
          
   
   //1.创建job任务

  
  
  
          
   
   Configuration conf = new 
   
   Configuration();

  
  
  
          
   
   Job job = 
   
   Job.getInstance(conf);

  
  
  
          
   
   //2.指定kjar包位置

  
  
  
  
   
           job.setJarByClass(
   
   FlowCountDriver.
   
   class);

  
  
  
          
   
   //3.关联使用的Mapper

  
  
  
  
   
           job.setMapperClass(
   
   FlowCountMapper.
   
   class);

  
  
  
          
   
   //4.关联使用的Reducer类

  
  
  
  
   
           job.setReducerClass(
   
   FlowCountReducer.
   
   class);

  
  
  
          
   
   //5.设置mapper阶段输出的数据类型

  
  
  
  
   
           job.setMapOutputKeyClass(
   
   Text.
   
   class);

  
  
  
  
   
           job.setMapOutputValueClass(
   
   FlowBean.
   
   class);

  
  
  
          
   
   //6.设置reducer阶段输出的数据类型

  
  
  
  
   
           job.setOutputKeyClass(
   
   Text.
   
   class);

  
  
  
  
   
           job.setOutputValueClass(
   
   FlowBean.
   
   class);

  
  
  
          
   
   //优化含有大量小文件的数据

  
  
  
          
   
   //设置读取数据切片的类

  
  
  
  
   
           job.setInputFormatClass(
   
   CombineTextInputFormat.
   
   class);

  
  
  
          
   
   //最大切片大小8M

  
  
  
          
   
   CombineTextInputFormat.setMaxInputSplitSize(job, 
   
   8388608);

  
  
  
          
   
   //最小切片大小6M

  
  
  
          
   
   CombineTextInputFormat.setMinInputSplitSize(job, 
   
   6291456);

  
  
  
          
   
   //7.设置数据输入的路径

  
  
  
          
   
   FileInputFormat.setInputPaths(job, new 
   
   Path(args[
   
   0]));

  
  
  
          
   
   //8.设置数据输出的路径

  
  
  
          
   
   FileOutputFormat.setOutputPath(job, new 
   
   Path(args[
   
   1]));

  
  
  
          
   
   //9.提交任务

  
  
  
  
   
           boolean  rs = job.waitForCompletion(
   
   true);

  
  
  
          
   
   System.exit(rs? 
   
   0:
   
   1);

  
  
  
  
   
       }

  
  
  
  
   
   }

运行结果


  
  
   
   [root@hsiehchou121 ~]# hdfs dfs -mkdir -p /flow/in

  
  
  
  
   
   [root@hsiehchou121 ~]# hdfs dfs -put HTTP_20180313143750.dat /flow/in

  
  
  
  
   
   [root@hsiehchou121 ~]# hadoop jar mapreduce
   
   -1.0-SNAPSHOT.jar com.hsiehchou.logs.FlowCountDriver /flow/in /flow/out

  
  
  
  
   
   [root@hsiehchou121 ~]# hdfs dfs -cat /flow/out/part-r
   
   -00000

  
  
  
  
   
   13480253104    
   
   120       
   
   1320      
   
   1440

  
  
  
  
   
   13502468823    
   
   735       
   
   11349     
   
   12084

  
  
  
  
   
   13510439658    
   
   1116      
   
   954       
   
   2070

  
  
  
  
   
   13560436326    
   
   1136      
   
   94        
   
   1230

  
  
  
  
   
   13560436666    
   
   1136      
   
   94        
   
   1230

  
  
  
  
   
   13560439658    
   
   918       
   
   4938      
   
   5856

  
  
  
  
   
   13602846565    
   
   198       
   
   910       
   
   1108

  
  
  
  
   
   13660577991    
   
   660       
   
   690       
   
   1350

  
  
  
  
   
   13719199419    
   
   240       
   
   0         
   
   240

  
  
  
  
   
   13726130503    
   
   299       
   
   681       
   
   980

  
  
  
  
   
   13726238888    
   
   2481      
   
   24681     
   
   27162

  
  
  
  
   
   13760778710    
   
   120       
   
   120       
   
   240

  
  
  
  
   
   13822544101    
   
   264       
   
   0         
   
   264

  
  
  
  
   
   13884138413    
   
   4116      
   
   1432      
   
   5548

  
  
  
  
   
   13922314466    
   
   3008      
   
   3720      
   
   6728

  
  
  
  
   
   13925057413    
   
   11058     
   
   4243      
   
   15301

  
  
  
  
   
   13926251106    
   
   240       
   
   0         
   
   240

  
  
  
  
   
   13926435656    
   
   132       
   
   1512      
   
   1644

  
  
  
  
   
   15013685858    
   
   369       
   
   338       
   
   707

  
  
  
  
   
   15889002119    
   
   938       
   
   380       
   
   1318

  
  
  
  
   
   15920133257    
   
   316       
   
   296       
   
   612

  
  
  
  
   
   18212575961    
   
   1527      
   
   2106      
   
   3633

  
  
  
  
   
   18320173382    
   
   9531      
   
   212       
   
   9743

小文件优化

如果企业中存在海量的小文件数据
TextInputFormat按照文件规划切片，文件不管多小都是一个单独的切片，启动mapt
ask任务去执行，这样会产生大量的maptask，浪费资源。
优化手段：
小文件合并大文件，如果不动这个小文件内容。

大数据之mapreduce小实战

手写wordcount的程序

1、pom.xml

2、新建Mapper类

3、新建Reducer类

4、新建驱动类

运行结果

5、IDEA的相关使用

案例分析

实现

FlowBean类

FlowCountMapper类

FlowCountReducer类

FlowCountDriver类

运行结果

小文件优化

猜你喜欢