Use mapreduce to count the number of occurrences of all words in the file

1. Upload the wordcount.txt text file to the /data/ directory. The content of the wordcount.txt file is as follows:

red   black  green  yellow
red blue blue
black big small small   yellow
red red red red
blue 

Insert picture description here

2. Create a java maven project, add hdfs and mapreduce references in pom.xml, as follows

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>

  <groupId>com.che</groupId>
  <artifactId>demo</artifactId>
  <version>0.0.1-SNAPSHOT</version>
  <packaging>jar</packaging>

  <name>demo</name>
  <url>http://maven.apache.org</url>

  <properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
  </properties>

  <dependencies>
  
    <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <version>3.8.1</version>
      <scope>test</scope>
    </dependency>
    
    <dependency>
		<groupId>org.apache.hadoop</groupId>
		<artifactId>hadoop-common</artifactId>
		<version>2.7.0</version>
	</dependency>
	
	<dependency>
		<groupId>org.apache.hadoop</groupId>
		<artifactId>hadoop-hdfs</artifactId>
		<version>2.7.0</version>
	</dependency>
	
    <dependency>
        <groupId>org.apache.hadoop</groupId>
        <artifactId>hadoop-mapreduce-client-core</artifactId>
        <version>2.7.0</version>
    </dependency>
    <dependency>
        <groupId>org.apache.hadoop</groupId>
        <artifactId>hadoop-client</artifactId>
        <version>2.7.0</version>
    </dependency>  
    
  </dependencies>
</project>

3. The code is as follows:

3.1 WordCount Mapper implementation class WordCountMapper.java

package com.che.demo.mapreduce;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

/**
 * WordCount Mapper实现类
 */
public class WordCountMapper extends Mapper<LongWritable, Text,Text,LongWritable> {
    
    

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    
    
        // 将Text类型的value  转换成 string
        String datas = value.toString();

        // 将这一行用 " " 切分出各个单词
        String[] words = datas.split(" ");

        for (String word : words) {
    
    
            context.write(new Text(word),new LongWritable(1));
        }

    }
}

3.2 WordCount Reducer implementation class WordCountReducer.java

package com.che.demo.mapreduce;

import java.io.IOException;
import java.util.Iterator;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

/**
 * WordCount Reducer实现类
 */
public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
    
    

    @Override
    protected void reduce(Text word, Iterable<IntWritable> valuesIterator,Context context)throws IOException, InterruptedException {
    
    
        int count=0;
        //统计单词数量
        Iterator<IntWritable> iterator = valuesIterator.iterator();
        while(iterator.hasNext()){
    
    
            iterator.next();
            count++;
        }
        context.write(word, new IntWritable(count));
    }

}

3.3 WordCount Main method implementation class WordCountJob.java

package com.che.demo.mapreduce;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**
 * WordCount Main方法实现类
 */
public class WordCountJob {
    
    
	
	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
    
    
        
        Configuration conf = new Configuration();
        Job wcjob = Job.getInstance(conf);
        
        wcjob.setJarByClass(WordCountJob.class);

        wcjob.setMapperClass(WordCountMapper.class);
        wcjob.setReducerClass(WordCountReducer.class);

        wcjob.setMapOutputKeyClass(Text.class);
        wcjob.setMapOutputValueClass(LongWritable.class);

        wcjob.setMapOutputKeyClass(Text.class);
        wcjob.setOutputValueClass(LongWritable.class);

        //指定要处理的数据所在的位置
        FileInputFormat.setInputPaths(wcjob,"/user/che/1021001/input");
        //指定处理完成之后的结果所保存的位置 
        FileOutputFormat.setOutputPath(wcjob, new Path("/user/che/1021001/output"));

        // 向yarn集群提交这个job
        boolean res = wcjob.waitForCompletion(true);
        System.out.println(res?0:1);
    }

}

3.4 Type the project project into a jar package in eclipse, and the operation is as follows

Insert picture description here
Then there will be an extra jar file in the target directory of the project as shown below
Insert picture description here

3.5 After renaming demo-0.0.1-SNAPSHOT.jar to demo.jar, upload it to the /data/ directory on centos7

Insert picture description here

3.6 Upload the /data/wordcount.txt file in centos7 to the /user/che/1021001/input directory on hdfs

hdfs dfs -put /data/wordcount.txt /user/che/1021001/input

Insert picture description here

3.7 Execute with hadoop jar command

hadoop jar /data/demo.jar com.che.demo.mapreduce.WordCountJob

Insert picture description here
Insert picture description here

3.8 View the output result, the output result is in /user/che/1021001/output

Insert picture description here

Guess you like

Origin blog.csdn.net/ytangdigl/article/details/109222110