1. Upload the wordcount.txt text file to the /data/ directory. The content of the wordcount.txt file is as follows:
red black green yellow
red blue blue
black big small small yellow
red red red red
blue
2. Create a java maven project, add hdfs and mapreduce references in pom.xml, as follows
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.che</groupId>
<artifactId>demo</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>
<name>demo</name>
<url>http://maven.apache.org</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>3.8.1</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.0</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.7.0</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>2.7.0</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.0</version>
</dependency>
</dependencies>
</project>
3. The code is as follows:
3.1 WordCount Mapper implementation class WordCountMapper.java
package com.che.demo.mapreduce;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
/**
* WordCount Mapper实现类
*/
public class WordCountMapper extends Mapper<LongWritable, Text,Text,LongWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// 将Text类型的value 转换成 string
String datas = value.toString();
// 将这一行用 " " 切分出各个单词
String[] words = datas.split(" ");
for (String word : words) {
context.write(new Text(word),new LongWritable(1));
}
}
}
3.2 WordCount Reducer implementation class WordCountReducer.java
package com.che.demo.mapreduce;
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
/**
* WordCount Reducer实现类
*/
public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
@Override
protected void reduce(Text word, Iterable<IntWritable> valuesIterator,Context context)throws IOException, InterruptedException {
int count=0;
//统计单词数量
Iterator<IntWritable> iterator = valuesIterator.iterator();
while(iterator.hasNext()){
iterator.next();
count++;
}
context.write(word, new IntWritable(count));
}
}
3.3 WordCount Main method implementation class WordCountJob.java
package com.che.demo.mapreduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
* WordCount Main方法实现类
*/
public class WordCountJob {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job wcjob = Job.getInstance(conf);
wcjob.setJarByClass(WordCountJob.class);
wcjob.setMapperClass(WordCountMapper.class);
wcjob.setReducerClass(WordCountReducer.class);
wcjob.setMapOutputKeyClass(Text.class);
wcjob.setMapOutputValueClass(LongWritable.class);
wcjob.setMapOutputKeyClass(Text.class);
wcjob.setOutputValueClass(LongWritable.class);
//指定要处理的数据所在的位置
FileInputFormat.setInputPaths(wcjob,"/user/che/1021001/input");
//指定处理完成之后的结果所保存的位置
FileOutputFormat.setOutputPath(wcjob, new Path("/user/che/1021001/output"));
// 向yarn集群提交这个job
boolean res = wcjob.waitForCompletion(true);
System.out.println(res?0:1);
}
}
3.4 Type the project project into a jar package in eclipse, and the operation is as follows
Then there will be an extra jar file in the target directory of the project as shown below
3.5 After renaming demo-0.0.1-SNAPSHOT.jar to demo.jar, upload it to the /data/ directory on centos7
3.6 Upload the /data/wordcount.txt file in centos7 to the /user/che/1021001/input directory on hdfs
hdfs dfs -put /data/wordcount.txt /user/che/1021001/input
3.7 Execute with hadoop jar command
hadoop jar /data/demo.jar com.che.demo.mapreduce.WordCountJob