大数据专栏 | ||
---|---|---|
上一篇 | 主目录 | 下一篇 |
目录
【前言】
Intellij IDEA打包MapReduce程序,并使用jar的方式在集群上运行
1 编写代码
1.1 创建Maven项目
1.2 添加依赖
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.shane.hadoop</groupId>
<artifactId>bigdata</artifactId>
<version>1.0-SNAPSHOT</version>
<repositories>
<repository>
<id>apache</id>
<url>http://maven.apache.org</url>
</repository>
</repositories>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.3</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.3</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.7.3</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
</plugin>
</plugins>
</build>
</project>
1.3 代码
WCMapper.java
package com.shane.mapreduce;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class WCMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] split = value.toString().split(" ");
for(String word : split){
context.write(new Text(word), new IntWritable(1));
}
}
}
WCReducer.java
package com.shane.mapreduce;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class WCReducer extends Reducer<Text, IntWritable, Text, LongWritable>{
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
for(IntWritable value : values){
sum += value.get();
}
context.write(key, new LongWritable(sum));
}
}
WordCountMR.java
package com.shane.mapreduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WordCountMR {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
conf.set("fs.defaultFS", "hdfs://192.168.65.101:9000");
Job job = Job.getInstance(conf);
// job.setJar("/home/hadoop/wc.jar");
job.setJarByClass(WordCountMR.class);
/**
* 设置 该 job 的 mapper和 reducer组件
*/
job.setMapperClass(WCMapper.class);
job.setReducerClass(WCReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
job.setNumReduceTasks(1); // reduceTask的编号是: 0 1 2
FileInputFormat.setInputPaths(job, args[0]);
Path outputPath = new Path(args[1]);
FileSystem fs = FileSystem.get(conf);
if(fs.exists(outputPath)){
fs.delete(outputPath, true);
}
FileOutputFormat.setOutputPath(job, outputPath);
boolean isDone = job.waitForCompletion(true);
System.exit(isDone ? 0 : 1);
}
}
2 项目打包
3 hdfs集群运行
3.1 上传到Linux服务器
打开文件地址
打开Xftp,拖拽上传
3.2 在hadoop01执行jar
hadoop jar bigdata-1.0-SNAPSHOT.jar com.shane.mapreduce.WordCountMR /wc/input /wc/output
其中:
- com.shane.mapreduce.WordCountMR 是主类WordCountMR.java的完整路径,在IDEA中右键->copy reference得到
- /wc/input /wc/output分别为输入输出路径
4 在本地IDEA执行MR
以上是将在idea写好的打成jar包,上传到hadoop01服务器上执行。本节是在idea中连接hadoop执行程序。
需要配置运行参数: