第一步:创建WordMapper类
package cn.zengy.mapreduce;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* 把K1,V1变成K2,V2
*/
public class WordMapper extends Mapper<LongWritable, Text,Text,LongWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// 通过v1获取每个单词
String[] split = value.toString().split(",");
Text text = new Text();
LongWritable longWritable = new LongWritable();
// iter 遍历每个单词
for (String s : split) {
// context.write(new Text(s), new LongWritable(1));
text.set(s);
longWritable.set(1);
// 将k2,v2写入上下文
context.write(text, longWritable);
}
}
}
第二步:创建WordReduce类
package cn.zengy.mapreduce;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class WordReduce extends Reducer<Text, LongWritable,Text,LongWritable> {
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
long count=0;
// 将v2计算出结果传给v3
for (LongWritable value : values) {
count += value.get();
}
// 将k3,v3写入上下文
context.write(key,new LongWritable(count));
}
}
第三步:创建JobMain类(用于结合WordMapper和WordReduce)
package cn.zengy.mapreduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class JobMain extends Configured implements Tool {
@Override
public int run(String[] strings) throws Exception {
Job job = Job.getInstance(super.getConf(), "wordcount_mapreduce");
job.setJarByClass(JobMain.class);
// 第一步:指定输入的类,并解析成k1,v1,指定输入的路径
job.setInputFormatClass(TextInputFormat.class);
// InputPath输入路径为hdfs中/wordcount
TextInputFormat.addInputPath(job,new Path("hdfs://node1:8020/wordcount"));
// 第二步:指定一个Mapper类,把k1,v1转换成k2,v2
job.setMapperClass(WordMapper.class);
// 指定输出的k2类,v2类
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
// 第三,第四,第五,第六 省略
// 第七步:指定reduce类
job.setReducerClass(WordReduce.class);
// 指定k3,v3类
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
// 第八步:指定输出对象(类)
job.setOutputFormatClass(TextOutputFormat.class);
// OutputPath输出路径为hdfs中/output/wordcount
// 注意:请选择不存在路径,若路径已存在会报错
TextOutputFormat.setOutputPath(job,new Path("hdfs://node1:8020/output/wordcount"));
boolean b = job.waitForCompletion(true);
return b?0:1;
}
public static void main(String[] args) throws Exception {
int run = ToolRunner.run(new Configuration(), new JobMain(), args);
System.exit(run);
}
}
在java中运行mapreduce
该运行是基于hadoop中,在windows中运行失败
需要在已经搭建好hadoop的linux中运行
第四步:在pom.xml中加入打包语句
<packaging>jar</packaging>
第五步:在maven的lifecycle中把之前的target都clean清理掉
具体实现是在
第六步:点击生命周期lifecycle中的package打包
第七步:将打包后的target文件中target/original-hdfs_api_java_test-1.0-SNAPSHOT.jar 保存到桌面
第八步:在linux中上传桌面文件original-hdfs_api_java_test-1.0-SNAPSHOT.jar
cd /export/servers --进入serves
rz -E --上传文件
第九步:运行
hadoop jar + (jar包名) + jobMain(需要运行的类的路径)
hadoop jar original-hdfs_api_java_test-1.0-SNAPSHOT.jar cn.zengy.mapreduce.JobMain