老的API实现WordCount 和

使用Hadoop版本0.x实现单词统计

复制代码
1 package old;
2
3 import java.io.IOException;
4 import java.net.URI;
5 import java.util.Iterator;
6
7 import org.apache.hadoop.conf.Configuration;
8 import org.apache.hadoop.fs.FileSystem;
9 import org.apache.hadoop.fs.Path;
10 import org.apache.hadoop.io.LongWritable;
11 import org.apache.hadoop.io.Text;
12 import org.apache.hadoop.mapred.FileInputFormat;
13 import org.apache.hadoop.mapred.FileOutputFormat;
14 import org.apache.hadoop.mapred.JobClient;
15 import org.apache.hadoop.mapred.JobConf;
16 import org.apache.hadoop.mapred.MapReduceBase;
17 import org.apache.hadoop.mapred.Mapper;
18 import org.apache.hadoop.mapred.OutputCollector;
19 import org.apache.hadoop.mapred.Reducer;
20 import org.apache.hadoop.mapred.Reporter;
21
22 /**
23 * 老API实现单词统计
24 *
25 /
26 /
*
27 * hadoop版本1.x的包一般是mapreduce
28 *
29 * hadoop版本0.x的包一般是mapred
30 *
31 /
32
33 public class OldApp {
34
35 static final String INPUT_PATH = “hdfs://chaoren:9000/hello”;
36 static final String OUT_PATH = “hdfs://chaoren:9000/out”;
37
38 public static void main(String[] args) throws Exception {
39 Configuration conf = new Configuration();
40 FileSystem fileSystem = FileSystem.get(new URI(INPUT_PATH), conf);
41 Path outPath = new Path(OUT_PATH);
42 if (fileSystem.exists(outPath)) {
43 fileSystem.delete(outPath, true);
44 }
45 /
*
46 * 改动1:不再使用Job,而是使用JobConf
47 *
48 * 改动2:类的包名不再使用mapreduce,而是使用mapred
49 *
50 * 改动3:不再使用job.waitForCompletion(true)提交作业,而是使用JobClient.runJob(job);
51 /
52 JobConf job = new JobConf(conf, OldApp.class);
53
54 // 1.1指定读取的文件位于哪里
55 FileInputFormat.setInputPaths(job, INPUT_PATH);
56 // 指定如何对输入的文件进行格式化,把输入文件每一行解析成键值对
57 // job.setInputFormatClass(TextInputFormat.class);
58
59 // 1.2指定自定义的map类
60 job.setMapperClass(MyMapper.class);
61 // map输出的<k,v>类型。如果<k3,v3>的类型与<k2,v2>类型一致,则可以省略
62 // job.setOutputKeyClass(Text.class);
63 // job.setOutputValueClass(LongWritable.class);
64
65 // 1.3分区
66 // job.setPartitionerClass(org.apache.hadoop.mapreduce.lib.partition.HashPartitioner.class);
67 // 有一个reduce任务运行
68 // job.setNumReduceTasks(1);
69
70 // 1.4排序、分组
71
72 // 1.5归约
73
74 // 2.2指定自定义reduce类
75 job.setReducerClass(MyReducer.class);
76 // 指定reduce的输出类型
77 job.setOutputKeyClass(Text.class);
78 job.setOutputValueClass(LongWritable.class);
79
80 // 2.3指定写出到哪里
81 FileOutputFormat.setOutputPath(job, outPath);
82 // 指定输出文件的格式化类
83 // job.setOutputFormatClass(TextOutputFormat.class);
84
85 // 把job提交给jobtracker运行
86 JobClient.runJob(job);
87 }
88
89 /
*
90 * 新API:extends Mapper
91 *
92 * 老API:extends MapReduceBase implements Mapper
93 */
94 static class MyMapper extends MapReduceBase implements
95 Mapper<LongWritable, Text, Text, LongWritable> {
96 public void map(LongWritable k1, Text v1,
97 OutputCollector<Text, LongWritable> collector, Reporter reporter)
98 throws IOException {
99 String[] split = v1.toString().split("\t");
100 for (String word : split) {
101 collector.collect(new Text(word), new LongWritable(1));
102 }
103 }
104 }
105
106 static class MyReducer extends MapReduceBase implements
107 Reducer<Text, LongWritable, Text, LongWritable> {
108 public void reduce(Text k2, Iterator v2s,
109 OutputCollector<Text, LongWritable> collector, Reporter reporter)
110 throws IOException {
111 long times = 0L;
112 while (v2s.hasNext()) {
113 long temp = v2s.next().get();
114 times += temp;
115 }
116 collector.collect(k2, new LongWritable(times));
117 }
118 }
119
120 }

复制代码

查看结果:
在这里插入图片描述

WordCountApp命令行运行时指定参数

1、修改之前的WordCountApp.java的代码

复制代码
1 package cmd;
2
3 import java.net.URI;
4 import org.apache.hadoop.conf.Configuration;
5 import org.apache.hadoop.conf.Configured;
6 import org.apache.hadoop.fs.FileSystem;
7 import org.apache.hadoop.fs.Path;
8 import org.apache.hadoop.io.LongWritable;
9 import org.apache.hadoop.io.Text;
10 import org.apache.hadoop.mapreduce.Job;
11 import org.apache.hadoop.mapreduce.Mapper;
12 import org.apache.hadoop.mapreduce.Reducer;
13 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
14 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
15 import org.apache.hadoop.util.Tool;
16 import org.apache.hadoop.util.ToolRunner;
17
18 public class WordCountApp extends Configured implements Tool{
19 static String INPUT_PATH = “”;
20 static String OUT_PATH = “”;
21
22 public int run(String[] arg0) throws Exception {
23 INPUT_PATH = arg0[0];
24 OUT_PATH = arg0[1];
25
26 Configuration conf = new Configuration();
27 FileSystem fileSystem = FileSystem.get(new URI(INPUT_PATH), conf);
28 Path outPath = new Path(OUT_PATH);
29 if (fileSystem.exists(outPath)) {
30 fileSystem.delete(outPath, true);
31 }
32
33 Job job = new Job(conf, WordCountApp.class.getSimpleName());
34
35 //打包运行时必须执行的秘密方法
36 job.setJarByClass(WordCountApp.class);
37
38 // 1.1指定读取的文件位于哪里
39 FileInputFormat.setInputPaths(job, INPUT_PATH);
40 // 指定如何对输入的文件进行格式化,把输入文件每一行解析成键值对
41 //job.setInputFormatClass(TextInputFormat.class);
42
43 // 1.2指定自定义的map类
44 job.setMapperClass(MyMapper.class);
45 // map输出的<k,v>类型。如果<k3,v3>的类型与<k2,v2>类型一致,则可以省略
46 //job.setOutputKeyClass(Text.class);
47 //job.setOutputValueClass(LongWritable.class);
48
49 // 1.3分区
50 //job.setPartitionerClass(org.apache.hadoop.mapreduce.lib.partition.HashPartitioner.class);
51 // 有一个reduce任务运行
52 //job.setNumReduceTasks(1);
53
54 // 1.4排序、分组
55
56 // 1.5归约
57
58 // 2.2指定自定义reduce类
59 job.setReducerClass(MyReducer.class);
60 // 指定reduce的输出类型
61 job.setOutputKeyClass(Text.class);
62 job.setOutputValueClass(LongWritable.class);
63
64 // 2.3指定写出到哪里
65 FileOutputFormat.setOutputPath(job, outPath);
66 // 指定输出文件的格式化类
67 //job.setOutputFormatClass(TextOutputFormat.class);
68
69 // 把job提交给jobtracker运行
70 job.waitForCompletion(true);
71 return 0;
72 }
73
74 public static void main(String[] args) throws Exception {
75 ToolRunner.run(new WordCountApp(), args);
76 }
77
78 /**
79 *
80 * KEYIN 即K1 表示行的偏移量
81 * VALUEIN 即V1 表示行文本内容
82 * KEYOUT 即K2 表示行中出现的单词
83 * VALUEOUT 即V2 表示行中出现的单词的次数,固定值1
84 *
85 /
86 static class MyMapper extends
87 Mapper<LongWritable, Text, Text, LongWritable> {
88 protected void map(LongWritable k1, Text v1, Context context)
89 throws java.io.IOException, InterruptedException {
90 String[] splited = v1.toString().split("\t");
91 for (String word : splited) {
92 context.write(new Text(word), new LongWritable(1));
93 }
94 };
95 }
96
97 /
*
98 * KEYIN 即K2 表示行中出现的单词
99 * VALUEIN 即V2 表示出现的单词的次数
100 * KEYOUT 即K3 表示行中出现的不同单词
101 * VALUEOUT 即V3 表示行中出现的不同单词的总次数
102 */
103 static class MyReducer extends
104 Reducer<Text, LongWritable, Text, LongWritable> {
105 protected void reduce(Text k2, java.lang.Iterable v2s,
106 Context ctx) throws java.io.IOException,
107 InterruptedException {
108 long times = 0L;
109 for (LongWritable count : v2s) {
110 times += count.get();
111 }
112 ctx.write(k2, new LongWritable(times));
113 };
114 }
115
116 }

复制代码

2、修改完之后,不是在eclipse中运行,而是要打包导出,然后通过WinSCP复制到Linux中/usr/local目录下。

在这里插入图片描述

3、在Linux命令行中运行,运行成功后,在查看运行后的结果。

在这里插入图片描述

在这里插入图片描述

猜你喜欢

转载自blog.csdn.net/chenyuanshengboke/article/details/83928579