Java代码:
public class WCMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
String[] words = line.split();
for (String word:words){
Text k = new Text(word);
IntWritable one = new IntWritable(1);
context.write(k, one);
}
}
}
public class WCReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
@Override
protected void reduce(Text word, Iterable<IntWritable> value, Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable one:value){
int i = one.get();
sum += i;
}
context.write(word, new IntWritable(sum));
}
}
public class WCDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(WCDriver.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setMapperClass(WCMapper.class);
job.setReducerClass(WCReducer.class);
job.setCombinerClass(WCReducer.class);
job.setNumReduceTasks(2);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
boolean bool = job.waitForCompletion(true);
System.exit(bool?0:1);
}
}
python代码:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
def map():
for line in sys.stdin:
line = line.strip()
words = line.split()
for word in words:
print('{}\t{}'.format(word, 1))
if __name__ == '__main__':
map()
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
from itertools import groupby
def from_stdin():
for line in sys.stdin:
word, count = line.strip().split('\t')
yield (word, count)
def reduce():
for word, group in groupby(from_stdin(), key=lambda x: x[0]):
count = sum([int(tup[1]) for tup in group])
print('%s\t%s' % (word, count))
if __name__ == '__main__':
reduce()