统计数据集中每个单词出现的总次数,为避免大小写问题出现的统计次数错误,将单词都做成小写,并用正则除去出现的标点。
map阶段
import sys import re p = re.compile(u'\w+') #读取数据流信息 for line in sys.stdin: word_list = line.strip().split() for word in word_list: if len(word) < 2: continue w = p.findall(word)[0].lower() print '%s\t%d'%(w, 1)
reduce阶段
import sys cur_word = None #定义word的指针 sum_cnt = 0 #对word数量累加 #读取数据流信息 for line in sys.stdin: word,count = line.strip().split('\t') if cur_word == None: cur_word = word if cur_word != word: print '%s\t%d'%(cur_word, sum_cnt) cur_word = word sum_cnt = 0 sum_cnt += int(count) print '%s\t%d'%(cur_word, sum_cnt)
本地调试
cat text.txt |python map.py | sort -k1 | python red.py
提交集群运行
HADOOP_CMD = "/usr/local/src/hadoop-2.6.1/bin/hadoop" STREAM_JAR_PATH = "/usr/local/src/hadoop-2.6.1/share/hadoop/tools/lib/hadoop-streaming-2.6.1.jar" INPUT_FILE_PATH = "/test.txt" OUTPUT_FILE_PATH = "/output_wordcount" $HADOOP_CMD fs - rmr -skipTrash $OUTPUT_FILE_PATH # Step 1. $HADOOP_CMD jar $STREAM_JAR_PATH \ -input $INPUT_FILE_PATH \ -output $OUTPUT_FILE_PATH \ -mapper "python map.py" \ -reducer "python red.py"