mapred代码示例--map端join

package join;

import java.io.BufferedReader;
import java.io.FileReader;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

/**
* map端join,把小表放入内存中
*
* 输入文件为
*
* user
* 1 zhangshan
* 2 lisi
* 3 wangwu
* 4 zhaoliu
*
* log
* 1 login
* 2 login
* 1 login
* 3 login
*
* 统计登陆次数
* zhangshan 2
* lisi 1
* wangwu 1
* zhaoliu 0
*
* @author Administrator
*
*/
public class MapJoinApp {

private static String INPUT_PATH = "hdfs://hadoop:9000/in/reducejoin";
private static String CACHE_FILE = "hdfs://hadoop:9000/in/reducejoin/user";
private static String OUT_PATH = "hdfs://hadoop:9000/out";

/**
* @param args
*/
public static void main(String[] args) throws Exception{
Configuration conf = new Configuration();
final FileSystem fileSystem = FileSystem.get(new URI(INPUT_PATH), conf);
fileSystem.delete(new Path(OUT_PATH), true);

Job job = new Job(conf);
job.setJarByClass(MapJoinApp.class);

//此处conf必须job.getConfiguration,不能用上面的conf
DistributedCache.addCacheFile(new URI(CACHE_FILE), job.getConfiguration());
FileInputFormat.setInputPaths(job, INPUT_PATH);
job.setInputFormatClass(TextInputFormat.class);
job.setMapperClass(MyMapper.class);
job.setNumReduceTasks(0);

job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileOutputFormat.setOutputPath(job, new Path(OUT_PATH));
job.waitForCompletion(true);

final FSDataInputStream in = fileSystem.open(new Path(OUT_PATH+"/part-m-00000"));
IOUtils.copyBytes(in, System.out, 1024, true);
}

public static class MyMapper extends Mapper<LongWritable, Text, Text, Text>{

Text k2 = new Text();
Text v2 = new Text();
Map<String,String> userMap = new HashMap<String,String>();
Map<String,Integer> logMap = new HashMap<String,Integer>();


protected void setup(org.apache.hadoop.mapreduce.Mapper<LongWritable,Text,Text,Text>.Context context) throws java.io.IOException ,InterruptedException {
Path[] localCacheFiles = DistributedCache.getLocalCacheFiles(context.getConfiguration());
for (Path path : localCacheFiles) {
System.out.println(path.toString());
if(path.toString().endsWith("user")){

BufferedReader br = new BufferedReader(new FileReader(path.toString()));
String line = "";
while ((line =br.readLine())!=null){
String[] split = line.split("\t");
userMap.put(split[0], split[1]);
}

}
}
};

protected void map(LongWritable key, Text value, org.apache.hadoop.mapreduce.Mapper<LongWritable,Text,Text,Text>.Context context) throws java.io.IOException ,InterruptedException {

String[] splited = value.toString().split("\t");
FileSplit inputSplit = (FileSplit) context.getInputSplit();
String name = inputSplit.getPath().getName();
System.out.println(name);
if(name!=null){
if(name.contains("log")){
Integer times = logMap.get(splited[0]);
if(times != null){
logMap.put(splited[0], times+1);
}else{
logMap.put(splited[0], 1);
}
}
}
};

protected void cleanup(org.apache.hadoop.mapreduce.Mapper<LongWritable,Text,Text,Text>.Context context) throws java.io.IOException ,InterruptedException {

for(String id:userMap.keySet()){
Integer times = logMap.get(id);
if(times==null){
times = new Integer(0);
}
String name = userMap.get(id);
k2.set(name);
v2.set(times.toString());

context.write(k2, v2);
}
};
}
}

猜你喜欢

转载自jsh0401.iteye.com/blog/2111920