如果处理的数据结构比较复杂,最好自定义一个类来做mapper和reduce,自定义类需要注意的几点:
- 实现org.apache.hadoop.io.Writable接口
- 需要提供无参构造函数
- 实现接口中的write和readFields方法
- 重写toString方法
这里以一个流量统计的示例举例:
pom.xml中需要引入依赖:
<dependencies> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-common</artifactId> <version>2.4.1</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-mapreduce-client-core</artifactId> <version>2.4.1</version> </dependency> </dependencies>
FlowBean:
import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import org.apache.hadoop.io.Writable; public class FlowBean implements Writable{ private long upflow; private long dflow; private long sumflow; // 因为反射机制的需要,必须定义一个无参的构造函数 public FlowBean() {} public FlowBean(long upflow, long dflow) { this.upflow = upflow; this.dflow = dflow; this.sumflow = upflow + dflow; } public long getUpflow() { return upflow; } public void setUpflow(long upflow) { this.upflow = upflow; } public long getDflow() { return dflow; } public void setDflow(long dflow) { this.dflow = dflow; } public long getSumflow() { return sumflow; } public void setSumflow(long sumflow) { this.sumflow = sumflow; } // 序列化方法 将我们要传输的数据序列化成字节流 @Override public void write(DataOutput out) throws IOException { out.writeLong(upflow); out.writeLong(dflow); } // 反序列化的方法 从数据字节流中恢复出各个字段 @Override public void readFields(DataInput in) throws IOException { upflow = in.readLong(); dflow = in.readLong(); } @Override public String toString() { return upflow + "\t" + dflow + "\t" + sumflow; } }
FlowCountMapper:
import java.io.IOException; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import org.apache.commons.lang.StringUtils; public class FlowCountMapper extends Mapper<LongWritable, Text, Text, FlowBean>{ @Override protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, FlowBean>.Context context) throws IOException, InterruptedException { String line = value.toString(); String[] fields = StringUtils.split(line, "\t"); String phone = fields[1]; long upflow = Long.parseLong(fields[fields.length - 3]); long dflow = Long.parseLong(fields[fields.length - 2]); FlowBean bean = new FlowBean(upflow, dflow); context.write(new Text(phone), bean); } }
FlowCountReducer:
import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; public class FlowCountReducer extends Reducer<Text,FlowBean,Text,FlowBean>{ @Override protected void reduce(Text key, Iterable<FlowBean> values, Context context) throws IOException, InterruptedException { long upflowsum = 0; long dflowsum = 0; for (FlowBean value : values) { upflowsum += value.getUpflow(); dflowsum += value.getDflow(); } FlowBean bean = new FlowBean(upflowsum, dflowsum); context.write(key, bean); } }
FlowCountSubmitter:
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class FlowCountSubmitter { public static void main(String[] args) throws Exception { Job job = Job.getInstance(new Configuration()); job.setJarByClass(FlowCountSubmitter.class); job.setMapperClass(FlowCountMapper.class); job.setReducerClass(FlowCountReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(FlowBean.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(FlowBean.class); FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.waitForCompletion(true); } }
之后打包,上传到00服务器,在00服务器上准备好数据目录和分析结果输出目录,并且上传数据文件到hadoop的srcdata路径下
hadoop fs -mkdir -p /flow/output
hadoop fs -mkdir -p /flow/srcdata
执行:hadoop jar hadoop-mapreduce-customer-1.0.jar com.wange.FlowCountSubmitter /flow/srcdata /flow/output,参数分别为:main函数所在路径、待分析的文件所在的目录、分析结果数据的目录,执行完毕就可以看到结果了。
查看yarn的web管理:http://hadoop-server-00:8088/cluster、hdfs的目录web管理为:http://hadoop-server-00:50070