一、环境配置
1、hdfs-site.xml
<configuration>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
<property>
<name>dfs.webhdfs.enabled</name>
<value>true</value>
</property>
<property>
<name>dfs.namenode.http-address</name>
<value>localhost:50070</value>
</property>
</configuration>
2、yarn-site.xml
<configuration>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.resourcemanager.hostname</name>
<value>localhost</value>
</property>
<property>
<name>yarn.nodemanager.disk-health-checker.max-disk-utilization-per-disk-percentage</name>
<value>98.0</value>
</property>
<property>
<name>yarn.nodemanager.env-whitelist</name>
<value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOME</value>
</property>
</configuration>
3、mapred-site.xml
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
</configuration>
4、core-site.xml
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://localhost</value>
</property>
<property>
<name>hadoop.tmp.dir</name>
<value>/Users/lhy/hadoop/tmp</value>
<description>A base for other temporary directories.</description>
</property>
</configuration>
二、启动mapreduce作业相关进程
1、首次使用hdfs,必须先格式化namenode,格式化HDFS
bin/hdfs namenode -format
2、启动HDFS
sbin/start-dfs.sh
3、启动YARN
sbin/start-yarn.sh
4、启动MapReduces守护进程
sbin/mr-jobhistory-daemon.sh start historyserver
5、为自己创建一个目录
hadoop fs -mkdir -p /user/$USER
6、查看是否启动成功
jps
必须有ResourceManager、SecondaryNameNode、NameNode、DataNode、NodeManager,才能说明启动成功成功,否则需要去$HDOOP_HOME/logs目录下查看相关日志解决问题。
三、mapreduce应用开发
下面开发一个计算美国NCDC气象数据中心一年中最高气温的mapreduce应用
1、引入依赖
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>3.1.0</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
</dependency>
<!--map输入测试依赖-->
<dependency>
<groupId>org.apache.mrunit</groupId>
<artifactId>mrunit</artifactId>
<version>1.1.0</version>
<classifier>hadoop2</classifier>
<scope>test</scope>
</dependency>
</dependencies>
2、构建mapper
所有应用的mapper必须继承Mapper类
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class MaxTemperatureMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
private final static int MISSING_TEMPERATURE = 9999;
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String year = line.substring(15, 19);
int airTemperature;
if (line.charAt(87) == '+') {
airTemperature = Integer.parseInt(line.substring(88, 92));
} else {
airTemperature = Integer.parseInt(line.substring(87, 92));
}
String quality = line.substring(92, 93);
if (airTemperature != MISSING_TEMPERATURE && quality.matches("[01459]")) {
context.write(new Text(year),new IntWritable(airTemperature));
}
}
}
2、单元测试mapper是否符合预期
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mrunit.mapreduce.MapDriver;
import org.junit.Test;
import com.yyh.mapper.MaxTemperatureMapper;
import java.io.IOException;
public class MaxTemperatureMapperTest {
@Test
public void processesValidRecord() throws IOException, InterruptedException {
Text value = new Text("0057332130999991950010103004+51317+028783FM-12+017199999V0203201N00721004501CN0100001N9-01281-01391102681");
new MapDriver<LongWritable, Text, Text, IntWritable>()
.withMapper(new MaxTemperatureMapper())
.withInput(new LongWritable(0), value)
.withOutput(new Text("1950"), new IntWritable(-128))
.runTest();
}
}
3、构建reducer
注意reducer中的key,value类型必须和maper中的key、value对应
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class MaxTemperatureReducer extends Reducer<Text,IntWritable,Text,IntWritable>{
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int maxValue = Integer.MIN_VALUE;
for (IntWritable value : values){
maxValue = Math.max(maxValue,value.get());
}
context.write(key,new IntWritable(maxValue));
}
}
4、配置作业
import com.yyh.mapper.MaxTemperatureMapper;
import com.yyh.mapper.MaxTemperatureReducer;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class MaxTemperatureDriver extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
//默认的的是本地文件系统,此处需要配置为伪分布式文件系统
getConf().set("fs.defaultFS","hdfs://localhost");
if (args.length != 2){
System.out.printf("Usage: %s [generic options] <input> <output>\n",getClass().getSimpleName());
ToolRunner.printGenericCommandUsage(System.err);
return -1;
}
Job job = Job.getInstance(getConf(),"Max temperature");
job.setJarByClass(getClass());
FileSystem fileSystem = FileSystem.get(getConf());
Path outPath = new Path(args[1]);
if (fileSystem.exists(outPath)){
fileSystem.delete(outPath,true);
}
//添加输入路径
FileInputFormat.addInputPath(job,new Path(args[0]));
//添加输出路径
FileOutputFormat.setOutputPath(job,new Path(args[1]));
job.setMapperClass(MaxTemperatureMapper.class);
job.setCombinerClass(MaxTemperatureReducer.class);
job.setReducerClass(MaxTemperatureReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
return job.waitForCompletion(true) ? 0 : 1;
}
public static void main(String[] args) throws Exception {
String arg0 = "ncdc/test/1902";//此文件已经在伪分布式文件系统中
String arg1 = "ncdc/test/output";
args = new String[]{arg0,arg1};
int exitCode = ToolRunner.run(new MaxTemperatureDriver(),args);
System.exit(exitCode);
}
}
5、打包应用程序
注意:为了运行方便需要将依赖和函数入口打包进去,打包maven插件如下
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.0</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<appendAssemblyId>false</appendAssemblyId>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
<archive>
<manifest>
<mainClass>com.yyh.job.MaxTemperatureDriver</mainClass>
</manifest>
</archive>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
四、在hadoop伪分布式文件系统执行作业
1、启动作业
hadoop jar MaxTemperatureJob-1.0-SNAPSHOT.jar com.yyh.job.MaxTemperatureDriver
第三个参数是jar包,第四个参数是main函数,后面还可以更具需求添加参数
执行结果部分截图如下
接下来在yarm资源管理器web-ui中可以查看我们运行过的job
可以看到任务有new、new saving等状态,还有很多功能需要探索
接下来
五、在hadoop伪分布式文件系统中查看作业结果
在hdfs的web-ui中可以查看作业生成的文件如下图
如果在集群上运行多个相同的作业,处理多个年份的天气,会生成多个文件
ps:web-ui是非常好的工具,所有的配置,日志,历史记录都能在web-ui中查看,通过web-ui还可以操作文件系统。