hadoop版本3.1.0分布式环境下执行mapreduce作业

一、环境配置

1、hdfs-site.xml

<configuration>
    <property>
        <name>dfs.replication</name>
        <value>1</value>
    </property>
    <property>
        <name>dfs.webhdfs.enabled</name>
        <value>true</value>
    </property>
    <property>
        <name>dfs.namenode.http-address</name>
        <value>localhost:50070</value>
    </property>
</configuration>

2、yarn-site.xml

<configuration>
    <property>
        <name>yarn.nodemanager.aux-services</name>
        <value>mapreduce_shuffle</value>
    </property>
    <property>
        <name>yarn.resourcemanager.hostname</name>
        <value>localhost</value>
    </property>
    <property>
        <name>yarn.nodemanager.disk-health-checker.max-disk-utilization-per-disk-percentage</name>
        <value>98.0</value>
    </property>
    <property>
        <name>yarn.nodemanager.env-whitelist</name>
        <value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOME</value>
    </property>
</configuration>

3、mapred-site.xml

<configuration>
    <property>
        <name>mapreduce.framework.name</name>
        <value>yarn</value>
    </property>
</configuration>

4、core-site.xml

<configuration>
    <property>
        <name>fs.defaultFS</name>
        <value>hdfs://localhost</value>
    </property>
    <property>
        <name>hadoop.tmp.dir</name>
        <value>/Users/lhy/hadoop/tmp</value>
        <description>A base for other temporary directories.</description>
    </property>
</configuration>

二、启动mapreduce作业相关进程

1、首次使用hdfs，必须先格式化namenode，格式化HDFS
bin/hdfs namenode -format
2、启动HDFS
sbin/start-dfs.sh
3、启动YARN
sbin/start-yarn.sh
4、启动MapReduces守护进程
sbin/mr-jobhistory-daemon.sh start historyserver
5、为自己创建一个目录
hadoop fs -mkdir -p /user/$USER
6、查看是否启动成功
jps
这里写图片描述
必须有ResourceManager、SecondaryNameNode、NameNode、DataNode、NodeManager，才能说明启动成功成功，否则需要去$HDOOP_HOME/logs目录下查看相关日志解决问题。

三、mapreduce应用开发

下面开发一个计算美国NCDC气象数据中心一年中最高气温的mapreduce应用
1、引入依赖

<dependencies>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>3.1.0</version>
        </dependency>

        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.12</version>
        </dependency>

        <!--map输入测试依赖-->
        <dependency>
            <groupId>org.apache.mrunit</groupId>
            <artifactId>mrunit</artifactId>
            <version>1.1.0</version>
            <classifier>hadoop2</classifier>
            <scope>test</scope>
        </dependency>
    </dependencies>

2、构建mapper
所有应用的mapper必须继承Mapper类

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class  MaxTemperatureMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
    private final static int MISSING_TEMPERATURE = 9999;

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String line = value.toString();
        String year = line.substring(15, 19);
        int airTemperature;
        if (line.charAt(87) == '+') {
            airTemperature = Integer.parseInt(line.substring(88, 92));
        } else {
            airTemperature = Integer.parseInt(line.substring(87, 92));
        }

        String quality = line.substring(92, 93);

        if (airTemperature != MISSING_TEMPERATURE && quality.matches("[01459]")) {
            context.write(new Text(year),new IntWritable(airTemperature));
        }
    }

}

2、单元测试mapper是否符合预期

import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mrunit.mapreduce.MapDriver;
import org.junit.Test;
import com.yyh.mapper.MaxTemperatureMapper;

import java.io.IOException;

public class MaxTemperatureMapperTest {
    @Test
    public void processesValidRecord() throws IOException, InterruptedException {
        Text value = new Text("0057332130999991950010103004+51317+028783FM-12+017199999V0203201N00721004501CN0100001N9-01281-01391102681");
        new MapDriver<LongWritable, Text, Text, IntWritable>()
                .withMapper(new MaxTemperatureMapper())
                .withInput(new LongWritable(0), value)
                .withOutput(new Text("1950"), new IntWritable(-128))
                .runTest();
    }

}

3、构建reducer
注意reducer中的key，value类型必须和maper中的key、value对应

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class MaxTemperatureReducer extends Reducer<Text,IntWritable,Text,IntWritable>{

    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        int maxValue = Integer.MIN_VALUE;
        for (IntWritable value : values){
            maxValue = Math.max(maxValue,value.get());
        }
        context.write(key,new IntWritable(maxValue));
    }
}

4、配置作业

import com.yyh.mapper.MaxTemperatureMapper;
import com.yyh.mapper.MaxTemperatureReducer;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class MaxTemperatureDriver extends Configured implements Tool {

    @Override
    public int run(String[] args) throws Exception {

        //默认的的是本地文件系统，此处需要配置为伪分布式文件系统
        getConf().set("fs.defaultFS","hdfs://localhost");

        if (args.length != 2){
            System.out.printf("Usage: %s [generic options] <input> <output>\n",getClass().getSimpleName());
            ToolRunner.printGenericCommandUsage(System.err);
            return -1;
        }

        Job job = Job.getInstance(getConf(),"Max temperature");
        job.setJarByClass(getClass());

        FileSystem fileSystem = FileSystem.get(getConf());
        Path outPath = new Path(args[1]);
        if (fileSystem.exists(outPath)){
            fileSystem.delete(outPath,true);
        }

        //添加输入路径
        FileInputFormat.addInputPath(job,new Path(args[0]));
        //添加输出路径
        FileOutputFormat.setOutputPath(job,new Path(args[1]));

        job.setMapperClass(MaxTemperatureMapper.class);
        job.setCombinerClass(MaxTemperatureReducer.class);
        job.setReducerClass(MaxTemperatureReducer.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        return job.waitForCompletion(true) ? 0 : 1;
    }

    public static void main(String[] args) throws Exception {
        String arg0 = "ncdc/test/1902";//此文件已经在伪分布式文件系统中
        String arg1 = "ncdc/test/output";
        args = new String[]{arg0,arg1};
        int exitCode = ToolRunner.run(new MaxTemperatureDriver(),args);
        System.exit(exitCode);
    }
}

5、打包应用程序
注意：为了运行方便需要将依赖和函数入口打包进去，打包maven插件如下

<plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>3.0</version>
                <configuration>
                    <source>1.8</source>
                    <target>1.8</target>
                </configuration>
            </plugin>


            <plugin>
                <artifactId>maven-assembly-plugin</artifactId>
                <configuration>
                    <appendAssemblyId>false</appendAssemblyId>
                    <descriptorRefs>
                        <descriptorRef>jar-with-dependencies</descriptorRef>
                    </descriptorRefs>
                    <archive>
                        <manifest>
                            <mainClass>com.yyh.job.MaxTemperatureDriver</mainClass>
                        </manifest>
                    </archive>
                </configuration>
                <executions>
                    <execution>
                        <id>make-assembly</id>
                        <phase>package</phase>
                        <goals>
                            <goal>single</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
        </plugins>

四、在hadoop伪分布式文件系统执行作业

1、启动作业

hadoop jar MaxTemperatureJob-1.0-SNAPSHOT.jar com.yyh.job.MaxTemperatureDriver

第三个参数是jar包，第四个参数是main函数，后面还可以更具需求添加参数
执行结果部分截图如下这里写图片描述

接下来在yarm资源管理器web-ui中可以查看我们运行过的job
这里写图片描述
可以看到任务有new、new saving等状态，还有很多功能需要探索
接下来

五、在hadoop伪分布式文件系统中查看作业结果

在hdfs的web-ui中可以查看作业生成的文件如下图
这里写图片描述
如果在集群上运行多个相同的作业，处理多个年份的天气，会生成多个文件

ps：web-ui是非常好的工具，所有的配置，日志，历史记录都能在web-ui中查看，通过web-ui还可以操作文件系统。