Hadoop3.x使用CombineTextInputFormat实现小文件合并

携手创作，共同成长！这是我参与「掘金日新计划 · 8 月更文挑战」的第7天，点击查看活动详情

在hadoop中对文件进行切片时，默认使用TextInputFormat中的切片机制，即按文件进行切片，不管文件大小，每个文件都会是一个单独的切片，并且每个文件/切片都会由一个单独的MapTask进行处理，但是这种切片机制会造成一个问题，如果有大量的小文件，那么也就要分配大量的MapTask，但每个文件的数据量又很小，光是启动MapTask的过程就会消耗大量的资源，这样显然是不合理的。所以针对小文件过多的情况，我们可以尝试用CombineTextInputFormat来进行处理，它可以将众多的小文件从逻辑上划分为较少的切片，这样只需要启动较少的MapTask即可。

我们可以先不指定切片机制，使用它默认的TextInputFormat来测试一下看看有几个切片：

准备5个小文件并上传至HDFS

[root@hadoop301 testdata]# pwd
/usr/local/wyh/software/hadoop-3.1.3/testdata
[root@hadoop301 testdata]# ls -lr
total 20
-rw-r--r--. 1 root root 381 Jul 24 08:39 testcombine5.txt
-rw-r--r--. 1 root root 351 Jul 24 08:38 testcombine4.txt
-rw-r--r--. 1 root root  24 Jul 24 08:37 testcombine3.txt
-rw-r--r--. 1 root root  88 Jul 24 08:37 testcombine2.txt
-rw-r--r--. 1 root root  72 Jul 24 08:36 testcombine1.txt

[root@hadoop301 testdata]# hdfs dfs -mkdir /testcombine
[root@hadoop301 testdata]# hdfs dfs -put testcombine* /testcombine
复制代码

创建project：

引入pom依赖

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>test.wyh</groupId>
    <artifactId>TestCombine</artifactId>
    <version>1.0-SNAPSHOT</version>

    <properties>
        <maven.compiler.source>8</maven.compiler.source>
        <maven.compiler.target>8</maven.compiler.target>
    </properties>

    <packaging>jar</packaging>
    <dependencies>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>3.1.3</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>3.1.3</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>3.1.3</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-core</artifactId>
            <version>3.1.3</version>
        </dependency>
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>RELEASE</version>
        </dependency>
    </dependencies>
    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>3.1</version>
                <configuration>
                    <source>1.8</source>
                    <target>1.8</target>
                    <encoding>UTF-8</encoding>
                </configuration>
            </plugin>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-shade-plugin</artifactId>
                <version>2.4.3</version>
                <executions>
                    <execution>
                        <phase>package</phase>
                        <goals>
                            <goal>shade</goal>
                        </goals>
                        <configuration>
                            <minimizeJar>true</minimizeJar>
                        </configuration>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>

</project>
复制代码

自定义Mapper

package test.wyh.testcombine;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class TestCombineMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

        //将每一行的文本数据进行拆分
        String[] splitWord=value.toString().split(",");
        //遍历数组，转换成为k2,v2
        for (String word:splitWord) {
            //将k2,v2写入上下文对象context中,参数一为要输出的k2,参数二为要输出的v2，拆分后的每个单词的v2都是固定值1
            context.write(new Text(word), new LongWritable(1));
        }


    }


}
复制代码

自定义Reducer

package test.wyh.testcombine;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class TestCombineReducer extends Reducer<Text, LongWritable, Text, LongWritable> {

    @Override
    protected void reduce(Text key, Iterable<LongWritable> values, Reducer<Text, LongWritable, Text, LongWritable>.Context context) throws IOException, InterruptedException {
        //遍历v2集合，将集合中的值相加得到v3，需要先初始化一个count值
        long count = 0;
        for (LongWritable value : values) {
            count=count+value.get();
        }
        //将k3,v3写入context中，key3的值与key2的值保持一致
        context.write(key, new LongWritable(count));
    }

}
复制代码

自定义主类

package test.wyh.testcombine;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class TestCombineJobMain extends Configured implements Tool {

    public int run(String[] strings) throws Exception {
        /**
         *创建job任务对象，参数一为Configuration类型的对象，需要注意的是在同一个job任务中，上下文必须使用同一个Configuration对象，
         * 而下面的main()中已经创建了Configuration对象，所以必须要使用main()中的configuration，而这个对象在下面的run方法中
         * 其实已经被保存在了Configured类中，因为Configured类中有一个私有变量是Configuration对象。所以这里我们就是要想办法拿到Configured类中的configuration，
         * 而当前我们的自定义类WordCountMain又是Configured的子类，所以我们可以通过super对象来调用其父类Configured的configuration对象。
         * 参数二为自定义的job name。
         */
        Job job = Job.getInstance(super.getConf(), "testCombineJob");
        //!!!!!!!!!!    集群必须要设置    !!!!!!!!
        job.setJarByClass(TestCombineJobMain.class);
        //配置job具体要执行的任务步骤
        //指定要读取的文件的路径，这里写了目录，就会将该目录下的所有文件都读取到
        FileInputFormat.setInputPaths(job, new Path("hdfs://hadoop301:8020/testcombine"));
        //指定map处理逻辑类
        job.setMapperClass(TestCombineMapper.class);
        //指定map阶段输出的k2类型
        job.setMapOutputKeyClass(Text.class);
        //指定map阶段输出的v2类型
        job.setMapOutputValueClass(LongWritable.class);
        //指定reduce处理逻辑类
        job.setReducerClass(TestCombineReducer.class);
        //设置reduce之后输出的k3类型
        job.setOutputKeyClass(Text.class);
        //设置reduce之后输出的v3类型
        job.setOutputValueClass(LongWritable.class);
        //指定结果输出路径，该目录必须是不存在的目录（如已存在该目录，则会报错），它会自动帮我们创建
        FileOutputFormat.setOutputPath(job, new Path("hdfs://hadoop301:8020/testcombineoutput"));
        //返回执行状态
        boolean status = job.waitForCompletion(true);
        //使用三目运算，将布尔类型的返回值转换为整型返回值，其实这个地方的整型返回值就是返回给了下面main()中的runStatus
        return status ? 0:1;
    }

    public static void main(String[] args) throws Exception {
        Configuration configuration = new Configuration();
        /**
         * 参数一是一个Configuration对象，参数二是Tool的实现类对象，参数三是一个String类型的数组参数，可以直接使用main()中的参数args.
         * 返回值是一个整型的值，这个值代表了当前这个任务执行的状态.
         * 调用ToolRunner的run方法启动job任务.
         */
        int runStatus = ToolRunner.run(configuration, new TestCombineJobMain(), args);
        /**
         * 任务执行完成后退出，根据上面状态值进行退出，如果任务执行是成功的，那么就是成功退出，如果任务是失败的，就是失败退出
         */
        System.exit(runStatus);

    }


}
复制代码

在这里我们没有指定FileInputFormat，默认就会按照TextFileInputFormat来执行，对于我们刚才创建的5个小文件，应该会有5个切片。

打包并上传至服务器

运行jar

[root@hadoop301 testjar]# hadoop jar TestCombine-1.0-SNAPSHOT.jar test.wyh.testcombine.TestCombineJobMain
复制代码

控制台日志：

可以看到日志中显示有5个切片。

接下来我们再在主类中指定FileInputFormat是CombineTextInputFormat并指定虚拟存储切片大小：

package test.wyh.testcombine;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class TestCombineJobMain extends Configured implements Tool {

    public int run(String[] strings) throws Exception {
        /**
         *创建job任务对象，参数一为Configuration类型的对象，需要注意的是在同一个job任务中，上下文必须使用同一个Configuration对象，
         * 而下面的main()中已经创建了Configuration对象，所以必须要使用main()中的configuration，而这个对象在下面的run方法中
         * 其实已经被保存在了Configured类中，因为Configured类中有一个私有变量是Configuration对象。所以这里我们就是要想办法拿到Configured类中的configuration，
         * 而当前我们的自定义类WordCountMain又是Configured的子类，所以我们可以通过super对象来调用其父类Configured的configuration对象。
         * 参数二为自定义的job name。
         */
        Job job = Job.getInstance(super.getConf(), "testCombineJob");
        //!!!!!!!!!!    集群必须要设置    !!!!!!!!
        job.setJarByClass(TestCombineJobMain.class);
        //设置使用CombineTextInputFormat
        job.setInputFormatClass(CombineTextInputFormat.class);
        //设置虚拟存储切片的最大值，这里设置的是4MB
        CombineTextInputFormat.setMaxInputSplitSize(job, 4194304);
        //配置job具体要执行的任务步骤
        //指定要读取的文件的路径，这里写了目录，就会将该目录下的所有文件都读取到
        FileInputFormat.setInputPaths(job, new Path("hdfs://hadoop301:8020/testcombine"));
        //指定map处理逻辑类
        job.setMapperClass(TestCombineMapper.class);
        //指定map阶段输出的k2类型
        job.setMapOutputKeyClass(Text.class);
        //指定map阶段输出的v2类型
        job.setMapOutputValueClass(LongWritable.class);
        //指定reduce处理逻辑类
        job.setReducerClass(TestCombineReducer.class);
        //设置reduce之后输出的k3类型
        job.setOutputKeyClass(Text.class);
        //设置reduce之后输出的v3类型
        job.setOutputValueClass(LongWritable.class);
        //指定结果输出路径，该目录必须是不存在的目录（如已存在该目录，则会报错），它会自动帮我们创建
        FileOutputFormat.setOutputPath(job, new Path("hdfs://hadoop301:8020/testcombineoutput2"));
        //返回执行状态
        boolean status = job.waitForCompletion(true);
        //使用三目运算，将布尔类型的返回值转换为整型返回值，其实这个地方的整型返回值就是返回给了下面main()中的runStatus
        return status ? 0:1;
    }

    public static void main(String[] args) throws Exception {
        Configuration configuration = new Configuration();
        /**
         * 参数一是一个Configuration对象，参数二是Tool的实现类对象，参数三是一个String类型的数组参数，可以直接使用main()中的参数args.
         * 返回值是一个整型的值，这个值代表了当前这个任务执行的状态.
         * 调用ToolRunner的run方法启动job任务.
         */
        int runStatus = ToolRunner.run(configuration, new TestCombineJobMain(), args);
        /**
         * 任务执行完成后退出，根据上面状态值进行退出，如果任务执行是成功的，那么就是成功退出，如果任务是失败的，就是失败退出
         */
        System.exit(runStatus);

    }


}
复制代码

再次运行jar：

可以看到输入文件依然是5，但切片数量变为了1，因为5个小文件的大小没有达到我们设置的虚拟存储的切片大小（这里我设置的是4M），那么就会合并为1个切片。要说明的是，并不是所有小文件合并后的切片都是1，如果你合并后的文件达到了你设置的虚拟存储切片的大小，那么就会启用一个新的切片，最终可能就会是多个切片但小于你原有的文件数，我这里只有1个，是因为我的5个文件实在太小，合并后也没有达到设置的值，所以就只有一个切片，一个MapTask。

以上的例子就简单测试了CombineTextInputFormat在小文件切片时的应用。

Hadoop3.x使用CombineTextInputFormat实现小文件合并

猜你喜欢