springboot integrates hadoop to realize hdfs function

pom.xml:

<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>2.8.5</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-hdfs -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>2.8.5</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>2.8.5</version>
        </dependency>

application.properties:


server.port=8569

#hdfs
hadoop.name-node: hdfs://192.168.4.252:9000
hadoop.namespace: /bestdir

HadoopConfig:

package com.zkaw.hadoop.config;

import lombok.extern.slf4j.Slf4j;
import org.apache.hadoop.fs.FileSystem;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import java.net.URI;

@Configuration
@ConditionalOnProperty(name="hadoop.name-node")
@Slf4j
public class HadoopConfig {
    @Value("${hadoop.name-node}")
    private String nameNode;
    /*** Configuration conf=new Configuration（）；
     * 创建一个Configuration对象时，其构造方法会默认加载hadoop中的两个配置文件，
     * 分别是hdfs-site.xml以及core-site.xml，这两个文件中会有访问hdfs所需的参数值，
     * 主要是fs.default.name，指定了hdfs的地址，有了这个地址客户端就可以通过这个地址访问hdfs了。
     * 即可理解为configuration就是hadoop中的配置信息。
     * @return
     */
    @Bean("fileSystem")
    public FileSystem createFs() throws Exception{
        //读取配置文件
        org.apache.hadoop.conf.Configuration conf = new org.apache.hadoop.conf.Configuration();
        conf.set("fs.defaultFS", nameNode);
        conf.set("dfs.replication", "1");
        conf.set("dfs.client.use.datanode.hostname", "true");
        FileSystem fs = null;
        // 指定访问hdfs的客户端身份
        fs = FileSystem.get(new URI(nameNode), conf, "root");
        // 文件系统// 返回指定的文件系统,如果在本地测试，需要使用此种方法获取文件系统
        try {
            URI uri = new URI(nameNode.trim());
            fs = FileSystem.get(uri,conf,"root");
        } catch (Exception e) {
            log.error("", e);
        }
        System.out.println("fs.defaultFS: "+conf.get("fs.defaultFS"));
        return  fs;
    }
}

HdfsController：

package com.zkaw.hadoop.controller;

import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;

import java.io.IOException;

/**
 * @Author: best_liu
 * @Description:
 * @Date Create in 14:20 2023/4/21
 * @Modified By:
 */
@RequestMapping("/hdfs")
@RestController
@Slf4j
public class HdfsController {
    @Value("${hadoop.name-node}")
    private String nameNode;
    @Value("${hadoop.namespace:/}")
    private String nameSpace;

    @Autowired
    private FileSystem fileSystem;


    /*** 将本地文件srcFile,上传到hdfs
     * @param srcFile
     * @return
     */
    @PostMapping("/upload")
    public String upload( String srcFile){
        srcFile = "D:\\test.txt";
        uploadFile(srcFile);
        return "upload";
    }

    public void uploadFile(String srcFile){
        this.copyFileToHDFS(false,true,srcFile,nameSpace);
    }
    public  void copyFileToHDFS(boolean delSrc, boolean overwrite,String srcFile,String destPath) {
        // 源文件路径是Linux下的路径，如果在 windows 下测试，需要改写为Windows下的路径，比如D://hadoop/djt/weibo.txt
        Path srcPath = new Path(srcFile);
        // 目的路径
        if(StringUtils.isNotBlank(nameNode)){
            destPath = nameNode + destPath;
        }
        Path dstPath = new Path(destPath);
        // 实现文件上传
        try {
            // 获取FileSystem对象
            fileSystem.copyFromLocalFile(srcPath, dstPath);
            fileSystem.copyFromLocalFile(delSrc,overwrite,srcPath, dstPath);
            //释放资源//
//            fileSystem.close();
        } catch (IOException e) {
            log.error("", e);
        }
    }

    @PostMapping("/delFile")
    public String del(String fileName){
        rmdir(nameSpace,"test.txt") ;
        return "delFile";
    }

    public void rmdir(String path,String fileName) {
        try {
            // 返回FileSystem对象
            if(StringUtils.isNotBlank(nameNode)){
                path = nameNode + path;
            }
            if(StringUtils.isNotBlank(fileName)){
                path =  path + "/" +fileName;
            }
            // 删除文件或者文件目录  delete(Path f) 此方法已经弃用
            fileSystem.delete(new Path(path),true);
        } catch (IllegalArgumentException | IOException e) {
            log.error("", e);
        }
    }

    @PostMapping("/download")
    public String download(String fileName,String savePath){
        getFile(nameSpace+"/"+"test.txt","D:\\work\\lxjTest\\hadoopmaster");
        return "download";
    }
    /*** 从 HDFS 下载文件
     ** @param hdfsFile
     * @param destPath 文件下载后,存放地址
     */
    public void getFile(String hdfsFile,String destPath) {
        // 源文件路径
        if(StringUtils.isNotBlank(nameNode)){
            hdfsFile = nameNode + hdfsFile;
        }
        Path hdfsPath = new Path(hdfsFile);
        Path dstPath = new Path(destPath);
        try {
            // 下载hdfs上的文件
            fileSystem.copyToLocalFile(hdfsPath, dstPath);
            // 释放资源//
            fileSystem.close();
        } catch (IOException e) {
            log.error("", e);
        }
    }
}

problems encountered

At this time, you may happily use postMan to test and find that the following error is reported:

 File /test/test.txt could only be replicated to 0 nodes instead of minReplication (=1).  There are 1 datanode(s) running and 1 node(s) are excluded in this operation.

But when you go back to the 50070 web page you visited earlier, you find that there is text.txt in the directory, but the size is 0

insert image description here

Cause Analysis:

We know that the approximate flow of the client's access to the Hdfs file system is:

The client communicates with the namenode through the public network ip + port.
The namenode returns the address of the datanode. Note that it is the intranet address of the hdfs file system!!!
The client connects to the datanode according to the address.
However, we access the virtual machine on our own computer , the address of the public network is used, so of course it is impossible to access the internal network address of hdfs, and it is also impossible to establish a normal transmission connection with the datanode. This is why there is a directory on the namenode, but the size is 0, and why the web interface Reasons why the file could not be downloaded

Therefore, we have to find a way to make the namenode not return the internal network address of the datanode, but the address of the public network, so that we can access and connect

Solution:

Find a way to make the DataNode accessible locally.

1. Add a sentence of configuration to make NameNode return the host name of DataNode instead of IP:

conf.set("dfs.client.use.datanode.hostname", "true");

In addition, you can also configure the content of the hdfs-site.xml file as follows:

<property>
	<name>dfs.client.use.datanode.hostname</name>
	<value>true</value>
</property>

2. You can get the host name of the DataNode locally. To access it, you need to configure the local Hosts mapping:

windows 下 hosts 文件地址：C:\Windows\System32\drivers\etc\hosts
你的虚拟机公网ip master
192.168.4.xx master

3. The result access is normal