彷徨 | HDFS系统里面文件的词频统计

HDFS系统里面的词频统计统计 , 有俩种方式 , 一种是简单的单机版 , 一种的分布式的MapReduce原理

第一种 : 简单的单机模式

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

/**
 * 单机版的词频统计
 * @author Administrator
 *
 */

public class SingleWC_zhang {
	public static void main(String[] args) throws IOException, InterruptedException, URISyntaxException {
		Map<String, Integer> map = new HashMap<>();
		
		Configuration conf = new Configuration();
		FileSystem fs = FileSystem.get(new URI("hdfs://hadoop01:9000"), conf, "root");
		
		FSDataInputStream inputStream = fs.open(new Path("/wc.txt"));
		BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream));
		String line = null;
		while((line = reader.readLine())!=null){
			//System.out.println(line);
			String[] split = line.split(" ");
			for (String word : split) {
				Integer count = map.getOrDefault(word, 0);
				count++;
				map.put(word, count);
			}
			
		}
		FSDataOutputStream create = fs.create(new Path("/part-r-000001"));
		Set<Entry<String,Integer>> entrySet = map.entrySet();
		for (Entry<String, Integer> entry : entrySet) {
			create.write((entry.getKey() + "=" +entry.getValue()+"\r\n").getBytes());
			//System.out.println(entry);
		}
		create.close();
		reader.close();
		fs.close();
		System.out.println("结束任务");
	}
}

第二种 : 用MapReduce原理

Map

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

public class MapTask_zhang {
	public static void main(String[] args) throws Exception {
		/**
		 * taskId标识哪台机器运行的
		 * file统计哪个文件的
		 * startOffSet从哪个位置开始的
		 * length读多长
		 */
		int taskId = Integer.parseInt(args[0]);
		String file = args[1];
		long startOffSet = Long.parseLong(args[2]);
		long lenth = Long.parseLong(args[3]);
		
		FileSystem fs = FileSystem.get(new URI("hdfs://hadoop01:9000"), new Configuration(), "root");
		FSDataInputStream inputStream = fs.open(new Path(file));
		
		
		//输出文件
		FSDataOutputStream out_tmp_1 = fs.create(new Path("/wordcount/tmp/part-m"+taskId+"-1"));
		FSDataOutputStream out_tmp_2 = fs.create(new Path("/wordcount/tmp/part-m"+taskId+"-2"));
		
		//定位到从哪里读
		inputStream.seek(startOffSet);
		BufferedReader br = new BufferedReader(new InputStreamReader(inputStream));
		
		//除了taskIdwei1的能读第一行,后面的task都需要跳过一行
		if(taskId != 1) {
			br.readLine();
		}
		
		long count = 0 ;
		String line = null;
		while((line = br.readLine()) != null) {
			String[] split = line.split(" ");
			for (String word : split) {
				//相同的字符串,相同的hashcode  hello 奇数  hello  偶数
				if (word.hashCode()%2 == 0) {
					out_tmp_1.write((word+"\t"+1+"\n").getBytes());
				} else {
					out_tmp_2.write((word+"\t"+1+"\n").getBytes());
				}
			}
			//累加每行的数据长度  多读一行
			count += line.length()+1;
			if (count>lenth) {
				break;
			}
		}
		br.close();
		out_tmp_1.close();
		out_tmp_2.close();
		fs.close();

		
	}
}

Reduce

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;

public class ReduceTask_zhang {
	public static void main(String[] args) throws Exception {
		int taskId = Integer.parseInt(args[0]);
		
		Map<String, Integer> map = new HashMap<>();
		
		FileSystem fs = FileSystem.get(new URI("hdfs://hadoop01:9000"), new Configuration(), "root");
		RemoteIterator<LocatedFileStatus> listFiles = fs.listFiles(new Path("/wordcount/tmp/"), true);
		while(listFiles.hasNext()) {
			LocatedFileStatus file = listFiles.next();
			
			//判断是否是属于自己需要计算的文件
			if(file.getPath().getName().endsWith("-"+taskId)) {
				FSDataInputStream inputStream = fs.open(file.getPath());
				BufferedReader br = new BufferedReader(new InputStreamReader(inputStream));
				String line = null;
				while((line = br.readLine()) != null) {
					String[] split = line.split("\t");
					Integer count = map.getOrDefault(split[0], 0);
					count += Integer.parseInt(split[1]);
					map.put(split[0], count);
				}
				br.close();
				inputStream.close();
			}
		}
		
		//将结果写入到hdfs上
		FSDataOutputStream outputStream = fs.create(new Path("/wordcount/ret/part-r-"+taskId));
		Set<Entry<String,Integer>> entrySet = map.entrySet();
		for (Entry<String, Integer> entry : entrySet) {
			outputStream.write((entry.getKey()+"="+entry.getValue()+"\n").getBytes());
		}
		outputStream.close();
		fs.close();
	}
}

要分析的词频如下

运行方式 :

1.先运行Map,右键 run as ---> java Application

2.右键run as --->run Configurations , 传参数

第一次传参 1 /wc.txt 0 50

第二次传参 2 /wc.txt 50 50

3.运行Reduce , 右键 run as ---> java Application

右键run as --->run Configurations , 传参数

第一次传参 1

第二次传参 2

查看集群结果:

map处理结果:

Reduce分析结果:

彷徨 | HDFS系统里面文件的词频统计

第一种 : 简单的单机模式

第二种 : 用MapReduce原理

运行方式 :

查看集群结果:

猜你喜欢