学习笔记:从0开始学习大数据-9. MapReduce读并写Hbase数据

上节的MapReduce计算WordCount例子是从hdfs读输入文件,计算结果也写入hdfs

MapReduce分布式计算的输入输出可以根据需要从hdfs或hbase读取或写入,如

A.读hdfs-->写hdfs

B.读hdfs-->写hbase

C.读hbase-->写hdfs

D.读hbase -->写hbase

本节示例第三种和第四种情况。

一、第四种情况,读hbase,mapreduce计算后写入hbase,其中mapreduce计算什么也没做,只是把读出的再写入到输出

网上找的程序,修改调试通过,记录备用

创建 "Map/Reduce  Project"项目, 需要注意的是把hbaseXXX/lib 下的jar引入到项目引用的库中来。

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;

 
public class RwHbasetoHbase{
 
	
	public static class readMapper extends TableMapper<Text, Put> {   // mapper classs
 
		public void map(ImmutableBytesWritable row, Result columns,
				Context context) throws IOException, InterruptedException {
 			Text mapoutputkey = new Text();
			String rowkey = Bytes.toString(row.get());  // output key
 			mapoutputkey.set(rowkey);
			Put put = new Put(row.get()); // output value
 			for (Cell cell : columns.rawCells()) {     // put column family
				if ("info".equals(Bytes.toString(CellUtil.cloneFamily(cell)))) {   // put column
					if ("name".equals(Bytes.toString(CellUtil      //column name
							.cloneQualifier(cell)))) {
 						put.add(cell);
					}
 					if ("age".equals(Bytes.toString(CellUtil       //column age
							.cloneQualifier(cell)))) {
 						put.add(cell);
					}
					if ("class".equals(Bytes.toString(CellUtil      //column class
							.cloneQualifier(cell)))) {
 						put.add(cell);
					}
				}
			}
			context.write(mapoutputkey, put);
		}
	}
 
	// reducer class
 
	public static class writereducer extends
			TableReducer<Text, Put, ImmutableBytesWritable> {
 
		protected void reduce(Text key, Iterable<Put> value, Context context)
				throws IOException, InterruptedException {
 			for (Put put : value) {
				context.write(null, put);  // write format
			}
 		}
 	}
 
	// driver
	public static void main(String[] args) throws Exception {{
 		Configuration cf=new Configuration();
 		cf.set("hbase.zookeeper.quorum", "centos7");
		Configuration config = HBaseConfiguration.create(cf);
		Job job = Job.getInstance(config, "hbase_read2write");
		job.setJarByClass(RwHbasetoHbase.class); // class that contains mapper and reducer
		Scan scan = new Scan();
		scan.setCaching(500); // 1 is the default in Scan, which will be bad for MapReduce jobs						
		scan.setCacheBlocks(false); // don't set to true for MR jobs set other scan attrs
		TableMapReduceUtil.initTableMapperJob(   //set mapper
				"student", // input table
				scan,      // Scan instance to control CF and attribute selection
				readMapper.class, // mapper class
				Text.class, // mapper output key
				Put.class, // mapper output value
				job);
		TableMapReduceUtil.initTableReducerJob(   		//set reducer
				"student_copy", // output table 														// table
				writereducer.class, // reducer class
				job);
		job.setNumReduceTasks(1); // at least one, adjust as required
		System.exit(job.waitForCompletion(true) ? 0 : 1);
 	}
	}
	}

其中 输入文件student创建见我的上篇文章

扫描二维码关注公众号,回复: 4384294 查看本文章

https://blog.csdn.net/oLinBSoft/article/details/84337229   学习笔记:从0开始学习大数据-7.hbase java编程hello world

输出文件 student_copy 如下:

二,第三种情况,从hbase作为输入源,读取数据 mapreduce计算结果写入到hdfs文件

也是网上查找的例子,从hdbase 的hello表中读取单词,统计后排序,取词频最高的三个结果写入到hdfs文件系统

1.创建输入源'hello'表

2.如上面一样,在项目中增加一个类 WordCountfromHbase.java

import java.io.IOException;
import java.util.Comparator;
import java.util.TreeMap;

 
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

 
public class WordCountfromHbase {  
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {  
	    String tablename = "hello";
	    Configuration conf = HBaseConfiguration.create();
	    conf.set("hbase.zookeeper.quorum", "centos7");
	    Job job = Job.getInstance(conf, "WordCountHbaseReader");
	    job.setJarByClass(WordCountfromHbase.class);
	    Scan scan = new Scan();
	    TableMapReduceUtil.initTableMapperJob(tablename,scan,doMapper.class, Text.class, IntWritable.class, job);
	    job.setReducerClass(WordCountHbaseReaderReduce.class);
	    FileOutputFormat.setOutputPath(job, new Path(args[0]));
	    MultipleOutputs.addNamedOutput(job, "hdfs", TextOutputFormat.class, WritableComparable.class, Writable.class);
	    System.exit(job.waitForCompletion(true) ? 0 : 1);
    }  
      
    public static class doMapper extends TableMapper<Text, IntWritable>{  
        private final static IntWritable one = new IntWritable(1);
        private Text word = new Text(); 
        @Override  
        protected void map(ImmutableBytesWritable key, Result value,  
                Context context) throws IOException, InterruptedException { 
        	/*不进行分隔,将value整行全部获取
			String rowValue = Bytes.toString(value.list().get(0).getValue());
          	context.write(new Text(rowValue), one);
        	*/
        	String[] rowValue = Bytes.toString(value.list().get(0).getValue()).split(" ");
     	    for (String str: rowValue){
    		   word.set(str);
    		   context.write(word,one);
    		  
    	    }   
 
        }  
    }  
    
    public static final int K = 3; 
    public static class WordCountHbaseReaderReduce extends Reducer<Text, IntWritable, Text, IntWritable> {
        //定义treeMap来保持统计结果,由于treeMap是按key升序排列的,这里要人为指定Comparator以实现倒排
        private TreeMap<Integer, String> treeMap = new TreeMap<Integer, String>(new Comparator<Integer>() {
            @Override
            public int compare(Integer x, Integer y) {
                return y.compareTo(x);
            }
        });
        public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            //reduce后的结果放入treeMap,而不是向context中记入结果
            int sum = 0;
            for (IntWritable val : values) {
                sum += val.get();
            }
            if (treeMap.containsKey(sum)){
                String value = treeMap.get(sum) + "," + key.toString();
                treeMap.put(sum,value);
            }else {
                treeMap.put(sum, key.toString());
            }
			if(treeMap.size() > K) {
				treeMap.remove(treeMap.lastKey());
			}  
        }
        protected void cleanup(Context context) throws IOException, InterruptedException {
            //将treeMap中的结果,按value-key顺序写入contex中
            for (Integer key : treeMap.keySet()) {
                context.write(new Text(treeMap.get(key)), new IntWritable(key));
            }
        }
    }
}  

3. 新建一个运行配置,输入需要的参数,即mapreduce结果输出hdfs目录

4.运行后查看hdfs目录文件检验结果

因为map函数中以空格分隔单词,所以逗号出现在分词结果中

猜你喜欢

转载自blog.csdn.net/oLinBSoft/article/details/84393380