HBase与MapReduce集成操作

1、目的：将HBase中stu_info表中的name放到表user_info中

2、TestHbaseMapper：

package com.zzw.hbase.mapreduce;

import java.io.IOException;

import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.mapreduce.Mapper;

public class TestHbaseMapper extends TableMapper<ImmutableBytesWritable, Put>
{
	@Override
	protected void map(ImmutableBytesWritable key, Result value,  Context context)
			throws IOException, InterruptedException
	{
		//封装put
		Put put =new Put(key.get());
		for (Cell cell : value.rawCells())
		{
			if("info".equals(Bytes.toString(CellUtil.cloneFamily(cell))))
			{
				if("name".equals(Bytes.toString(CellUtil.cloneQualifier(cell))))
				{
					put.add(cell);
				}
			}
		}
		context.write(key, put);
               
	}
}

3、TestDriver

package com.zzw.hbase.mapreduce;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class TestDriver extends Configured implements Tool
{

	public int run(String[] args) throws Exception
	{
		Configuration conf = this.getConf();
		Job job = Job.getInstance(conf, "mr-hbase");
		job.setJarByClass(TestDriver.class);
		Scan scan = new Scan();
		TableMapReduceUtil.initTableMapperJob(
				"stu_info", // input HBase table name
				scan, // Scan instance to control CF and attribute selection
				TestHbaseMapper.class, // mapper
				ImmutableBytesWritable.class, // mapper output key
				Put.class, // mapper output value
				job);
		TableMapReduceUtil.initTableReducerJob(
				"user_info", // output table
				null, // reducer class
				job);
		job.setNumReduceTasks(1);   // at least one, adjust as required
		return job.waitForCompletion(true)?0:1;
	}

	public static void main(String[] args)
	{
		Configuration conf = HBaseConfiguration.create();
		try
		{
			int status = ToolRunner.run(conf, new TestDriver(), args);
			System.exit(status);
		} catch (Exception e)
		{
			e.printStackTrace();
		}
	}

}

4、生成jar包导入到Linux中

5、在hadoop-env.sh新增export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:/opt/cdh5.14.2/hbase-1.2.0/lib/*

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Set Hadoop-specific environment variables here.

# The only required environment variable is JAVA_HOME.  All others are
# optional.  When running a distributed configuration it is best to
# set JAVA_HOME in this file, so that it is correctly defined on
# remote nodes.

# The java implementation to use.
export JAVA_HOME=/opt/java/jdk1.7.0_80

# The jsvc implementation to use. Jsvc is required to run secure datanodes
# that bind to privileged ports to provide authentication of data transfer
# protocol.  Jsvc is not required if SASL is configured for authentication of
# data transfer protocol using non-privileged ports.
#export JSVC_HOME=${JSVC_HOME}

export HADOOP_CONF_DIR=${HADOOP_CONF_DIR:-"/etc/hadoop"}

# Extra Java CLASSPATH elements.  Automatically insert capacity-scheduler.
for f in $HADOOP_HOME/contrib/capacity-scheduler/*.jar; do
  if [ "$HADOOP_CLASSPATH" ]; then
    export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:$f
  else
    export HADOOP_CLASSPATH=$f
  fi
done
export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:/opt/cdh5.14.2/hbase-1.2.0/lib/*
# The maximum amount of heap to use, in MB. Default is 1000.
#export HADOOP_HEAPSIZE=
#export HADOOP_NAMENODE_INIT_HEAPSIZE=""

# Extra Java runtime options.  Empty by default.
export HADOOP_OPTS="$HADOOP_OPTS -Djava.net.preferIPv4Stack=true"

# Command specific options appended to HADOOP_OPTS when specified
export HADOOP_NAMENODE_OPTS="-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS} -Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} $HADOOP_NAMENODE_OPTS"
export HADOOP_DATANODE_OPTS="-Dhadoop.security.logger=ERROR,RFAS $HADOOP_DATANODE_OPTS"

export HADOOP_SECONDARYNAMENODE_OPTS="-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS} -Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} $HADOOP_SECONDARYNAMENODE_OPTS"

export HADOOP_NFS3_OPTS="$HADOOP_NFS3_OPTS"
export HADOOP_PORTMAP_OPTS="-Xmx512m $HADOOP_PORTMAP_OPTS"

# The following applies to multiple commands (fs, dfs, fsck, distcp etc)
export HADOOP_CLIENT_OPTS="-Xmx512m $HADOOP_CLIENT_OPTS"
#HADOOP_JAVA_PLATFORM_OPTS="-XX:-UsePerfData $HADOOP_JAVA_PLATFORM_OPTS"

# On secure datanodes, user to run the datanode as after dropping privileges.
# This **MUST** be uncommented to enable secure HDFS if using privileged ports
# to provide authentication of data transfer protocol.  This **MUST NOT** be
# defined if SASL is configured for authentication of data transfer protocol
# using non-privileged ports.
export HADOOP_SECURE_DN_USER=${HADOOP_SECURE_DN_USER}

# Where log files are stored.  $HADOOP_HOME/logs by default.
#export HADOOP_LOG_DIR=${HADOOP_LOG_DIR}/$USER

# Where log files are stored in the secure data environment.
export HADOOP_SECURE_DN_LOG_DIR=${HADOOP_LOG_DIR}/${HADOOP_HDFS_USER}

###
# HDFS Mover specific parameters
###
# Specify the JVM options to be used when starting the HDFS Mover.
# These options will be appended to the options specified as HADOOP_OPTS
# and therefore may override any similar flags set in HADOOP_OPTS
#
# export HADOOP_MOVER_OPTS=""

###
# Advanced Users Only!
###

# The directory where pid files are stored. /tmp by default.
# NOTE: this should be set to a directory that can only be written to by 
#       the user that will run the hadoop daemons.  Otherwise there is the
#       potential for a symlink attack.
export HADOOP_PID_DIR=${HADOOP_PID_DIR}
export HADOOP_SECURE_DN_PID_DIR=${HADOOP_PID_DIR}

# A string representing this instance of hadoop. $USER by default.
export HADOOP_IDENT_STRING=$USER

7、运行生成的jar

[root@master hadoop-2.6.0]# $HADOOP_HOME/bin/yarn jar /opt/cdh5.14.2/hadoop-2.6.0/jars/testhbase-mr.jar

8、查看user_info

hbase(main):021:0> scan 'user_info'
ROW                                        COLUMN+CELL                                                                                                                 
 2018_1801                                 column=info:name, timestamp=1535316184157, value=Tom                                                                        
 2018_1802                                 column=info:name, timestamp=1535316247908, value=Jim                                                                        
 2018_1803                                 column=info:name, timestamp=1535316281664, value=Lucy                                                                       
 2018_1804                                 column=info:name, timestamp=1535316313407, value=Lily                                                                       
4 row(s) in 0.0630 seconds

HBase与MapReduce集成操作

猜你喜欢