HBase数据快速导入之ImportTsv&Bulkload
导入数据最快的方式,可以略过WAL直接生产底层HFile文件
(环境:centos6.5、Hadoop2.6.0、HBase0.98.9)
1.SHELL方式
1.1 ImportTsv直接导入
命令:bin/hbase org.apache.hadoop.hbase.mapreduce.ImportTsv
Usage: importtsv -Dimporttsv.columns=a,b,c <tablename> <inputdir>
测试:
1.1.1在HBase中创建好表
create ‘testImport1’,’cf’
1.1.2准备数据文件sample1.csv,并上传到HDFS,内容为:
1,"tom"
2,"sam"
3,"jerry"
4,"marry"
5,"john
1.1.3使用导入命令导入
bin/hbase org.apache.hadoop.hbase.mapreduce.ImportTsv -Dimporttsv.separator="," -Dimporttsv.columns=HBASE_ROW_KEY,cf testImport1 /sample1.csv
1.1.4结果
1.2先通过ImportTsv生产HFile文件,再通过completeBulkload导入HBase
1.2.1使用刚才的源数据并创建新表
create ‘testImport2’,’cf’
1.2.2使用命令生产HFile文件
bin/hbase org.apache.hadoop.hbase.mapreduce.ImportTsv -Dimporttsv.separator="," -Dimporttsv.bulk.output=hfile_tmp -Dimporttsv.columns=HBASE_ROW_KEY,cf testImport2 /sample1.csv
1.2.3在HDFS上的中间结果
1.2.4使用命令将HFile文件导入HBase
hadoop jar lib/hbase-server-0.98.9-hadoop2.jar completebulkload hfile_tmp testImport2
1.2.5结果
注:1.如果出现缺包错误提示,则把HBase的jar包包含到hadoop的classpath中;2.运行该命令的本质是一个hdfs的mv操作,并不会启动MapReduce。
2.API代码方式
代码的方式更灵活一点,许多东西可以自定义。
直接贴代码吧:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
|
import
org.apache.hadoop.conf.Configuration;
import
org.apache.hadoop.fs.FsShell;
import
org.apache.hadoop.fs.Path;
import
org.apache.hadoop.hbase.HBaseConfiguration;
import
org.apache.hadoop.hbase.KeyValue;
import
org.apache.hadoop.hbase.client.HTable;
import
org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import
org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2;
import
org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles;
import
org.apache.hadoop.hbase.util.Bytes;
import
org.apache.hadoop.io.LongWritable;
import
org.apache.hadoop.io.Text;
import
org.apache.hadoop.mapreduce.Job;
import
org.apache.hadoop.mapreduce.Mapper;
import
org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import
org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import
org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import
org.apache.hadoop.util.GenericOptionsParser;
import
org.slf4j.Logger;
import
org.slf4j.LoggerFactory;
import
java.io.IOException;
public
class
BulkLoadJob {
static
Logger logger = LoggerFactory.getLogger(BulkLoadJob.
class
);
public
static
class
BulkLoadMap
extends
Mapper<LongWritable, Text, ImmutableBytesWritable, KeyValue> {
public
void
map(LongWritable key, Text value, Context context)
throws
IOException, InterruptedException {
String[] valueStrSplit = value.toString().split(
"\t"
);
String hkey = valueStrSplit[
0
];
String family = valueStrSplit[
1
].split(
":"
)[
0
];
String column = valueStrSplit[
1
].split(
":"
)[
1
];
String hvalue = valueStrSplit[
2
];
final
byte
[] rowKey = Bytes.toBytes(hkey);
final
ImmutableBytesWritable HKey =
new
ImmutableBytesWritable(rowKey);
// Put HPut = new Put(rowKey);
// byte[] cell = Bytes.toBytes(hvalue);
// HPut.add(Bytes.toBytes(family), Bytes.toBytes(column), cell);
KeyValue kv =
new
KeyValue(rowKey, Bytes.toBytes(family), Bytes.toBytes(column), Bytes.toBytes(hvalue));
context.write(HKey, kv);
}
}
public
static
void
main(String[] args)
throws
Exception {
Configuration conf = HBaseConfiguration.create();
conf.set(
"hbase.zookeeper.property.clientPort"
,
"2182"
);
conf.set(
"hbase.zookeeper.quorum"
,
"msg801,msg802,msg803"
);
conf.set(
"hbase.master"
,
"msg801:60000"
);
String[] dfsArgs =
new
GenericOptionsParser(conf, args).getRemainingArgs();
String inputPath = dfsArgs[
0
];
System.out.println(
"source: "
+ dfsArgs[
0
]);
String outputPath = dfsArgs[
1
];
System.out.println(
"dest: "
+ dfsArgs[
1
]);
HTable hTable =
null
;
try
{
Job job = Job.getInstance(conf,
"Test Import HFile & Bulkload"
);
job.setJarByClass(BulkLoadJob.
class
);
job.setMapperClass(BulkLoadJob.BulkLoadMap.
class
);
job.setMapOutputKeyClass(ImmutableBytesWritable.
class
);
job.setMapOutputValueClass(KeyValue.
class
);
// speculation
job.setSpeculativeExecution(
false
);
job.setReduceSpeculativeExecution(
false
);
// in/out format
job.setInputFormatClass(TextInputFormat.
class
);
job.setOutputFormatClass(HFileOutputFormat2.
class
);
FileInputFormat.setInputPaths(job, inputPath);
FileOutputFormat.setOutputPath(job,
new
Path(outputPath));
hTable =
new
HTable(conf, dfsArgs[
2
]);
HFileOutputFormat2.configureIncrementalLoad(job, hTable);
if
(job.waitForCompletion(
true
)) {
FsShell shell =
new
FsShell(conf);
try
{
shell.run(
new
String[] {
"-chmod"
,
"-R"
,
"777"
, dfsArgs[
1
] });
}
catch
(Exception e) {
logger.error(
"Couldnt change the file permissions "
, e);
throw
new
IOException(e);
}
// 加载到hbase表
LoadIncrementalHFiles loader =
new
LoadIncrementalHFiles(conf);
// 两种方式都可以
// 方式一
String[] loadArgs = { outputPath, dfsArgs[
2
] };
loader.run(loadArgs);
// 方式二
// loader.doBulkLoad(new Path(outputPath), hTable);
}
else
{
logger.error(
"loading failed."
);
System.exit(
1
);
}
}
catch
(IllegalArgumentException e) {
e.printStackTrace();
}
finally
{
if
(hTable !=
null
) {
hTable.close();
}
}
}
}
|
2.1创建新表
create ‘testImport3’,’fm1’,’fm2’
2.2创建sample2.csv,并上传到HDFS,内容为:
key1 fm1:col1 value1
key1 fm1:col2 value2
key1 fm2:col1 value3
key4 fm1:col1 value4
使用命令:
hadoop jar BulkLoadJob.jar hdfs://msg/sample2,csv hdfs://msg/HFileOut testImport3
注:1.mapper中使用KeyValue和Put都可以;2.注意jar包的classpath;3.如果Hadoop是HA,则需要使用HA的名字,比如我们的active namenode名称为msg801,但是HA的nameservice为msg,则HDFS的路径必须使用hdfs://msg而不能使用hdfs://msg801:9000(WHY?)。
具体报错为:
IllegalArgumentException: Wrong FS: hdfs://msg801:9000/HFileOut/fm2/bbab9d883a574d518cdcb304d1e681e9, expected: hdfs://msg |