LZO压缩支持

HDFS中支持压缩的话,能节省大量存储空间,下面就简单使用lzo格式实现存储压缩。
安装ant工具
zypper install ant-nodeps
zypper install ant-junit
zypper install ant
下载安装lzo
download http://www.oberhumer.com/opensource/lzo/
./configure --enable-shared --prefix /usr/local/lzo-2.06
make && make install 
下载安装(hadoop-lzo,有好几个版本,我测试使用的是下面这个版本)
download https://github.com/toddlipcon/hadoop-lzo
cd /root/hadoop/hadoop-lzo-master
C_INCLUDE_PATH=/root/hadoop/lzo-2.06/include
LIBRARY_PATH=/root/hadoop/lzo-2.06/lib
#64位机器安装步骤
export JAVA_HOME=/path/to/64bit/jdk 
export CFLAGS=-m64 
export CXXFLAGS=-m64 
ant compile-native tar
如果提示有hadoop-gpl-compression相关错误
wget http://Hadoop-gpl-compression.googlecode.com/files/hadoop-gpl-compression-0.1.0-rc0.tar.gz
mv hadoop-gpl-compression-0.1.0/lib/native/Linux-amd64-64/* /root/hadoop/lib/native/Linux-amd64-64/
build/目录下hadoop-lzo*.jar文件

配置conf/core-site.xml文件
<property>
	<name>io.compression.codecs</name>
	<value>org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.BZip2Codec,com.hadoop.compression.lzo.LzopCodec,com.hadoop.compression.lzo.LzoCodec</value>
</property>
<property>
	<name>io.compression.codec.lzo.class</name>
	<value>com.hadoop.compression.lzo.LzopCodec</value>
</property>
对hdfs中的lzo文件生成index索引
hadoop jar /root/hadoop/lib/hadoop-lzo.jar com.hadoop.compression.lzo.LzoIndexer inputTest/test.log.lzo
hadoop jar /root/hadoop/lib/hadoop-lzo-0.4.15.jar com.hadoop.compression.lzo.DistributedLzoIndexer inputTest/test.log.lzo

注(如果生成索引提示失败)

cp -r build /root/hadoop/
上面这一步是 为了让JNI找到库文件需要配置JAVA_LIBRARY_PATH。 bin/hadoop的命令会自动加载这个路径。
因为经常会报 "java.lang.RuntimeException: native-lzo library not available" 的错误,上面这一步就是最简单的解决方法。
使用hive做如下操作
#创建数据表
create table log(day int, bytes int, tag string, user string) row format delimited fields terminated by ' ';
#修改表结构
alter table log set fileformat inputformat "com.hadoop.mapred.DeprecatedLzoTextInputFormat" outputformat "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat";
#load data会删除hdfs中数据文件,保存到/user/root/hive.../warehouse里面
#load data inpath '/user/root/inputTest/test.log.lzo' overwrite into table log;
#设置location
alter table log set location 'hdfs:///user/root/inputTest/test.log.lzo';
#或者创建数据表如下
CREATE TABLE log_lzo (
  day int,  
  bytes int,  
  tag string,  
  user string
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' 
STORED AS 
INPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
#LOCATION 'hdfs:///user/root/inputTest/test.log.lzo' 

猜你喜欢

转载自blog.csdn.net/ciaos/article/details/8970111
今日推荐