1.lzo安装
#检查是否有lzop命令
[wangly@hadoop000 ~]$ which lzop
#若没有执行如下安装命令
[wangly@hadoop000 ~]$ yum install -y svn ncurses-devel
[wangly@hadoop000 ~]$ yum install -y gcc gcc-c++ make cmake
[wangly@hadoop000 ~]$ yum install -y openssl openssl-devel svn ncurses-devel zlib-devel libtool
[wangly@hadoop000 ~]$ yum install -y lzo lzo-devel lzop autoconf automake cmake
2.准备测试数据并且使用lzo压缩
[wangly@hadoop000 lzo]$ vi user_lzo.txt
#lzo压缩:lzop -v file lzo解压:lzop -dv file
[wangly@hadoop000 lzo]$ lzop -v user_lzo.txt
henren,18,girl
haungtian,20,man
qingdi,22,man
3.hadoop-lzo编译
hadoop-lzo的源码在GitHub上是开源的,源码地址:GitHub - twitter/hadoop-lzo: Refactored version of code.google.com/hadoop-gpl-compression for hadoop 0.20
?4maven编译
4.2改hadoop版本
我这里hadoop是3.2.2版本
[root@hadoop000 hadoop-lzo-master]# vi pom.xml
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<hadoop.current.version>3.2.2</hadoop.current.version>
<hadoop.old.version>1.0.4</hadoop.old.version>
</properties>
4.3编译
[root@hadoop000 hadoop-lzo-master]# mvn clean package -Dmaven.test.skip=true
#进入编译好的目录
[root@hadoop000 hadoop-lzo-master]# cd target/
[root@hadoop000 target]# ll
total 436
...
-rw-r--r-- 1 root root 199669 Jan 15 22:11 hadoop-lzo-0.4.21-SNAPSHOT.jar
...
4.4拷贝编译好的lzo文件到hadoop
[root@hadoop000 target]# cp hadoop-lzo-0.4.21-SNAPSHOT.jar hadoop/share/hadoop/common/
5.改hadoop配置文件
core-site.xml 添加
#配置压缩类
<property>
<name>io.compression.codecs</name>
<value>org.apache.hadoop.io.compress.GzipCodec,
org.apache.hadoop.io.compress.DefaultCodec,
org.apache.hadoop.io.compress.BZip2Codec,
org.apache.hadoop.io.compress.SnappyCodec,
com.hadoop.compression.lzo.LzoCodec,
com.hadoop.compression.lzo.LzopCodec
</value>
</property>
<property>
<name>io.compression.codec.lzo.class</name>
<value>com.hadoop.compression.lzo.LzoCodec</value>
</property>
mapred-site.xml添加
#map输出压缩
<property>
<name>mapred.compress.map.output</name>
<value>true</value>
</property>
<property>
<name>mapred.map.output.compression.codec</name>
<value>com.hadoop.compression.lzo.LzoCodec</value>
</property>
#reduce输出压缩
<property>
<name>mapreduce.output.fileoutputformat.compress</name>
<value>true</value>
</property>
<property>
<name>mapreduce.output.fileoutputformat.compress.codec</name>
<value>org.apache.hadoop.io.compress.BZip2Codec</value>
</property>
集群模式同步到其他节点
6. 测试hive表
#创建表
hive (wangly)> create table user_lzo(
> name string,
> age int,
> gender string
> ) row format delimited fields terminated by ','
> STORED AS INPUTFORMAT "com.hadoop.mapred.DeprecatedLzoTextInputFormat"
> OUTPUTFORMAT "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat";
#加载数据--数据是上边lzo压缩的数据
hive (wangly)> load data local inpath'/home/wangly/data/lzo/user_lzo.txt.lzo' into table user_lzo;
#简单查询测试
hive (wangly)> select * from user_lzo;
OK
user_lzo.name user_lzo.age user_lzo.gender
henren 18 girl
haungtian 20 man
qingdi 22 man
|