sqoop import \
load data inpath '/user/hive/warehouse/test/role7' into table test.role7;
问题记录 1 如果直接导入到hive,类型转换会失败 –map-column-java create_time=String,update_time=String 此选项不会生效
2 tinyint(1)会转换成boolean类型 ?tinyInt1isBit=false可以解决
3 分隔符无特殊要求,最好使用默认的 –fields-terminated-by ‘\001’ –lines-terminated-by ‘\n’ \
4 desc formatted 显示的是否压缩,可以通过查看parquet压缩文件确认,生成的类型也可以查看 java -jar /tmp/parquet-tools-1.6.0rc3-SNAPSHOT.jar schema -d /tmp/7713c78e-5a45-40f4-a78b-afc16389d782.parquet
[root@hadoop102 tmp]
message role {
optional int32 id;
optional binary role_no (UTF8);
optional binary name (UTF8);
optional binary company_no (UTF8);
optional int32 role_type;
optional binary notes (UTF8);
optional int32 enable;
optional binary create_user_no (UTF8);
optional binary update_user_no (UTF8);
optional binary create_time (UTF8);
optional binary update_time (UTF8);
}
creator: parquet-mr version 1.5.0-cdh5.16.1 (build ${buildNumber})
extra: parquet.avro.schema = {"type":"record","name":"role","doc":"Sqoop import of role","fields":[{"name":"id","type":["null","int"],"default":null,"columnName":"id","sqlType":"4"},{"name":"role_no","type":["null","string"],"default":null,"columnName":"role_no","sqlType":"12"},{"name":"name","type":["null","string"],"default":null,"columnName":"name","sqlType":"12"},{"name":"company_no","type":["null","string"],"default":null,"columnName":"company_no","sqlType":"12"},{"name":"role_type","type":["null","int"],"default":null,"columnName":"role_type","sqlType":"4"},{"name":"notes","type":["null","string"],"default":null,"columnName":"notes","sqlType":"12"},{"name":"enable","type":["null","int"],"default":null,"columnName":"enable","sqlType":"-6"},{"name":"create_user_no","type":["null","string"],"default":null,"columnName":"create_user_no","sqlType":"12"},{"name":"update_user_no","type":["null","string"],"default":null,"columnName":"update_user_no","sqlType":"12"},{"name":"create_time","type":["null","string"],"default":null,"columnName":"create_time","sqlType":"93"},{"name":"update_time","type":["null","string"],"default":null,"columnName":"update_time","sqlType":"93"}],"tableName":"role"}
file schema: role
--------------------------------------------------------------------------------
id: OPTIONAL INT32 R:0 D:1
role_no: OPTIONAL BINARY O:UTF8 R:0 D:1
name: OPTIONAL BINARY O:UTF8 R:0 D:1
company_no: OPTIONAL BINARY O:UTF8 R:0 D:1
role_type: OPTIONAL INT32 R:0 D:1
notes: OPTIONAL BINARY O:UTF8 R:0 D:1
enable: OPTIONAL INT32 R:0 D:1
create_user_no: OPTIONAL BINARY O:UTF8 R:0 D:1
update_user_no: OPTIONAL BINARY O:UTF8 R:0 D:1
create_time: OPTIONAL BINARY O:UTF8 R:0 D:1
update_time: OPTIONAL BINARY O:UTF8 R:0 D:1
row group 1: RC:70 TS:7683 OFFSET:4
--------------------------------------------------------------------------------
id: INT32 SNAPPY DO:0 FPO:4 SZ:327/323/0.99 VC:70 ENC:PLAIN,RLE,BIT_PACKED
role_no: BINARY SNAPPY DO:0 FPO:331 SZ:527/1406/2.67 VC:70 ENC:PLAIN,RLE,BIT_PACKED
name: BINARY SNAPPY DO:0 FPO:858 SZ:979/1379/1.41 VC:70 ENC:PLAIN_DICTIONARY,RLE,BIT_PACKED
company_no: BINARY SNAPPY DO:0 FPO:1837 SZ:230/299/1.30 VC:70 ENC:PLAIN_DICTIONARY,RLE,BIT_PACKED
role_type: INT32 SNAPPY DO:0 FPO:2067 SZ:65/61/0.94 VC:70 ENC:PLAIN_DICTIONARY,RLE,BIT_PACKED
notes: BINARY SNAPPY DO:0 FPO:2132 SZ:594/717/1.21 VC:70 ENC:PLAIN,RLE,BIT_PACKED
enable: INT32 SNAPPY DO:0 FPO:2726 SZ:77/73/0.95 VC:70 ENC:PLAIN_DICTIONARY,RLE,BIT_PACKED
create_user_no: BINARY SNAPPY DO:0 FPO:2803 SZ:203/252/1.24 VC:70 ENC:PLAIN_DICTIONARY,RLE,BIT_PACKED
update_user_no: BINARY SNAPPY DO:0 FPO:3006 SZ:163/181/1.11 VC:70 ENC:PLAIN_DICTIONARY,RLE,BIT_PACKED
create_time: BINARY SNAPPY DO:0 FPO:3169 SZ:958/1808/1.89 VC:70 ENC:PLAIN_DICTIONARY,RLE,BIT_PACKED
update_time: BINARY SNAPPY DO:0 FPO:4127 SZ:621/1184/1.91 VC:70 ENC:PLAIN,RLE,BIT_PACKED
java -jar /tmp/parquet-tools-1.6.0rc3-SNAPSHOT.jar head -n 10 /tmp/7713c78e-5a45-40f4-a78b-afc16389d782.parquet
[root@hadoop102 tmp]
id = 3
role_no = RO02020111600001
name = 系统管理员
company_no = GS2020111001
role_type = 0
notes = 江苏test
enable = true
create_user_no = 1
update_user_no = UR03
create_time = 2020-11-17 17:41:08.0
update_time = 2020-11-19 13:55:04.0
5 建立hive表时,参考parquet和hive的类型对应关系
BINARY -> STRING
BOOLEAN -> BOOLEAN
DOUBLE -> DOUBLE
FLOAT -> FLOAT
INT32 -> INT
INT64 -> BIGINT
INT96 -> TIMESTAMP
BINARY + OriginalType UTF8 -> STRING
BINARY + OriginalType DECIMAL -> DECIMAL
6 decimal类型转换,需要添加参数如下 参考官网 https://docs.cloudera.com/documentation/enterprise/6/6.3/topics/cdh_ig_parquet.html#sqoop_parquet_format
-D sqoop.parquet.logical_types.decimal.enable=true \
-D parquetjob.configurator.implementation=hadoop \
-D sqoop.avro.decimal_padding.enable=true \
-D sqoop.avro.logical_types.decimal.default.precision=38 \
-D sqoop.avro.logical_types.decimal.default.scale=8 \
|