组件总览
组件 | 版本 |
---|
JDK | 1.8.0_211 | Scala | 1.12.14 | Zookeeper | 3.5.9 | Hadoop | 3.2.2 | Hive | 3.1.4 | HBase | 2.4.9 | Kafka | 2.6.3 | Spark | 2.4.8 | Kudu | 1.14.0 | Impala | 3.4.0 |
Zookeeper部署
下载 apache-zookeeper-3.5.9-bin.tar.gz
https://mirrors.cnnic.cn/apache/zookeeper/zookeeper-3.5.9/apache-zookeeper-3.5.9-bin.tar.gz
创建目录
# 切换到hdfs用户安装
su hdfs
# 安装目录
sudo mkdir /opt/apps/
# 数据目录
sudo chmod 755 /data
sudo mkdir /data/zookeeper
sudo chown hdfs:hdfs zookeeper
解压
tar -zxvf apache-zookeeper-3.5.9-bin.tar.gz -C /opt/apps/
cd /opt/apps
mv apache-zookeeper-3.5.9/ zookeeper-3.5.9
环境变量
sudo vim /etc/profile.d/hdfs_env.sh
# zookeeper
export ZK_HOME=/opt/apps/zookeeper-3.5.9
export PATH=$PATH:$ZK_HOME/bin
source /etc/profile.d/hdfs_env.sh
配置服务器编号
echo "1" > /data/zookeeper/myid
配置
cd /opt/apps/zookeeper-3.5.9/conf/
sudo mv zoo_sample.cfg zoo.cfg
sudo vim zoo.cfg
# 修改data存储路径
dataDir=/data/zookeeper
# 添加
server.1=hadoop-master:2888:3888
server.2=hadoop-slave01:2888:3888
server.3=hadoop-slave02:2888:3888
vim /opt/apps/zookeeper/bin/zkEnv.sh
# 添加
export JAVA_HOME=/opt/apps/jdk
同步
lsync /opt/apps/zookeeper-3.5.9
# 修改其他服务 myid
[root@hadoop-slave01 /]$ echo "2" > /data/zookeeper/myid
[root@hadoop-slave02 /]$ echo "3" > /data/zookeeper/myid
启动
启动脚本
vim zkCluster.sh
sudo chmod +x zkCluster.sh
脚本内容
#!/bin/bash
hosts=(hadoop-master hadoop-slave01 hadoop-slave02)
path=/opt/apps/zookeeper-3.5.9
case $1 in
"start"){
for i in ${hosts[@]}
do
echo ---------- $i zookeeper startting ------------
ssh $i "$path/bin/zkServer.sh start"
done
};;
"stop"){
for i in ${hosts[@]}
do
echo ---------- $i zookeeper stopping ------------
ssh $i "$path/bin/zkServer.sh stop"
done
};;
"status"){
for i in ${hosts[@]}
do
echo ---------- $i zookeeper status ------------
ssh $i "$path/bin/zkServer.sh status"
done
};;
esac
Hadoop部署
下载 hadoop-3.2.2.tar.gz
https://archive.apache.org/dist/hadoop/common/hadoop-3.2.2/hadoop-3.2.2.tar.gz
解压
tar -zxvf hadoop-3.2.2.tar.gz -C /opt/apps/
环境变量
vim /etc/profile.d/hdfs_env.sh
# hadoop
export HADOOP_HOME=/opt/apps/hadoop-3.2.2
export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
export LD_LIBRARY_PATH=$HADOOP_HOME/lib/native:/usr/lib64
export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
source /etc/profile.d/hdfs_env.sh
创建目录
sudo mkdir -p /data/hadoop/tmp
sudo mkdir -p /data/hadoop/nn
sudo mkdir -p /data/hadoop/dn
sudo mkdir -p /data/hadoop/jn
sudo chown hdfs:hdfs -R /data/hadoop
集群配置
cd /opt/apps/hadoop-3.2.2/etc/hadoop/
core-site.xml
https://hadoop.apache.org/docs/r3.1.4/hadoop-project-dist/hadoop-common/core-default.xml
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://nameservice1</value>
</property>
<!-- hadoop运行时产生的临时文件存储路径 -->
<property>
<name>hadoop.tmp.dir</name>
<value>/data/hadoop/tmp</value>
</property>
<!-- 开启垃圾回收功能,保留分钟数,3天 -->
<property>
<name>fs.trash.interval</name>
<value>4320</value>
</property>
<!-- zookeeper地址 -->
<property>
<name>ha.zookeeper.quorum</name>
<value>hadoop-master:2181,hadoop-slave01:2181,hadoop-slave02:2181</value>
</property>
<!-- 开启本地库支持 -->
<property>
<name>io.native.lib.available</name>
<value>true</value>
</property>
<!-- 支持的编码的类 -->
<property>
<name>io.compression.codecs</name>
<value>org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.BZip2Codec,org.apache.hadoop.io.compress.SnappyCodec</value>
</property>
<!-- SequenceFiles在读写中可以使用的缓存大小 -->
<property>
<name>io.file.buffer.size</name>
<value>131072</value>
</property>
<!-- 客户端断联最大时长,6秒 -->
<property>
<name>ipc.client.connection.maxidletime</name>
<value>60000</value>
</property>
<!-- hdfs(superUser)允许通过代理访问的主机节点 -->
<property>
<name>hadoop.proxyuser.hdfs.hosts</name>
<value>*</value>
</property>
<!-- hdfs(superUser)允许通过代理用户所属组 -->
<property>
<name>hadoop.proxyuser.hdfs.groups</name>
<value>*</value>
</property>
<!-- root允许通过代理访问的主机节点 -->
<property>
<name>hadoop.proxyuser.root.hosts</name>
<value>*</value>
</property>
<!-- root允许通过代理用户所属组 -->
<property>
<name>hadoop.proxyuser.root.groups</name>
<value>*</value>
</property>
</configuration>
hdfs-site.xml
https://hadoop.apache.org/docs/r3.1.4/hadoop-project-dist/hadoop-hdfs/hdfs-default.xml
<configuration>
<!-- 集群命名空间 -->
<property>
<name>dfs.nameservices</name>
<value>nameservice1</value>
</property>
<!-- 高可用2个NameNode逻辑地址 -->
<property>
<name>dfs.ha.namenodes.nameservice1</name>
<value>nn1,nn2</value>
</property>
<!-- nn1的RPC通信地址 -->
<property>
<name>dfs.namenode.rpc-address.nameservice1.nn1</name>
<value>hadoop-master:8020</value>
</property>
<!-- nn1 HDFS服务RPC通讯地址 -->
<property>
<name>dfs.namenode.servicerpc-address.nameservice1.nn1</name>
<value>hadoop-master:8022</value>
</property>
<!-- nn1 webUI监听地址和端口 -->
<property>
<name>dfs.namenode.http-address.nameservice1.nn1</name>
<value>hadoop-master:9870</value>
</property>
<!-- nn1的安全HTTP通信地址 -->
<property>
<name>dfs.namenode.https-address.nameservice1.nn1</name>
<value>hadoop-master:9871</value>
</property>
<!-- nn2的RPC通信地址 -->
<property>
<name>dfs.namenode.rpc-address.nameservice1.nn2</name>
<value>hadoop-slave01:8020</value>
</property>
<!-- nn2 HDFS服务RPC通讯地址 -->
<property>
<name>dfs.namenode.servicerpc-address.nameservice1.nn2</name>
<value>hadoop-slave01:8022</value>
</property>
<!-- nn2 webUI监听地址和端口 -->
<property>
<name>dfs.namenode.http-address.nameservice1.nn2</name>
<value>hadoop-slave01:9870</value>
</property>
<!-- nn2的安全HTTP通信地址 -->
<property>
<name>dfs.namenode.https-address.nameservice1.nn2</name>
<value>hadoop-salve01:9871</value>
</property>
<!-- NameNode fsimage日志存放位置 -->
<property>
<name>dfs.namenode.name.dir</name>
<value>/data/hadoop/nn</value>
</property>
<!-- Journal日志存放位置 -->
<property>
<name>dfs.journalnode.edits.dir</name>
<value>/data/hadoop/jn</value>
</property>
<!-- NameNode共享元数据edits在JournalNode上的存放位置 -->
<property>
<name>dfs.namenode.shared.edits.dir</name>
<value>qjournal://hadoop-master:8485;hadoop-slave01:8485;hadoop-slave02:8485/nameservice1</value>
</property>
<!-- datanode数据块存储位置 -->
<property>
<name>dfs.datanode.data.dir</name>
<value>/data/hadoop/dn</value>
</property>
<!-- 存储卷选择策略,数据分配到各个节点的机制 -->
<property>
<name>dfs.datanode.fsdataset.volume.choosing.policy</name>
<value>org.apache.hadoop.hdfs.server.datanode.fsdataset.AvailableSpaceVolumeChoosingPolicy</value>
</property>
<!-- 容许的存储卷的空间差值,2G -->
<property>
<name>dfs.datanode.available-space-volume-choosing-policy.balanced-space-threshold</name>
<value>2147483648</value>
</property>
<!-- 数据块分配到到磁盘空间多的概率 -->
<property>
<name>dfs.datanode.available-space-volume-choosing-policy.balanced-space-preference-fraction</name>
<value>0.85f</value>
</property>
<!-- 副本数,默认3 -->
<property>
<name>dfs.replication</name>
<value>3</value>
</property>
<!-- 开启故障自动转移 -->
<property>
<name>dfs.ha.automatic-failover.enabled.nameservice1</name>
<value>true</value>
</property>
<!-- 故障自动转移方式,实现类,zkfc -->
<!-- 客户端通过代理访问NameNode,访问文件系统,HDFS客户端与Active节点通信的Java类,使用其确定Active节点是否活跃 -->
<property>
<name>dfs.client.failover.proxy.provider.nameservice1</name>
<value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
</property>
<!-- 配置隔离fence机制方法,多个机制用换行分割 -->
<property>
<name>dfs.ha.fencing.methods</name>
<value>
sshfence
shell(/bin/true)
</value>
</property>
<!-- 使用sshfence隔离机制时需要ssh免登陆 -->
<property>
<name>dfs.ha.fencing.ssh.private-key-files</name>
<value>/home/hdfs/.ssh/id_rsa</value>
</property>
<!-- 配置sshfence隔离机制超时时间 -->
<property>
<name>dfs.ha.fencing.ssh.connect-timeout</name>
<value>30000</value>
</property>
<!-- 故障检测zk会话时间,默认5秒检测一次 -->
<property>
<name>ha.zookeeper.session-timeout.ms</name>
<value>5000</value>
</property>
<!-- 权限检查 -->
<property>
<name>dfs.permissions.enabled</name>
<value>false</value>
</property>
<!-- webhdfs的 LISTSTATUS、LISTFILESTATUS等需要列出文件、文件夹状态的命令 -->
<property>
<name>dfs.webhdfs.enabled</name>
<value>true</value>
</property>
<!-- 给文件追加内容 -->
<property>
<name>dfs.support.append</name>
<value>true</value>
</property>
<!-- DataNode节点用于均衡数据的最大带宽 -->
<property>
<name>dfs.datanode.balance.bandwidthPerSec</name>
<value>50m</value>
</property>
<!-- NameNode监听DataNode RPC通讯的线程数 -->
<property>
<name>dfs.namenode.handler.count</name>
<value>50</value>
</property>
<!-- 文件块的大小 -->
<property>
<name>dfs.blocksize</name>
<value>134217728</value>
</property>
<!-- DataNode数据传输(in+out)的最大线程数 -->
<property>
<name>dfs.datanode.max.transfer.threads</name>
<value>8192</value>
</property>
<!-- 每个存储卷的保留磁盘空间 -->
<property>
<name>dfs.datanode.du.reserved</name>
<value>2147483648</value>
<description></description>
</property>
<!-- 客户端连接datanode时使用主机名,远程通过外网连接需要开启 -->
<property>
<name>dfs.client.use.datanode.hostname</name>
<value>true</value>
<description>only cofig in clients</description>
</property>
</configuration>
mapred-site.xml
https://hadoop.apache.org/docs/r3.1.4/hadoop-mapreduce-client/hadoop-mapreduce-client-core/mapred-default.xml
<!-- 设置mapreduce运行框架,yarn -->
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<!-- 历史服务器端地址 -->
<property>
<name>mapreduce.jobhistory.address</name>
<value>hadoop-slave02:10020</value>
</property>
<!-- 历史服务器web端地址 -->
<property>
<name>mapreduce.jobhistory.webapp.address</name>
<value>hadoop-slave02:19888</value>
</property>
<!-- 历史服务器web端地址 -->
<property>
<name>mapreduce.jobhistory.webapp.https.address</name>
<value>hadoop-slave02:19890</value>
</property>
<property>
<name>mapreduce.jobhistory.done-dir</name>
<value>/data/yarn/history/done</value>
</property>
<property>
<name>mapreduce.jobhistory.intermediate-done-dir</name>
<value>/data/yarn/history/done_intermediate</value>
</property>
<!-- MR运行相关配置 -->
<!-- MRAppMaster需要的总内存大小,默认是1536 -->
<property>
<name>yarn.app.mapreduce.am.resource.mb</name>
<value>1536</value>
</property>
<!-- MRAppMaster需要的堆内存大小,默认是:-Xmx1024m -->
<property>
<name>yarn.app.mapreduce.am.command-opts</name>
<value>-Xmx1024m</value>
</property>
<!-- MRAppMaster需要的的虚拟cpu数量,默认是:1 -->
<property>
<name>yarn.app.mapreduce.am.resource.cpu-vcores</name>
<value>1</value>
</property>
<!-- MapTask需要的总内存大小,默认是-1,不限制 -->
<property>
<name>mapreduce.map.memory.mb</name>
<value>-1</value>
</property>
<!-- MapTask需要的堆内存大小,默认是-Xmx200m -->
<property>
<name>mapreduce.map.java.opts</name>
<value>-Xmx300m</value>
</property>
<!-- MapTask需要的虚拟cpu大小,默认是1 -->
<property>
<name>mapreduce.map.cpu.vcores</name>
<value>1</value>
</property>
<!-- ReduceTask需要的总内存大小,默认是-1,不限制 -->
<property>
<name>mapreduce.reduce.memory.mb</name>
<value>-1</value>
</property>
<!-- ReduceTask需要的堆内存大小,默认是-Xmx200m -->
<property>
<name>mapreduce.reduce.java.opts</name>
<value>-Xmx300m</value>
</property>
<!-- ReduceTask需要的虚拟cpu大小,默认是1 -->
<property>
<name>mapreduce.reduce.cpu.vcores</name>
<value>1</value>
</property>
</configuration>
yarn-site.xml
https://hadoop.apache.org/docs/r3.1.4/hadoop-yarn/hadoop-yarn-common/yarn-default.xml
<configuration>
<!--开启RM ha,默认是开启的-->
<property>
<name>yarn.resourcemanager.ha.enabled</name>
<value>true</value>
</property>
<!-- RM的cluster id -->
<property>
<name>yarn.resourcemanager.cluster-id</name>
<value>yarnRM</value>
</property>
<!-- 指定RM的名字 -->
<property>
<name>yarn.resourcemanager.ha.rm-ids</name>
<value>rm1,rm2</value>
</property>
<!-- 分别指定RM的地址 -->
<property>
<name>yarn.resourcemanager.hostname.rm1</name>
<value>hadoop-master</value>
</property>
<property>
<name>yarn.resourcemanager.hostname.rm2</name>
<value>hadoop-slave01</value>
</property>
<!-- RM对外暴露的地址,客户端通过该地址向RM提交应用程序,杀死应用程序等 -->
<property>
<name>yarn.resourcemanager.address.rm1</name>
<value>hadoop-master:8032</value>
</property>
<!-- RM对ApplicationMaster暴露的地址,AM通过该地址向RM申请资源、释放资源等 -->
<property>
<name>yarn.resourcemanager.scheduler.address.rm1</name>
<value>hadoop-master:8030</value>
</property>
<!-- 资源追踪接口地址 -->
<property>
<name>yarn.resourcemanager.resource-tracker.address.rm1</name>
<value>hadoop-master:8031</value>
</property>
<!-- RM管理接口,管理员通过该地址向RM发送管理命令 -->
<property>
<name>yarn.resourcemanager.admin.address.rm1</name>
<value>hadoop-master:8033</value>
</property>
<!-- WebUI-->
<property>
<name>yarn.resourcemanager.webapp.address.rm1</name>
<value>hadoop-master:8088</value>
</property>
<!-- HTTPS WebUI -->
<property>
<name>yarn.resourcemanager.webapp.https.address.rm1</name>
<value>hadoop-master:8090</value>
</property>
<property>
<name>yarn.resourcemanager.address.rm2</name>
<value>hadoop-slave01:8032</value>
</property>
<property>
<name>yarn.resourcemanager.scheduler.address.rm2</name>
<value>hadoop-slave01:8030</value>
</property>
<property>
<name>yarn.resourcemanager.resource-tracker.address.rm2</name>
<value>hadoop-slave01:8031</value>
</property>
<property>
<name>yarn.resourcemanager.admin.address.rm2</name>
<value>hadoop-slave01:8033</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.address.rm2</name>
<value>hadoop-slave01:8088</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.https.address.rm2</name>
<value>hadoop-slave01:8090</value>
</property>
<property>
<name>yarn.nodemanager.address</name>
<value>0.0.0.0:9103</value>
</property>
<!-- 指定MR shuffle -->
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.nodemanager.webapp.address</name>
<value>0.0.0.0:8042</value>
</property>
<property>
<name>yarn.nodemanager.localizer.address</name>
<value>0.0.0.0:8040</value>
</property>
<property>
<name>yarn.nodemanager.aux-services.mapreduce.shuffle.class</name>
<value>org.apache.hadoop.mapred.ShuffleHandler</value>
</property>
<property>
<name>mapreduce.shuffle.port</name>
<value>23080</value>
</property>
<property>
<name>yarn.app.mapreduce.am.staging-dir</name>
<value>/user</value>
</property>
<property>
<name>yarn.web-proxy.address</name>
<value>hadoop-master:8041</value>
</property>
<!-- 环境变量 -->
<property>
<name>yarn.nodemanager.env-whitelist</name>
<value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOME</value>
</property>
<!-- 日志聚合 -->
<property>
<name>yarn.log-aggregation-enable</name>
<value>true</value>
</property>
<!-- 日志聚集服务器地址 -->
<property>
<name>yarn.log.server.url</name>
<value>http://hadoop-slave02:19888/jobhistory/logs</value>
</property>
<!-- 设置日志保留时间为7天 -->
<property>
<name>yarn.log-aggregation.retain-seconds</name>
<value>604800</value>
</property>
<!-- 中间结果存放位置,存放执行Container所需的数据如可执行程序或jar包,配置文件等和运行过程中产生的临时数据 -->
<property>
<name>yarn.nodemanager.local-dirs</name>
<value>/data/yarn/local</value>
</property>
<!-- Container运行日志存放地址(可配置多个目录) -->
<property>
<name>yarn.nodemanager.log-dirs</name>
<value>/data/yarn/logs</value>
</property>
<!-- 是否启用日志聚集功能 -->
<property>
<name>yarn.log-aggregation-enable</name>
<value>true</value>
</property>
<!-- 当应用程序运行结束后,日志被转移到的HDFS目录(启用日志聚集功能时有效) -->
<property>
<name>yarn.nodemanager.remote-app-log-dir</name>
<value>/tmp/app-logs</value>
</property>
<!-- nodemanager上所有Container的运行日志在HDFS中的保存时间,保留半个月 -->
<property>
<name>yarn.log-aggregation.retain-seconds</name>
<value>1209600</value>
</property>
<property>
<name>yarn.app.mapreduce.am.scheduler.connection.wait.interval-ms</name>
<value>5000</value>
</property>
<!-- RM故障自动转移 -->
<property>
<name>yarn.resourcemanager.ha.automatic-failover.enabled</name>
<value>true</value>
</property>
<!-- 内嵌式故障转移,依赖RM状态来处理隔离 -->
<property>
<name>yarn.resourcemanager.ha.automatic-failover.embedded</name>
<value>true</value>
</property>
<!-- 启用RM状态恢复 -->
<property>
<name>yarn.resourcemanager.recovery.enabled</name>
<value>true</value>
</property>
<!-- 将RM的状态信息存储在ZK,默认是存放在FileSystem里面。-->
<property>
<name>yarn.resourcemanager.store.class</name>
<value>org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore</value>
</property>
<!-- 指定zk集群地址 -->
<property>
<name>yarn.resourcemanager.zk-address</name>
<value>hadoop-master:2181,hadoop-slave01:2181,hadoop-slave02:2181</value>
</property>
<!-- 环境变量 -->
<property>
<name>yarn.nodemanager.env-whitelist</name>
<value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOME</value>
</property>
<!-- 单个任务申请container的最小内存,MB -->
<property>
<name>yarn.scheduler.minimum-allocation-mb</name>
<value>1024</value>
</property>
<!-- 单个任务申请container的最大内存,MB -->
<property>
<name>yarn.scheduler.maximum-allocation-mb</name>
<value>8192</value>
</property>
<!-- 单个任务申请container的最小CPU cores -->
<property>
<name>yarn.scheduler.minimum-allocation-vcores</name>
<value>1</value>
</property>
<!-- 单个任务申请container的最大CPU cores -->
<property>
<name>yarn.scheduler.maximum-allocation-vcores</name>
<value>16</value>
</property>
<!-- 该节点上可分配的物理内存总量 -->
<property>
<name>yarn.nodemanager.resource.memory-mb</name>
<value>3072</value>
</property>
<!-- 该节点上YARN可使用的虚拟CPU个数,一个物理CPU对应3个虚拟CPU -->
<property>
<name>yarn.nodemanager.resource.cpu-vcores</name>
<value>6</value>
</property>
<!-- 调度器 -->
<property>
<name>yarn.resourcemanager.scheduler.class</name>
<value>org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler</value>
</property>
<!-- 虚拟内存检测,默认是True -->
<property>
<name>yarn.nodemanager.vmem-check-enabled</name>
<value>false</value>
</property>
<!-- 物理内存检测,默认是True -->
<property>
<name>yarn.nodemanager.pmem-check-enabled</name>
<value>false</value>
</property>
</configuration>
workers
hadoop-master
hadoop-slave01
hadoop-slave02
启动
初始化zkfc
hdfs zkfc -formatZK
# 两个namenode节点,启动zkfc
hdfs --daemon start zkfc
首次启动要初始化NameNode,先手动启动各个组件
# 每个节点,启动 JournalNode集群
hdfs --daemon start journalnode
# 格式化NameNode
# 在一个NameNode节点格式化
hdfs namenode -format
# 启动NameNode
hdfs --daemon start namenode
# 同步NameNode
# 另一个NameNode同步
hdfs namenode -bootstrapStandby
hdfs --daemon start namenode
# webUI
10.0.11.110:9870
# 切换NameNode active
# 启动后两个都时standby,将一台强制切换为active
hdfs haadmin -transitionToActive nn1 --forcemanual
# 查看NameNode状态
hdfs haadmin -getServiceState nn1
hdfs haadmin -getServiceState nn2
# 其中一台,启动DataNode
vim $HADOOP_CONF_DIR/hadoop-env.sh
export JAVA_HOME=/opt/apps/jdk
hdfs --workers --daemon start datanode
# 两个主节点,启动ResourceManager
yarn --daemon start resourcemanager
# ResourceManager webUI
http://10.0.11.110:8088
# 每个节点,启动NodeManager
yarn --daemon start nodemanager
# 第三个节点,启动Historyserver
mapred --daemon start historyserver
# Historyserver webUI
http://10.0.11.112:19888
启动集群
start-dfs.sh
start-yarn.sh
检验HDFS
# HDFS
hdfs dfs -ls /
hdfs dfs -put core-site.xml /tmp
# YARN
hadoop jar /opt/apps/hadoop-3.2.2/share/hadoop/mapreduce/hadoop-mapreduce-examples-3.1.4.jar \
wordcount \
/tmp/core-site.xml /tmp/out
# 查看计算结果
hdfs dfs -text /tmp/out/*
验证HA
# HDFS,kill 掉 active,standby节点会自动切换成 active
jps
15065 NameNode
kill -9 15065
# 重启后会自动成为 standby
hdfs --daemon start namenode
# YARN,从任一RM节点进入WebUI都会进入 hadoop-master:8088, kill后从该节点无法开打Web
jps
13743 ResourceManager
kill 13743
# 重启后能继续访问
yarn --daemon start resourcemanager
PostgresSQL部署
官网:PostgreSQL: Linux downloads (Red Hat family)
yum:RepoView: PostgreSQL PGDG 13 Updates RPMs
离线下载:PostgreSQL JDBC Download
依赖
yum install -y python-devel perl-ExtUtils-Embed python-devel gcc-c++ openssl-devel readline readline-devel bzip2 zlib zlib-devel openssl openssl-devel pam pam-devel libxml2 libxml2-devel libxslt libxslt-devel openldap openldap-devel libgeos-dev libproj-dev libgdal-dev xsltproc docbook-xsl docbook-xml imagemagick libmagickcore-dev dblatex tcl tcl-devel unixODBC unixODBC-devel libpng12 libpng12-devel libtiff libtiff-devel curl-devel
yum 安装
sudo yum install -y https://download.postgresql.org/pub/repos/yum/reporpms/EL-7-x86_64/pgdg-redhat-repo-latest.noarch.rpm
sudo yum install -y postgresql13-server
rpm -qa|grep postgres
systemctl 管理 PostgreSQL 服务
cp /lib/systemd/system/postgresql-13.service /etc/systemd/system/postgresql-13.service
自定义数据目录
# 两种方式
# 1. 单实例,直接修改文件
vim /etc/systemd/system/postgresql-13.service
Environment=PGDATA=/data/pgsql/13/data/
# 初始化
sudo /usr/pgsql-13/bin/postgresql-13-setup initdb postgresql-13
ll /data
# 2.导入环境变量
export PGHOME=/usr/pgsql-13
export PGDATA=/data/pgsql/13/data
export PGUSER=postgres
export LD_LIBRARY_PATH=$PGHOME/lib:$LD_LIBRARY_PATH
export PATH=$PGHOME/bin:$PATH
# 初始化
sudo /usr/pgsql-13/bin/postgresql-13-setup initdb
修改postgresql.conf
vim $PGDATA/postgresql.conf
# 修改
listen_addresses = '*' # what IP address(es) to listen on;
port = 5432 # (change requires restart)
max_connections = 1000 # (change requires restart)
superuser_reserved_connections = 5 # (change requires restart)
shared_buffers = 8192MB # min 128kB
work_mem = 16MB # min 64kB
maintenance_work_mem = 512MB # min 1MB
vacuum_cost_delay = 0 # 0-100 milliseconds (0 disables)
max_worker_processes = 128 # (change requires restart)
max_parallel_maintenance_workers = 8 # taken from max_parallel_workers
max_parallel_workers_per_gather = 16 # taken from max_parallel_workers
fsync = on # flush data to disk for crash safety
commit_delay = 1000 # range 0-100000, in microseconds
commit_siblings = 100 # range 1-1000
checkpoint_completion_target = 0.8 # checkpoint target duration, 0.0 - 1.0
effective_cache_size = 4GB
datestyle = 'iso, mdy'
lc_messages = 'en_US.UTF-8' # locale for system error message
lc_monetary = 'en_US.UTF-8' # locale for monetary formatting
lc_numeric = 'en_US.UTF-8' # locale for number formatting
lc_time = 'en_US.UTF-8' # locale for time formatting
default_text_search_config = 'pg_catalog.english'
修改pg_hba.conf
vim $PGDATA/pg_hba.conf
# 允许所有访问,新增一条
# host all all 0.0.0.0/0 md5
# 只允许内网 10.0.11.110/112 网段访问,增加一条
host all all 10.0.11.110/112 trust
启动
sudo systemctl enable postgresql-13
sudo systemctl start postgresql-13
sudo systemctl status postgresql-13
登录
psql -h 127.0.0.1 -p 5432 -U postgres
# postgres用户下直接输入psql 就可以进入psql的命令界面
su - postgres
psql -U postgres
alter user postgres with password '123456';
# 创建角色hive
select rolname from pg_roles;
create user hive with password '123456';
create database metastore owner hive;
grant all privileges on database metastore to hive;
alter role hive with createdb;
卸载
yum remove postgresql13-server
Hive部署
兼容性通过下载页可用看到兼容版本
https://hive.apache.org/downloads.html
下载 hive-3.1.2
https://dlcdn.apache.org/hive/hive-3.1.2/apache-hive-3.1.2-bin.tar.gz
解压
tar -zxvf apache-hive-3.1.2-bin.tar.gz -C /opt/apps
mv opt/apps/apache-hive-3.1.2-bin.tar.gz opt/apps/apache-hive-3.1.2
环境变量
vim /etc/profile.d/hdfs_env.sh
# hive
export HIVE_HOME=/opt/apps/hadoop-3.2.2
export HIVE_CONF_DIR=$HIVE_HOME/conf
export PATH=$PATH:$HIVE_HOME/bin
source /etc/profile.d/hdfs_env.sh
解决依赖问题
# 上传jdbc connetor
cp postgresql-42.3.1.jar /opt/apps/hive-3.1.2/lib/
mv $HIVE_HOME/lib/postgresql-9.4.1208.jre7.jar $HIVE_HOME/lib/postgresql-9.4.1208.jre7.jar.bak
# Hadoop3.2.2的guava比Hive的版本高,拷贝到Hive的lib下
cp $HADOOP_HOME/share/hadoop/common/lib/guava-27.0-jre.jar $HIVE_HOME/lib/
mv $HIVE_HOME/lib/guava-19.0.jar $HIVE_HOME/lib/guava-19.0.jar.bak
mv $HIVE_HOME/lib/log4j-slf4j-impl-*.jar $HIVE_HOME/lib/log4j-slf4j-impl-*.jar.bak
配置
cd /opt/apps/hive-3.1.2/conf
cp hive-default.xml.template hive-site.xml
mv hive-env.sh.template hive-env.sh
mv hive-log4j2.properties.template hive-log4j2.properties
hive-log4j2.properties
cat > hive-log4j2.properties <<EOL
log4j.rootLogger=WARN, CA
log4j.appender.CA=org.apache.log4j.ConsoleAppender
log4j.appender.CA.layout=org.apache.log4j.PatternLayout
log4j.appender.CA.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
EOL
hive-env.sh
vim hive-env.sh
# 修改
HADOOP_HOME=/opt/apps/hadoop-3.2.2
export HIVE_CONF_DIR=/opt/apps/hive-3.1.2/conf
export HIVE_AUX_JARS_PATH=/opt/apps/hive-3.1.2/lib
hive-site.xml
Configuration Properties - Apache Hive - Apache Software Foundation
vim hive-site.xml
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?><!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<configuration>
<!-- 元数据库类型 derby, oracle, mysql, mssql, postgres -->
<property>
<name>hive.metastore.db.type</name>
<value>postgres</value>
</property>
<!-- 元数据库 -->
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:postgresql://hadoop-master:5432/metastore</value>
</property>
<!-- 数据库驱动 -->
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>org.postgresql.Driver</value>
</property>
<!-- 数据库用户名 -->
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>hive</value>
</property>
<!-- 数据库密码 -->
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>123456</value>
</property>
<!-- Hive使用的HDFS目录 -->
<property>
<name>hive.metastore.warehouse.dir</name>
<value>/user/hive/warehouse</value>
</property>
<property>
<name>datanucleus.schema.autoCreateAll</name>
<value>true</value>
</property>
<!-- 强制元数据一致 -->
<property>
<name>hive.metastore.schema.verification</name>
<value>false</value>
</property>
<!-- metastore服务器URL -->
<property>
<name>hive.metastore.uris</name>
<value>thrift://hadoop-master:9083,thrift://hadoop-slave01:9083</value>
</property>
<!-- HiveServer2高可用,使HiveServer2服务可被动态发现 -->
<property>
<name>hive.server2.support.dynamic.service.discovery</name>
<value>true</value>
</property>
<!-- hive在zk上的命名空间 -->
<property>
<name>hive.server2.zookeeper.namespace</name>
<value>hiveserver2</value>
</property>
<!-- Active/Passive 高可用模式,但是只有Leader处理请求 -->
<!--
<property>
<name>hive.server2.active.passive.ha.enable</name>
<value>true</value>
</property>
-->
<!-- hiveserver2实例及leader在zk中注册的znode名,开启后授权会异常 -->
<!--
<property>
<name>hive.server2.active.passive.ha.registry.namespace</name>
<value>hiveserver2</value>
</property>
-->
<!-- zk集群服务器 -->
<property>
<name>hive.zookeeper.quorum</name>
<value>hadoop-master,hadoop-slave01,hadoop-slave02</value>
</property>
<!-- zk端口 -->
<property>
<name>hive.zookeeper.client.port</name>
<value>2181</value>
</property>
<!-- thriftServer绑定的主机 -->
<property>
<name>hive.server2.thrift.bind.host</name>
<value>0.0.0.0</value>
</property>
<!-- thriftServer端口,两个HiveServer2实例的端口号要一致 -->
<property>
<name>hive.server2.thrift.port</name>
<value>10000</value>
</property>
<!-- 身份验证模式,默认为NONE。NONE(使用普通SASL),NOSASL,KERBEROS,LDAP,PAM和CUSTOM -->
<property>
<name>hive.server2.authentication</name>
<value>NONE</value>
</property>
<!-- HiveServer2以提交查询的用户执行查询,false则以Hiveserver2进程运行的用户执行 -->
<property>
<name>hive.server2.enable.doAs</name>
<value>true</value>
</property>
<!-- 启动时自动创建必须的Schema,创建完成后要修改为false -->
<property>
<name>datanucleus.autoCreateSchema</name>
<value>true</value>
</property>
<!-- 是否允许更新Schema -->
<property>
<name>datanucleus.fixedDatastore</name>
<value>true</value>
</property>
<!-- 检查Schema -->
<property>
<name>datanucleus.autoStartMechanismMode</name>
<value>checked</value>
</property>
<!-- 命令行显示当前操作的库名,默认不显示 -->
<property>
<name>hive.cli.print.current.db</name>
<value>true</value>
</property>
<!-- 查询结构显示列名,默认不显示 -->
<property>
<name>hive.cli.print.header</name>
<value>true</value>
</property>
<!-- 执行引擎 -->
<property>
<name>hive.execution.engine</name>
<value>mr</value>
</property>
<!-- 临时文件HDFS位置 -->
<property>
<name>hive.exec.scratchdir</name>
<value>/tmp/hive</value>
</property>
<!-- Hive的DDL/DML作业计算结果本地存储目录 -->
<property>
<name>hive.exec.local.scratchdir</name>
<value>/data/hive/scratchdir</value>
</property>
<!-- Hive查询日志本地存放目录 -->
<property>
<name>hive.querylog.location</name>
<value>/data/hive/querylog</value>
</property>
<!-- 用于向远程文件系统添加资源的本地临时目录 -->
<property>
<name>hive.downloaded.resources.dir</name>
<value>/data/hive/resources/${hive.session.id}_resources</value>
</property>
<!-- hive WebUI的地址 -->
<property>
<name>hive.server2.webui.host</name>
<value>0.0.0.0</value>
</property>
<!-- hive WebUI的端口 -->
<property>
<name>hive.server2.webui.port</name>
<value>10002</value>
</property>
</configuration>
初始化数据库
# PG
schematool -initSchema -dbType postgres
# 查看数据库
psql -h localhost -Uhive -p 5432 -W -d metastore
\d
# MySQL
cp /usr/share/java/mysql-connector-java.jar /opt/apps/hive/lib/
CREATE DATABASE metastore DEFAULT CHARACTER SET utf8 DEFAULT COLLATE utf8_general_ci;
create user 'hive'@'%' identified by '123456';
grant all on metastore.* to 'hive'@'%';
show grants for 'hive'@'%';
schematool -initSchema -dbType mysql
创建目录
# HDFS
hdfs dfs -mkdir -p /user/hive/warehouse
hdfs dfs -ls /hive
# 本地
sudo mkdir /data/hive
sudo chown hdfs:hdfs /data/hive
启动
# 分别启动两台的metaStore和hiveServer2
# metastore启动后可以直接通过hive命令打开hive客户端
hive --service metastore
# hiveserver2启动后可以通过beeline连接hive
# !connect jdbc:hive2://hadoop-master:10000 hdfs 123456
hiveserver2
nohup hive --service metastore >> $HIVE_HOME/logs/metastore_$(date +%Y_%m_%d).log 2>&1 &
nohup hiveserver2 >> $HIVE_HOME/logs/hiveserver2_$(date +%Y_%m_%d).log 2>&1 &
验证
su hdfs
hive
zkCli.sh
ls /hiveserver2
# 两个都启动后会注册到zk
# [serverUri=0.0.0.0:10000;version=3.1.2;sequence=0000000004, serverUri=0.0.0.0:10000;version=3.1.2;sequence=0000000008]
beeline
!connect jdbc:hive2://hadoop-master,hadoop-slave01/;serviceDiscoveryMode=zooKeeper;zooKeeperNamespace=hiveserver2 hdfs 123456
# kill 掉其中一台 metastore
jps
# 27090 RunJar
kill 27090
# beeline再次查看,剩下另外一台metastore同样提供服务
Hadoop权限问题
vim core-site.xml
<property>
<name>hadoop.proxyuser.hdfs.groups</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.hdfs.hosts</name>
<value>*</value>
</property>
刷新权限
yarn rmadmin -refreshSuperUserGroupsConfiguration
hdfs dfsadmin -refreshSuperUserGroupsConfiguration
HBase部署
兼容性
Apache HBase ? Reference Guide
搜索“Java support”,查看JDK兼容
Hadoop兼容
Apache HBase ? Reference Guide
Apache HBase ? Reference Guide
Zookeeper兼容
What version of ZooKeeper should I use?
The newer version, the better. ZooKeeper 3.4.x is required as of HBase 1.0.0
搜索“ZooKeeper Requirements”,如下:
An Apache ZooKeeper quorum is required. The exact version depends on your version of HBase, though the minimum ZooKeeper version is 3.4.x due to the?useMulti ?feature made default in 1.0.0 (see?HBASE-16598).
下载 HBase2.4.9
https://dlcdn.apache.org/hbase/2.4.9/hbase-2.4.9-bin.tar.gz
解压
tar -zxvf hbase-2.4.9-bin.tar.gz -C /opt/apps
mv hbase-2.4.9-bin hbase-2.4.9
环境变量
vim /etc/profile.d/hdfs_env.sh
# hadoop
export HBASE_HOME=/opt/apps/hbase-2.4.9
export PATH=$PATH:$HBASE_HOME/bin
source /etc/profile.d/hdfs_env.sh
配置
cd /opt/apps/hbase-2.4.9/conf
# 链接hadoop配置文件
ln -s /opt/apps/hadoop-3.2.2/etc/hadoop/core-site.xml core-site.xml
ln -s /opt/apps/hadoop-3.2.2/etc/hadoop/hdfs-site.xml hdfs-site.xml
Apache HBase ? Reference Guide
regionservers
hadoop-master
hadoop-slave01
hadoop-slave02
hbase-env.sh
cat > hbase-env.sh << 'EOF'
export JAVA_HOME=/opt/apps/jdk1.8.0_211
export HBASE_MANAGES_ZK=false
EOF
hbase-site.xml
<configuration>
<!-- 运行模式,true分布式模式,false单机模式 -->
<property>
<name>hbase.cluster.distributed</name>
<value>true</value>
</property>
<!--- regionServer的共享目录,用来持久化HBase -->
<property>
<name>hbase.rootdir</name>
<value>hdfs://nameservice1/hbase</value>
</property>
<!-- 本地文件系统的临时文件夹 -->
<property>
<name>hbase.tmp.dir</name>
<value>/data/hbase</value>
</property>
<!-- Master -->
<!-- Master webUI端口。-1 表示不运行 -->
<property>
<name>hbase.master.port</name>
<value>16000</value>
</property>
<!-- Master webUI绑定主机 -->
<property>
<name>hbase.master.info.bindAddress</name>
<value>0.0.0.0</value>
</property>
<!-- Master webUI端口。-1 表示不运行 -->
<property>
<name>hbase.master.info.port</name>
<value>16010</value>
</property>
<!-- regionserver -->
<!-- RegionServer绑定的端口 -->
<property>
<name>hbase.regionserver.port</name>
<value>16020</value>
</property>
<!-- RegionServer webUI绑定的主机 -->
<property>
<name>hbase.regionserver.info.bindAddress</name>
<value>0.0.0.0</value>
</property>
<!-- RegionServer webUI端口, -1表示不运行RegionServer webUI -->
<property>
<name>hbase.regionserver.info.port</name>
<value>16030</value>
</property>
<!-- RegionServers处理远程请求的线程数,默认30,设置为CPU的倍数,值设得越大,意味着内存开销变大 -->
<property>
<name>hbase.regionserver.handler.count</name>
<value>100</value>
</property>
<!-- 压缩编码类型 -->
<property>
<name>hbase.regionserver.codecs</name>
<value>snappy,gz</value>
</property>
<!-- memstore大小超过了128M会flush到磁盘,内存足够时可设置大些 -->
<property>
<name>hbase.hregion.memstore.flush.size</name>
<value>128</value>
</property>
<!-- regionserver在写入时会检查每个region的memstore的总大小是否超过了memstore大小乘这个倍数,如果超过了则锁住memstore写操作并触发flush -->
<property>
<name>hbase.hregion.memstore.block.multiplier</name>
<value>4</value>
</property>
<!-- 多少个storefile进行一次合并,默认10 -->
<property>
<name>hbase.hstore.compaction.max</name>
<value>10</value>
</property>
<!-- Region的所有HFile总合的上限,如果超过这个Region会被一分为二,可能导致Region短暂下线,默认10G -->
<property>
<name>hbase.hregion.max.filesize</name>
<value>10737418240</value>
</property>
<!-- Client参数 -->
<!-- 一次scan从服务端抓取的数据条数 -->
<property>
<name>hbase.client.scanner.caching</name>
<value>1000</value>
</property>
<!-- scan过期时间,单位毫秒,默认60秒 -->
<property>
<name>hbase.client.scanner.timeout.period</name>
<value>300000</value>
</property>
<!-- StoreFile占用缓存Heap的最大百分比,默认0.4。如果写比读多可用默认,如果读比写多,越大越好 -->
<property>
<name>hfile.block.cache.size</name>
<value>0.4</value>
</property>
<!-- client的rpc请求超时时间,默认60秒 -->
<property>
<name>hbase.rpc.timeout</name>
<value>300000</value>
</property>
<!-- Zookeeper -->
<property>
<name>hbase.zookeeper.quorum</name>
<value>hadoop-master,hadoop-slave01,hadoop-slave02</value>
</property>
<!-- Zookeeper port-->
<property>
<name>hbase.zookeeper.property.clientPort</name>
<value>2181</value>
</property>
<!--HBase在zookeeper上数据的目录-->
<property>
<name>zookeeper.znode.parent</name>
<value>/hbase</value>
</property>
<!-- RegionServer与Zookeeper的连接超时时间。当超时时间到后,ReigonServer会被Zookeeper从RS集群清单中移除,HMaster收到移除通知后,会对这台server负责的regions重新balance,让其他存活的RegionServer接管 -->
<property>
<name>zookeeper.session.timeout</name>
<value>120000</value>
</property>
<!-- ZooKeeper的zoo.conf中的配置,快照的存储位置 -->
<property>
<name>hbase.zookeeper.property.dataDir</name>
<value>/data/zookeeper</value>
</property>
</configuration>
backup-masters
vim backup-masters
# 备用master的host,可多台
hadoop-slave02
启动
start-hbase.sh
stop-hbase.sh
hbase-daemon.sh start master
hbase-daemon.sh start regionserver
# Master WebUI
http://hadoop-master:16010
# 备用 Master
http://hadoop-slave02:16010
# RegionServer WebUI
http://hadoop-master:16030
验证
zkCli.sh
ls /hbase
ls /hbase/backup-masters
# [hadoop-slave02,16000,1642670777382]
# 在 master节点下线 HMaster
hbase-daemon.sh stop master
jps再查看 HMaster 不存在后,zookeeper 中 backup-masters 中 hadoop-slave02 没有了。
hadoop-master 的 WebUI 无法打开,Hadoop-slave02 自动切换为 HMaster。哪台再启动 HMaster,会自动注册为 backup-masters。
Kafka部署
下载
Apache Kafka
解压
tar -zxvf kafka_2.12-2.6.3.tgz -C /opt/apps/
环境变量
vim /etc/profile.d/hdfs_env.sh
# hadoop
export KAFKA_HOME=/opt/apps/kafka_2.12-2.6.3
export PATH=$PATH:$KAFKA_HOME/bin
source /etc/profile.d/hdfs_env.sh
创建目录
# 日志目录
mkdir $KAFKA_HOME/logs
# 数据
mkdir /data/kafka
配置
Apache Kafka
server.properties
broker.id=1
port=9092
delete.topic.enable=true
log.dirs=/data/kaka
zookeeper.connect=datalake-01:2181,datalake-02:2181,datalake-03:2181
producer.properties
bootstrap.servers=hadoop-master:9092,hadoop-slave01:9092,hadoop-slave02:9092
consumer.properties
bootstrap.servers=hadoop-master:9092,hadoop-slave01:9092,hadoop-slave02:9092
将 kafka 分发到 hadoop-slave01,hadoop-slave02,并修改?broker.id
broker.id=2
broker.id=3
启动
# 在三个节点上分别执行
nohup kafka-server-start.sh -daemon /opt/apps/kafka_2.12-2.6.3/config/server.properties 2>&1 &
脚本
for i in ethan001 ethan002 ethan003
do
echo "========== $i =========="
ssh $i 'kafka-server-start.sh -daemon /opt/apps/kafka_2.12-3.1.2/config/server.properties 2>&1'
echo $?
done
检验
# 查看 topic 列表
kafka-topics.sh --zookeeper localhost:2181 --list
# 创建 topic
kafka-topics.sh --zookeeper localhost:2181 --create --topic test --partitions 3 --replication-factor 3
# 查看topic
kafka-topics.sh --zookeeper localhost:2181 --describe --topic test
# producer
kafka-console-producer.sh --topic test --broker-list localhost:9092
# consumer
kafka-console-consumer.sh --topic test --bootstrap-server localhost:9092
Spark部署
下载
Downloads | Apache Spark
解压
tar -zxvf spark-2.4.8-bin-hadoop2.7.tgz /opt/apps/
mv spark-2.4.8-bin-hadoop2.7/ spark-2.4.8
环境变量
sudo vim /etc/profile.d/hdfs_env.sh
# spark
export SPARK_HOME=/opt/apps/spark-2.4.8
export PATH=$PATH:$SPARK_HOME/bin
# Hadoop与Spark都有脚本,使用别名区别
alias spark-start-all='/opt/apps/spark-2.4.8/sbin/start-all.sh'
alias spark-stop-all='/opt/apps/spark-2.4.8/sbin/stop-all.sh'
source /etc/profile.d/hdfs_env.sh
配置
cd /$SPARK_HOME/conf
cp spark-defaults.conf.template spark-defaults.conf
mv spark-env.sh.template spark-env.sh
mv slaves.template slaves
cp log4j.properties.template log4j.properties
spark-env.sh
export JAVA_HOME=/opt/apps/jdk1.8.0_211
export SCALA_HOME=/opt/apps/scala-2.12.14
export HADOOP_HOME=/opt/apps/hadoop-3.2.2
export HADOOP_CONF_DIR=/opt/apps/hadoop-3.2.2/etc/hadoop
export SPARK_MASTER_WEBUI_PORT=8078
export SPARK_WORKER_WEBUI_PORT=8079
# Master通信端口7077
export SPARK_MASTER_PORT=7077
export SPARK_DAEMON_JAVA_OPTS="-Dspark.deploy.recoveryMode=ZOOKEEPER -Dspark.deploy.zookeeper.url=hadoop-master:2181,hadoop-slave01:2181,hadoop-slave02:2181 -Dspark.deploy.zookeeper.dir=/spark2"
slaves
hadoop-master
hadoop-slave01
hadoop-slave02
spark-defaults.conf
Spark on YARN 会将本地 Spark-jar上传到 HDFS,然后分发到各个 NodeManager,将 jar 先上传到HDFS,减少上传过程。
hdfs dfs -mkdir -p /spark/jars
hdfs dfs -put /opt/apps/spark-2.4.8/jars/* /spark/jars/
vim spark-defaults.conf
spark.yarn.jars=hdfs://nameservice1/spark/jars/*
spark.yarn.historyServer.address=hadoop-slave02:18080
spark.history.ui.port=18080
spark.master.rest.port 16066
spark.eventLog.enabled true
spark.eventLog.dir hdfs://nameservice1/spark/directory
启动
# standalone
spark-start-all
# jps 查看
# Master
# worker
# Master WebUI
http://hadoop-master:8078
# History
mkdir /tmp/spark-events
$SPARK_HOME/sbin/start-history-server.sh
# History WebUI
http://hadoop-slave02:18080
HA
# 在其他节点启动 Master
sh $SPARK_HOME/sbin/start-master.sh
# 检查
zkCli.sh
ls /spark2/leader_election
# [_c_7c35ab9e-e333-4e18-aea4-25501fca6a22-latch-0000000002, _c_c853231d-86b2-45ae-8734-fbfa1b25fe40-latch-0000000001]
# WebUI 可以看到状态是 Status: STANDBY
http://hadoop-slave01:8078
# kill 掉 active Master,再次查看会变成Active
jps
# 13436 Master
kill -9 13436
检验
检验部署是否成功
run-example SparkPi
# Pi is roughly 3.1407357036785184
Local
spark-shell
val textFile = sc.textFile("file:///opt/apps/spark-2.4.8/README.md")
textFile.first()
textFile.count()
textFile.map(line=>line.split(" ").size).reduce((a,b)=>if(a>b) a else b)
res1.collect
val textFile = sc.textFile("hdfs://nameservice1/tmp/README.md")
textFile.first
textFile.collect
:quit
Standalone
spark-submit \
--master spark://10.0.11.111:7077 \
--executor-memory 1G \
--total-executor-cores 1 \
--class org.apache.spark.examples.SparkPi \
/opt/apps/spark-2.4.8/examples/jars/spark-examples_2.11-2.4.8.jar
查看任务运行(Spark History):
http://hadoop-slave02:18080
YARN
spark-submit \
--master yarn \
--deploy-mode client \
--class org.apache.spark.examples.SparkPi \
/opt/apps/spark-2.4.8/examples/jars/spark-examples_2.11-2.4.8.jar
查看任务运行(YARN History):
http://hadoop-slave02:19888
Flume部署
下载Flume1.9.0
Apache Downloads
解压
tar -zxvf apache-flume-1.9.0-bin.tar.gz -C /opt/apps/
mv apache-flume-1.9.0-bin/ flume-1.9.0
环境变量
vim /etc/profile.d/hdfs_env.sh
# flume
export FLUME_HOME=/opt/apps/flume-1.9.0
export PATH=$PATH:$FLUME_HOME/bin
source /etc/profile.d/hdfs_env.sh
配置
cd /opt/apps/flume-1.9.0/conf/
cp flume-env.sh.template flume-env.sh
vim flume-env.sh
export JAVA_HOME=/opt/apps/jdk1.8.0_211
# 冲突
cd $FLUME_HOME/lib
mv guava-11.0.2.jar guava-11.0.2.jar.bak
cp $HADOOP_HOME/share/hadoop/common/lib/guava-27.0-jre.jar $FLUME_HOME/lib
检验
flume-ng version
Flume 1.9.0
Source code repository: https://git-wip-us.apache.org/repos/asf/flume.git
Revision: d4fcab4f501d41597bc616921329a4339f73585e
Compiled by fszabo on Mon Dec 17 20:45:25 CET 2018
From source with checksum 35db629a3bda49d23e9b3690c80737f9
Flume 1.9.0 User Guide — Apache Flume
flume-conf.properties示例
# 1.Name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1
# 2.Describe/configure the source
a1.sources.r1.type = netcat
a1.sources.r1.bind = hadoop-master
a1.sources.r1.port = 44444
# 3.Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
# 4.Describe the sink
a1.sinks.k1.type = logger
# 5.Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
启动Agent
flume-ng agent \
--name a1 \
--conf $FLUME_HOME/conf \
--conf-file $FLUME_HOME/conf/flume-conf.properties \
-Dflume.root.logger=INFO,console
测试
telnet hadoop-master 4444
Maven部署
下载Maven-3.6.4
Index of /maven/maven-3/3.6.3/binaries
解压
tar -zxvf apache-maven-3.6.3-bin.tar.gz -C /opt/apps/
mv /opt/apps/apache-maven-3.6.3-bin/ /opt/apps/maven-3.6.3
环境变量
vim /etc/profile.d/hdfs_env.sh
# maven
export M2_HOME=/opt/apps/maven-3.6.3
export PATH=$PATH:$M2_HOME/bin
source /etc/profile.d/hdfs_env.sh
配置
vim setting /opt/apps/maven-3.6.3/conf/setting.xml
<localRepository>/data/repo</localRepository>
<mirrors>
<mirror>
<id>ali-public</id>
<url>https://maven.aliyun.com/repository/public</url>
<mirrorOf>public</mirrorOf>
</mirror>
<mirror>
<id>ali-central</id>
<url>https://maven.aliyun.com/repository/central</url>
<mirrorOf>central</mirrorOf>
</mirror>
<mirror>
<id>ali-apache-snapshots</id>
<url>https://maven.aliyun.com/repository/apache-snapshots</url>
<mirrorOf>apache snapshots</mirrorOf>
</mirror>
<mirror>
<id>ali-snapshots</id>
<url>https://maven.aliyun.com/repository/snapshots</url>
<mirrorOf>snapshots</mirrorOf>
</mirror>
<mirror>
<id>ali-releases</id>
<url>https://maven.aliyun.com/repository/releases</url>
<mirrorOf>releases</mirrorOf>
</mirror>
<mirror>
<id>ali-mapr-public</id>
<url>https://maven.aliyun.com/repository/mapr-public</url>
<mirrorOf>mapr-public</mirrorOf>
</mirror>
<mirror>
<id>ali-google</id>
<url>https://maven.aliyun.com/repository/google</url>
<mirrorOf>google</mirrorOf>
</mirror>
<mirror>
<id>ali-gradle-plugin</id>
<url>https://maven.aliyun.com/repository/gradle-plugin</url>
<mirrorOf>gradle-plugin</mirrorOf>
</mirror>
</mirror>
<mirror>
<id>ali-spring</id>
<url>https://maven.aliyun.com/repository/spring</url>
<mirrorOf>spring</mirrorOf>
</mirror>
<mirror>
<id>ali-spring-plugin</id>
<url>https://maven.aliyun.com/repository/spring-plugin</url>
<mirrorOf>spring-plugin</mirrorOf>
</mirror>
<mirror>
<id>ali-grails-core</id>
<url>https://maven.aliyun.com/repository/grails-core</url>
<mirrorOf>grails-core</mirrorOf>
</mirror>
<mirror>
<id>nexus-hortonworks</id>
<name>Nexus hortonworks</name>
<url>https://repo.hortonworks.com/content/groups/public/</url>
<mirrorOf>central</mirrorOf>
</mirror>
<mirror>
<id>cloudera</id>
<name>cloudera</name>
<url>https://repository.cloudera.com/artifactory/cloudera-repos</url>
<mirrorOf>central</mirrorOf>
</mirror>
<mirror>
<id>mavenl</id>
<name>Maven Repository Switchboard</name>
<url>http://repo1.maven.org/maven2/</url>
<mirrorOf>central</mirrorOf>
</mirror>
<mirror>
<id>maven2</id>
<name>Maven Repository Switchboard</name>
<url>http://repo1.maven.apache.org/maven2/</url>
<mirrorOf>central</mirrorOf>
</mirror>
</mirrors>
检验
mvn -v
Kudu部署
下载Kudu1.14.0
https://dlcdn.apache.org/kudu/1.14.0/apache-kudu-1.14.0.tar.gz
Git
git clone https://github.com/apache/kudu
解压
tar -zxvf apache-kudu-1.14.0.tar.gz -C /opt/apps
mv apache-kudu-1.14.0/ kudu-1.14.0
编译
Apache Kudu - Installing Apache Kudu
依赖
sudo yum install -y autoconf automake cyrus-sasl-devel cyrus-sasl-gssapi \
cyrus-sasl-plain flex gcc gcc-c++ gdb git java-1.8.0-openjdk-devel \
krb5-server krb5-workstation libtool make openssl-devel patch \
pkgconfig redhat-lsb-core rsync unzip vim-common which
# Centos7,添加依赖
sudo yum install -y centos-release-scl-rh devtoolset-8
# 安装memkind,用于支持Kudu NVM (non-volatile memory)
sudo yum remove memkind
sudo yum install -y numactl-libs numactl-devel
git clone https://github.com/memkind/memkind.git
cd memkind
./build.sh --prefix=/usr
sudo make install
sudo ldconfig
# 如果需要编译文档,需要安装依赖,RHEL/centos 7以上用 rubygems 替换 gem
sudo yum install -y gem graphviz zlib-devel rh-ruby23
build-support/enable_devtoolset.sh thirdparty/build-if-necessary.sh
编译
mkdir -p build/release
cd build/release
../../build-support/enable_devtoolset.sh \
../../thirdparty/installed/common/bin/cmake \
-DCMAKE_BUILD_TYPE=release ../.. \
-DNO_TESTS=1
make j4
拷贝文件
拷贝 /build/release/bin/ 下的3个文件 kudu,kudu-master,kudu-tserver,和 www 目录,并创建 conf 目录
├── bin
│?? ├── kudu
│?? ├── kudu-master
│?? └── kudu-tserver
├── conf
│?? ├── master.gflagfile
│?? └── tserver.gflagfile
└── www
├── bootstrap
│?? ├── css
│?? │?? ├── bootstrap.min.css
│?? │?? ├── bootstrap.min.css.map
│?? │?? ├── bootstrap-table.min.css
│?? │?? ├── bootstrap-theme.min.css
│?? │?? └── bootstrap-theme.min.css.map
│?? ├── fonts
│?? │?? └── glyphicons-halflings-regular.woff
│?? └── js
│?? ├── bootstrap.min.js
│?? └── bootstrap-table.min.js
├── config.mustache
├── d3.v2.js
├── dashboards.mustache
├── epoch.0.5.2.min.css
├── epoch.0.5.2.min.js
├── favicon.ico
├── home.mustache
├── index.html
├── jquery-3.5.1.min.js
├── key.png
├── kudu.css
├── kudu.js
├── log-anchors.mustache
├── logo.png
├── logs.mustache
├── maintenance-manager.mustache
├── masters.mustache
├── metrics-epoch.js
├── metrics.html
├── scans.mustache
├── table.mustache
├── tables.mustache
├── tablet-consensus-status.mustache
├── tablet.mustache
├── tablet-rowsetlayout-svg.mustache
├── tablet-servers.mustache
├── tablets.mustache
├── threadz.mustache
├── tracing.html
└── tracing.js
环境变量
sudo vim /etc/profile.d/hdfs_env.sh
# kudu
export KUDU_HOME=/opt/apps/kudu-1.14.0
export PATH=$PATH:$KUDU_HOME/bin
source /etc/profile.d/hdfs_env.sh
安装
如果完成拷贝和环境变量,这一步可以不用
# 默认安装到 /usr/local/bin和 /usr/local/sbin
cd /opt/apps/kudu-1.14.0-src/build/release
make DESTDIR=/opt/apps/kudu-1.14.0 install
创建目录
mkdir -p /opt/apps/kudu-1.14.0/logs /data/kudu/master /data/kudu/tserver
配置
Apache Kudu - Apache Kudu Configuration Reference
mkdir /opt/apps/kudu-1.14.0/conf
cd /opt/apps/kudu-1.14.0/conf
touch master.gflagfile tserver.gflagfile
简易配置
master.gflagfile
--rpc_bind_addresses=0.0.0.0:7051
--master_addresses=hadoop-master:7051
--fs_data_dirs=/data/kudu/master
--fs_metadata_dir=/data/kudu/master
--fs_wal_dir=/data/kudu/master
--log_dir=/opt/apps/kudu-1.14.0/logs
--webserver_doc_root=/opt/apps/kudu-1.14.0/www
tserver.gflagfile
--rpc_bind_addresses=0.0.0.0:7050
--tserver_master_addrs=hadoop-master:7051
--fs_wal_dir=/data/kudu/tserver
--fs_metadata_dir=/data/kudu/tserver
--fs_data_dirs=/data/kudu/tserver
--log_dir=/opt/apps/kudu-1.14.0/logs
--webserver_doc_root=/opt/apps/kudu-1.14.0/www
集群配置
master.gflagfile
--rpc_bind_addresses=hadoop-master:7051
--master_addresses=hadoop-master:7051,hadoop-slave01:7051,hadoop-slave02:7051
--webserver_enabled=true
--webserver_port=8051
--metrics_log_interval_ms=60000
--webserver_doc_root=/opt/apps/kudu-1.14.0/www
--fs_wal_dir=/data/kudu/master
--fs_metadata_dir=/data/kudu/master
--fs_data_dirs=/data/kudu/master
--log_dir=/opt/apps/kudu-1.14.0/logs
--colorlogtostderr=true
--enable_process_lifetime_heap_profiling=true
--heap_profile_path=/data/kudu/master/heap
--rpc_authentication=disabled
--unlock_unsafe_flags=true
--unlock_experimental_flags=true
--max_log_size=2048
--flush_threshold_secs=86400
--budgeted_compaction_target_rowset_size=67100000
--tablet_delta_store_minor_compact_max=100
--tablet_delta_store_major_compact_min_ratio=0.01
--memory_limit_hard_bytes=1073741824
--block_cache_capacity_mb=256
--default_num_replicas=3
--max_clock_sync_error_usec=10000000
--consensus_rpc_timeout_ms=30000
--follower_unavailable_considered_failed_sec=300
--leader_failure_max_missed_heartbeat_periods=3
--tserver_unresponsive_timeout_ms=60000
--rpc_num_service_threads=10
--max_negotiation_threads=50
--min_negotiation_threads=0
--rpc_negotiation_timeout_ms=3000
--rpc_default_keepalive_time_ms=65000
--rpc_num_acceptors_per_address=1
--master_ts_rpc_timeout_ms=60000
--remember_clients_ttl_ms=3600000
--remember_responses_ttl_ms=600000
--rpc_service_queue_length=500
--raft_heartbeat_interval_ms=500
--heartbeat_interval_ms=1000
--heartbeat_max_failures_before_backoff=3
tserver.gflagfile
--rpc_bind_addresses=0.0.0.0:7050
--tserver_master_addrs=hadoop-master:7051,hadoop-slave01:7051,hadoop-slave02:7051
--webserver_enabled=true
--webserver_port=8050
--metrics_log_interval_ms=60000
--webserver_doc_root=/opt/apps/kudu-1.14.0/www
--fs_wal_dir=/data/kudu/tserver
--fs_metadata_dir=/data/kudu/tserver
--fs_data_dirs=/data/kudu/tserver
--log_dir=/opt/apps/kudu-1.14.0/logs
--colorlogtostderr=true
--enable_process_lifetime_heap_profiling=true
--heap_profile_path=/data/kudu/tserver/heap
--rpc_authentication=disabled
--unlock_unsafe_flags=true
--unlock_experimental_flags=true
--max_log_size=1800
--flush_threshold_secs=86400
--budgeted_compaction_target_rowset_size=67100000
--tablet_delta_store_minor_compact_max=100
--tablet_delta_store_major_compact_min_ratio=0.01
--memory_limit_hard_bytes=1073741824
--block_cache_capacity_mb=536870912
--default_num_replicas=3
--consensus_rpc_timeout_ms=30000
--follower_unavailable_considered_failed_sec=300
--leader_failure_max_missed_heartbeat_periods=3
--tserver_unresponsive_timeout_ms=60000
--rpc_num_service_threads=10
--max_negotiation_threads=50
--min_negotiation_threads=0
--rpc_negotiation_timeout_ms=3000
--rpc_default_keepalive_time_ms=65000
--rpc_num_acceptors_per_address=1
--master_ts_rpc_timeout_ms=60000
--remember_clients_ttl_ms=3600000
--remember_responses_ttl_ms=600000
--rpc_service_queue_length=500
--raft_heartbeat_interval_ms=500
--heartbeat_interval_ms=1000
--heartbeat_max_failures_before_backoff=3
启动
# master
nohup kudu-master --flagfile=/opt/apps/kudu-1.14.0/conf/master.gflagfile &
# tserver
nohup kudu-tserver --flagfile=/opt/apps/kudu-1.14.0/conf/tserver.gflagfile &
检验
kudu cluster ksck localhost:7051
kudu master list localhost:7051
kudu tserver list localhost:7051
WebUI
master
http://hadoop-master:8051
tserver
http://hadoop-master:8050
停止
jobs -l
# [1]+ 21795 运行中 kudu-master --flagfile=/opt/apps/kudu-1.14.0/conf/master.gflagfile &
kill 21795
Impala部署
注意:最好在干净的环境下编译,会下载很多依赖修改本地环境变量。
下载 Imapala-3.4.0
https://archive.apache.org/dist/impala/3.4.0/apache-impala-3.4.0.tar.gz
解压
tar -zxvf apache-impala-3.4.0.tar.gz -C /opt/apps/
mv /opt/apps/apache-impala-3.4.0 impala-3.4.0
编译
Building Impala - Impala - Apache Software Foundation
准备
修改 Ant 版本
cd /opt/apps/impala-3.4.0
vim bin/bootstrap_system.sh
# 243行,修改Ant版本为1.9.16,注释sha512验证
redhat sudo wget -nv \
https://downloads.apache.org/ant/binaries/apache-ant-1.9.16-bin.tar.gz
#redhat sha512sum -c - <<< 'b9324cffeb5b113fa289126db1408b9a0125757b598d763f076fc5deec97fb43f27979974cadcac79b6573d8
#4dcb2d1d5bf59b7972fb2abe5ed3d9fed445b04e apache-ant-1.9.16-bin.tar.gz'
redhat sudo tar -C /usr/local -xzf apache-ant-1.9.16-bin.tar.gz
redhat sudo ln -s /usr/local/apache-ant-1.9.16/bin/ant /usr/local/bin
如果编译过程中已经建立软链,编译失败,那么把这几行都注释掉,避免重复下载
预下载 m2_archive.tar.gz
m2_archive.tar.gz下载过程中总是中断,手动下载并上传到 /tmp
https://jenkins.impala.io/job/all-build-options-ub1604/7919//artifact/Impala/logs/m2_archive.tar.gz
vim /opt/jars/impala-4.0.0/bin/jenkins/populate_m2_directory.py
# 修改和注释下面的
tmp_tarball_location = "/tmp/tarball_name"
#subprocess.check_call(["wget", "-q", url, "-O", tmp_tarball_location])
impala-3.4 需要修改pom文件
vim impala-parent/pom.xml
# 修改
<repository>
<id>cdh.rcs.releases.repo</id>
<!-- <url>https://repository.cloudera.com/content/groups/cdh-releases-rcs</url> -->
<url>https://repository.cloudera.com/artifactory/cdh-releases-rcs</url>
<name>CDH Releases Repository</name>
<snapshots>
<enabled>true</enabled>
</snapshots>
</repository>
# 删除
<repository>
<id>cloudera.thirdparty.repo</id>
<url>https://repository.cloudera.com/content/repositories/third-party</url>
<name>Cloudera Third Party Repository</name>
<snapshots>
<enabled>false</enabled>
</snapshots>
</repository>
# 修改
<pluginRepositories>
<pluginRepository>
<id>cloudera.thirdparty.repo</id>
<!-- <url>https://repository.cloudera.com/content/repositories/third-party</url> -->
<url>https://repository.cloudera.com/artifactory/cdh-releases-rcs</url>
<name>Cloudera Third Party Repository</name>
<snapshots>
<enabled>false</enabled>
</snapshots>
</pluginRepository>
</pluginRepositories>
执行编译
export IMPALA_HOME=`pwd`
bin/bootstrap_system.sh
source $IMPALA_HOME/bin/impala-config.sh
# 编译下载依赖在 toolchain 下,需要很久
./buildall.sh -noclean -notests -skiptests
拷贝编译文件
#!/bin/bash
IMPALA_SRC_HOME=/opt/jars/impala-4.0.0-src
IMPALA_HOME=/opt/apps/impala-4.0.0
# if exist the dest directory then clear it.
if [ -d "${IMPALA_HOME}" ]; then
rm -rf ${IMPALA_HOME}/*
else
mkdir -p ${IMPALA_HOME}
fi
mkdir ${IMPALA_HOME}/be
mkdir ${IMPALA_HOME}/lib
mkdir ${IMPALA_HOME}/dependency/
mkdir ${IMPALA_HOME}/sbin
cp -rf ${IMPALA_SRC_HOME}/be/build/debug/* ${IMPALA_HOME}/be/
cp -rf ${IMPALA_SRC_HOME}/toolchain/toolchain-packages-gcc7.5.0/gcc-7.5.0/lib64* ${IMPALA_HOME}/lib/
cp -rf ${IMPALA_SRC_HOME}/fe/target/impala-frontend-0.1-SNAPSHOT.jar ${IMPALA_HOME}/lib/
cp -rf ${IMPALA_SRC_HOME}/fe/target/dependency/* ${IMPALA_HOME}/dependency/
cp -rf ${IMPALA_SRC_HOME}/shell/build/impala-shell-4.0.0-RELEASE/* ${IMPALA_HOME}/shell
cp -r ${IMPALA_SRC_HOME}/www ${IMPALA_HOME}/
echo "Finished"
环境变量
sudo vim /etc/profile.d/hdfs_env.sh
# spark
export IMAPALA_HOME=/opt/apps/impala-3.4.0
export PATH=$PATH:$IMAPALA_HOME/bin
source /etc/profile.d/hdfs_env.sh
配置
默认配置文件为 /etc/default/impala,没有则创建
IMPALA_CATALOG_SERVICE_HOST=hadoop-slave02
IMPALA_STATE_STORE_HOST=hadoop-slave02
IMPALA_CATALOG_SERVICE_PORT=26000
IMPALA_STATE_STORE_PORT=24000
IMPALA_BACKEND_PORT=22000
IMPALA_LOG_DIR=/opt/apps/impala-4.0.0/logs
export IMPALA_CATALOG_ARGS=" -log_dir=${IMPALA_LOG_DIR} -catalog_service_port=${IMPALA_CATALOG_SERVICE_PORT}"
export IMPALA_STATE_STORE_ARGS=" -log_dir=${IMPALA_LOG_DIR} -state_store_port=${IMPALA_STATE_STORE_PORT}"
export IMPALA_SERVER_ARGS=" \
-log_dir=${IMPALA_LOG_DIR} \
-catalog_service_host=${IMPALA_CATALOG_SERVICE_HOST} \
-state_store_port=${IMPALA_STATE_STORE_PORT} \
-state_store_host=${IMPALA_STATE_STORE_HOST} \
-use_statestore \
-be_port=${IMPALA_BACKEND_PORT} -mem_limit=60%"
# -kudu_master_hosts=hadoop-master:7051
export ENABLE_CORE_DUMPS=${ENABLE_COREDUMPS:-false}
export JAVA_HOME=/opt/apps/jdk1.8.0_211
export IMPALA_HOME=/opt/jars/impala
export IMPALA_CONF_DIR=/etc/impala/conf
export HADOOP_CONF_DIR=/etc/hadoop/conf
export HIVE_CONF_DIR=/etc/hive/conf
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$IMPALA_HOME/lib:$HADOOP_HOME/lib/native
for f in /opt/jars/impala/lib/*.jar; do
export CLASSPATH=$CLASSPATH:$f
done
export CLASSPATH=$CLASSPATH:/etc/impala/conf
MYSQL_CONNECTOR_JAR=/usr/share/java/mysql-connector-java.jar
export IMPALAD_START="nohup ${IMPALA_HOME}/bin/impalad ${IMPALA_SERVER_ARGS} &"
export CATALOG_START="nohup ${IMPALA_HOME}/bin/catalogd ${IMPALA_CATALOG_ARGS} &"
export STATESTORE_START="nohup ${IMPALA_HOME}/bin/statestored ${IMPALA_STATE_STORE_ARGS} &"
创建配置文件软链
ln -s /opt/apps/impala-4.0.0/conf /etc/impala/conf
ln -s /opt/apps/hadoop-3.2.2/etc/hadoop/core-site.xml /etc/impala/conf/core-site.xml
ln -s /opt/apps/hadoop-3.2.2/etc/hadoop/hdfs-site.xml /etc/impala/conf/hdfs-site.xml
ln -s /opt/apps/hive-3.1.2/conf/hive-site.xml /etc/impala/conf/hive-site.xml
ln -s /opt/apps/hbase-2.3.7/conf/hbase-site.xml /etc/impala/conf/hbase-site.xml
mkdir -p /etc/hadoop/conf
ln -s /opt/apps/hadoop-3.2.2/etc/hadoop/core-site.xml /etc/hadoop/conf/core-site.xml
ln -s /opt/apps/hadoop-3.2.2/etc/hadoop/hdfs-site.xml /etc/hadoop/conf/hdfs-site.xml
ln -s /opt/apps/hadoop-3.2.2/etc/hadoop/yarn-site.xml /etc/hadoop/conf/yarn-site.xml
mkdir -p /etc/hive/conf
ln -s /opt/apps/hive-3.1.2/conf/hive-site.xml /etc/hive/conf/hive-site.xml
mkdir -p /etc/hbase/conf
ln -s /opt/apps/hbase-2.3.7/conf/hbase-site.xml /etc/hbase/conf/hbase-site.xml
ln -s /opt/apps/impala-4.0.0/shell/impala-shell /bin/impala-shell
所有节点创建短路读取路径
mkdir -p /var/lib/hdfs-sockets
配置hdfs-site.xml
<!-- 短路读取,client与DataNode在同一主机时,不经过TCP socket传输直接本地读取数据-->
<property>
<name>dfs.client.read.shortcircuit</name>
<value>true</value>
</property>
<!-- UNIX域套接字的路径,将用于DataNode和本地HDFS客户机之间的通信 -->
<property>
<name>dfs.domain.socket.path</name>
<value>/var/run/hdfs-sockets/dn</value>
</property>
<!-- block存储元数据信息开发开关 -->
<property>
<name>dfs.datanode.hdfs-blocks-metadata.enabled</name>
<value>true</value>
</property>
<property>
<name>dfs.client.file-block-storage-locations.timeout</name>
<value>30000</value>
</property>
修改 /var/run/hdfs-sockets 目录权限,用户为hdfs,组为root
sudo chown hdfs:root -R /var/run/hdfs-sockets
启动
source /etc/default/impala
nohup $STATESTORE_START &
nohup $CATALOG_START &
nohup $IMPALAD_START &
nohup $IMPALAD_START -kudu_master_hosts=hadoop-master:7051 &
nohup ${STATESTORE_START} > $IMPALA_HOME/logs/statestore.log 2>&1 &
nohup ${CATALOG_START} > $IMPALA_HOME/logs/catalog.log 2>&1 &
nohup ${IMPALAD_START} -kudu_master_hosts=hadoop-master:7051 > $IMPALA_HOME/logs/impalad.log 2>&1 &
验证
ps -ef | grep impala
#root 29741 26654 0 10:20 pts/0 00:00:00 /opt/jars/impala/bin/statestored -log_dir=/opt/jars/impala/logs -state_store_port=24000
#root 29799 27425 16 10:20 pts/1 00:00:07 /opt/jars/impala/bin/catalogd -log_dir=/opt/jars/impala/logs -catalog_service_port=26000
#root 29932 29834 24 10:21 pts/2 00:00:09 /opt/jars/impala/bin/impalad -log_dir=/opt/jars/impala/logs -catalog_service_host=hadoop-slave02 -state_store_port=24000 -state_store_host=hadoop-slave02 -use_statestore -be_port=22000 -mem_limit=60%
jobs -l
Statestored WebUI
http://hadoop-slave02:25010/
Catalog WebUI
http://hadoop-slave02:25020
impalad WebUI
http://hadoop-slave02:25000/
编译问题
sudo service postgresql initdb Hint: the preferred way to do this is now "postgresql-setup initdb" Data directory is not empty!
cd /var/lib/pgsql
rm -rf data/
启动报错
impalad: /usr/lib64/libstdc++.so.6: version `GLIBCXX_3.4.21' not found (required by impalad)
impalad: /usr/lib64/libstdc++.so.6: version `GLIBCXX_3.4.22' not found (required by impalad)
impalad: /usr/lib64/libstdc++.so.6: version `CXXABI_1.3.8' not found (required by impalad)
impalad: /usr/lib64/libstdc++.so.6: version `GLIBCXX_3.4.20' not found (required by impalad)
impalad: /usr/lib64/libstdc++.so.6: version `CXXABI_1.3.11' not found (required by impalad)
impalad: /usr/lib64/libstdc++.so.6: version `GLIBCXX_3.4.21' not found (required by /opt/apps/impala-4.0.0/lib/libkudu_client.so.0)
impalad: /usr/lib64/libstdc++.so.6: version `CXXABI_1.3.8' not found (required by /opt/apps/impala-4.0.0/lib/libkudu_client.so.0)
impalad: /usr/lib64/libstdc++.so.6: version `GLIBCXX_3.4.20' not found (required by /opt/apps/impala-4.0.0/lib/libkudu_client.so.0)
impalad: /usr/lib64/libstdc++.so.6: version `CXXABI_1.3.9' not found (required by /opt/apps/impala-4.0.0/lib/libkudu_client.so.0)
impalad: /usr/lib64/libstdc++.so.6: version `GLIBCXX_3.4.22' not found (required by /opt/apps/impala-4.0.0/lib/libkudu_client.so.0)
cp /opt/apps/impala-4.0.0/lib/libstdc++.so.6.0.24 /usr/lib64
cd /usr/lib64
ln -snf libstdc++.so.6.0.24 libstdc++.so.6
报错
Environment variable CLASSPATH not set!
getJNIEnv: getGlobalJNIEnv failed
Impala负载均衡
安装 haproxy
yum install -y haproxy
配置 haproxy.cfg
vim /etc/haproxy/haproxy.cfg
listen impalashell # 监听 impala-shell
bind 0.0.0.0:21000 # proxy绑定的IP和端口
mode tcp # 以4层?方式代理理,重要
option tcplog
balance roundrobin # 调度算法 'leastconn' 最少连接数分配,或者 'roundrobin'轮询
server impalashell_1 hadoop-master:21000 check # 所有impalad节点,别名,主机名,端口, check检查正常才转发给impalad
server impalashell_2 hadoop-slave01:21000 check
server impalashell_3 hadoop-slave02:21000 check
listen impalajdbc # 监听jdbc的请求,通过客户端界面连接就是用的jdbc
bind 0.0.0.0:21050
mode tcp
option tcplog
balance roundrobin
server impalajdbc_1 hadoop-master:21050 check
server impalajdbc_2 hadoop-slave01:21050 check
server impalajdbc_2 hadoop-slave02:21050 check
连接
impala-shell -i hadoop-proxy:21000
Hue集成Impala
vim /opt/apps/hue/desktop/conf/pseudo-distributed.ini
[impala] # 1140行左右
server_host=hadoop-proxy # 任意一台impala-server主机,使用了haproxy实现负载均衡,则填haproxy绑定的主机
server_port=21050 # hue是通过jdbc的方式连接的impalad
impala_conf_dir=/etc/impala/conf # impala的配置文件目录
Tez部署
Apache Tez – Install and Deployment Instructions
下载 Tez-0.10.1 源码
Apache Downloads
解压
tar -zxvf apache-tez-0.10.1-src.tar.gz
修改pom
vim pom.xml
<hadoop.version>3.2.2</hadoop.version>
# 编译 tez-ui 需要翻墙很麻烦,基本不会使用,可以跳过 tez-ui 模块
<modules>
...
<!--<module>tez-ui</module>-->
编译
编译工具
yum -y install autoconf automake libtool cmake ncurses-devel openssl-devel lzo-devel zlib-devel gcc gcc-c++
编译 probuf-2.5.0
https://codeload.github.com/protocolbuffers/protobuf/tar.gz/refs/tags/v2.5.0
tar -zxvf protobuf-2.5.0.tar.gz
cd protobuf-2.5.0
./configure
make install
编译Tez
cd apache-tez-0.10.1-src
mvn clean package -DskipTests=true -Dmaven.javadoc.skip=true
编译完成,apache-tez-0.10.1-src/tez-dist/target/ 下
解压
解压 tez-0.10.1-minimal.tar.gz
mkdir /opt/apps/tez-0.10.1
tar -zxvf /opt/jars/tez-0.10.1-minimal.tar.gz -C /opt/apps/tez-0.10.1/
上传
上传 tez-0.10.1.tar.gz 到 HDFS
su hdfs
hdfs dfs -mkdir /tez
hdfs dfs -put /opt/apps/tez-0.10.1.tar.gz /tez/
配置
tez-site.xml
在 $HADOOP_HOME/etc/hadoop 下新建 tez-site.xml 文件
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>tez.lib.uris</name>
<value>${fs.defaultFS}/tez/apache-tez-0.10.1-SNAPSHOT.tar.gz</value>
</property>
<property>
<name>tez.use.cluster.hadoop-libs</name>
<value>true</value>
</property>
<property>
<name>tez.am.resource.memory.mb</name>
<value>1024</value>
</property>
<property>
<name>tez.am.resource.cpu.vcores</name>
<value>1</value>
</property>
<property>
<name>tez.container.max.java.heap.fraction</name>
<value>0.4</value>
</property>
<property>
<name>tez.task.resource.memory.mb</name>
<value>1024</value>
</property>
<property>
<name>tez.task.resource.cpu.vcores</name>
<value>1</value>
</property>
<property>
<name>tez.history.logging.service.class</name>
<value>org.apache.tez.dag.history.logging.ats.ATSHistoryLoggingService</value>
</property>
</configuration>
hadoop-env.sh
修改 Hadoop 启动环境
TEZ_CONF_DIR=/opt/apps/hadoop-3.2.2/etc/hadoop/
TEZ_JARS=/opt/apps/tez-0.10.1
export HADOOP_CLASSPATH=${HADOOP_CLASSPATH}:${TEZ_CONF_DIR}:${TEZ_JARS}/*:${TEZ_JARS}/lib/*
hive-site.xml
修改 hive 配置计算引擎
vim /opt/apps/hive-3.1.2/conf/hive-site.xml
<property>
<name>hive.execution.engine</name>
<value>tez</value>
</property>
<property>
<name>hive.tez.container.size</name>
<value>2048</value>
</property>
hive-env.sh
修改 Hive 启动环境
export TEZ_HOME=/opt/apps/tez-0.10.1
export TEZ_JARS=""
for jar in `ls $TEZ_HOME |grep jar`; do
export TEZ_JARS=$TEZ_JARS:$TEZ_HOME/$jar
done
for jar in `ls $TEZ_HOME/lib`; do
export TEZ_JARS=$TEZ_JARS:$TEZ_HOME/lib/$jar
done
export HIVE_AUX_JARS_PATH=${HIVE_HOME}/lib$TEZ_JARS
mapred-site.xml
<property>
<name>mapreduce.framework.name</name>
<value>yarn-tez</value>
</property>
检验
cd /opt/apps/tez-0.10.1
hdfs dfs -put LICENSE /tez
yarn jar /opt/apps/tez-0.10.1/tez-examples-0.10.1.jar orderedwordcount /tez/LICENSE /tez/output
hive
show databases;
create database test_db;
use test_db;
create table test_tb(id int, name string);
insert into test_tb values(1,"aaa");
Lzo编译安装
下载
wget http://www.oberhumer.com/opensource/lzo/download/lzo-2.10.tar.gz
tar -zxvf lzo-2.10.tar.gz
cd lzo-2.10
./configure -prefix=/usr/local/hadoop/lzo/
make
make install
|