数仓搭建
一、第一层采集通道
启动脚本f1
#!/bin/bash
#使用start启动脚本,使用stop停止脚本
if(($#!=1))
then
echo 请输入start或stop!
exit;
fi
#定义cmd用来保存要执行的命令
cmd=cmd
if [ $1 = start ]
then
cmd="nohup flume-ng agent -c $FLUME_HOME/conf/ -n a1 -f $FLUME_HOME/myagents/f1.conf -Dflume.root.logger=DEBUG,console > /home/f1.log 2>&1 &"
elif [ $1 = stop ]
then
cmd="ps -ef | grep f1.conf | grep -v grep | awk '{print \$2}' | xargs kill -9"
else
echo 请输入start或stop!
fi
#在hadoop102和hadoop103开启采集
for i in hadoop102 hadoop103
do
ssh $i $cmd
done
二、第二层采集通道
在hadoop104的/opt/module/flume/myagents中创建f2.conf
a1.channels.c2.type=file
a1.channels.c2.checkpointDir=/opt/module/flume/c2/checkpoint
a1.channels.c2.useDualCheckpoints=true
a1.channels.c2.backupCheckpointDir=/opt/module/flume/c2/backupcheckpoint
a1.channels.c2.dataDirs=/opt/module/flume/c2/datas
#sink
a1.sinks.k1.type = hdfs
#一旦路径中含有基于时间的转义序列,要求event的header中必须有timestamp=时间戳,如果没有需要将useLocalTimeStamp = true
a1.sinks.k1.hdfs.path = hdfs://hadoop102:9000/origin_data/gmall/log/topic_start/%Y-%m-%d
a1.sinks.k1.hdfs.filePrefix = logstart-
a1.sinks.k1.hdfs.batchSize = 1000
#文件的滚动
#60秒滚动生成一个新的文件
a1.sinks.k1.hdfs.rollInterval = 30
#设置每个文件到128M时滚动
a1.sinks.k1.hdfs.rollSize = 134217700
#禁用基于event数量的文件滚动策略
a1.sinks.k1.hdfs.rollCount = 0
#指定文件使用LZO压缩格式
a1.sinks.k1.hdfs.fileType = CompressedStream
a1.sinks.k1.hdfs.codeC = lzop
#a1.sinks.k1.hdfs.round = true
#a1.sinks.k1.hdfs.roundValue = 10
#a1.sinks.k1.hdfs.roundUnit = second
a1.sinks.k2.type = hdfs
a1.sinks.k2.hdfs.path = hdfs://hadoop102:9000/origin_data/gmall/log/topic_event/%Y-%m-%d
a1.sinks.k2.hdfs.filePrefix = logevent-
a1.sinks.k2.hdfs.batchSize = 1000
a1.sinks.k2.hdfs.rollInterval = 30
a1.sinks.k2.hdfs.rollSize = 134217700
a1.sinks.k2.hdfs.rollCount = 0
a1.sinks.k2.hdfs.fileType = CompressedStream
a1.sinks.k2.hdfs.codeC = lzop
#a1.sinks.k2.hdfs.round = true
#a1.sinks.k2.hdfs.roundValue = 10
#a1.sinks.k2.hdfs.roundUnit = second
#连接组件
a1.sources.r1.channels=c1
a1.sources.r2.channels=c2
a1.sinks.k1.channel=c1
a1.sinks.k2.channel=c2
启动脚本f2
#!/bin/bash
#使用start启动脚本,使用stop停止脚本
if(($#!=1))
then
echo 请输入start或stop!
exit;
fi
if [ $1 = start ]
then
ssh hadoop104 "nohup flume-ng agent -c $FLUME_HOME/conf/ -n a1 -f $FLUME_HOME/myagents/f2.conf -Dflume.root.logger=INFO,console > /home/f2.log 2>&1 &"
elif [ $1 = stop ]
then
ssh hadoop104 "ps -ef | grep f2.conf | grep -v grep | awk '{print \$2}' | xargs kill -9"
else
echo 请输入start或stop!
fi
一键启动脚本onekeyboot:
#!/bin/bash
#输入start和stop参数,一键启动或关闭hadoop,zk,kafka集群,启动f1,f2采集通道
if(($#!=1))
then
echo 请输入start或stop!
exit;
fi
#编写函数,这个函数的功能为返回集群中启动成功的broker的数量
function countKafkaBrokders()
{
count=0
for((i=102;i<=104;i++))
do
result=$(ssh hadoop$i "jps | grep Kafka | wc -l")
count=$[$result+$count]
done
#函数可以定义返回值,如果不定义,返回函数最后一条命令的执行状态(返回0,代表成功,非0,即为异常)
return $count
}
#启动,注意启动时,各个组件的依赖关系,例如zk必须先于kafka启动,后于kafka关闭
if [ $1 = start ]
then
zk.sh start
myhadoop.sh start
kf.sh start
#保证kafka集群已经启动时,才能启动f1,f2,判断当前kafka集群启动了多少 broker实例
while [ 1 ]
do
countKafkaBrokders
#如果返回值不为3,有可能是机器还尚未执行broker的启动命令,因此继续判断
if(($?==3))
then
break
fi
sleep 2s
done
f1 start
f2 start
#查看启动了哪些进程
xcall jps
elif [ $1 = stop ]
then
f1 stop
f2 stop
kf.sh stop
#在kafka没有停止完成之前,不能停止zk集群
while [ 1 ]
do
countKafkaBrokders
#如果返回值不为0,kafka集群没有停止完成
if(($?==0))
then
break
fi
sleep 2s
done
zk.sh stop
myhadoop.sh stop
#查看还剩了哪些进程
xcall jps
else
echo 请输入start或stop!
fi
三、MySql的安装
1.检查是否已经安装了mysql的一些软件防止冲突
rpm -qa | grep mysql
rpm -qa | grep MySQL
如果有就要卸载
rpm -e --nodeps 查询到的地址
2.安装
rpm -ivh MySQL-client
rpm -ivh MySQL-server
3.配置root用户密码
查看生成的随机密码:
sudo cat /root/.mysql_secret
使用随机密码登录修改新的密码:
set password=password('123456');
启动服务
sudo service mysql start
四、双向主从的MySQL搭建
1.到/usr/share/mysql下找mysql服务端配置的模版
sudo cp my-default.cnf /etc/my.cnf
2.配置
my.ccnf
server_id = 102
log-bin=mysql-bin
binlog_format=mixed
relay_log=mysql-relay
分发,在103上修改id
3.重启
sudo service mysql restart
4.在主机上使用root@localhost登录
GRANT replication slave ON *.* TO 'slave'@'%' IDENTIFIED BY '123456';
5.查看主机binlog文件的最新位置
show master status;
6.在从机上执行以下语句
change master to master_user='slave',master_password='123456',master_host='192.168.1.103',master_log_file='mysql-bin.000001',master_log_pos=311;
7.在从机上开启同步线程
start slave
五、hive安装
配置
配置hive的元数据存储在mysql中
1.复制Mysql的驱动到 /opt/module/hive/lib/中
2.进入/opt/module/hive/conf/编辑hive-site.xml 添加以下内容:
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://192.168.1.100:3306/metastore?createDatabaseIfNotExist=true</value>
<description>JDBC connect string for a JDBC metastore</description>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.jdbc.Driver</value>
<description>Driver class name for a JDBC metastore</description>
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>root</value>
<description>username to use against metastore database</description>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>123456</value>
<description>password to use against metastore database</description>
</property>
<property>
<name>hive.cli.print.header</name>
<value>true</value>
</property>
<property>
<name>hive.cli.print.current.db</name>
<value>true</value>
</property>
</configuration>
六、Tez的安装
1.解压
在module中将tez的名字改为tez-0.9.1
将tez的tar包上传到hdfs的/tez目录下
Hadoop fs -mkdir /tez
Hadoop fs -put /opt/soft/apach```
2.在Hive的/opt/module/hive/conf下面创建一个tez-site.xml文件
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>tez.lib.uris</name>
<value>${fs.defaultFS}/tez/apache-tez-0.9.1-bin.tar.gz</value>
</property>
<property>
<name>tez.use.cluster.hadoop-libs</name>
<value>true</value>
</property>
<property>
<name>tez.history.logging.service.class</name> <value>org.apache.tez.dag.history.logging.ats.ATSHistoryLoggingService</value>
</property>
<property>
<name>hive.metastore.schema.verification</name>
<value>false</value>
</property>
</configuration>
3.在hive-site.xml文件中添加如下配置,更改hive计算引擎
<property>
<name>hive.execution.engine</name>
<value>tez</value>
</property>
4.在hive-env.sh文件中添加tez环境变量配置和依赖包环境变量配置
[atguigu@hadoop102 conf]$ vim hive-env.sh
添加如下配置
export TEZ_HOME=/opt/module/tez-0.9.1 #是你的tez的解压目录
export TEZ_JARS=""
for jar in `ls $TEZ_HOME |grep jar`; do
export TEZ_JARS=$TEZ_JARS:$TEZ_HOME/$jar
done
for jar in `ls $TEZ_HOME/lib`; do
export TEZ_JARS=$TEZ_JARS:$TEZ_HOME/lib/$jar
done
export HIVE_AUX_JARS_PATH=/opt/module/hadoop-2.7.2/share/hadoop/common/hadoop-lzo-0.4.20.jar$TEZ_JARS
|