准备工作
wget https://www.apache.org/dyn/closer.lua/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz -O /opt/spark-3.2.0-bin-hadoop3.2.tgz
mkdir -p /opt/bigdata/spark
tar xf spark-3.1.2-bin-hadoop3.2.tgz -C /opt/bigdata/spark
cd /opt/bigdata/spark/
ln -s spark-3.1.2-bin-hadoop3.2 current
export SPARK_HOME=/opt/bigdata/spark/current
export PATH=$PATH:$SPARK_HOME/bin
for i in {2..5};do scp -p /etc/profile node0${i}:`pwd`;done
集群配置
cd $SPARK_HOME/conf
mv spark-env.sh.template spark-env.sh
mv spark-defaults.conf.template spark-defaults.conf
mv workers.template workers
1. 在yarn-site.xml添加以下内容
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle,spark_shuffle</value>
</property>
<property>
<name>yarn.nodemanager.aux-services.mapreduce_shuffle.class</name>
<value>org.apache.hadoop.mapred.ShuffleHandler</value>
</property>
<property>
<name>yarn.nodemanager.aux-services.spark_shuffle.class</name>
<value>org.apache.spark.network.yarn.YarnShuffleService</value>
</property>
for i in {2..5};do scp -p yarn-site.xml node0${i}:`pwd`;done
2. 复制Spark 对应的 jar 包放到 Hadoop 的库文件中
忘记复制此 jar 文件,那么 NodeManager 服务就无法启动
cp $SPARK_HOME/yarn/spark-3.1.2-yarn-shuffle.jar $HADOOP_HOME/share/hadoop/yarn/lib/
3. 开启 Spark 日志记录功能
cat > spark-env.sh <<-EOF
export JAVA_HOME=/usr/java/default
export HADOOP_HOME=/opt/bigdata/hadoop/current
export HADOOP_CONF_DIR=\$HADOOP_HOME/etc/hadoop/
export YARN_CONF_DIR=\$HADOOP_HOME/etc/hadoop/
EOF
cat > spark-defaults.conf <<-EOF
spark.shuffle.service.enabled true
# spark-history服务的配置
spark.eventLog.enabled true
spark.yarn.historyServer.address=node01:18080
spark.history.ui.port=18080
spark.eventLog.dir hdfs://mycluster/spark_log
spark.history.fs.logDirectory hdfs://mycluster/spark_log
spark.history.retainedApplications 30
# jar包保留在hdfs,减少任务启动时间,两种方式选一个即可
#spark.yarn.jars hdfs://mycluster/libs/spark_lib/jars/*
spark.yarn.archive hdfs://mycluster/libs/spark_lib/lib_archive/spark-3.1.2-bin-hadoop3.2.zip
EOF
for i in {2..5};do scp spark-env.sh spark-defaults.conf node0${i}:`pwd`;done
4. 上传jar包到hdfs,加快启动速度
cd $SPARK_HOME
上传jar包到hdfs
hdfs dfs -mkdir -p /libs/spark_lib/
hdfs dfs -put jars /libs/spark_lib/
使用归档的
cd $SPARK_HOME/jars
zip spark-3.1.2-bin-hadoop3.2.zip ./*
hdfs dfs -mkdir -p /libs/spark_lib/lib_archive/
hdfs dfs -put spark-3.1.2-bin-hadoop3.2.zip /libs/spark_lib/lib_archive/
rm -f spark-3.1.2-bin-hadoop3.2.zip
5. 同步包
cd /opt/bigdata/
[god@node01 bigdata]$ tar czf spark.gz spark
for i in {2..5};do scp -p spark.gz node0${i}:`pwd` ;done
for i in {2..5};do ssh node0${i} "cd /opt/bigdata/ && tar xf spark.gz && rm -f spark.gz" ;done
所有NodeManager节点都要复制spark的jar包到hadoop
for i in {2..5};do ssh node0${i} "cp $SPARK_HOME/yarn/spark-3.1.2-yarn-shuffle.jar $HADOOP_HOME/share/hadoop/yarn/lib/ "; done
6. 重启yarn
[god@node01 hadoop]$ stop-yarn.sh
[god@node01 hadoop]$ start-yarn.sh
7. 启动spark history服务 选择在node03机器,任意
[god@node03 current]$ $SPARK_HOME/sbin/start-history-server.sh
8. 测试spark-shell
$SPARK_HOME/bin/spark-shell --master yarn
|