From 5cc0ede4aba1d60aa6f66f831722f0d08ca8efad Mon Sep 17 00:00:00 2001 From: colben Date: Sun, 29 Mar 2026 00:26:11 +0800 Subject: [PATCH] update --- content/post/hdp2.md | 9 +++- content/post/spark.md | 100 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 107 insertions(+), 2 deletions(-) create mode 100644 content/post/spark.md diff --git a/content/post/hdp2.md b/content/post/hdp2.md index 2c1fe47..32fc329 100644 --- a/content/post/hdp2.md +++ b/content/post/hdp2.md @@ -76,7 +76,7 @@ Rocky9 | hdp-slave11 | 192.168.8.11/24 | /data/hdp-dn | Datanode, NodeManager ```bash echo 'export HADOOP_HOME=/opt/hdp' > /etc/profile.d/hdp.sh echo 'export PATH=$HADOOP_HOME/bin:$PATH' >> /etc/profile.d/hdp.sh - # 不推荐把 $HADOOP_HOME/sbin 加入环境变量 PATH. + # 不推荐把 $HADOOP_HOME/sbin 加入环境变量 PATH,避免与 spark 冲突 source /etc/profile.d/hdp.sh ``` @@ -194,10 +194,15 @@ Rocky9 | hdp-slave11 | 192.168.8.11/24 | /data/hdp-dn | Datanode, NodeManager 604800 - + yarn.nodemanager.vmem-check-enabled false + + + yarn.nodemanager.pmem-check-enabled + false + ``` diff --git a/content/post/spark.md b/content/post/spark.md new file mode 100644 index 0000000..2aa4068 --- /dev/null +++ b/content/post/spark.md @@ -0,0 +1,100 @@ +--- +title: "spark on yarn 部署" +date: 2023-05-23T12:00:00+08:00 +lastmod: 2024-07-19T17:00:00+08:00 +keywords: [] +tags: ["hadoop", "spark"] +categories: ["hadoop"] +--- + +## 环境 +操作系统 | 主机名 | 地址 | 运行组件 +---- | ---- | ---- | ---- +Rocky9 | hdp-nn | 192.168.8.1/24 | Namenode +Rocky9 | hdp-snn | 192.168.8.2/24 | SecondaryNamenode +Rocky9 | hdp-rm | 192.168.8.3/24 | ResourceManager +Rocky9 | hdp-slave10 | 192.168.8.10/24 | Datanode, NodeManager +Rocky9 | hdp-slave11 | 192.168.8.11/24 | Datanode, NodeManager + +## 前提 +- [已部署好 hadoop 2.10](/post/hdp2) + +## 部署 spark 环境 +- 在**全部主机**上执行如下操作 +- 下载 spark-3.3.4-bin-hadoop2.tgz,解压 + ```bash + curl -LO https://archive.apache.org/dist/spark/spark-3.3.4/spark-3.3.4-bin-hadoop2.tgz + tar zxf spark-3.3.4-bin-hadoop2.tgz + mv spark-3.3.4-bin-hadoop2 /opt/spark + ``` + +- 配置 spark 环境变量 + ```bash + echo 'export SPARK_HOME=/opt/spark' > /etc/profile.d/spark.sh + echo 'export PATH=$SPARK_HOME/bin:$PATH' >> /etc/profile.d/spark.sh + # 不推荐把 $SPARK_HOME/sbin 加入环境变量 PATH,避免与 hadoop 冲突 + source /etc/profile.d/spark.sh + ``` + +### 修改 yarn-site.xml +- 在**全部主机**上[关闭 yarn 虚拟内存检查](/post/hdp2/#修改-yarn-sitexml) + +### 修改 capacity-scheduler.xml +- 在**全部主机**上执行如下操作 +- 编辑 $HADOOP_HOME/etc/hadoop/capacity-scheduler.xml,修改内容如下 + ```xml + + + yarn.scheduler.capacity.resource-calculator + + org.apache.hadoop.yarn.util.resource.DominantResourceCalculator + + + + ``` + +### 创建 spark-defaults.conf +- 在**全部主机**上执行如下操作 +- 创建 $SPARK_HOME/conf/spark-defaults.conf,参考内容如下 + ``` + spark.master yarn + spark.eventLog.enabled true + spark.eventLog.dir hdfs://hdp-nn:8020/spark-logs + ``` + +### 修改 spark-env.sh +- 在**全部主机**上执行如下操作 +- 创建 $SPARK_HOME/conf/spark-env.sh,参考内容如下 + ```bash + export HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop + export SPARK_HISTORY_OPTS="-Dspark.history.ui.port=18080 -Dspark.history.fs.logDirectory=hdfs://hdp-nn:8020/spark-logs -Dspark.history.retainedApplications=30" + ``` + +### 创建 spark 日志目录 +- 在 **hdp-nn** 上创建 spark 日志目录 + ```bash + hdfs dfs -mkdir /spark-logs + ``` + +## 启动 spark 日志服务 +- 在**任一主机** 上启动日志服务 + ```bash + /opt/spark/sbin/start-history-server.sh + ``` + +## 发布任务 +- 客户端模式 + ```bash + spark-shell + ``` + +- 集群模式 + ```bash + spark-submit \ + --class org.apache.spark.examples.SparkPi \ + --deploy-mode cluster \ + $SPARK_HOME/examples/jars/spark-examples_2.12-3.3.4.jar + ``` + +- 浏览器访问 http://{spark 日志服务器}:18080 查看任务进度 +