From 75cef9d8c6c2ca23331fa542048bdce8e8ffc756 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Mon, 22 May 2023 07:07:38 +0000 Subject: [PATCH 01/14] enable yarn ci Signed-off-by: minmingzhu --- .github/workflows/ci-tests.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml index 96018b1fa..a3597bce7 100644 --- a/.github/workflows/ci-tests.yml +++ b/.github/workflows/ci-tests.yml @@ -50,7 +50,6 @@ jobs: run: | ${{github.workspace}}/dev/ci/ci-local-test.sh yarn-test: - if: ${{ false }} # disable for now name: Yarn Test for Examples (CPU) runs-on: ubuntu-20.04 steps: From 4caa6cc90e5b0e442c3553212ccc51858cc2c7b2 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Tue, 23 May 2023 08:41:22 +0000 Subject: [PATCH 02/14] update Signed-off-by: minmingzhu --- dev/test-cluster/yarn/setup-cluster.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/dev/test-cluster/yarn/setup-cluster.sh b/dev/test-cluster/yarn/setup-cluster.sh index d57edf90c..4b3281d3e 100755 --- a/dev/test-cluster/yarn/setup-cluster.sh +++ b/dev/test-cluster/yarn/setup-cluster.sh @@ -40,6 +40,7 @@ cp ./yarn-site.xml ~/opt/hadoop-$HADOOP_VERSION/etc/hadoop/ cp ./hadoop-env.sh ~/opt/hadoop-$HADOOP_VERSION/etc/hadoop/ cp ../log4j.properties ~/opt/spark-$SPARK_VERSION-bin-$SPARK_HADOOP_VERSION/conf cp ./spark-defaults.conf ~/opt/spark-$SPARK_VERSION-bin-$SPARK_HADOOP_VERSION/conf +cp ~/opt/spark-$SPARK_VERSION-bin-$SPARK_HADOOP_VERSION/yarn/spark-*-yarn-shuffle.jar ~/opt/hadoop-$HADOOP_VERSION/share/hadoop/yarn/lib/ echo $HOST_IP > $HADOOP_HOME/etc/hadoop/slaves echo $HOST_IP > $SPARK_HOME/conf/slaves From 14f749fa79b117bff20db91b73792fb59d6ed01c Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Wed, 24 May 2023 07:35:17 +0000 Subject: [PATCH 03/14] debugging Signed-off-by: minmingzhu --- dev/ci/ci-yarn-test.sh | 7 +++---- dev/test-cluster/yarn/setup-cluster.sh | 8 ++++++++ 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/dev/ci/ci-yarn-test.sh b/dev/ci/ci-yarn-test.sh index e9a93aee3..b66710171 100755 --- a/dev/ci/ci-yarn-test.sh +++ b/dev/ci/ci-yarn-test.sh @@ -34,7 +34,6 @@ echo "=========================================" echo "Cluster Testing with Spark Version: $SPARK_VERSION" echo "=========================================" -# Build and run all examples -./build-all-scala.sh -./run-all-scala.sh -./run-all-pyspark.sh +# Build and run all examplesdebug#./build-all-scala.sh +#./run-all-scala.sh +#./run-all-pyspark.sh diff --git a/dev/test-cluster/yarn/setup-cluster.sh b/dev/test-cluster/yarn/setup-cluster.sh index 4b3281d3e..7606f5732 100755 --- a/dev/test-cluster/yarn/setup-cluster.sh +++ b/dev/test-cluster/yarn/setup-cluster.sh @@ -58,6 +58,14 @@ $HADOOP_HOME/bin/hdfs namenode -format $HADOOP_HOME/sbin/start-dfs.sh $HADOOP_HOME/sbin/start-yarn.sh +jps +free -g +df -h +yarn application -list +ls -ls $HADOOP_HOME/logs/ +cat $HADOOP_HOME/logs/hadoop-*-resourcemanager-*.log +cat $HADOOP_HOME/logs/hadoop-*-nodemanager-*.log + hadoop fs -ls / yarn node -list From c78c8d3d9fca13208a9ec9e9af2cead12b264964 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Wed, 24 May 2023 08:10:16 +0000 Subject: [PATCH 04/14] debugging Signed-off-by: minmingzhu --- dev/test-cluster/yarn/setup-cluster.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dev/test-cluster/yarn/setup-cluster.sh b/dev/test-cluster/yarn/setup-cluster.sh index 7606f5732..18e9a6e15 100755 --- a/dev/test-cluster/yarn/setup-cluster.sh +++ b/dev/test-cluster/yarn/setup-cluster.sh @@ -54,6 +54,8 @@ mkdir -p /tmp/run/hdfs/datanode # hdfs format $HADOOP_HOME/bin/hdfs namenode -format +wget -P $HADOOP_HOME/share/hadoop/yarn/lib/ https://repo1.maven.org/maven2/javax/activation/activation/1.1.1/activation-1.1.1.jar + # start hdfs and yarn $HADOOP_HOME/sbin/start-dfs.sh $HADOOP_HOME/sbin/start-yarn.sh From 78ef6b2b0b71b7ec80e8a774df1b3224fcf9e5a3 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Wed, 24 May 2023 08:40:07 +0000 Subject: [PATCH 05/14] debugging Signed-off-by: minmingzhu --- dev/ci/ci-yarn-test.sh | 2 +- examples/run-all-scala.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dev/ci/ci-yarn-test.sh b/dev/ci/ci-yarn-test.sh index b66710171..55bb36b30 100755 --- a/dev/ci/ci-yarn-test.sh +++ b/dev/ci/ci-yarn-test.sh @@ -35,5 +35,5 @@ echo "Cluster Testing with Spark Version: $SPARK_VERSION" echo "=========================================" # Build and run all examplesdebug#./build-all-scala.sh -#./run-all-scala.sh +./run-all-scala.sh #./run-all-pyspark.sh diff --git a/examples/run-all-scala.sh b/examples/run-all-scala.sh index 04bab7f8a..241a861ca 100755 --- a/examples/run-all-scala.sh +++ b/examples/run-all-scala.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -exampleDirs=(kmeans pca als naive-bayes linear-regression correlation summarizer) +exampleDirs=(kmeans) for dir in ${exampleDirs[*]} do From 8586be6340fd3f8e265b9ca59a5ae343c364b4a0 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Thu, 25 May 2023 03:08:23 +0000 Subject: [PATCH 06/14] debugging Signed-off-by: minmingzhu --- dev/install-build-deps-ubuntu.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/dev/install-build-deps-ubuntu.sh b/dev/install-build-deps-ubuntu.sh index 8ae9a4e2c..e4d417418 100755 --- a/dev/install-build-deps-ubuntu.sh +++ b/dev/install-build-deps-ubuntu.sh @@ -8,6 +8,7 @@ if [ ! -d /opt/intel/oneapi ]; then | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list sudo apt update + sudo -E apt-cache pkgnames intel | grep intel-oneapi-runtime sudo apt-get install -y intel-oneapi-ccl-devel-2021.8.0 \ intel-oneapi-tbb-common-devel-2021.8.0 intel-oneapi-tbb-devel-2021.8.0 \ intel-oneapi-mpi-devel-2021.8.0 \ From 787bc1770c730ba07042bc46f32defeb42a7d085 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Thu, 25 May 2023 03:18:47 +0000 Subject: [PATCH 07/14] debugging Signed-off-by: minmingzhu --- dev/ci/ci-build-test.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/dev/ci/ci-build-test.sh b/dev/ci/ci-build-test.sh index 104820d7b..0cd37f6dc 100755 --- a/dev/ci/ci-build-test.sh +++ b/dev/ci/ci-build-test.sh @@ -8,6 +8,7 @@ trap 'last_command=$current_command; current_command=$BASH_COMMAND' DEBUG # echo an error message before exiting trap 'echo "\"${last_command}\" command filed with exit code $?."' EXIT +rm -rf /opt/intel/oneapi # Install dependencies for building $GITHUB_WORKSPACE/dev/install-build-deps-ubuntu.sh From ad2a2759ec09f17e2fb26df33b4d78927033a1df Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Thu, 25 May 2023 06:23:33 +0000 Subject: [PATCH 08/14] debugging Signed-off-by: minmingzhu --- dev/ci/ci-yarn-test.sh | 1 + examples/build-all-scala.sh | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/dev/ci/ci-yarn-test.sh b/dev/ci/ci-yarn-test.sh index 55bb36b30..8b1a2f5c9 100755 --- a/dev/ci/ci-yarn-test.sh +++ b/dev/ci/ci-yarn-test.sh @@ -35,5 +35,6 @@ echo "Cluster Testing with Spark Version: $SPARK_VERSION" echo "=========================================" # Build and run all examplesdebug#./build-all-scala.sh +./build-all-scala.sh ./run-all-scala.sh #./run-all-pyspark.sh diff --git a/examples/build-all-scala.sh b/examples/build-all-scala.sh index 8babb9d97..ad507e031 100755 --- a/examples/build-all-scala.sh +++ b/examples/build-all-scala.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -exampleDirs=(kmeans pca als naive-bayes linear-regression correlation summarizer) +exampleDirs=(kmeans) for dir in ${exampleDirs[*]} do From 35507b7f2f0a3ffc952d2ad0640db0a2328a6841 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Thu, 25 May 2023 06:54:35 +0000 Subject: [PATCH 09/14] debugging Signed-off-by: minmingzhu --- dev/test-cluster/yarn/hadoop-env.sh | 2 ++ dev/test-cluster/yarn/yarn-site.xml | 5 ++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/dev/test-cluster/yarn/hadoop-env.sh b/dev/test-cluster/yarn/hadoop-env.sh index f60b65a0b..f6e93a3ce 100755 --- a/dev/test-cluster/yarn/hadoop-env.sh +++ b/dev/test-cluster/yarn/hadoop-env.sh @@ -96,3 +96,5 @@ export HADOOP_SECURE_DN_PID_DIR=${HADOOP_PID_DIR} # A string representing this instance of hadoop. $USER by default. export HADOOP_IDENT_STRING=$USER +source /opt/intel/oneapi/setvars.sh --ccl-configuration=cpu +export FI_TCP_IFACE=eth0 diff --git a/dev/test-cluster/yarn/yarn-site.xml b/dev/test-cluster/yarn/yarn-site.xml index ff74d23a7..f973bbdef 100644 --- a/dev/test-cluster/yarn/yarn-site.xml +++ b/dev/test-cluster/yarn/yarn-site.xml @@ -63,5 +63,8 @@ yarn.scheduler.maximum-allocation-vcores 2 - + + yarn.nodemanager.env-whitelist + JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,HADOOP_YARN_HOME,LD_LIBRARY_PATH + From 0344734d8a33c8a0c59a25bd02e6e4395e2ae14d Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Thu, 25 May 2023 07:38:59 +0000 Subject: [PATCH 10/14] debugging Signed-off-by: minmingzhu --- dev/ci/ci-build-test.sh | 1 - dev/test-cluster/yarn/env.sh | 2 +- dev/test-cluster/yarn/yarn-site.xml | 8 ++++---- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/dev/ci/ci-build-test.sh b/dev/ci/ci-build-test.sh index 0cd37f6dc..104820d7b 100755 --- a/dev/ci/ci-build-test.sh +++ b/dev/ci/ci-build-test.sh @@ -8,7 +8,6 @@ trap 'last_command=$current_command; current_command=$BASH_COMMAND' DEBUG # echo an error message before exiting trap 'echo "\"${last_command}\" command filed with exit code $?."' EXIT -rm -rf /opt/intel/oneapi # Install dependencies for building $GITHUB_WORKSPACE/dev/install-build-deps-ubuntu.sh diff --git a/dev/test-cluster/yarn/env.sh b/dev/test-cluster/yarn/env.sh index 3430d4c3e..13d5cb6bf 100755 --- a/dev/test-cluster/yarn/env.sh +++ b/dev/test-cluster/yarn/env.sh @@ -38,7 +38,7 @@ else fi # Set Spark resources, can be overwritten in example -SPARK_DRIVER_MEMORY=1G +SPARK_DRIVER_MEMORY=512M SPARK_NUM_EXECUTORS=2 SPARK_EXECUTOR_CORES=1 SPARK_EXECUTOR_MEMORY=1G diff --git a/dev/test-cluster/yarn/yarn-site.xml b/dev/test-cluster/yarn/yarn-site.xml index f973bbdef..b843e7630 100644 --- a/dev/test-cluster/yarn/yarn-site.xml +++ b/dev/test-cluster/yarn/yarn-site.xml @@ -33,7 +33,7 @@ yarn.nodemanager.resource.memory-mb - 7168 + 3072 yarn.nodemanager.resource.cpu-vcores @@ -49,11 +49,11 @@ yarn.scheduler.minimum-allocation-mb - 1024 + 256 yarn.scheduler.maximum-allocation-mb - 7168 + 3072 yarn.scheduler.minimum-allocation-vcores @@ -65,6 +65,6 @@ yarn.nodemanager.env-whitelist - JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,HADOOP_YARN_HOME,LD_LIBRARY_PATH + JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,HADOOP_YARN_HOME,LD_LIBRARY_PATH,FI_TCP_IFACE,CMPLR_ROOT,DAALROOT,CCL_ROOT From 39c32eb8920527f145c9cae8da3849df6d290e0d Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Thu, 25 May 2023 08:20:16 +0000 Subject: [PATCH 11/14] debugging Signed-off-by: minmingzhu --- dev/test-cluster/yarn/load-spark-envs.sh | 1 + dev/test-cluster/yarn/spark-defaults.conf | 4 ++-- dev/test-cluster/yarn/yarn-site.xml | 6 +++++- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/dev/test-cluster/yarn/load-spark-envs.sh b/dev/test-cluster/yarn/load-spark-envs.sh index 7ee0cb452..19c7ad6b7 100755 --- a/dev/test-cluster/yarn/load-spark-envs.sh +++ b/dev/test-cluster/yarn/load-spark-envs.sh @@ -15,5 +15,6 @@ export PYTHONPATH=$SPARK_HOME/python:$PYTHONPATH export PYSPARK_PYTHON=python3 export PATH=$HADOOP_HOME/bin:$SPARK_HOME/bin:$PATH +export FI_TCP_IFACE=eth0 set +x diff --git a/dev/test-cluster/yarn/spark-defaults.conf b/dev/test-cluster/yarn/spark-defaults.conf index 04ed10b97..0bbc7b17c 100644 --- a/dev/test-cluster/yarn/spark-defaults.conf +++ b/dev/test-cluster/yarn/spark-defaults.conf @@ -28,7 +28,7 @@ spark.master yarn spark.serializer org.apache.spark.serializer.KryoSerializer -spark.driver.memory 1g +spark.driver.memory 512m spark.executor.num 2 spark.executor.cores 1 -spark.executor.memory 2g +spark.executor.memory 1g diff --git a/dev/test-cluster/yarn/yarn-site.xml b/dev/test-cluster/yarn/yarn-site.xml index b843e7630..9f6627bef 100644 --- a/dev/test-cluster/yarn/yarn-site.xml +++ b/dev/test-cluster/yarn/yarn-site.xml @@ -43,9 +43,13 @@ yarn.nodemanager.vmem-check-enabled false + + yarn.nodemanager.pmem-check-enabled + false + yarn.nodemanager.vmem-pmem-ratio - 2 + 1 yarn.scheduler.minimum-allocation-mb From 663c045c0b8a3cc8bbd41f98ed472d31c240f7dd Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Thu, 25 May 2023 09:16:08 +0000 Subject: [PATCH 12/14] debugging Signed-off-by: minmingzhu --- dev/test-cluster/yarn/env.sh | 4 ++-- dev/test-cluster/yarn/spark-defaults.conf | 7 ------- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/dev/test-cluster/yarn/env.sh b/dev/test-cluster/yarn/env.sh index 13d5cb6bf..8a386b078 100755 --- a/dev/test-cluster/yarn/env.sh +++ b/dev/test-cluster/yarn/env.sh @@ -38,10 +38,10 @@ else fi # Set Spark resources, can be overwritten in example -SPARK_DRIVER_MEMORY=512M +SPARK_DRIVER_MEMORY=256M SPARK_NUM_EXECUTORS=2 SPARK_EXECUTOR_CORES=1 -SPARK_EXECUTOR_MEMORY=1G +SPARK_EXECUTOR_MEMORY=256M SPARK_TOTAL_CORES=$((SPARK_NUM_EXECUTORS * SPARK_EXECUTOR_CORES)) SPARK_DEFAULT_PARALLELISM=$((SPARK_TOTAL_CORES * 2)) diff --git a/dev/test-cluster/yarn/spark-defaults.conf b/dev/test-cluster/yarn/spark-defaults.conf index 0bbc7b17c..19cba6e71 100644 --- a/dev/test-cluster/yarn/spark-defaults.conf +++ b/dev/test-cluster/yarn/spark-defaults.conf @@ -25,10 +25,3 @@ # spark.serializer org.apache.spark.serializer.KryoSerializer # spark.driver.memory 5g # spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three" - -spark.master yarn -spark.serializer org.apache.spark.serializer.KryoSerializer -spark.driver.memory 512m -spark.executor.num 2 -spark.executor.cores 1 -spark.executor.memory 1g From 29b6acb77bf3d2876229371acd745c33951366ce Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Fri, 26 May 2023 07:36:54 +0000 Subject: [PATCH 13/14] debugging Signed-off-by: minmingzhu --- dev/test-cluster/yarn/env.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/test-cluster/yarn/env.sh b/dev/test-cluster/yarn/env.sh index 8a386b078..1ca3d6e0e 100755 --- a/dev/test-cluster/yarn/env.sh +++ b/dev/test-cluster/yarn/env.sh @@ -38,7 +38,7 @@ else fi # Set Spark resources, can be overwritten in example -SPARK_DRIVER_MEMORY=256M +SPARK_DRIVER_MEMORY=512M SPARK_NUM_EXECUTORS=2 SPARK_EXECUTOR_CORES=1 SPARK_EXECUTOR_MEMORY=256M From 544f09d765292eebedef09f158152e264050d248 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Fri, 26 May 2023 07:53:34 +0000 Subject: [PATCH 14/14] debugging Signed-off-by: minmingzhu --- dev/test-cluster/yarn/env.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/test-cluster/yarn/env.sh b/dev/test-cluster/yarn/env.sh index 1ca3d6e0e..94d468bc7 100755 --- a/dev/test-cluster/yarn/env.sh +++ b/dev/test-cluster/yarn/env.sh @@ -41,7 +41,7 @@ fi SPARK_DRIVER_MEMORY=512M SPARK_NUM_EXECUTORS=2 SPARK_EXECUTOR_CORES=1 -SPARK_EXECUTOR_MEMORY=256M +SPARK_EXECUTOR_MEMORY=512M SPARK_TOTAL_CORES=$((SPARK_NUM_EXECUTORS * SPARK_EXECUTOR_CORES)) SPARK_DEFAULT_PARALLELISM=$((SPARK_TOTAL_CORES * 2))