feat(spark): add spark dockerbit

StatCan · Nov 14, 2023 · 7c69f9c · 7c69f9c
1 parent 14c2201
commit 7c69f9c
Show file tree

Hide file tree

Showing 6 changed files with 549 additions and 3 deletions.
diff --git a/Makefile b/Makefile
@@ -16,6 +16,8 @@ DOCKER-STACKS-UPSTREAM-TAG := ed2908bbb62e
 tensorflow-CUDA := 11.8.0
 pytorch-CUDA    := 11.8.0
 
+SPARK := main
+
 # https://stackoverflow.com/questions/5917413/concatenate-multiple-files-but-include-filename-as-section-headers
 CAT := awk '(FNR==1){print "\n\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\n\#\#\#  " FILENAME "\n\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\n"}1'
 
@@ -64,7 +66,7 @@ generate-CUDA:
 	bash scripts/get-nvidia-stuff.sh    $(pytorch-CUDA) > $(SRC)/1_CUDA-$(pytorch-CUDA).Dockerfile
 
 generate-Spark:
-	bash scripts/get-spark-stuff.sh --commit $(COMMIT)  > $(SRC)/2_Spark.Dockerfile
+	bash scripts/get-spark-stuff.sh --commit $(SPARK)  > $(SRC)/2_Spark.Dockerfile
 
 ###################################
 ###### Dockerfile Management ######
@@ -148,6 +150,7 @@ jupyterlab: pytorch tensorflow cpu
 		cp -r resources/common/. $(OUT)/$@-$${type}/; \
 		$(CAT) \
 			$(TMP)/$${type}.Dockerfile \
+			$(SRC)/2_Spark.Dockerfile \
 			$(SRC)/3_Kubeflow.Dockerfile \
 			$(SRC)/4_CLI.Dockerfile \
 			$(SRC)/5_DB-Drivers.Dockerfile \

diff --git a/docker-bits/2_Spark.Dockerfile b/docker-bits/2_Spark.Dockerfile
@@ -0,0 +1,132 @@
+# Spark stuff
+
+###########################
+### pyspark-notebook
+###########################
+# https://raw.githubusercontent.com/jupyter/docker-stacks/main/images/pyspark-notebook/Dockerfile
+
+# Copyright (c) Jupyter Development Team.
+# Distributed under the terms of the Modified BSD License.
+ARG REGISTRY=quay.io
+ARG OWNER=jupyter
+
+
+# Fix: https://github.com/hadolint/hadolint/wiki/DL4006
+# Fix: https://github.com/koalaman/shellcheck/wiki/SC3014
+SHELL ["/bin/bash", "-o", "pipefail", "-c"]
+
+USER root
+
+# Spark dependencies
+# Default values can be overridden at build time
+# (ARGS are in lower case to distinguish them from ENV)
+ARG spark_version="3.5.0"
+ARG hadoop_version="3"
+ARG scala_version
+ARG spark_checksum="8883c67e0a138069e597f3e7d4edbbd5c3a565d50b28644aad02856a1ec1da7cb92b8f80454ca427118f69459ea326eaa073cf7b1a860c3b796f4b07c2101319"
+ARG openjdk_version="17"
+
+ENV APACHE_SPARK_VERSION="${spark_version}" \
+    HADOOP_VERSION="${hadoop_version}"
+
+RUN apt-get update --yes && \
+    apt-get install --yes --no-install-recommends \
+    "openjdk-${openjdk_version}-jre-headless" \
+    ca-certificates-java && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+
+# Spark installation
+WORKDIR /tmp
+
+# You need to use https://archive.apache.org/dist/ website if you want to download old Spark versions
+# But it seems to be slower, that's why we use recommended site for download
+RUN if [ -z "${scala_version}" ]; then \
+    curl --progress-bar --location --output "spark.tgz" \
+        "https://dlcdn.apache.org/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz"; \
+  else \
+    curl --progress-bar --location --output "spark.tgz" \
+        "https://dlcdn.apache.org/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala${scala_version}.tgz"; \
+  fi && \
+  echo "${spark_checksum} *spark.tgz" | sha512sum -c - && \
+  tar xzf "spark.tgz" -C /usr/local --owner root --group root --no-same-owner && \
+  rm "spark.tgz"
+
+# Configure Spark
+ENV SPARK_HOME=/usr/local/spark
+ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" \
+    PATH="${PATH}:${SPARK_HOME}/bin"
+
+RUN if [ -z "${scala_version}" ]; then \
+    ln -s "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" "${SPARK_HOME}"; \
+  else \
+    ln -s "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala${scala_version}" "${SPARK_HOME}"; \
+  fi && \
+  # Add a link in the before_notebook hook in order to source automatically PYTHONPATH && \
+  ln -s "${SPARK_HOME}/sbin/spark-config.sh" /usr/local/bin/before-notebook.d/spark-config.sh
+
+# Configure IPython system-wide
+COPY ipython_kernel_config.py "/etc/ipython/"
+RUN fix-permissions "/etc/ipython/"
+
+USER ${NB_UID}
+
+# Install pyarrow
+# NOTE: It's important to ensure compatibility between Pandas versions.
+# The pandas version in this Dockerfile should match the version
+# on which the Pandas API for Spark is built.
+# To find the right version:
+# 1. Check out the Spark branch you are on.
+# 2. Find the pandas version in the file spark/dev/infra/Dockerfile.
+RUN mamba install --yes \
+    'grpcio-status' \
+    'grpcio' \
+    'pandas=2.0.3' \
+    'pyarrow' && \
+    mamba clean --all -f -y && \
+    fix-permissions "${CONDA_DIR}" && \
+    fix-permissions "/home/${NB_USER}"
+
+WORKDIR "${HOME}"
+EXPOSE 4040
+
+###########################
+### all-spark-notebook
+###########################
+# https://raw.githubusercontent.com/jupyter/docker-stacks/main/images/all-spark-notebook/Dockerfile
+
+# Copyright (c) Jupyter Development Team.
+# Distributed under the terms of the Modified BSD License.
+ARG REGISTRY=quay.io
+ARG OWNER=jupyter
+
+
+# Fix: https://github.com/hadolint/hadolint/wiki/DL4006
+# Fix: https://github.com/koalaman/shellcheck/wiki/SC3014
+SHELL ["/bin/bash", "-o", "pipefail", "-c"]
+
+USER root
+
+# RSpark config
+ENV R_LIBS_USER "${SPARK_HOME}/R/lib"
+RUN fix-permissions "${R_LIBS_USER}"
+
+# R pre-requisites
+RUN apt-get update --yes && \
+    apt-get install --yes --no-install-recommends \
+    fonts-dejavu \
+    gfortran \
+    gcc && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+
+USER ${NB_UID}
+
+# R packages including IRKernel which gets installed globally.
+RUN mamba install --yes \
+    'r-base' \
+    'r-ggplot2' \
+    'r-irkernel' \
+    'r-rcurl' \
+    'r-sparklyr' && \
+    mamba clean --all -f -y && \
+    fix-permissions "${CONDA_DIR}" && \
+    fix-permissions "/home/${NB_USER}"
diff --git a/output/jupyterlab-cpu/Dockerfile b/output/jupyterlab-cpu/Dockerfile
@@ -28,6 +28,143 @@ RUN apt-get update --yes \
     && rm -rf /var/lib/apt/lists/* \
     && chmod +x /usr/bin/clean-layer.sh
 
+###############################
+###  docker-bits/2_Spark.Dockerfile
+###############################
+
+# Spark stuff
+
+###########################
+### pyspark-notebook
+###########################
+# https://raw.githubusercontent.com/jupyter/docker-stacks/main/images/pyspark-notebook/Dockerfile
+
+# Copyright (c) Jupyter Development Team.
+# Distributed under the terms of the Modified BSD License.
+ARG REGISTRY=quay.io
+ARG OWNER=jupyter
+
+
+# Fix: https://github.com/hadolint/hadolint/wiki/DL4006
+# Fix: https://github.com/koalaman/shellcheck/wiki/SC3014
+SHELL ["/bin/bash", "-o", "pipefail", "-c"]
+
+USER root
+
+# Spark dependencies
+# Default values can be overridden at build time
+# (ARGS are in lower case to distinguish them from ENV)
+ARG spark_version="3.5.0"
+ARG hadoop_version="3"
+ARG scala_version
+ARG spark_checksum="8883c67e0a138069e597f3e7d4edbbd5c3a565d50b28644aad02856a1ec1da7cb92b8f80454ca427118f69459ea326eaa073cf7b1a860c3b796f4b07c2101319"
+ARG openjdk_version="17"
+
+ENV APACHE_SPARK_VERSION="${spark_version}" \
+    HADOOP_VERSION="${hadoop_version}"
+
+RUN apt-get update --yes && \
+    apt-get install --yes --no-install-recommends \
+    "openjdk-${openjdk_version}-jre-headless" \
+    ca-certificates-java && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+
+# Spark installation
+WORKDIR /tmp
+
+# You need to use https://archive.apache.org/dist/ website if you want to download old Spark versions
+# But it seems to be slower, that's why we use recommended site for download
+RUN if [ -z "${scala_version}" ]; then \
+    curl --progress-bar --location --output "spark.tgz" \
+        "https://dlcdn.apache.org/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz"; \
+  else \
+    curl --progress-bar --location --output "spark.tgz" \
+        "https://dlcdn.apache.org/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala${scala_version}.tgz"; \
+  fi && \
+  echo "${spark_checksum} *spark.tgz" | sha512sum -c - && \
+  tar xzf "spark.tgz" -C /usr/local --owner root --group root --no-same-owner && \
+  rm "spark.tgz"
+
+# Configure Spark
+ENV SPARK_HOME=/usr/local/spark
+ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" \
+    PATH="${PATH}:${SPARK_HOME}/bin"
+
+RUN if [ -z "${scala_version}" ]; then \
+    ln -s "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" "${SPARK_HOME}"; \
+  else \
+    ln -s "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala${scala_version}" "${SPARK_HOME}"; \
+  fi && \
+  # Add a link in the before_notebook hook in order to source automatically PYTHONPATH && \
+  ln -s "${SPARK_HOME}/sbin/spark-config.sh" /usr/local/bin/before-notebook.d/spark-config.sh
+
+# Configure IPython system-wide
+COPY ipython_kernel_config.py "/etc/ipython/"
+RUN fix-permissions "/etc/ipython/"
+
+USER ${NB_UID}
+
+# Install pyarrow
+# NOTE: It's important to ensure compatibility between Pandas versions.
+# The pandas version in this Dockerfile should match the version
+# on which the Pandas API for Spark is built.
+# To find the right version:
+# 1. Check out the Spark branch you are on.
+# 2. Find the pandas version in the file spark/dev/infra/Dockerfile.
+RUN mamba install --yes \
+    'grpcio-status' \
+    'grpcio' \
+    'pandas=2.0.3' \
+    'pyarrow' && \
+    mamba clean --all -f -y && \
+    fix-permissions "${CONDA_DIR}" && \
+    fix-permissions "/home/${NB_USER}"
+
+WORKDIR "${HOME}"
+EXPOSE 4040
+
+###########################
+### all-spark-notebook
+###########################
+# https://raw.githubusercontent.com/jupyter/docker-stacks/main/images/all-spark-notebook/Dockerfile
+
+# Copyright (c) Jupyter Development Team.
+# Distributed under the terms of the Modified BSD License.
+ARG REGISTRY=quay.io
+ARG OWNER=jupyter
+
+
+# Fix: https://github.com/hadolint/hadolint/wiki/DL4006
+# Fix: https://github.com/koalaman/shellcheck/wiki/SC3014
+SHELL ["/bin/bash", "-o", "pipefail", "-c"]
+
+USER root
+
+# RSpark config
+ENV R_LIBS_USER "${SPARK_HOME}/R/lib"
+RUN fix-permissions "${R_LIBS_USER}"
+
+# R pre-requisites
+RUN apt-get update --yes && \
+    apt-get install --yes --no-install-recommends \
+    fonts-dejavu \
+    gfortran \
+    gcc && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+
+USER ${NB_UID}
+
+# R packages including IRKernel which gets installed globally.
+RUN mamba install --yes \
+    'r-base' \
+    'r-ggplot2' \
+    'r-irkernel' \
+    'r-rcurl' \
+    'r-sparklyr' && \
+    mamba clean --all -f -y && \
+    fix-permissions "${CONDA_DIR}" && \
+    fix-permissions "/home/${NB_USER}"
+
 ###############################
 ###  docker-bits/3_Kubeflow.Dockerfile
 ###############################