Skip to content

Commit

Permalink
feat(spark): add spark dockerbit
Browse files Browse the repository at this point in the history
  • Loading branch information
Souheil-Yazji committed Nov 14, 2023
1 parent 14c2201 commit 7c69f9c
Show file tree
Hide file tree
Showing 6 changed files with 549 additions and 3 deletions.
5 changes: 4 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ DOCKER-STACKS-UPSTREAM-TAG := ed2908bbb62e
tensorflow-CUDA := 11.8.0
pytorch-CUDA := 11.8.0

SPARK := main

# https://stackoverflow.com/questions/5917413/concatenate-multiple-files-but-include-filename-as-section-headers
CAT := awk '(FNR==1){print "\n\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\n\#\#\# " FILENAME "\n\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\#\n"}1'

Expand Down Expand Up @@ -64,7 +66,7 @@ generate-CUDA:
bash scripts/get-nvidia-stuff.sh $(pytorch-CUDA) > $(SRC)/1_CUDA-$(pytorch-CUDA).Dockerfile

generate-Spark:
bash scripts/get-spark-stuff.sh --commit $(COMMIT) > $(SRC)/2_Spark.Dockerfile
bash scripts/get-spark-stuff.sh --commit $(SPARK) > $(SRC)/2_Spark.Dockerfile

###################################
###### Dockerfile Management ######
Expand Down Expand Up @@ -148,6 +150,7 @@ jupyterlab: pytorch tensorflow cpu
cp -r resources/common/. $(OUT)/$@-$${type}/; \
$(CAT) \
$(TMP)/$${type}.Dockerfile \
$(SRC)/2_Spark.Dockerfile \
$(SRC)/3_Kubeflow.Dockerfile \
$(SRC)/4_CLI.Dockerfile \
$(SRC)/5_DB-Drivers.Dockerfile \
Expand Down
132 changes: 132 additions & 0 deletions docker-bits/2_Spark.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
# Spark stuff

###########################
### pyspark-notebook
###########################
# https://raw.githubusercontent.com/jupyter/docker-stacks/main/images/pyspark-notebook/Dockerfile

# Copyright (c) Jupyter Development Team.
# Distributed under the terms of the Modified BSD License.
ARG REGISTRY=quay.io
ARG OWNER=jupyter


# Fix: https://github.com/hadolint/hadolint/wiki/DL4006
# Fix: https://github.com/koalaman/shellcheck/wiki/SC3014
SHELL ["/bin/bash", "-o", "pipefail", "-c"]

USER root

# Spark dependencies
# Default values can be overridden at build time
# (ARGS are in lower case to distinguish them from ENV)
ARG spark_version="3.5.0"
ARG hadoop_version="3"
ARG scala_version
ARG spark_checksum="8883c67e0a138069e597f3e7d4edbbd5c3a565d50b28644aad02856a1ec1da7cb92b8f80454ca427118f69459ea326eaa073cf7b1a860c3b796f4b07c2101319"
ARG openjdk_version="17"

ENV APACHE_SPARK_VERSION="${spark_version}" \
HADOOP_VERSION="${hadoop_version}"

RUN apt-get update --yes && \
apt-get install --yes --no-install-recommends \
"openjdk-${openjdk_version}-jre-headless" \
ca-certificates-java && \
apt-get clean && rm -rf /var/lib/apt/lists/*

# Spark installation
WORKDIR /tmp

# You need to use https://archive.apache.org/dist/ website if you want to download old Spark versions
# But it seems to be slower, that's why we use recommended site for download
RUN if [ -z "${scala_version}" ]; then \
curl --progress-bar --location --output "spark.tgz" \
"https://dlcdn.apache.org/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz"; \
else \
curl --progress-bar --location --output "spark.tgz" \
"https://dlcdn.apache.org/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala${scala_version}.tgz"; \
fi && \
echo "${spark_checksum} *spark.tgz" | sha512sum -c - && \
tar xzf "spark.tgz" -C /usr/local --owner root --group root --no-same-owner && \
rm "spark.tgz"

# Configure Spark
ENV SPARK_HOME=/usr/local/spark
ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" \
PATH="${PATH}:${SPARK_HOME}/bin"

RUN if [ -z "${scala_version}" ]; then \
ln -s "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" "${SPARK_HOME}"; \
else \
ln -s "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala${scala_version}" "${SPARK_HOME}"; \
fi && \
# Add a link in the before_notebook hook in order to source automatically PYTHONPATH && \
ln -s "${SPARK_HOME}/sbin/spark-config.sh" /usr/local/bin/before-notebook.d/spark-config.sh

# Configure IPython system-wide
COPY ipython_kernel_config.py "/etc/ipython/"
RUN fix-permissions "/etc/ipython/"

USER ${NB_UID}

# Install pyarrow
# NOTE: It's important to ensure compatibility between Pandas versions.
# The pandas version in this Dockerfile should match the version
# on which the Pandas API for Spark is built.
# To find the right version:
# 1. Check out the Spark branch you are on.
# 2. Find the pandas version in the file spark/dev/infra/Dockerfile.
RUN mamba install --yes \
'grpcio-status' \
'grpcio' \
'pandas=2.0.3' \
'pyarrow' && \
mamba clean --all -f -y && \
fix-permissions "${CONDA_DIR}" && \
fix-permissions "/home/${NB_USER}"

WORKDIR "${HOME}"
EXPOSE 4040

###########################
### all-spark-notebook
###########################
# https://raw.githubusercontent.com/jupyter/docker-stacks/main/images/all-spark-notebook/Dockerfile

# Copyright (c) Jupyter Development Team.
# Distributed under the terms of the Modified BSD License.
ARG REGISTRY=quay.io
ARG OWNER=jupyter


# Fix: https://github.com/hadolint/hadolint/wiki/DL4006
# Fix: https://github.com/koalaman/shellcheck/wiki/SC3014
SHELL ["/bin/bash", "-o", "pipefail", "-c"]

USER root

# RSpark config
ENV R_LIBS_USER "${SPARK_HOME}/R/lib"
RUN fix-permissions "${R_LIBS_USER}"

# R pre-requisites
RUN apt-get update --yes && \
apt-get install --yes --no-install-recommends \
fonts-dejavu \
gfortran \
gcc && \
apt-get clean && rm -rf /var/lib/apt/lists/*

USER ${NB_UID}

# R packages including IRKernel which gets installed globally.
RUN mamba install --yes \
'r-base' \
'r-ggplot2' \
'r-irkernel' \
'r-rcurl' \
'r-sparklyr' && \
mamba clean --all -f -y && \
fix-permissions "${CONDA_DIR}" && \
fix-permissions "/home/${NB_USER}"
137 changes: 137 additions & 0 deletions output/jupyterlab-cpu/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,143 @@ RUN apt-get update --yes \
&& rm -rf /var/lib/apt/lists/* \
&& chmod +x /usr/bin/clean-layer.sh

###############################
### docker-bits/2_Spark.Dockerfile
###############################

# Spark stuff

###########################
### pyspark-notebook
###########################
# https://raw.githubusercontent.com/jupyter/docker-stacks/main/images/pyspark-notebook/Dockerfile

# Copyright (c) Jupyter Development Team.
# Distributed under the terms of the Modified BSD License.
ARG REGISTRY=quay.io
ARG OWNER=jupyter


# Fix: https://github.com/hadolint/hadolint/wiki/DL4006
# Fix: https://github.com/koalaman/shellcheck/wiki/SC3014
SHELL ["/bin/bash", "-o", "pipefail", "-c"]

USER root

# Spark dependencies
# Default values can be overridden at build time
# (ARGS are in lower case to distinguish them from ENV)
ARG spark_version="3.5.0"
ARG hadoop_version="3"
ARG scala_version
ARG spark_checksum="8883c67e0a138069e597f3e7d4edbbd5c3a565d50b28644aad02856a1ec1da7cb92b8f80454ca427118f69459ea326eaa073cf7b1a860c3b796f4b07c2101319"
ARG openjdk_version="17"

ENV APACHE_SPARK_VERSION="${spark_version}" \
HADOOP_VERSION="${hadoop_version}"

RUN apt-get update --yes && \
apt-get install --yes --no-install-recommends \
"openjdk-${openjdk_version}-jre-headless" \
ca-certificates-java && \
apt-get clean && rm -rf /var/lib/apt/lists/*

# Spark installation
WORKDIR /tmp

# You need to use https://archive.apache.org/dist/ website if you want to download old Spark versions
# But it seems to be slower, that's why we use recommended site for download
RUN if [ -z "${scala_version}" ]; then \
curl --progress-bar --location --output "spark.tgz" \
"https://dlcdn.apache.org/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz"; \
else \
curl --progress-bar --location --output "spark.tgz" \
"https://dlcdn.apache.org/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala${scala_version}.tgz"; \
fi && \
echo "${spark_checksum} *spark.tgz" | sha512sum -c - && \
tar xzf "spark.tgz" -C /usr/local --owner root --group root --no-same-owner && \
rm "spark.tgz"

# Configure Spark
ENV SPARK_HOME=/usr/local/spark
ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" \
PATH="${PATH}:${SPARK_HOME}/bin"

RUN if [ -z "${scala_version}" ]; then \
ln -s "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" "${SPARK_HOME}"; \
else \
ln -s "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala${scala_version}" "${SPARK_HOME}"; \
fi && \
# Add a link in the before_notebook hook in order to source automatically PYTHONPATH && \
ln -s "${SPARK_HOME}/sbin/spark-config.sh" /usr/local/bin/before-notebook.d/spark-config.sh

# Configure IPython system-wide
COPY ipython_kernel_config.py "/etc/ipython/"
RUN fix-permissions "/etc/ipython/"

USER ${NB_UID}

# Install pyarrow
# NOTE: It's important to ensure compatibility between Pandas versions.
# The pandas version in this Dockerfile should match the version
# on which the Pandas API for Spark is built.
# To find the right version:
# 1. Check out the Spark branch you are on.
# 2. Find the pandas version in the file spark/dev/infra/Dockerfile.
RUN mamba install --yes \
'grpcio-status' \
'grpcio' \
'pandas=2.0.3' \
'pyarrow' && \
mamba clean --all -f -y && \
fix-permissions "${CONDA_DIR}" && \
fix-permissions "/home/${NB_USER}"

WORKDIR "${HOME}"
EXPOSE 4040

###########################
### all-spark-notebook
###########################
# https://raw.githubusercontent.com/jupyter/docker-stacks/main/images/all-spark-notebook/Dockerfile

# Copyright (c) Jupyter Development Team.
# Distributed under the terms of the Modified BSD License.
ARG REGISTRY=quay.io
ARG OWNER=jupyter


# Fix: https://github.com/hadolint/hadolint/wiki/DL4006
# Fix: https://github.com/koalaman/shellcheck/wiki/SC3014
SHELL ["/bin/bash", "-o", "pipefail", "-c"]

USER root

# RSpark config
ENV R_LIBS_USER "${SPARK_HOME}/R/lib"
RUN fix-permissions "${R_LIBS_USER}"

# R pre-requisites
RUN apt-get update --yes && \
apt-get install --yes --no-install-recommends \
fonts-dejavu \
gfortran \
gcc && \
apt-get clean && rm -rf /var/lib/apt/lists/*

USER ${NB_UID}

# R packages including IRKernel which gets installed globally.
RUN mamba install --yes \
'r-base' \
'r-ggplot2' \
'r-irkernel' \
'r-rcurl' \
'r-sparklyr' && \
mamba clean --all -f -y && \
fix-permissions "${CONDA_DIR}" && \
fix-permissions "/home/${NB_USER}"

###############################
### docker-bits/3_Kubeflow.Dockerfile
###############################
Expand Down
Loading

0 comments on commit 7c69f9c

Please sign in to comment.