-
Notifications
You must be signed in to change notification settings - Fork 21
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
14c2201
commit 7c69f9c
Showing
6 changed files
with
549 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,132 @@ | ||
# Spark stuff | ||
|
||
########################### | ||
### pyspark-notebook | ||
########################### | ||
# https://raw.githubusercontent.com/jupyter/docker-stacks/main/images/pyspark-notebook/Dockerfile | ||
|
||
# Copyright (c) Jupyter Development Team. | ||
# Distributed under the terms of the Modified BSD License. | ||
ARG REGISTRY=quay.io | ||
ARG OWNER=jupyter | ||
|
||
|
||
# Fix: https://github.com/hadolint/hadolint/wiki/DL4006 | ||
# Fix: https://github.com/koalaman/shellcheck/wiki/SC3014 | ||
SHELL ["/bin/bash", "-o", "pipefail", "-c"] | ||
|
||
USER root | ||
|
||
# Spark dependencies | ||
# Default values can be overridden at build time | ||
# (ARGS are in lower case to distinguish them from ENV) | ||
ARG spark_version="3.5.0" | ||
ARG hadoop_version="3" | ||
ARG scala_version | ||
ARG spark_checksum="8883c67e0a138069e597f3e7d4edbbd5c3a565d50b28644aad02856a1ec1da7cb92b8f80454ca427118f69459ea326eaa073cf7b1a860c3b796f4b07c2101319" | ||
ARG openjdk_version="17" | ||
|
||
ENV APACHE_SPARK_VERSION="${spark_version}" \ | ||
HADOOP_VERSION="${hadoop_version}" | ||
|
||
RUN apt-get update --yes && \ | ||
apt-get install --yes --no-install-recommends \ | ||
"openjdk-${openjdk_version}-jre-headless" \ | ||
ca-certificates-java && \ | ||
apt-get clean && rm -rf /var/lib/apt/lists/* | ||
|
||
# Spark installation | ||
WORKDIR /tmp | ||
|
||
# You need to use https://archive.apache.org/dist/ website if you want to download old Spark versions | ||
# But it seems to be slower, that's why we use recommended site for download | ||
RUN if [ -z "${scala_version}" ]; then \ | ||
curl --progress-bar --location --output "spark.tgz" \ | ||
"https://dlcdn.apache.org/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz"; \ | ||
else \ | ||
curl --progress-bar --location --output "spark.tgz" \ | ||
"https://dlcdn.apache.org/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala${scala_version}.tgz"; \ | ||
fi && \ | ||
echo "${spark_checksum} *spark.tgz" | sha512sum -c - && \ | ||
tar xzf "spark.tgz" -C /usr/local --owner root --group root --no-same-owner && \ | ||
rm "spark.tgz" | ||
|
||
# Configure Spark | ||
ENV SPARK_HOME=/usr/local/spark | ||
ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" \ | ||
PATH="${PATH}:${SPARK_HOME}/bin" | ||
|
||
RUN if [ -z "${scala_version}" ]; then \ | ||
ln -s "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" "${SPARK_HOME}"; \ | ||
else \ | ||
ln -s "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala${scala_version}" "${SPARK_HOME}"; \ | ||
fi && \ | ||
# Add a link in the before_notebook hook in order to source automatically PYTHONPATH && \ | ||
ln -s "${SPARK_HOME}/sbin/spark-config.sh" /usr/local/bin/before-notebook.d/spark-config.sh | ||
|
||
# Configure IPython system-wide | ||
COPY ipython_kernel_config.py "/etc/ipython/" | ||
RUN fix-permissions "/etc/ipython/" | ||
|
||
USER ${NB_UID} | ||
|
||
# Install pyarrow | ||
# NOTE: It's important to ensure compatibility between Pandas versions. | ||
# The pandas version in this Dockerfile should match the version | ||
# on which the Pandas API for Spark is built. | ||
# To find the right version: | ||
# 1. Check out the Spark branch you are on. | ||
# 2. Find the pandas version in the file spark/dev/infra/Dockerfile. | ||
RUN mamba install --yes \ | ||
'grpcio-status' \ | ||
'grpcio' \ | ||
'pandas=2.0.3' \ | ||
'pyarrow' && \ | ||
mamba clean --all -f -y && \ | ||
fix-permissions "${CONDA_DIR}" && \ | ||
fix-permissions "/home/${NB_USER}" | ||
|
||
WORKDIR "${HOME}" | ||
EXPOSE 4040 | ||
|
||
########################### | ||
### all-spark-notebook | ||
########################### | ||
# https://raw.githubusercontent.com/jupyter/docker-stacks/main/images/all-spark-notebook/Dockerfile | ||
|
||
# Copyright (c) Jupyter Development Team. | ||
# Distributed under the terms of the Modified BSD License. | ||
ARG REGISTRY=quay.io | ||
ARG OWNER=jupyter | ||
|
||
|
||
# Fix: https://github.com/hadolint/hadolint/wiki/DL4006 | ||
# Fix: https://github.com/koalaman/shellcheck/wiki/SC3014 | ||
SHELL ["/bin/bash", "-o", "pipefail", "-c"] | ||
|
||
USER root | ||
|
||
# RSpark config | ||
ENV R_LIBS_USER "${SPARK_HOME}/R/lib" | ||
RUN fix-permissions "${R_LIBS_USER}" | ||
|
||
# R pre-requisites | ||
RUN apt-get update --yes && \ | ||
apt-get install --yes --no-install-recommends \ | ||
fonts-dejavu \ | ||
gfortran \ | ||
gcc && \ | ||
apt-get clean && rm -rf /var/lib/apt/lists/* | ||
|
||
USER ${NB_UID} | ||
|
||
# R packages including IRKernel which gets installed globally. | ||
RUN mamba install --yes \ | ||
'r-base' \ | ||
'r-ggplot2' \ | ||
'r-irkernel' \ | ||
'r-rcurl' \ | ||
'r-sparklyr' && \ | ||
mamba clean --all -f -y && \ | ||
fix-permissions "${CONDA_DIR}" && \ | ||
fix-permissions "/home/${NB_USER}" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.