Merge pull request #26 from ASFHyP3/g6_gpu

Update GPU dockerfile to fully support GPU workflow
ASFHyP3 · May 21, 2024 · c87f10a · c87f10a
2 parents df2f1a5 + acd9d06
commit c87f10a
Show file tree

Hide file tree

Showing 7 changed files with 125 additions and 68 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,19 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [PEP 440](https://www.python.org/dev/peps/pep-0440/)
 and uses [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.5.0]
+
+### Added
+* `scripts/ubuntu_setup.sh` for setting up a GPU-based Ubuntu EC2 AMI.
+* `scripts/amazon_linux_setup.sh` for setting up a GPU-based Amazon Linux 2023 EC2 AMI.
+
+### Changed
+* Refactored `scripts/build_proc.sh` to combine GPU compilation steps.
+* Final product zip archive is now always created.
+
+### Fixed
+* `Dockerfile.gpu` so that outputs will contain actual data.
+
 ## [0.4.0]
 
 ### Added

diff --git a/Dockerfile.gpu b/Dockerfile.gpu
@@ -1,4 +1,24 @@
-FROM nvidia/cuda:12.4.1-devel-ubuntu20.04
+FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 as builder
+
+# FIXME: should be able to find this dynamically
+ARG GPU_ARCH=89
+
+# GPU_ARCH and USEGPU environment variable used by build_proc.sh
+ENV FFTW_LIB=/usr/lib/x86_64-linux-gnu/libfftw3f.a
+ENV GPU_ARCH=${GPU_ARCH}
+ENV USEGPU=true
+ENV DEBIAN_FRONTEND=noninteractive
+
+# FIXME: can remove git after switch back to released version of back-projection
+RUN apt-get update && apt-get install -y --no-install-recommends unzip vim curl git build-essential gfortran libfftw3-dev && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+
+RUN git clone -b main https://github.com/ASFHyP3/back-projection.git
+COPY . /hyp3-back-projection/
+COPY ./scripts/build_proc.sh ./back-projection
+RUN cd /back-projection && ./build_proc.sh && cd /
+
+FROM nvidia/cuda:12.4.1-runtime-ubuntu22.04 as runner
 
 # For opencontainers label definitions, see:
 #    https://github.com/opencontainers/image-spec/blob/master/annotations.md
@@ -11,32 +31,21 @@ LABEL org.opencontainers.image.url="https://github.com/ASFHyP3/hyp3-back-project
 LABEL org.opencontainers.image.source="https://github.com/ASFHyP3/hyp3-back-projection"
 LABEL org.opencontainers.image.documentation="https://hyp3-docs.asf.alaska.edu"
 
-ARG DEBIAN_FRONTEND=noninteractive
 ARG CONDA_UID=1000
 ARG CONDA_GID=1000
-ARG BACK_PROJECTION_TAG=0.2.0
-ARG FFTW_TAG=3.3.9
 ARG MINIFORGE_NAME=Miniforge3
 ARG MINIFORGE_VERSION=24.3.0-0
 
-# USEGPU environment variable used by build_proc.sh
-ENV USEGPU="true" 
 ENV CONDA_DIR=/opt/conda
 ENV LANG=C.UTF-8 LC_ALL=C.UTF-8
 ENV PATH=${CONDA_DIR}/bin:${PATH}
 ENV PYTHONDONTWRITEBYTECODE=true
-ENV PROC_HOME=/home/conda/back-projection
+ENV PROC_HOME=/back-projection
 ENV MYHOME=/home/conda
+ENV DEBIAN_FRONTEND=noninteractive
 
 # Conda setup
-RUN apt-get update > /dev/null && \
-    apt-get install --no-install-recommends --yes \
-        wget bzip2 ca-certificates \
-        git \
-        tini \
-        > /dev/null && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/* && \
+RUN apt-get update && apt-get install --no-install-recommends --yes wget bzip2 ca-certificates git > /dev/null && \
     wget --no-hsts --quiet https://github.com/conda-forge/miniforge/releases/download/${MINIFORGE_VERSION}/${MINIFORGE_NAME}-${MINIFORGE_VERSION}-Linux-$(uname -m).sh -O /tmp/miniforge.sh && \
     /bin/bash /tmp/miniforge.sh -b -p ${CONDA_DIR} && \
     rm /tmp/miniforge.sh && \
@@ -47,7 +56,7 @@ RUN apt-get update > /dev/null && \
     echo ". ${CONDA_DIR}/etc/profile.d/conda.sh && conda activate base" >> /etc/skel/.bashrc && \
     echo ". ${CONDA_DIR}/etc/profile.d/conda.sh && conda activate base" >> ~/.bashrc
 
-RUN apt-get update && apt-get install -y --no-install-recommends unzip vim curl build-essential gfortran libfftw3-dev && \
+RUN apt-get install -y --no-install-recommends unzip vim curl gfortran && \
     apt-get clean && rm -rf /var/lib/apt/lists/*
 
 RUN groupadd -g "${CONDA_GID}" --system conda && \
@@ -61,20 +70,8 @@ SHELL ["/bin/bash", "-l", "-c"]
 USER ${CONDA_UID}
 WORKDIR /home/conda/
 
-RUN curl -sL https://github.com/ASFHyP3/back-projection/archive/refs/tags/v${BACK_PROJECTION_TAG}.tar.gz > ./back-projection.tar.gz && \
-    mkdir -p ./back-projection && \
-    tar -xvf ./back-projection.tar.gz -C ./back-projection/ --strip=1 && \
-    rm ./back-projection.tar.gz && \
-    rm -rf ./back-projection/fft
-
-COPY --chown=${CONDA_UID}:${CONDA_GID} ./scripts/build_proc.sh ./back-projection
-RUN cd /home/conda/back-projection && \
-    chmod +x ./build_proc.sh && \
-    ./build_proc.sh && \
-    find $PROC_HOME -type f -name "*.py" -exec chmod +x {} + && \
-    cd /home/conda/
-
-COPY --chown=${CONDA_UID}:${CONDA_GID} . /hyp3-back-projection/
+COPY --chown=${CONDA_UID}:${CONDA_GID} --from=builder /back-projection /back-projection
+COPY --chown=${CONDA_UID}:${CONDA_GID} --from=builder /hyp3-back-projection /hyp3-back-projection
 
 RUN mamba env create -f /hyp3-back-projection/environment.yml && \
     conda clean -afy && \

diff --git a/README.md b/README.md
@@ -58,29 +58,18 @@ The process is different for different OS's and Linux distros. The setup process
 can be found [here](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#configuration). Make sure to follow the [Docker configuration steps](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#configuration) after installing the package.
 
 ### EC2 Setup
+> [!CAUTION]
+> Running the docker container on an Amazon Linux 2023 Deep Learning AMI runs, but will result in all zero outputs. Work is ongoing to determine what is causing this issue. For now, we recommend using option 2.i.
+
 When running on an EC2 instance, the following setup is recommended:
-1. Create a [P3-family EC2 instance](https://aws.amazon.com/ec2/instance-types/p3/) with the [Amazon Linux 2 AMI with NVIDIA TESLA GPU Driver](https://aws.amazon.com/marketplace/pp/prodview-64e4rx3h733ru?sr=0-4&ref_=beagle&applicationId=AWSMPContessa)
-2. Install Docker and the nvidia-container-toolkit on the EC2 instance:
-```bash
-sudo yum-config-manager --disable amzn2-graphics
-curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo | sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo
-sudo yum install docker -y
-sudo yum install nvidia-container-toolkit -y
-sudo yum-config-manager --enable amzn2-graphics
-```
-3. Optionally, set up Docker to not require `sudo` and to start when the EC2 instance starts
-```bash
-sudo systemctl start docker && \
-sudo usermod -a -G docker ec2-user && \
-sudo systemctl enable docker
-```
-4. Exit the EC2 instance and re-enter
-5. To test the GPU setup, run the base NVIDIA container:
-```bash
-docker run -it --gpus all nvidia/cuda:12.4.1-devel-ubuntu20.04 nvidia-smi
-```
-6. Build the actual container and run it:
+1. Create a [G6-family EC2 instance](https://aws.amazon.com/ec2/instance-types/g6/) that has **at least 32 GB of memory**.
+2. Launch your instance with one of the following setups (**option i is recommended**):
+    1. Use the latest [Amazon Linux 2023 AMI](https://docs.aws.amazon.com/linux/al2023/ug/ec2.html) with `scripts/amazon_linux_setup.sh` as the [user script on launch](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/user-data.html).
+    2. Use the latest [Ubuntu AMI](https://cloud-images.ubuntu.com/locator/ec2/) with the `scripts/ubuntu_setup.sh` as the [user script on launch](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/user-data.html).
+    3. Use the [Ubuntu Deep Learning Base OSS Nvidia Driver GPU AMI](https://aws.amazon.com/releasenotes/aws-deep-learning-base-gpu-ami-ubuntu-22-04/) (no install script required).
+3. Build the GPU docker container with the correct compute capability version. To determine this value, run `nvidia-smi` on the instance to obtain GPU type, then cross-reference this information with NVIDIA's [GPU type compute capability list](https://developer.nvidia.com/cuda-gpus). For a g6.2xlarge instance, this would be:
 ```bash
-docker build -t back-projection:gpu -f Dockerfile.gpu .
-docker run --gpus=all --rm -it back-projection:gpu ++process back_projection --help
+docker --build-arg="GPU_ARCH=89" -t back-projection:gpu-89 -f Dockerfile.gpu .
 ```
+The compute capability version will always be the same for a given instance type, so you will only need to look this up once per instance type.
+The default value for this argument is `89` - the correct value for g6.2xlarge instances.
diff --git a/scripts/amazon_linux_setup.sh b/scripts/amazon_linux_setup.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+# GPU setup for the Amazon Linux 2023
+
+# Install NVIDIA driver
+DRIVER_VERSION=550.54.14
+sudo dnf install -y kernel-devel-$(uname -r) kernel-headers-$(uname -r) kernel-modules-extra
+curl -fSsl -O https://us.download.nvidia.com/tesla/$DRIVER_VERSION/NVIDIA-Linux-x86_64-$DRIVER_VERSION.run
+chmod +x NVIDIA-Linux-x86_64-$DRIVER_VERSION.run
+sudo ./NVIDIA-Linux-x86_64-$DRIVER_VERSION.run --tmpdir . --silent
+rm ./NVIDIA-Linux-x86_64-$DRIVER_VERSION.run 
+
+# Install and enable Docker
+sudo dnf install -y docker git
+sudo systemctl start docker
+sudo systemctl enable docker
+sudo usermod -aG docker ec2-user
+
+# Install nvidia-container-toolkit
+sudo dnf config-manager --add-repo https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo
+sudo dnf install -y nvidia-container-toolkit
+sudo nvidia-ctk runtime configure --runtime=docker
+sudo systemctl restart docker
+
+# Install extra packages
+sudo dnf install -y git
+
+# Cleanup
+dnf clean all && rm -rf /var/cache/dnf/*
+
+# Reboot
+sudo reboot
diff --git a/scripts/build_proc.sh b/scripts/build_proc.sh
@@ -1,10 +1,13 @@
 #!/bin/bash
 
-MULTIARCH_DIR=/usr/lib/$(gcc -print-multiarch)
-FFTW_LIB=$MULTIARCH_DIR/libfftw3f.a
+# Keeping these lines here in case we need to switch back to grabbing the FFTW location
+# dynamically again
+# MULTIARCH_DIR=/usr/lib/$(gcc -print-multiarch)
+# FFTW_LIB=$MULTIARCH_DIR/libfftw3f.a
 echo 'using FFTW library:' $FFTW_LIB
 if [[ "$USEGPU" == "true" ]]; then
-    echo 'building with GPU support'
+    nvcc -o gpu_arch gpu_arch.cu
+    echo 'building with GPU support, capability version' $GPU_ARCH
 fi
 
 cd DEM
@@ -21,11 +24,6 @@ gfortran -c processsubcpu.f90 backprojectcpusub.f90 bounds.f90 orbitrangetime.f9
 gcc -o sentinel_raw_process_cpu sentinel_raw_process_cpu.o decode_line_memory.o processsubcpu.o backprojectcpusub.o azimuth_compress_cpu.o bounds.o orbitrangetime.o latlon.o intp_orbit.o radar_to_xyz.o unitvec.o tcnbasis.o curvature.o cross.o orbithermite.o filelen.o io.o sentineltimingsub.o getburststatevectors.o $FFTW_LIB -lgfortran -lgomp -lm -lrt -lpthread
 echo 'built sentinel_raw_process_cpu'
 
-if [[ "$USEGPU" == "true" ]]; then
-    nvcc -o howmanygpus howmanygpus.cu
-    echo 'built howmanygpus'
-fi
-
 cd geo2rdr
 gfortran -o estimatebaseline estimatebaseline.f90 intp_orbit.f90 latlon.f90 orbithermite.f -ffixed-line-length-none
 
@@ -72,7 +70,6 @@ gfortran -o psinterp psinterp.f90 -fopenmp
 echo 'Built cosine_sim and psinterp in ps directory'
 
 cd ..
-tar xf snaphu_v2_0b0_0_0.tar
 cd snaphu_v2.0b0.0.0/src
 make CFLAGS=-O3 -s
 
@@ -89,14 +86,13 @@ gcc -c filelen.c io.c sentinel_raw_process.c decode_line_memory.c -lm -fopenmp
 
 echo 'built raw_process components in sentinel'
 
-if [[ "$USEGPU" == "true" ]]; then
-    nvcc -gencode arch=compute_89,code=sm_89 -c azimuth_compress.cu -Wno-deprecated-gpu-targets
-fi
-
 gfortran -c processsub.f90 backprojectgpusub.f90 bounds.f90 orbitrangetime.f90 latlon.f90 intp_orbit.f90 radar_to_xyz.f90 unitvec.f90 tcnbasis.f90 curvature.f90 cross.f90 orbithermite.f sentineltimingsub.f90 getburststatevectors.f90 -ffixed-line-length-none -fopenmp
 
 if [[ "$USEGPU" == "true" ]]; then
-    nvcc -o sentinel_raw_process sentinel_raw_process.o decode_line_memory.o processsub.o backprojectgpusub.o azimuth_compress.o bounds.o orbitrangetime.o latlon.o intp_orbit.o radar_to_xyz.o unitvec.o tcnbasis.o curvature.o cross.o orbithermite.o filelen.o io.o sentineltimingsub.o getburststatevectors.o $FFTW_LIB -lstdc++ -lgfortran -lgomp
+    nvcc -o howmanygpus howmanygpus.cu
+    nvcc -gencode arch=compute_$GPU_ARCH,code=sm_$GPU_ARCH -c azimuth_compress.cu -Wno-deprecated-gpu-targets
+    nvcc -gencode arch=compute_$GPU_ARCH,code=sm_$GPU_ARCH -o sentinel_raw_process sentinel_raw_process.o decode_line_memory.o processsub.o backprojectgpusub.o azimuth_compress.o bounds.o orbitrangetime.o latlon.o intp_orbit.o radar_to_xyz.o unitvec.o tcnbasis.o curvature.o cross.o orbithermite.o filelen.o io.o sentineltimingsub.o getburststatevectors.o $FFTW_LIB -lstdc++ -lgfortran -lgomp
+    echo 'built gpu components components in sentinel'
 fi
 
 cd ..
diff --git a/scripts/ubuntu_setup.sh b/scripts/ubuntu_setup.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+# GPU setup for the Ubuntu 22.04
+
+# NVIDIA source setup
+curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg && \
+curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
+sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
+sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list && \
+wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb && \
+sudo dpkg -i cuda-keyring_1.1-1_all.deb && \
+rm cuda-keyring_1.1-1_all.deb
+
+# Docker source setup
+sudo apt install -y ca-certificates curl gnupg lsb-release && \
+sudo mkdir -p /etc/apt/keyrings && \
+curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg && \
+echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
+
+# Installs
+sudo apt-get update && \
+sudo apt-get install -y nvidia-headless-535-server nvidia-utils-535-server nvidia-container-toolkit docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin awscli git && \
+sudo usermod -aG docker ubuntu
+
+# Cleanup temporary files
+sudo apt-get clean
+sudo rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+
+# Reboot
+sudo reboot
diff --git a/src/hyp3_back_projection/back_projection.py b/src/hyp3_back_projection/back_projection.py
@@ -140,8 +140,8 @@ def back_project(
 
     utils.call_stanford_module('util/merge_slcs.py', work_dir=work_dir)
 
+    zip_path = create_product(work_dir)
     if bucket:
-        zip_path = create_product(work_dir)
         upload_file_to_s3(zip_path, bucket, bucket_prefix)
 
     print(f'Finished back-projection for {list(work_dir.glob("S1*.geo"))[0].with_suffix("").name}!')