Skip to content

Commit

Permalink
Merge pull request #26 from ASFHyP3/g6_gpu
Browse files Browse the repository at this point in the history
Update GPU dockerfile to fully support GPU workflow
  • Loading branch information
forrestfwilliams authored May 21, 2024
2 parents df2f1a5 + acd9d06 commit c87f10a
Show file tree
Hide file tree
Showing 7 changed files with 125 additions and 68 deletions.
13 changes: 13 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,19 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [PEP 440](https://www.python.org/dev/peps/pep-0440/)
and uses [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [0.5.0]

### Added
* `scripts/ubuntu_setup.sh` for setting up a GPU-based Ubuntu EC2 AMI.
* `scripts/amazon_linux_setup.sh` for setting up a GPU-based Amazon Linux 2023 EC2 AMI.

### Changed
* Refactored `scripts/build_proc.sh` to combine GPU compilation steps.
* Final product zip archive is now always created.

### Fixed
* `Dockerfile.gpu` so that outputs will contain actual data.

## [0.4.0]

### Added
Expand Down
57 changes: 27 additions & 30 deletions Dockerfile.gpu
Original file line number Diff line number Diff line change
@@ -1,4 +1,24 @@
FROM nvidia/cuda:12.4.1-devel-ubuntu20.04
FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 as builder

# FIXME: should be able to find this dynamically
ARG GPU_ARCH=89

# GPU_ARCH and USEGPU environment variable used by build_proc.sh
ENV FFTW_LIB=/usr/lib/x86_64-linux-gnu/libfftw3f.a
ENV GPU_ARCH=${GPU_ARCH}
ENV USEGPU=true
ENV DEBIAN_FRONTEND=noninteractive

# FIXME: can remove git after switch back to released version of back-projection
RUN apt-get update && apt-get install -y --no-install-recommends unzip vim curl git build-essential gfortran libfftw3-dev && \
apt-get clean && rm -rf /var/lib/apt/lists/*

RUN git clone -b main https://github.com/ASFHyP3/back-projection.git
COPY . /hyp3-back-projection/
COPY ./scripts/build_proc.sh ./back-projection
RUN cd /back-projection && ./build_proc.sh && cd /

FROM nvidia/cuda:12.4.1-runtime-ubuntu22.04 as runner

# For opencontainers label definitions, see:
# https://github.com/opencontainers/image-spec/blob/master/annotations.md
Expand All @@ -11,32 +31,21 @@ LABEL org.opencontainers.image.url="https://github.com/ASFHyP3/hyp3-back-project
LABEL org.opencontainers.image.source="https://github.com/ASFHyP3/hyp3-back-projection"
LABEL org.opencontainers.image.documentation="https://hyp3-docs.asf.alaska.edu"

ARG DEBIAN_FRONTEND=noninteractive
ARG CONDA_UID=1000
ARG CONDA_GID=1000
ARG BACK_PROJECTION_TAG=0.2.0
ARG FFTW_TAG=3.3.9
ARG MINIFORGE_NAME=Miniforge3
ARG MINIFORGE_VERSION=24.3.0-0

# USEGPU environment variable used by build_proc.sh
ENV USEGPU="true"
ENV CONDA_DIR=/opt/conda
ENV LANG=C.UTF-8 LC_ALL=C.UTF-8
ENV PATH=${CONDA_DIR}/bin:${PATH}
ENV PYTHONDONTWRITEBYTECODE=true
ENV PROC_HOME=/home/conda/back-projection
ENV PROC_HOME=/back-projection
ENV MYHOME=/home/conda
ENV DEBIAN_FRONTEND=noninteractive

# Conda setup
RUN apt-get update > /dev/null && \
apt-get install --no-install-recommends --yes \
wget bzip2 ca-certificates \
git \
tini \
> /dev/null && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* && \
RUN apt-get update && apt-get install --no-install-recommends --yes wget bzip2 ca-certificates git > /dev/null && \
wget --no-hsts --quiet https://github.com/conda-forge/miniforge/releases/download/${MINIFORGE_VERSION}/${MINIFORGE_NAME}-${MINIFORGE_VERSION}-Linux-$(uname -m).sh -O /tmp/miniforge.sh && \
/bin/bash /tmp/miniforge.sh -b -p ${CONDA_DIR} && \
rm /tmp/miniforge.sh && \
Expand All @@ -47,7 +56,7 @@ RUN apt-get update > /dev/null && \
echo ". ${CONDA_DIR}/etc/profile.d/conda.sh && conda activate base" >> /etc/skel/.bashrc && \
echo ". ${CONDA_DIR}/etc/profile.d/conda.sh && conda activate base" >> ~/.bashrc

RUN apt-get update && apt-get install -y --no-install-recommends unzip vim curl build-essential gfortran libfftw3-dev && \
RUN apt-get install -y --no-install-recommends unzip vim curl gfortran && \
apt-get clean && rm -rf /var/lib/apt/lists/*

RUN groupadd -g "${CONDA_GID}" --system conda && \
Expand All @@ -61,20 +70,8 @@ SHELL ["/bin/bash", "-l", "-c"]
USER ${CONDA_UID}
WORKDIR /home/conda/

RUN curl -sL https://github.com/ASFHyP3/back-projection/archive/refs/tags/v${BACK_PROJECTION_TAG}.tar.gz > ./back-projection.tar.gz && \
mkdir -p ./back-projection && \
tar -xvf ./back-projection.tar.gz -C ./back-projection/ --strip=1 && \
rm ./back-projection.tar.gz && \
rm -rf ./back-projection/fft

COPY --chown=${CONDA_UID}:${CONDA_GID} ./scripts/build_proc.sh ./back-projection
RUN cd /home/conda/back-projection && \
chmod +x ./build_proc.sh && \
./build_proc.sh && \
find $PROC_HOME -type f -name "*.py" -exec chmod +x {} + && \
cd /home/conda/

COPY --chown=${CONDA_UID}:${CONDA_GID} . /hyp3-back-projection/
COPY --chown=${CONDA_UID}:${CONDA_GID} --from=builder /back-projection /back-projection
COPY --chown=${CONDA_UID}:${CONDA_GID} --from=builder /hyp3-back-projection /hyp3-back-projection

RUN mamba env create -f /hyp3-back-projection/environment.yml && \
conda clean -afy && \
Expand Down
35 changes: 12 additions & 23 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,29 +58,18 @@ The process is different for different OS's and Linux distros. The setup process
can be found [here](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#configuration). Make sure to follow the [Docker configuration steps](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#configuration) after installing the package.

### EC2 Setup
> [!CAUTION]
> Running the docker container on an Amazon Linux 2023 Deep Learning AMI runs, but will result in all zero outputs. Work is ongoing to determine what is causing this issue. For now, we recommend using option 2.i.
When running on an EC2 instance, the following setup is recommended:
1. Create a [P3-family EC2 instance](https://aws.amazon.com/ec2/instance-types/p3/) with the [Amazon Linux 2 AMI with NVIDIA TESLA GPU Driver](https://aws.amazon.com/marketplace/pp/prodview-64e4rx3h733ru?sr=0-4&ref_=beagle&applicationId=AWSMPContessa)
2. Install Docker and the nvidia-container-toolkit on the EC2 instance:
```bash
sudo yum-config-manager --disable amzn2-graphics
curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo | sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo
sudo yum install docker -y
sudo yum install nvidia-container-toolkit -y
sudo yum-config-manager --enable amzn2-graphics
```
3. Optionally, set up Docker to not require `sudo` and to start when the EC2 instance starts
```bash
sudo systemctl start docker && \
sudo usermod -a -G docker ec2-user && \
sudo systemctl enable docker
```
4. Exit the EC2 instance and re-enter
5. To test the GPU setup, run the base NVIDIA container:
```bash
docker run -it --gpus all nvidia/cuda:12.4.1-devel-ubuntu20.04 nvidia-smi
```
6. Build the actual container and run it:
1. Create a [G6-family EC2 instance](https://aws.amazon.com/ec2/instance-types/g6/) that has **at least 32 GB of memory**.
2. Launch your instance with one of the following setups (**option i is recommended**):
1. Use the latest [Amazon Linux 2023 AMI](https://docs.aws.amazon.com/linux/al2023/ug/ec2.html) with `scripts/amazon_linux_setup.sh` as the [user script on launch](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/user-data.html).
2. Use the latest [Ubuntu AMI](https://cloud-images.ubuntu.com/locator/ec2/) with the `scripts/ubuntu_setup.sh` as the [user script on launch](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/user-data.html).
3. Use the [Ubuntu Deep Learning Base OSS Nvidia Driver GPU AMI](https://aws.amazon.com/releasenotes/aws-deep-learning-base-gpu-ami-ubuntu-22-04/) (no install script required).
3. Build the GPU docker container with the correct compute capability version. To determine this value, run `nvidia-smi` on the instance to obtain GPU type, then cross-reference this information with NVIDIA's [GPU type compute capability list](https://developer.nvidia.com/cuda-gpus). For a g6.2xlarge instance, this would be:
```bash
docker build -t back-projection:gpu -f Dockerfile.gpu .
docker run --gpus=all --rm -it back-projection:gpu ++process back_projection --help
docker --build-arg="GPU_ARCH=89" -t back-projection:gpu-89 -f Dockerfile.gpu .
```
The compute capability version will always be the same for a given instance type, so you will only need to look this up once per instance type.
The default value for this argument is `89` - the correct value for g6.2xlarge instances.
32 changes: 32 additions & 0 deletions scripts/amazon_linux_setup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/bin/bash

# GPU setup for the Amazon Linux 2023

# Install NVIDIA driver
DRIVER_VERSION=550.54.14
sudo dnf install -y kernel-devel-$(uname -r) kernel-headers-$(uname -r) kernel-modules-extra
curl -fSsl -O https://us.download.nvidia.com/tesla/$DRIVER_VERSION/NVIDIA-Linux-x86_64-$DRIVER_VERSION.run
chmod +x NVIDIA-Linux-x86_64-$DRIVER_VERSION.run
sudo ./NVIDIA-Linux-x86_64-$DRIVER_VERSION.run --tmpdir . --silent
rm ./NVIDIA-Linux-x86_64-$DRIVER_VERSION.run

# Install and enable Docker
sudo dnf install -y docker git
sudo systemctl start docker
sudo systemctl enable docker
sudo usermod -aG docker ec2-user

# Install nvidia-container-toolkit
sudo dnf config-manager --add-repo https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo
sudo dnf install -y nvidia-container-toolkit
sudo nvidia-ctk runtime configure --runtime=docker
sudo systemctl restart docker

# Install extra packages
sudo dnf install -y git

# Cleanup
dnf clean all && rm -rf /var/cache/dnf/*

# Reboot
sudo reboot
24 changes: 10 additions & 14 deletions scripts/build_proc.sh
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
#!/bin/bash

MULTIARCH_DIR=/usr/lib/$(gcc -print-multiarch)
FFTW_LIB=$MULTIARCH_DIR/libfftw3f.a
# Keeping these lines here in case we need to switch back to grabbing the FFTW location
# dynamically again
# MULTIARCH_DIR=/usr/lib/$(gcc -print-multiarch)
# FFTW_LIB=$MULTIARCH_DIR/libfftw3f.a
echo 'using FFTW library:' $FFTW_LIB
if [[ "$USEGPU" == "true" ]]; then
echo 'building with GPU support'
nvcc -o gpu_arch gpu_arch.cu
echo 'building with GPU support, capability version' $GPU_ARCH
fi

cd DEM
Expand All @@ -21,11 +24,6 @@ gfortran -c processsubcpu.f90 backprojectcpusub.f90 bounds.f90 orbitrangetime.f9
gcc -o sentinel_raw_process_cpu sentinel_raw_process_cpu.o decode_line_memory.o processsubcpu.o backprojectcpusub.o azimuth_compress_cpu.o bounds.o orbitrangetime.o latlon.o intp_orbit.o radar_to_xyz.o unitvec.o tcnbasis.o curvature.o cross.o orbithermite.o filelen.o io.o sentineltimingsub.o getburststatevectors.o $FFTW_LIB -lgfortran -lgomp -lm -lrt -lpthread
echo 'built sentinel_raw_process_cpu'

if [[ "$USEGPU" == "true" ]]; then
nvcc -o howmanygpus howmanygpus.cu
echo 'built howmanygpus'
fi

cd geo2rdr
gfortran -o estimatebaseline estimatebaseline.f90 intp_orbit.f90 latlon.f90 orbithermite.f -ffixed-line-length-none

Expand Down Expand Up @@ -72,7 +70,6 @@ gfortran -o psinterp psinterp.f90 -fopenmp
echo 'Built cosine_sim and psinterp in ps directory'

cd ..
tar xf snaphu_v2_0b0_0_0.tar
cd snaphu_v2.0b0.0.0/src
make CFLAGS=-O3 -s

Expand All @@ -89,14 +86,13 @@ gcc -c filelen.c io.c sentinel_raw_process.c decode_line_memory.c -lm -fopenmp

echo 'built raw_process components in sentinel'

if [[ "$USEGPU" == "true" ]]; then
nvcc -gencode arch=compute_89,code=sm_89 -c azimuth_compress.cu -Wno-deprecated-gpu-targets
fi

gfortran -c processsub.f90 backprojectgpusub.f90 bounds.f90 orbitrangetime.f90 latlon.f90 intp_orbit.f90 radar_to_xyz.f90 unitvec.f90 tcnbasis.f90 curvature.f90 cross.f90 orbithermite.f sentineltimingsub.f90 getburststatevectors.f90 -ffixed-line-length-none -fopenmp

if [[ "$USEGPU" == "true" ]]; then
nvcc -o sentinel_raw_process sentinel_raw_process.o decode_line_memory.o processsub.o backprojectgpusub.o azimuth_compress.o bounds.o orbitrangetime.o latlon.o intp_orbit.o radar_to_xyz.o unitvec.o tcnbasis.o curvature.o cross.o orbithermite.o filelen.o io.o sentineltimingsub.o getburststatevectors.o $FFTW_LIB -lstdc++ -lgfortran -lgomp
nvcc -o howmanygpus howmanygpus.cu
nvcc -gencode arch=compute_$GPU_ARCH,code=sm_$GPU_ARCH -c azimuth_compress.cu -Wno-deprecated-gpu-targets
nvcc -gencode arch=compute_$GPU_ARCH,code=sm_$GPU_ARCH -o sentinel_raw_process sentinel_raw_process.o decode_line_memory.o processsub.o backprojectgpusub.o azimuth_compress.o bounds.o orbitrangetime.o latlon.o intp_orbit.o radar_to_xyz.o unitvec.o tcnbasis.o curvature.o cross.o orbithermite.o filelen.o io.o sentineltimingsub.o getburststatevectors.o $FFTW_LIB -lstdc++ -lgfortran -lgomp
echo 'built gpu components components in sentinel'
fi

cd ..
30 changes: 30 additions & 0 deletions scripts/ubuntu_setup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/bin/bash

# GPU setup for the Ubuntu 22.04

# NVIDIA source setup
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg && \
curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list && \
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb && \
sudo dpkg -i cuda-keyring_1.1-1_all.deb && \
rm cuda-keyring_1.1-1_all.deb

# Docker source setup
sudo apt install -y ca-certificates curl gnupg lsb-release && \
sudo mkdir -p /etc/apt/keyrings && \
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg && \
echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null

# Installs
sudo apt-get update && \
sudo apt-get install -y nvidia-headless-535-server nvidia-utils-535-server nvidia-container-toolkit docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin awscli git && \
sudo usermod -aG docker ubuntu

# Cleanup temporary files
sudo apt-get clean
sudo rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*

# Reboot
sudo reboot
2 changes: 1 addition & 1 deletion src/hyp3_back_projection/back_projection.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,8 @@ def back_project(

utils.call_stanford_module('util/merge_slcs.py', work_dir=work_dir)

zip_path = create_product(work_dir)
if bucket:
zip_path = create_product(work_dir)
upload_file_to_s3(zip_path, bucket, bucket_prefix)

print(f'Finished back-projection for {list(work_dir.glob("S1*.geo"))[0].with_suffix("").name}!')
Expand Down

0 comments on commit c87f10a

Please sign in to comment.