diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 0000000000..ec50a345f8 --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,38 @@ +name: Build Docker Images + +on: + workflow_dispatch: + inputs: + onmt_version: + description: "OpenNMT version" + required: true + type: string + # to facilitate initial tests in PR + push: + branches: + - "docker" + +run-name: ${{ github.workflow }} -- ${{ inputs.onmt_version || 'test' }} + +env: + ONMT_VERSION: ${{ inputs.onmt_version || 'test' }} + +jobs: + build: + runs-on: ubuntu-22.04 + strategy: + matrix: + cuda_version: [11.8.0, 12.1.0] + permissions: write-all + steps: + - name: Checkout opennmt repo + uses: actions/checkout@v4 + - name: Login to ghcr + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Build + run: | + docker/build.sh ${{ env.ONMT_VERSION }} ${{ matrix.cuda_version}} diff --git a/README.md b/README.md index ec41122105..818e063d89 100644 --- a/README.md +++ b/README.md @@ -53,6 +53,30 @@ If you used previous versions of OpenNMT-py, you can check the [Changelog](https ## Setup +### Using docker + +To facilitate setup and reproducibility, some docker images are made available via the Github Container Registry: +https://github.com/OpenNMT/OpenNMT-py/pkgs/container/opennmt-py + +You can adapt the workflow and build your own image(s) depending on specific needs by using `build.sh` and `Dockerfile` in the `docker` directory of the repo. + +``` +docker pull ghcr.io/opennmt/opennmt-py:3.4.3-ubuntu22.04-cuda12.1 +``` + +Example oneliner to run a container and open a bash shell within it +``` +docker run --rm -it --runtime=nvidia ghcr.io/opennmt/opennmt-py:test-ubuntu22.04-cuda12.1 +``` +Note: you need to have the [Nvidia Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) (formerly nvidia-docker) installed to properly take advantage of the CUDA/GPU features. + +Depending on your needs you can add various flags: +- `-p 5000:5000` to forward some exposed port from your container to your host; +- `-v /some/local/directory:/some/container/directory` to mount some local directory to some container directory; +- `--entrypoint some_command` to directly run some specific command as the container entry point (instead of the default bash shell); + +### Installing locally + OpenNMT-py requires: - Python >= 3.8 @@ -78,7 +102,7 @@ Note: if you encounter a `MemoryError` during installation, try to use `pip` wit pip install -r requirements.opt.txt ``` -## Manual installation of some dependencies +### Manual installation of some dependencies Apex is highly recommended to have fast performance (especially the legacy fusedadam optimizer and FusedRMSNorm) diff --git a/docker/Dockerfile b/docker/Dockerfile new file mode 100644 index 0000000000..e469409c58 --- /dev/null +++ b/docker/Dockerfile @@ -0,0 +1,52 @@ +ARG CUDA_VERSION=11.8.0 +FROM nvidia/cuda:$CUDA_VERSION-devel-ubuntu22.04 + +RUN apt-get update && apt-get install -y locales gcc g++ python3-dev +RUN apt-get update && apt-get install -y \ + git \ + python3-pip \ + python3-dev \ + libprotobuf-dev \ + libprotobuf-c-dev + +RUN pip3 install --upgrade pip +RUN pip3 install packaging + +# Install torch +RUN CU=$(echo "${CUDA_VERSION%.*}" | sed 's/\.//g'); pip3 install torch --index-url "https://download.pytorch.org/whl/cu$CU" + +# Install apex +RUN mkdir /setup +WORKDIR /setup +RUN git clone https://github.com/nvidia/apex +WORKDIR /setup/apex +RUN pip3 install ninja +ENV TORCH_CUDA_ARCH_LIST="6.0;6.1;6.2;7.0;7.5;8.6" +RUN pip3 install -v --no-build-isolation \ + --config-settings --global-option="--cpp_ext" \ + --config-settings --global-option="--cuda_ext" \ + --config-settings --global-option="--deprecated_fused_adam" \ + --global-option="--xentropy" \ + --global-option="--fast_multihead_attn" \ + ./ + +# Install flash-attention +RUN pip install flash-attn --no-build-isolation + +# Install llm-awq +RUN git clone https://github.com/mit-han-lab/llm-awq && \ + cd llm-awq && \ + pip install -e . && \ + cd .. + +# Install AutoAWQ +RUN pip install autoawq + +COPY . /opennmt-py +WORKDIR /opennmt-py +RUN pip install -r requirements.opt.txt +RUN pip install -e . + +WORKDIR / + +ENTRYPOINT /bin/bash \ No newline at end of file diff --git a/docker/build.sh b/docker/build.sh new file mode 100755 index 0000000000..29d1e2bb7a --- /dev/null +++ b/docker/build.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# +# Build and push version X of OpenNMT-py with CUDA Y: +# ./build.sh X Y + +set -e + +# allow user to run this script from anywhere +# from https://stackoverflow.com/a/246128 +# one-liner which will give you the full directory name +# of the script no matter where it is being called from +unset CDPATH +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" + +ROOT_DIR=$DIR/.. +cd $ROOT_DIR + +ONMT_VERSION="$1" +CUDA_VERSION="$2" +[ -z "$CUDA_VERSION" ] && CUDA_VERSION="11.8.0" + +IMAGE="ghcr.io/opennmt/opennmt-py" +TAG="$ONMT_VERSION-ubuntu22.04-cuda${CUDA_VERSION%.*}" + +echo "Building $IMAGE:$TAG with CUDA_VERSION=$CUDA_VERSION" + +docker build -t $IMAGE:$TAG --progress=plain -f docker/Dockerfile --build-arg CUDA_VERSION=$CUDA_VERSION . +docker push $IMAGE:$TAG \ No newline at end of file diff --git a/onmt/opts.py b/onmt/opts.py index 861e96172e..b6b34c1c7c 100644 --- a/onmt/opts.py +++ b/onmt/opts.py @@ -493,6 +493,13 @@ def distributed_opts(parser): type=int, help="Port of master for torch.distributed training.", ) + group.add( + "--timeout", + "-timeout", + default=60, + type=int, + help="Timeout for one GOU to wait for the others.", + ) def model_opts(parser): diff --git a/onmt/trainer.py b/onmt/trainer.py index a570039707..d580b19fd1 100644 --- a/onmt/trainer.py +++ b/onmt/trainer.py @@ -328,9 +328,10 @@ def train( ) if valid_iter is not None and step % valid_steps == 0: - valid_stats = self.validate( - valid_iter, moving_average=self.moving_average - ) + if self.parallel_mode == "tensor_parallel" or self.gpu_rank <= 0: + valid_stats = self.validate( + valid_iter, moving_average=self.moving_average + ) if step % valid_steps == 0 and self.gpu_rank <= 0: self._report_step( diff --git a/onmt/utils/distributed.py b/onmt/utils/distributed.py index c7e6051a7c..e6779c397f 100644 --- a/onmt/utils/distributed.py +++ b/onmt/utils/distributed.py @@ -29,7 +29,7 @@ def multi_init(opt, device_id): init_method=dist_init_method, world_size=dist_world_size, rank=opt.gpu_ranks[device_id], - timeout=timedelta(seconds=60), + timeout=timedelta(seconds=opt.timeout), ) gpu_rank = torch.distributed.get_rank() if not is_master(opt, device_id): diff --git a/requirements.opt.txt b/requirements.opt.txt index f2d13037ca..cdd42a0cbd 100644 --- a/requirements.opt.txt +++ b/requirements.opt.txt @@ -6,3 +6,4 @@ scipy bitsandbytes>=0.41.2 safetensors spacy +gradio