examples/ml-slurm-v5-legacy.yaml

# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

---
blueprint_name: ml-slurm

vars:
  project_id:  ## Set project id here
  deployment_name: ml-example
  region: asia-southeast1
  zone: asia-southeast1-b
  zones:
  - asia-southeast1-a
  - asia-southeast1-b
  - asia-southeast1-c
  new_image:
    family: ml-slurm
    project: $(vars.project_id)
  disk_size_gb: 200

# Recommended to use GCS backend for Terraform state
# See https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/examples#optional-setting-up-a-remote-terraform-state
#
# terraform_backend_defaults:
#  type: gcs
#  configuration:
#    bucket: <<BUCKET_NAME>>

deployment_groups:
- group: primary
  modules:
  - id: network
    source: modules/network/pre-existing-vpc

  # this example anticipates that the VPC default network has internal traffic
  # allowed and IAP tunneling for SSH connections
  - id: firewall_rule
    source: modules/network/firewall-rules
    use:
    - network
    settings:
      ingress_rules:
      - name: $(vars.deployment_name)-allow-internal-traffic
        description: Allow internal traffic
        destination_ranges:
        - $(network.subnetwork_address)
        source_ranges:
        - $(network.subnetwork_address)
        allow:
        - protocol: tcp
          ports:
          - 0-65535
        - protocol: udp
          ports:
          - 0-65535
        - protocol: icmp
      - name: $(vars.deployment_name)-allow-iap-ssh
        description: Allow IAP-tunneled SSH connections
        destination_ranges:
        - $(network.subnetwork_address)
        source_ranges:
        - 35.235.240.0/20
        allow:
        - protocol: tcp
          ports:
          - 22

  - id: homefs
    source: modules/file-system/filestore
    use:
    - network
    settings:
      local_mount: /home
      size_gb: 2560
      filestore_tier: BASIC_SSD

  - id: script
    source: modules/scripts/startup-script
    settings:
      runners:
      - type: shell
        destination: install-ml-libraries.sh
        content: |
          #!/bin/bash
          # this script is designed to execute on Slurm images published by SchedMD that:
          # - are based on Debian 11 distribution of Linux
          # - have NVIDIA Drivers v530 pre-installed
          # - have CUDA Toolkit 12.1 pre-installed.

          set -e -o pipefail

          echo "deb https://packages.cloud.google.com/apt google-fast-socket main" > /etc/apt/sources.list.d/google-fast-socket.list
          apt-get update --allow-releaseinfo-change
          apt-get install --assume-yes google-fast-socket

          CONDA_BASE=/opt/conda

          if [ -d $CONDA_BASE ]; then
                  exit 0
          fi

          DL_DIR=\$(mktemp -d)
          cd $DL_DIR
          curl -O https://repo.anaconda.com/miniconda/Miniconda3-py310_23.3.1-0-Linux-x86_64.sh
          HOME=$DL_DIR bash Miniconda3-py310_23.3.1-0-Linux-x86_64.sh -b -p $CONDA_BASE
          cd -
          rm -rf $DL_DIR
          unset DL_DIR

          source $CONDA_BASE/bin/activate base
          conda init --system
          conda config --system --set auto_activate_base False
          # following channel ordering is important! use strict_priority!
          conda config --system --set channel_priority strict
          conda config --system --remove channels defaults
          conda config --system --add channels conda-forge
          conda config --system --add channels nvidia
          conda config --system --add channels nvidia/label/cuda-11.8.0

          conda update -n base conda --yes

          ### create a virtual environment for tensorflow
          conda create -n tf python=3.10 --yes
          conda activate tf
          conda install -n tf cuda-toolkit --yes
          pip install nvidia-cudnn-cu11 nvidia-nccl-cu11

          cd $CONDA_PREFIX/lib/python3.10/site-packages/nvidia/nccl/lib/
          ln -s libnccl.so.2 libnccl.so
          cd -

          mkdir -p $CONDA_PREFIX/etc/conda/activate.d
          echo 'export OLD_LD_LIBRARY_PATH=$LD_LIBRARY_PATH' > $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh
          echo 'NVIDIA_PYTHON_PATH=$CONDA_PREFIX/lib/python3.10/site-packages/nvidia' >> $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh
          echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib/:$NVIDIA_PYTHON_PATH/cudnn/lib/:$NVIDIA_PYTHON_PATH/nccl/lib/' >> $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh
          mkdir -p $CONDA_PREFIX/etc/conda/deactivate.d
          echo 'export LD_LIBRARY_PATH=${OLD_LD_LIBRARY_PATH}' > $CONDA_PREFIX/etc/conda/deactivate.d/env_vars.sh
          echo 'unset OLD_LD_LIBRARY_PATH' >> $CONDA_PREFIX/etc/conda/deactivate.d/env_vars.sh

          pip install tensorflow==2.12.*
          pip install tensorrt==8.6.*

          ### create a virtual environment for pytorch
          conda create -n pytorch python=3.10 --yes
          conda activate pytorch
          conda config --env --add channels pytorch
          conda install -n pytorch pytorch torchvision torchaudio pytorch-cuda=11.8 --yes

- group: packer
  modules:
  - id: custom-image
    source: modules/packer/custom-image
    kind: packer
    use:
    - network
    - script
    settings:
      # give VM a public IP to ensure startup script can reach public internet
      # w/o new VPC
      omit_external_ip: false
      source_image_project_id: [schedmd-slurm-public]
      # see latest in https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family
      source_image_family: slurm-gcp-5-11-debian-11
      # You can find size of source image by using following command
      # gcloud compute images describe-from-family <source_image_family> --project schedmd-slurm-public
      disk_size: $(vars.disk_size_gb)
      image_family: $(vars.new_image.family)
      # building this image does not require a GPU-enabled VM
      machine_type: c2-standard-4
      state_timeout: 15m

- group: cluster
  modules:
  - id: examples
    source: modules/scripts/startup-script
    settings:
      runners:
      - type: data
        destination: /var/tmp/torch_test.sh
        content: |
          #!/bin/bash
          source /etc/profile.d/conda.sh
          conda activate pytorch
          python3 torch_test.py
      - type: data
        destination: /var/tmp/torch_test.py
        content: |
          import torch
          import torch.utils.benchmark as benchmark

          def batched_dot_mul_sum(a, b):
              '''Computes batched dot by multiplying and summing'''
              return a.mul(b).sum(-1)

          def batched_dot_bmm(a, b):
              '''Computes batched dot by reducing to bmm'''
              a = a.reshape(-1, 1, a.shape[-1])
              b = b.reshape(-1, b.shape[-1], 1)
              return torch.bmm(a, b).flatten(-3)

          # use GPU if available, else CPU
          device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
          print('Using device:', device)
          if device.type == 'cuda':
              print(torch.cuda.get_device_name(0))

          # benchmarking
          x = torch.randn(10000, 64)
          t0 = benchmark.Timer(
              stmt='batched_dot_mul_sum(x, x)',
              setup='from __main__ import batched_dot_mul_sum',
              globals={'x': x})
          t1 = benchmark.Timer(
              stmt='batched_dot_bmm(x, x)',
              setup='from __main__ import batched_dot_bmm',
              globals={'x': x})
          print(t0.timeit(100))
          print(t1.timeit(100))

  - id: a2_node_group
    source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
    settings:
      node_count_dynamic_max: 20
      bandwidth_tier: gvnic_enabled
      machine_type: a2-highgpu-1g
      instance_image: $(vars.new_image)
      instance_image_custom: true

  - id: a2_partition
    source: community/modules/compute/schedmd-slurm-gcp-v5-partition
    use:
    - a2_node_group
    - homefs
    - network
    settings:
      partition_name: a2
      is_default: true

  - id: g2_node_group
    source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
    settings:
      node_count_dynamic_max: 20
      bandwidth_tier: gvnic_enabled
      machine_type: g2-standard-4
      instance_image: $(vars.new_image)
      instance_image_custom: true

  - id: g2_partition
    source: community/modules/compute/schedmd-slurm-gcp-v5-partition
    use:
    - g2_node_group
    - homefs
    - network
    settings:
      partition_name: g2
      enable_placement: false
      exclusive: false

  - id: slurm_controller
    source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller
    use:
    - network
    - a2_partition
    - g2_partition
    - homefs
    settings:
      disable_controller_public_ips: false
      instance_image: $(vars.new_image)
      instance_image_custom: true

  - id: slurm_login
    source: community/modules/scheduler/schedmd-slurm-gcp-v5-login
    use:
    - examples
    - network
    - slurm_controller
    settings:
      disable_login_public_ips: false
      instance_image: $(vars.new_image)
      instance_image_custom: true