Skip to content

Commit

Permalink
Merge branch 'main' into rocm_genai
Browse files Browse the repository at this point in the history
  • Loading branch information
jianyuh authored Sep 29, 2024
2 parents 9887a3e + 00f2fd5 commit fae63ec
Show file tree
Hide file tree
Showing 68 changed files with 2,582 additions and 617 deletions.
3 changes: 3 additions & 0 deletions .github/scripts/fbgemm_gpu_build.bash
Original file line number Diff line number Diff line change
Expand Up @@ -416,6 +416,9 @@ __print_library_infos () {

echo "[CHECK] Listing out external shared libraries linked:"
print_exec ldd "${library}"

echo "[CHECK] Displaying ELF information:"
print_exec readelf -d "${library}"
echo "################################################################################"
echo ""
echo ""
Expand Down
23 changes: 23 additions & 0 deletions .github/scripts/fbgemm_gpu_postbuild.bash
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/bin/bash
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

echo "################################################################################"
echo "[CMAKE] Running post-build script ..."

# Print directory
pwd

# List all generated .SO files
find . -name '*.so'

# Remove errant RPATHs from the .SO
# https://github.com/pytorch/FBGEMM/issues/3098
# https://github.com/NixOS/patchelf/issues/453
find . -name '*.so' -print0 | xargs -0 patchelf --remove-rpath

echo "[CMAKE] Removed errant RPATHs"
echo "################################################################################"
1 change: 1 addition & 0 deletions .github/scripts/utils_build.bash
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,7 @@ install_build_tools () {
ncurses \
ninja \
openblas \
patchelf \
scikit-build \
wheel) || return 1

Expand Down
6 changes: 6 additions & 0 deletions .github/scripts/utils_cuda.bash
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,12 @@ install_cuda () {
nm -gDC "${libcuda_path}"
append_to_library_path "${env_name}" "$(dirname "$libcuda_path")"

# The symlink appears to be missing when we attempt to run FBGEMM_GPU on the
# `ubuntu-latest` runners on GitHub, so we have to manually add this in.
if [ "$ADD_LIBCUDA_SYMLINK" == "1" ]; then
print_exec ln "${libcuda_path}" -s "$(dirname "$libcuda_path")/libcuda.so.1"
fi

echo "[INSTALL] Set environment variable NVML_LIB_PATH ..."
# shellcheck disable=SC2155,SC2086
local conda_prefix=$(conda run ${env_prefix} printenv CONDA_PREFIX)
Expand Down
7 changes: 6 additions & 1 deletion .github/scripts/utils_pytorch.bash
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,12 @@ install_pytorch_pip () {
# shellcheck disable=SC2155
local env_prefix=$(env_name_or_prefix "${env_name}")

# Install the package from PyTorch PIP (not PyPI)
# Install the main dependencies
# shellcheck disable=SC2086
(exec_with_retries 3 conda install ${env_prefix} -c conda-forge -y \
numpy) || return 1

# Install the torch package from PyTorch PIP (not PyPI)
install_from_pytorch_pip "${env_name}" torch "${pytorch_channel_version}" "${pytorch_variant_type_version}" || return 1

# Check that PyTorch is importable
Expand Down
53 changes: 52 additions & 1 deletion .github/scripts/utils_system.bash
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,57 @@ free_disk_space () {
echo "[CLEANUP] Freed up some disk space"
}

free_disk_space_on_host () {
echo "################################################################################"
echo "# Free Disk Space On CI Host"
echo "################################################################################"

# NOTE: This is meant to be run from ** inside ** containers hosted on
# non-PyTorch-infra GitHub runners, where the hosts might be close to full
# disk from serving many CI jobs. When the container is set up properly, we
# can escape the container using nsenter to run commands on the host.
#
# On average, we see roughly 3GB of disk freed when running this cleanup,
# which appears to be sufficient to avoid the somewhat-frequent out-of-disk
# errors that we were previously running into.
#
# Frees up disk space on the ubuntu-latest host machine based on recommendations:
# https://github.com/orgs/community/discussions/25678
# https://github.com/apache/flink/blob/02d30ace69dc18555a5085eccf70ee884e73a16e/tools/azure-pipelines/free_disk_space.sh
#
# Escape the docker container to run the free disk operation on the host:
# https://stackoverflow.com/questions/66160057/how-to-run-a-command-in-host-before-entering-docker-container-in-github-ci
# https://stackoverflow.com/questions/32163955/how-to-run-shell-script-on-host-from-docker-container/63140387#63140387

nsenter -t 1 -m -u -n -i bash -c "
echo 'Listing 100 largest packages';
dpkg-query -Wf '\${Installed-Size}\t\${Package}\n' | sort -n | tail -n 100;
df -h;
echo 'Removing large packages';
sudo apt-get remove -y '^ghc-8.*';
sudo apt-get remove -y '^dotnet-.*';
sudo apt-get remove -y '^llvm-.*';
sudo apt-get remove -y 'php.*';
sudo apt-get remove -y azure-cli google-cloud-sdk hhvm google-chrome-stable firefox powershell mono-devel;
sudo apt-get autoremove -y;
sudo apt-get clean;
df -h;
echo 'Removing large directories';
rm -rf /usr/local/android;
rm -rf /usr/share/dotnet;
rm -rf /usr/local/share/boost;
rm -rf /opt/ghc;
rm -rf /usr/local/share/chrom*;
rm -rf /usr/share/swift;
rm -rf /usr/local/julia*;
rm -rf /usr/local/lib/android;
rm -rf /opt/hostedtoolcache;
df -h;
"
}


################################################################################
# Info Functions
Expand All @@ -91,7 +142,7 @@ print_gpu_info () {

(lspci -v | grep -e 'controller.*NVIDIA') || true

if [[ "${ENFORCE_CUDA_DEVICE}" ]]; then
if [[ "${ENFORCE_CUDA_DEVICE}" == '1' ]]; then
# Ensure that nvidia-smi is available and returns GPU entries
if ! nvidia-smi; then
echo "[CHECK] NVIDIA drivers and CUDA device are required for this workflow, but does not appear to be installed or available!"
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/build_wheels_linux_aarch64.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ permissions:

jobs:
generate-matrix:
if: ${{ github.repository_owner == 'pytorch' }}
uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
with:
package-type: wheel
Expand All @@ -32,6 +33,7 @@ jobs:
test-infra-ref: main
with-cuda: disable
build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: generate-matrix
strategy:
fail-fast: false
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/build_wheels_linux_x86.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ permissions:

jobs:
generate-matrix:
if: ${{ github.repository_owner == 'pytorch' }}
uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
with:
package-type: wheel
Expand All @@ -34,6 +35,7 @@ jobs:
with-rocm: enable
with-cpu: enable
build:
if: ${{ github.repository_owner == 'pytorch' }}
needs: generate-matrix
name: pytorch/FBGEMM
uses: pytorch/test-infra/.github/workflows/build_wheels_linux.yml@main
Expand Down
4 changes: 3 additions & 1 deletion .github/workflows/fbgemm_ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ concurrency:

jobs:
build-linux:
if: ${{ github.repository_owner == 'pytorch' }}
runs-on: ${{ matrix.host-machine.instance }}
container:
image: amazonlinux:2023
Expand Down Expand Up @@ -105,7 +106,8 @@ jobs:
build-bazel:
runs-on: linux.12xlarge
if: ${{ github.repository_owner == 'pytorch' }}
runs-on: ${{ matrix.host-machine.instance }}
container:
image: amazonlinux:2023
options: --user root
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/fbgemm_gpu_ci_cpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ concurrency:
jobs:
# Build on CPU hosts, run tests, and upload to GHA
build_artifact:
if: ${{ github.repository_owner == 'pytorch' }}
runs-on: ${{ matrix.host-machine.instance }}
container:
image: amazonlinux:2023
Expand Down Expand Up @@ -118,6 +119,7 @@ jobs:

# Download the built artifact from GHA, test on GPU, and push to PyPI
test_and_publish_artifact:
if: ${{ github.repository_owner == 'pytorch' }}
runs-on: ${{ matrix.host-machine.instance }}
container:
image: amazonlinux:2023
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/fbgemm_gpu_ci_cuda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ concurrency:
jobs:
# Build on CPU hosts and upload to GHA
build_artifact:
if: ${{ github.repository_owner == 'pytorch' }}
runs-on: ${{ matrix.host-machine.instance }}
container:
image: amazonlinux:2023
Expand Down Expand Up @@ -127,6 +128,7 @@ jobs:

# Download the built artifact from GHA, test on GPU, and push to PyPI
test_and_publish_artifact:
if: ${{ github.repository_owner == 'pytorch' }}
# runs-on: linux.4xlarge.nvidia.gpu
# Use available instance types - https://github.com/pytorch/test-infra/blob/main/.github/scale-config.yml
runs-on: ${{ matrix.host-machine.instance }}
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/fbgemm_gpu_ci_genai.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ concurrency:
jobs:
# Build on CPU hosts and upload to GHA
build_artifact:
if: ${{ github.repository_owner == 'pytorch' }}
runs-on: ${{ matrix.host-machine.instance }}
container:
image: amazonlinux:2023
Expand Down Expand Up @@ -127,7 +128,7 @@ jobs:

# Download the built artifact from GHA, test on GPU, and push to PyPI
test_and_publish_artifact:
# runs-on: linux.4xlarge.nvidia.gpu
if: ${{ github.repository_owner == 'pytorch' }}
# Use available instance types - https://github.com/pytorch/test-infra/blob/main/.github/scale-config.yml
runs-on: ${{ matrix.host-machine.instance }}
defaults:
Expand Down
Loading

0 comments on commit fae63ec

Please sign in to comment.