From cfa3f7cd259ec204d1ee653ab1fad7083be1a791 Mon Sep 17 00:00:00 2001
From: Xu Zhao <xzhao9@meta.com>
Date: Tue, 11 Jun 2024 19:32:51 -0400
Subject: [PATCH 1/7] Add FBGEMM submodule

---
 .gitmodules       | 3 +++
 submodules/FBGEMM | 1 +
 2 files changed, 4 insertions(+)
 create mode 160000 submodules/FBGEMM

diff --git a/.gitmodules b/.gitmodules
index 1a6ef12df7..40fdd49a0f 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -4,3 +4,6 @@
 [submodule "submodules/lit-llama"]
 	path = submodules/lit-llama
 	url = https://github.com/Lightning-AI/lit-llama.git
+[submodule "submodules/FBGEMM"]
+	path = submodules/FBGEMM
+	url = https://github.com/pytorch/FBGEMM.git
diff --git a/submodules/FBGEMM b/submodules/FBGEMM
new file mode 160000
index 0000000000..182f3a3d09
--- /dev/null
+++ b/submodules/FBGEMM
@@ -0,0 +1 @@
+Subproject commit 182f3a3d09434da305186325ab7fe5e691ba45b1

From 89f5e4094a4becfd68beb19d703f2186baf5d2b6 Mon Sep 17 00:00:00 2001
From: Xu Zhao <i@xuzhao.net>
Date: Tue, 11 Jun 2024 23:27:56 -0400
Subject: [PATCH 2/7] Add fbgemm genai to the nightly docker

---
 docker/gcp-a100-runner-dind.dockerfile |  4 ++--
 docker/torchbench-nightly.dockerfile   |  5 +++++
 install.py                             |  6 ++++--
 userbenchmark/triton/install.py        | 22 ++++++++++++++++++++++
 4 files changed, 33 insertions(+), 4 deletions(-)
 create mode 100644 userbenchmark/triton/install.py

diff --git a/docker/gcp-a100-runner-dind.dockerfile b/docker/gcp-a100-runner-dind.dockerfile
index 2d67106417..98f012def7 100644
--- a/docker/gcp-a100-runner-dind.dockerfile
+++ b/docker/gcp-a100-runner-dind.dockerfile
@@ -28,9 +28,9 @@ RUN sudo mkdir -p /workspace; sudo chown runner:runner /workspace
 # We assume that the host NVIDIA driver binaries and libraries are mapped to the docker filesystem
 
 # Use the CUDA installation scripts from pytorch/builder
+# Install CUDA 12.4 only to reduce docker size
 RUN cd /workspace; git clone https://github.com/pytorch/builder.git
-RUN sudo bash -c 'source /workspace/builder/common/install_cuda.sh; install_118; prune_118'
-RUN sudo bash -c 'source /workspace/builder/common/install_cuda.sh; install_121; prune_121'
+RUN sudo bash -c 'source /workspace/builder/common/install_cuda.sh; install_124; prune_124'
 
 # Install miniconda
 RUN wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /workspace/Miniconda3-latest-Linux-x86_64.sh
diff --git a/docker/torchbench-nightly.dockerfile b/docker/torchbench-nightly.dockerfile
index c45f33a2e3..fe0e744484 100644
--- a/docker/torchbench-nightly.dockerfile
+++ b/docker/torchbench-nightly.dockerfile
@@ -48,3 +48,8 @@ RUN cd /workspace/benchmark && \
 RUN cd /workspace/benchmark && \
     . ${SETUP_SCRIPT} && \
     python install.py
+
+# Install FBGEMM GENAI
+RUN cd /workspace/benchmark && \
+    . ${SETUP_SCRIPT} && \
+    python install.py --userbenchmark triton --fbgemm
diff --git a/install.py b/install.py
index 7486c22209..ea825e6f7c 100644
--- a/install.py
+++ b/install.py
@@ -46,7 +46,7 @@ def pip_install_requirements(requirements_txt="requirements.txt"):
         choices=list_userbenchmarks(),
         help="Install requirements for optional components.",
     )
-    args = parser.parse_args()
+    args, extra_args = parser.parse_known_args()
 
     os.chdir(os.path.realpath(os.path.dirname(__file__)))
 
@@ -68,9 +68,11 @@ def pip_install_requirements(requirements_txt="requirements.txt"):
     if args.userbenchmark:
         # Install userbenchmark dependencies if exists
         userbenchmark_dir = REPO_ROOT.joinpath("userbenchmark", args.userbenchmark)
+        cmd = [sys.executable, "install.py"]
+        cmd.extend(extra_args)
         if userbenchmark_dir.joinpath("install.py").is_file():
             subprocess.check_call(
-                [sys.executable, "install.py"], cwd=userbenchmark_dir.absolute()
+                cmd, cwd=userbenchmark_dir.absolute()
             )
         sys.exit(0)
 
diff --git a/userbenchmark/triton/install.py b/userbenchmark/triton/install.py
new file mode 100644
index 0000000000..ca9a8542d4
--- /dev/null
+++ b/userbenchmark/triton/install.py
@@ -0,0 +1,22 @@
+import argparse
+import subprocess
+import sys
+
+from torchbenchmark import REPO_PATH
+FBGEMM_PATH = REPO_PATH.joinpath("submodules", "FBGEMM", "fbgemm_gpu")
+
+def install_fbgemm():
+    cmd = [sys.executable, "setup.py", "bdist_wheel", "--package_variant=genai"]
+    subprocess.check_call(cmd, cwd=FBGEMM_PATH)
+
+def test_fbgemm():
+    cmd = [sys.executable, "-c", '"import fbgemm_gpu.experimental.gen_ai"']
+    subprocess.check_call(cmd)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--fbgemm", action="store_true", help="Install FBGEMM GPU")
+    args = parser.parse_args()
+    if args.fbgemm:
+        install_fbgemm()
+        test_fbgemm()

From 1d2830a46edf3d08be92b52feaf543edb75d51b6 Mon Sep 17 00:00:00 2001
From: Xu Zhao <i@xuzhao.net>
Date: Tue, 11 Jun 2024 23:30:06 -0400
Subject: [PATCH 3/7] Update the default version in cuda utils

---
 utils/cuda_utils.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/utils/cuda_utils.py b/utils/cuda_utils.py
index 798f15dcde..80f677af1e 100644
--- a/utils/cuda_utils.py
+++ b/utils/cuda_utils.py
@@ -8,16 +8,12 @@
 from typing import Optional
 
 # defines the default CUDA version to compile against
-DEFAULT_CUDA_VERSION = "12.1"
+DEFAULT_CUDA_VERSION = "12.4"
 
 CUDA_VERSION_MAP = {
-    "11.8": {
-        "pytorch_url": "cu118",
-        "magma_version": "magma-cuda118",
-    },
-    "12.1": {
-        "pytorch_url": "cu121",
-        "magma_version": "magma-cuda121",
+    "12.4": {
+        "pytorch_url": "cu124",
+        "magma_version": "magma-cuda124",
     },
 }
 PIN_CMAKE_VERSION = "3.22.*"

From bacbff4981bde94907a01b1be201332bc1c2dc15 Mon Sep 17 00:00:00 2001
From: Xu Zhao <i@xuzhao.net>
Date: Wed, 12 Jun 2024 09:55:43 -0400
Subject: [PATCH 4/7] Install fbgemm

---
 docker/torchbench-nightly.dockerfile | 7 ++++---
 userbenchmark/triton/install.py      | 4 +++-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/docker/torchbench-nightly.dockerfile b/docker/torchbench-nightly.dockerfile
index fe0e744484..c48436f47f 100644
--- a/docker/torchbench-nightly.dockerfile
+++ b/docker/torchbench-nightly.dockerfile
@@ -45,11 +45,12 @@ RUN cd /workspace/benchmark && \
     . ${SETUP_SCRIPT} && \
     python utils/cuda_utils.py --install-torchbench-deps
 
+# Install FBGEMM GENAI
 RUN cd /workspace/benchmark && \
     . ${SETUP_SCRIPT} && \
-    python install.py
+    python install.py --userbenchmark triton --fbgemm
 
-# Install FBGEMM GENAI
+# Install Torchbench models
 RUN cd /workspace/benchmark && \
     . ${SETUP_SCRIPT} && \
-    python install.py --userbenchmark triton --fbgemm
+    python install.py
diff --git a/userbenchmark/triton/install.py b/userbenchmark/triton/install.py
index ca9a8542d4..99051367e2 100644
--- a/userbenchmark/triton/install.py
+++ b/userbenchmark/triton/install.py
@@ -1,8 +1,10 @@
 import argparse
 import subprocess
 import sys
+import os
+from pathlib import Path
 
-from torchbenchmark import REPO_PATH
+REPO_PATH = Path(os.path.abspath(__file__)).parent.parent.parent
 FBGEMM_PATH = REPO_PATH.joinpath("submodules", "FBGEMM", "fbgemm_gpu")
 
 def install_fbgemm():

From aeaaa0d850471dddc600a6f73e1a333d4864263a Mon Sep 17 00:00:00 2001
From: Xu Zhao <i@xuzhao.net>
Date: Wed, 12 Jun 2024 10:15:42 -0400
Subject: [PATCH 5/7] Fix dockerfile issues

---
 docker/build-torchbench-nightly-docker.sh | 5 ++++-
 docker/torchbench-nightly.dockerfile      | 7 +++++--
 userbenchmark/triton/install.py           | 2 +-
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/docker/build-torchbench-nightly-docker.sh b/docker/build-torchbench-nightly-docker.sh
index 5194affc4c..8f53442dd9 100644
--- a/docker/build-torchbench-nightly-docker.sh
+++ b/docker/build-torchbench-nightly-docker.sh
@@ -1 +1,4 @@
-docker build . -f torchbench-nightly.dockerfile -t ghcr.io/pytorch/torchbench:latest
+TORCHBENCH_BRANCH=${TORCHBENCH_BRANCH:-main}
+
+docker build . -f torchbench-nightly.dockerfile -t ghcr.io/pytorch/torchbench:latest \
+    --build-arg TORCHBENCH_BRANCH=${TORCHBENCH_BRANCH}
diff --git a/docker/torchbench-nightly.dockerfile b/docker/torchbench-nightly.dockerfile
index c48436f47f..a35a3dda20 100644
--- a/docker/torchbench-nightly.dockerfile
+++ b/docker/torchbench-nightly.dockerfile
@@ -8,10 +8,13 @@ ENV SETUP_SCRIPT=/workspace/setup_instance.sh
 ARG TORCHBENCH_BRANCH=${TORCHBENCH_BRANCH:-main}
 ARG FORCE_DATE=${FORCE_DATE}
 
-# Setup Conda env and CUDA
+# Checkout Torchbench and submodules
 RUN git clone -b "${TORCHBENCH_BRANCH}" --single-branch \
- https://github.com/pytorch/benchmark /workspace/benchmark
+    https://github.com/pytorch/benchmark /workspace/benchmark
+RUN cd /workspace/benchmark \
+    git submodule update --init --recursive
 
+# Setup conda env and CUDA
 RUN cd /workspace/benchmark && \
     . ${SETUP_SCRIPT} && \
     python ./utils/python_utils.py --create-conda-env ${CONDA_ENV} && \
diff --git a/userbenchmark/triton/install.py b/userbenchmark/triton/install.py
index 99051367e2..360806c3d2 100644
--- a/userbenchmark/triton/install.py
+++ b/userbenchmark/triton/install.py
@@ -9,7 +9,7 @@
 
 def install_fbgemm():
     cmd = [sys.executable, "setup.py", "bdist_wheel", "--package_variant=genai"]
-    subprocess.check_call(cmd, cwd=FBGEMM_PATH)
+    subprocess.check_call(cmd, cwd=str(FBGEMM_PATH.resolve()))
 
 def test_fbgemm():
     cmd = [sys.executable, "-c", '"import fbgemm_gpu.experimental.gen_ai"']

From 7e6c79df0efad75f3ae9c428202ad1b890f16e97 Mon Sep 17 00:00:00 2001
From: Xu Zhao <i@xuzhao.net>
Date: Wed, 12 Jun 2024 10:31:01 -0400
Subject: [PATCH 6/7] Another fix

---
 docker/torchbench-nightly.dockerfile | 4 +---
 userbenchmark/triton/install.py      | 2 ++
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docker/torchbench-nightly.dockerfile b/docker/torchbench-nightly.dockerfile
index a35a3dda20..f6f574af8e 100644
--- a/docker/torchbench-nightly.dockerfile
+++ b/docker/torchbench-nightly.dockerfile
@@ -9,10 +9,8 @@ ARG TORCHBENCH_BRANCH=${TORCHBENCH_BRANCH:-main}
 ARG FORCE_DATE=${FORCE_DATE}
 
 # Checkout Torchbench and submodules
-RUN git clone -b "${TORCHBENCH_BRANCH}" --single-branch \
+RUN git clone --recurse-submodules -b "${TORCHBENCH_BRANCH}" --single-branch \
     https://github.com/pytorch/benchmark /workspace/benchmark
-RUN cd /workspace/benchmark \
-    git submodule update --init --recursive
 
 # Setup conda env and CUDA
 RUN cd /workspace/benchmark && \
diff --git a/userbenchmark/triton/install.py b/userbenchmark/triton/install.py
index 360806c3d2..21c61dc802 100644
--- a/userbenchmark/triton/install.py
+++ b/userbenchmark/triton/install.py
@@ -8,6 +8,8 @@
 FBGEMM_PATH = REPO_PATH.joinpath("submodules", "FBGEMM", "fbgemm_gpu")
 
 def install_fbgemm():
+    cmd = ["pip", "install", "-r", "requirements.txt"]
+    subprocess.check_call(cmd, cwd=str(FBGEMM_PATH.resolve()))
     cmd = [sys.executable, "setup.py", "bdist_wheel", "--package_variant=genai"]
     subprocess.check_call(cmd, cwd=str(FBGEMM_PATH.resolve()))
 

From b156359533cddb038fadbc40a3cc5c6a823bd4fa Mon Sep 17 00:00:00 2001
From: Xu Zhao <i@xuzhao.net>
Date: Wed, 12 Jun 2024 11:01:34 -0400
Subject: [PATCH 7/7] Limit build target to A100/H100

---
 userbenchmark/triton/install.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/userbenchmark/triton/install.py b/userbenchmark/triton/install.py
index 21c61dc802..d5faff068c 100644
--- a/userbenchmark/triton/install.py
+++ b/userbenchmark/triton/install.py
@@ -10,7 +10,8 @@
 def install_fbgemm():
     cmd = ["pip", "install", "-r", "requirements.txt"]
     subprocess.check_call(cmd, cwd=str(FBGEMM_PATH.resolve()))
-    cmd = [sys.executable, "setup.py", "bdist_wheel", "--package_variant=genai"]
+    # Build target A100(8.0) or H100(9.0)
+    cmd = [sys.executable, "setup.py", "bdist_wheel", "--package_variant=genai", "-DTORCH_CUDA_ARCH_LIST=8.0;9.0"]
     subprocess.check_call(cmd, cwd=str(FBGEMM_PATH.resolve()))
 
 def test_fbgemm():