diff --git a/.gitmodules b/.gitmodules
index 1a6ef12df7..40fdd49a0f 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -4,3 +4,6 @@
 [submodule "submodules/lit-llama"]
 	path = submodules/lit-llama
 	url = https://github.com/Lightning-AI/lit-llama.git
+[submodule "submodules/FBGEMM"]
+	path = submodules/FBGEMM
+	url = https://github.com/pytorch/FBGEMM.git
diff --git a/docker/build-torchbench-nightly-docker.sh b/docker/build-torchbench-nightly-docker.sh
index 5194affc4c..8f53442dd9 100644
--- a/docker/build-torchbench-nightly-docker.sh
+++ b/docker/build-torchbench-nightly-docker.sh
@@ -1 +1,4 @@
-docker build . -f torchbench-nightly.dockerfile -t ghcr.io/pytorch/torchbench:latest
+TORCHBENCH_BRANCH=${TORCHBENCH_BRANCH:-main}
+
+docker build . -f torchbench-nightly.dockerfile -t ghcr.io/pytorch/torchbench:latest \
+    --build-arg TORCHBENCH_BRANCH=${TORCHBENCH_BRANCH}
diff --git a/docker/gcp-a100-runner-dind.dockerfile b/docker/gcp-a100-runner-dind.dockerfile
index 2d67106417..98f012def7 100644
--- a/docker/gcp-a100-runner-dind.dockerfile
+++ b/docker/gcp-a100-runner-dind.dockerfile
@@ -28,9 +28,9 @@ RUN sudo mkdir -p /workspace; sudo chown runner:runner /workspace
 # We assume that the host NVIDIA driver binaries and libraries are mapped to the docker filesystem
 
 # Use the CUDA installation scripts from pytorch/builder
+# Install CUDA 12.4 only to reduce docker size
 RUN cd /workspace; git clone https://github.com/pytorch/builder.git
-RUN sudo bash -c 'source /workspace/builder/common/install_cuda.sh; install_118; prune_118'
-RUN sudo bash -c 'source /workspace/builder/common/install_cuda.sh; install_121; prune_121'
+RUN sudo bash -c 'source /workspace/builder/common/install_cuda.sh; install_124; prune_124'
 
 # Install miniconda
 RUN wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /workspace/Miniconda3-latest-Linux-x86_64.sh
diff --git a/docker/torchbench-nightly.dockerfile b/docker/torchbench-nightly.dockerfile
index c45f33a2e3..f6f574af8e 100644
--- a/docker/torchbench-nightly.dockerfile
+++ b/docker/torchbench-nightly.dockerfile
@@ -8,10 +8,11 @@ ENV SETUP_SCRIPT=/workspace/setup_instance.sh
 ARG TORCHBENCH_BRANCH=${TORCHBENCH_BRANCH:-main}
 ARG FORCE_DATE=${FORCE_DATE}
 
-# Setup Conda env and CUDA
-RUN git clone -b "${TORCHBENCH_BRANCH}" --single-branch \
- https://github.com/pytorch/benchmark /workspace/benchmark
+# Checkout Torchbench and submodules
+RUN git clone --recurse-submodules -b "${TORCHBENCH_BRANCH}" --single-branch \
+    https://github.com/pytorch/benchmark /workspace/benchmark
 
+# Setup conda env and CUDA
 RUN cd /workspace/benchmark && \
     . ${SETUP_SCRIPT} && \
     python ./utils/python_utils.py --create-conda-env ${CONDA_ENV} && \
@@ -45,6 +46,12 @@ RUN cd /workspace/benchmark && \
     . ${SETUP_SCRIPT} && \
     python utils/cuda_utils.py --install-torchbench-deps
 
+# Install FBGEMM GENAI
+RUN cd /workspace/benchmark && \
+    . ${SETUP_SCRIPT} && \
+    python install.py --userbenchmark triton --fbgemm
+
+# Install Torchbench models
 RUN cd /workspace/benchmark && \
     . ${SETUP_SCRIPT} && \
     python install.py
diff --git a/install.py b/install.py
index 7486c22209..ea825e6f7c 100644
--- a/install.py
+++ b/install.py
@@ -46,7 +46,7 @@ def pip_install_requirements(requirements_txt="requirements.txt"):
         choices=list_userbenchmarks(),
         help="Install requirements for optional components.",
     )
-    args = parser.parse_args()
+    args, extra_args = parser.parse_known_args()
 
     os.chdir(os.path.realpath(os.path.dirname(__file__)))
 
@@ -68,9 +68,11 @@ def pip_install_requirements(requirements_txt="requirements.txt"):
     if args.userbenchmark:
         # Install userbenchmark dependencies if exists
         userbenchmark_dir = REPO_ROOT.joinpath("userbenchmark", args.userbenchmark)
+        cmd = [sys.executable, "install.py"]
+        cmd.extend(extra_args)
         if userbenchmark_dir.joinpath("install.py").is_file():
             subprocess.check_call(
-                [sys.executable, "install.py"], cwd=userbenchmark_dir.absolute()
+                cmd, cwd=userbenchmark_dir.absolute()
             )
         sys.exit(0)
 
diff --git a/submodules/FBGEMM b/submodules/FBGEMM
new file mode 160000
index 0000000000..182f3a3d09
--- /dev/null
+++ b/submodules/FBGEMM
@@ -0,0 +1 @@
+Subproject commit 182f3a3d09434da305186325ab7fe5e691ba45b1
diff --git a/userbenchmark/triton/install.py b/userbenchmark/triton/install.py
new file mode 100644
index 0000000000..d5faff068c
--- /dev/null
+++ b/userbenchmark/triton/install.py
@@ -0,0 +1,27 @@
+import argparse
+import subprocess
+import sys
+import os
+from pathlib import Path
+
+REPO_PATH = Path(os.path.abspath(__file__)).parent.parent.parent
+FBGEMM_PATH = REPO_PATH.joinpath("submodules", "FBGEMM", "fbgemm_gpu")
+
+def install_fbgemm():
+    cmd = ["pip", "install", "-r", "requirements.txt"]
+    subprocess.check_call(cmd, cwd=str(FBGEMM_PATH.resolve()))
+    # Build target A100(8.0) or H100(9.0)
+    cmd = [sys.executable, "setup.py", "bdist_wheel", "--package_variant=genai", "-DTORCH_CUDA_ARCH_LIST=8.0;9.0"]
+    subprocess.check_call(cmd, cwd=str(FBGEMM_PATH.resolve()))
+
+def test_fbgemm():
+    cmd = [sys.executable, "-c", '"import fbgemm_gpu.experimental.gen_ai"']
+    subprocess.check_call(cmd)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--fbgemm", action="store_true", help="Install FBGEMM GPU")
+    args = parser.parse_args()
+    if args.fbgemm:
+        install_fbgemm()
+        test_fbgemm()
diff --git a/utils/cuda_utils.py b/utils/cuda_utils.py
index 798f15dcde..80f677af1e 100644
--- a/utils/cuda_utils.py
+++ b/utils/cuda_utils.py
@@ -8,16 +8,12 @@
 from typing import Optional
 
 # defines the default CUDA version to compile against
-DEFAULT_CUDA_VERSION = "12.1"
+DEFAULT_CUDA_VERSION = "12.4"
 
 CUDA_VERSION_MAP = {
-    "11.8": {
-        "pytorch_url": "cu118",
-        "magma_version": "magma-cuda118",
-    },
-    "12.1": {
-        "pytorch_url": "cu121",
-        "magma_version": "magma-cuda121",
+    "12.4": {
+        "pytorch_url": "cu124",
+        "magma_version": "magma-cuda124",
     },
 }
 PIN_CMAKE_VERSION = "3.22.*"