NVIDIA · yhtang · Oct 15, 2024 · Oct 15, 2024 · Oct 15, 2024 · Oct 15, 2024
diff --git a/.github/workflows/_runner_ondemand_slurm.yaml b/.github/workflows/_runner_ondemand_slurm.yaml
@@ -20,7 +20,7 @@ on:
 jobs:
 
   launch-slurm-runner:
-    runs-on: ubuntu-latest
+    runs-on: jumpbox
     steps:
       - name: Print environment variables
         run: env
@@ -58,7 +58,7 @@ jobs:
         shell: bash -x -e {0}
         run: |
           SLURM_JOB_ID_FILE=$(mktemp)
-          ssh -p 3000 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} >${SLURM_JOB_ID_FILE} \
+          ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} >${SLURM_JOB_ID_FILE} \
             sbatch --parsable \
           <<"EOF"
           #!/bin/bash
@@ -117,5 +117,5 @@ jobs:
         if: cancelled()
         shell: bash -x -e {0}
         run: |
-          ssh -p 3000 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
+          ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
             scancel ${{ steps.submit.outputs.SLURM_JOB_ID }}
diff --git a/.github/workflows/_test_maxtext.yaml b/.github/workflows/_test_maxtext.yaml
@@ -37,14 +37,13 @@ jobs:
 
   single-process-multi-device:
     strategy:
+      max-parallel: 1
       matrix:
         PARALLEL_CONFIG:
         - [1, 1, 2, 4]
         # - [1, 1, 1, 8] # PP, DP, FSDP, TP
       fail-fast: false
-
-    runs-on: ubuntu-22.04
-
+    runs-on: jumpbox
     steps:
       - name: Print environment variables
         run: env
@@ -88,7 +87,7 @@ jobs:
         id: submit
         shell: bash -O expand_aliases -x -e {0}
         run: |
-          alias sshx='ssh -p 3000 -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
+          alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
           sshx "date && hostname && sinfo"
           sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }}
           JOB=$(sshx sbatch --parsable << EOF
@@ -149,17 +148,17 @@ jobs:
         if: cancelled()
         shell: bash -x -e {0}
         run: |
-          ssh -p 3000 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
+          ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
             scancel ${{ steps.submit.outputs.SLURM_JOB_ID }}
 
       - name: Retrieve training logs and upload to TensorBoard server
         shell: bash -x -e {0}
         run: |
           mkdir output/
-          rsync -rtz --progress -e 'ssh -p 3000' \
+          rsync -rtz --progress\
             ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \
             output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true
-          rsync -rtz --progress -e 'ssh -p 3000' \
+          rsync -rtz --progress\
             ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \
             output/ || true
           rsync -rtz --progress \
@@ -184,6 +183,7 @@ jobs:
 
   maxtext-multinode:
     strategy:
+      max-parallel: 1
       matrix:
         PARALLEL_CONFIG:
         - [1, 1, 1, 1]
@@ -193,9 +193,7 @@ jobs:
         - [1, 2, 2, 2]
         - [1, 4, 2, 2]
       fail-fast: false
-
-    runs-on: ubuntu-22.04
-
+    runs-on: jumpbox
     steps:
       - name: Print environment variables
         run: env
@@ -240,7 +238,7 @@ jobs:
         id: submit
         shell: bash -O expand_aliases -x -e {0}
         run: |
-          alias sshx='ssh -p 3000 -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
+          alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
           sshx "date && hostname && sinfo"
           sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }}
           JOB=$(sshx sbatch --parsable << EOF
@@ -304,17 +302,17 @@ jobs:
         if: cancelled()
         shell: bash -x -e {0}
         run: |
-          ssh -p 3000 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
+          ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
             scancel ${{ steps.submit.outputs.SLURM_JOB_ID }}
 
       - name: Retrieve training logs and upload to TensorBoard server
         shell: bash -x -e {0}
         run: |
           mkdir output/
-          rsync -rtz --progress -e 'ssh -p 3000' \
+          rsync -rtz --progress\
             ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \
             output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true
-          rsync -rtz --progress -e 'ssh -p 3000' \
+          rsync -rtz --progress\
             ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \
             output/ || true
           rsync -rtz --progress \

diff --git a/.github/workflows/_test_pax_rosetta.yaml b/.github/workflows/_test_pax_rosetta.yaml
@@ -32,17 +32,15 @@ jobs:
 
   single-process-multi-device-te:
     strategy:
+      max-parallel: 1
       matrix:
         PARALLEL_CONFIG:
         - [1, 8, 1, 1]
         - [1, 1, 2, 4]
       fail-fast: false
-
-    runs-on: ubuntu-22.04
-
+    runs-on: jumpbox
     env:
       BADGE_FILENAME_PREFIX: badge-rosetta-pax-single-process-multi-device-te
-
     steps:
       - name: Print environment variables
         run: env
@@ -87,7 +85,7 @@ jobs:
         shell: bash -O expand_aliases -x -e {0}
         run: |
           cd $GITHUB_WORKSPACE
-          alias sshx='ssh -p 3000 -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
+          alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
           sshx "date && hostname && sinfo"
           sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }}
           JOB=$(sshx sbatch --parsable << EOF
@@ -146,18 +144,18 @@ jobs:
         if: cancelled()
         shell: bash -x -e {0}
         run: |
-          ssh -p 3000 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
+          ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
             scancel ${{ steps.submit.outputs.SLURM_JOB_ID }}
 
       - name: Retrieve training logs and upload to TensorBoard server
         shell: bash -x -e {0}
         run: |
           cd $GITHUB_WORKSPACE
           mkdir output/
-          rsync -rtz --progress -e 'ssh -p 3000' \
+          rsync -rtz --progress\
             ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \
             output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true
-          rsync -rtz --progress -e 'ssh -p 3000' \
+          rsync -rtz --progress\
             ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \
             output/ || true
           rsync -rtz --progress \
@@ -223,6 +221,7 @@ jobs:
 
   rosetta-pax-multi-node-te:
     strategy:
+      max-parallel: 1
       matrix:
         include:
           - TEST_NAME: 1DP1FSDP1TP1PP_TE
@@ -259,8 +258,7 @@ jobs:
             EVALUATE: true
             ADDITIONAL_ARGS: "--model-type LLaMA70BProxy --evaluate"
       fail-fast: false
-
-    runs-on: ubuntu-22.04
+    runs-on: jumpbox
     env:
       BADGE_FILENAME_PREFIX: badge-rosetta-pax-multi-node-te
     steps:
@@ -308,7 +306,7 @@ jobs:
         id: submit
         shell: bash -O expand_aliases -x -e {0}
         run: |
-          alias sshx='ssh -p 3000 -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
+          alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
           sshx "date && hostname && sinfo"
           sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }}
           JOB=$(sshx sbatch --parsable << EOF
@@ -372,18 +370,18 @@ jobs:
         if: cancelled()
         shell: bash -x -e {0}
         run: |
-          ssh -p 3000 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
+          ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
             scancel ${{ steps.submit.outputs.SLURM_JOB_ID }}
 
       - name: Retrieve training logs and upload to TensorBoard server
         shell: bash -x -e {0}
         run: |
           cd $GITHUB_WORKSPACE
           mkdir output/
-          rsync -rtz --progress -e 'ssh -p 3000' \
+          rsync -rtz --progress\
             ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \
             output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true
-          rsync -rtz --progress -e 'ssh -p 3000' \
+          rsync -rtz --progress\
             ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \
             output/ || true
           rsync -rtz --progress \
@@ -450,15 +448,15 @@ jobs:
 
   rosetta-pax-multi-node:
     strategy:
+      max-parallel: 1
       matrix:
         PARALLEL_CONFIG:
         - [1, 8, 1, 1]
         - [1, 4, 1, 2]
         - [4, 2, 1, 1]
         - [4, 2, 1, 2]
       fail-fast: false
-
-    runs-on: ubuntu-22.04
+    runs-on: jumpbox
     env:
       BADGE_FILENAME_PREFIX: badge-rosetta-pax-multi-node
     steps:
@@ -506,7 +504,7 @@ jobs:
         shell: bash -O expand_aliases -x -e {0}
         run: |
           cd $GITHUB_WORKSPACE
-          alias sshx='ssh -p 3000 -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
+          alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
           sshx "date && hostname && sinfo"
           sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }}
           JOB=$(sshx sbatch --parsable << EOF
@@ -567,18 +565,18 @@ jobs:
         if: cancelled()
         shell: bash -x -e {0}
         run: |
-          ssh -p 3000 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
+          ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
             scancel ${{ steps.submit.outputs.SLURM_JOB_ID }}
 
       - name: Retrieve training logs and upload to TensorBoard server
         shell: bash -x -e {0}
         run: |
           cd $GITHUB_WORKSPACE
           mkdir output/
-          rsync -rtz --progress -e 'ssh -p 3000' \
+          rsync -rtz --progress\
             ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \
             output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true
-          rsync -rtz --progress -e 'ssh -p 3000' \
+          rsync -rtz --progress\
             ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \
             output/ || true
           rsync -rtz --progress \
@@ -645,12 +643,12 @@ jobs:
 
   rosetta-pax-single-node-dropout-te:
     strategy:
+      max-parallel: 1
       matrix:
         PARALLEL_CONFIG:
         - [1, 8, 1, 1]
       fail-fast: false
-
-    runs-on: ubuntu-22.04
+    runs-on: jumpbox
     env:
       BADGE_FILENAME_PREFIX: badge-rosetta-pax-single-node-dropout-te
     steps:
@@ -698,7 +696,7 @@ jobs:
         id: submit
         shell: bash -O expand_aliases -x -e {0}
         run: |
-          alias sshx='ssh -p 3000 -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
+          alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
           sshx "date && hostname && sinfo"
           sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }}
           JOB=$(sshx sbatch --parsable << EOF
@@ -762,18 +760,18 @@ jobs:
         if: cancelled()
         shell: bash -x -e {0}
         run: |
-          ssh -p 3000 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
+          ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
             scancel ${{ steps.submit.outputs.SLURM_JOB_ID }}
 
       - name: Retrieve training logs and upload to TensorBoard server
         shell: bash -x -e {0}
         run: |
           cd $GITHUB_WORKSPACE
           mkdir output/
-          rsync -rtz --progress -e 'ssh -p 3000' \
+          rsync -rtz --progress\
             ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \
             output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true
-          rsync -rtz --progress -e 'ssh -p 3000' \
+          rsync -rtz --progress\
             ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \
             output/ || true
           rsync -rtz --progress \
@@ -839,12 +837,12 @@ jobs:
 
   single-process-evaluation-te:
     strategy:
+      max-parallel: 1
       matrix:
         PARALLEL_CONFIG:
         - [1, 8, 1, 1]
       fail-fast: false
-
-    runs-on: ubuntu-22.04
+    runs-on: jumpbox
     env:
       BADGE_FILENAME_PREFIX: badge-rosetta-pax-single-process-evaluation-te
     steps:
@@ -890,7 +888,7 @@ jobs:
         shell: bash -O expand_aliases -x -e {0}
         run: |
           cd $GITHUB_WORKSPACE
-          alias sshx='ssh -p 3000 -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
+          alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}'
           sshx "date && hostname && sinfo"
           sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }}
           JOB=$(sshx sbatch --parsable << EOF
@@ -952,18 +950,18 @@ jobs:
         if: cancelled()
         shell: bash -x -e {0}
         run: |
-          ssh -p 3000 ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
+          ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \
             scancel ${{ steps.submit.outputs.SLURM_JOB_ID }}
 
       - name: Retrieve training logs and upload to TensorBoard server
         shell: bash -x -e {0}
         run: |
           cd $GITHUB_WORKSPACE
           mkdir output/
-          rsync -rtz --progress -e 'ssh -p 3000' \
+          rsync -rtz --progress\
             ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \
             output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true
-          rsync -rtz --progress -e 'ssh -p 3000' \
+          rsync -rtz --progress\
             ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \
             output/ || true
           rsync -rtz --progress \