Merge branch 'NVIDIA:main' into jgerh-patch-1-readme-updates

NVIDIA · Dec 13, 2024 · e286b60 · e286b60
2 parents 34d2530 + 28a3328
commit e286b60
Show file tree

Hide file tree

Showing 438 changed files with 32,664 additions and 4,067 deletions.
diff --git a/.github/workflows/_test_template.yml b/.github/workflows/_test_template.yml
@@ -60,7 +60,16 @@ jobs:
               ARG=("--runtime=nvidia --gpus all")
             fi
 
-            docker run --rm -d --name nemo_container_${{ github.run_id }} ${ARG[@]} --shm-size=64g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/nemo_container:${{ github.run_id }} bash -c "sleep $(( ${{ inputs.TIMEOUT }} * 60 + 60 ))"
+            docker run \
+              --rm \
+              -d \
+              --name nemo_container_${{ github.run_id }} ${ARG[@]} \
+              --shm-size=64g \
+              --env TRANSFORMERS_OFFLINE=0 \
+              --env HYDRA_FULL_ERROR=1 \
+              --env HF_HOME=/home/TestData/HF_HOME \
+              --volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/nemo_container:${{ github.run_id }} \
+              bash -c "sleep $(( ${{ inputs.TIMEOUT }} * 60 + 60 ))"
 
         - id: main
           name: Run main script
@@ -95,4 +104,4 @@ jobs:
           if: always()
           run: |
             docker container stop nemo_container_${{ github.run_id }} || true
-            docker container rm nemo_container_${{ github.run_id }} || true
+            docker container rm nemo_container_${{ github.run_id }} || true
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
diff --git a/.github/workflows/release-freeze.yml b/.github/workflows/release-freeze.yml
@@ -9,18 +9,24 @@ on:
         options: 
         - major
         - minor
+      freeze-commit:
+        type: string
+        description: Commit SHA to use for cut-off
+        required: false
+        default: main
       mcore_version:
         description: 'Version of MCore to use (must be a valid git ref)'
         required: true
         type: string
 
 jobs:
   code-freeze:
-    uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_code_freeze.yml@v0.8.0
+    uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_code_freeze.yml@v0.17.3
     with:
-      library_name: NeMo-Toolkit
-      python_package: nemo
-      type_of_release: ${{ inputs.type_of_release }}
+      library-name: NeMo-Toolkit
+      python-package: nemo
+      release-type: ${{ inputs.type_of_release }}
+      freeze-commit: ${{ inputs.freeze-commit }}
     secrets:
       SLACK_RELEASE_ENDPOINT: ${{ secrets.SLACK_RELEASE_ENDPOINT }}
       SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_WEBHOOK_ADMIN }}
@@ -36,7 +42,6 @@ jobs:
           fetch-depth: 0
           fetch-tags: true
           ref: ${{ needs.code-freeze.outputs.release-branch }}
-          token: ${{ secrets.PAT }}
 
       - name: Pin branch name in Notebooks
         run: |

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -20,10 +20,15 @@ on:
         description: Ref (SHA or branch name) to release
         required: true
         type: string
+      dry-run:
+        description: Do not publish a wheel and GitHub release.
+        required: true
+        default: true
+        type: boolean
 
 jobs: 
   release:
-    uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_release_library.yml@v0.12.3
+    uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_release_library.yml@v0.15.1
     with:
       release-ref: ${{ inputs.release-ref }}
       image-name: nemo_container
@@ -35,8 +40,10 @@ jobs:
       python-package: nemo
       container-workdir: /workspace
       library-name: Neural Modules
+      dry-run: ${{ inputs.dry-run }}
     secrets:
       TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
       TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
       SLACK_RELEASE_ENDPOINT: ${{ secrets.SLACK_RELEASE_ENDPOINT }}
       PAT: ${{ secrets.PAT }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
diff --git a/.github/workflows/secrets-detector.yml b/.github/workflows/secrets-detector.yml
@@ -25,11 +25,6 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@v4
         with:
-          # setup repository and ref for PRs, see
-          # https://github.com/EndBug/add-and-commit?tab=readme-ov-file#working-with-prs
-          repository: ${{ github.event.pull_request.head.repo.full_name }}
-          ref: ${{ github.event.pull_request.head.ref }}
-          # custom token is required to trigger actions after reformatting + pushing
           fetch-depth: 0
           token: ${{ secrets.NEMO_REFORMAT_TOKEN }}
 
@@ -38,7 +33,7 @@ jobs:
 
       - name: Run on change-set
         run: |
-          git diff --name-only --diff-filter=d --merge-base origin/main -z | xargs -0 detect-secrets-hook --baseline .secrets.baseline 
+          git diff --name-only --diff-filter=d --merge-base origin/main -z | xargs -0 detect-secrets-hook --disable-plugin HexHighEntropyString --baseline .secrets.baseline
       
       - uses: EndBug/add-and-commit@v9
         # Commit changes. Nothing is committed if no changes.

diff --git a/.secrets.baseline b/.secrets.baseline
@@ -1933,7 +1933,7 @@
         "filename": "tutorials/speaker_tasks/Speaker_Diarization_Inference.ipynb",
         "hashed_secret": "80903ddedcf4ec0a2ee5911cefa7e1ad52419dcc",
         "is_verified": false,
-        "line_number": 989
+        "line_number": 990
       }
     ],
     "tutorials/tools/DefinedCrowd_x_NeMo_ASR_Training_Tutorial.ipynb": [

diff --git a/Dockerfile.ci b/Dockerfile.ci
@@ -54,15 +54,15 @@ RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMO_RUN_T
 # Install NeMo requirements
 ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
 ARG MODELOPT_VERSION=0.19.0
-ARG MCORE_TAG=aded519cfb1de2abf96f36ca059f992294b7876f
+ARG MCORE_TAG=bd677bfb13ac2f19deaa927adc6da6f9201d66aa
 
 ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
 RUN \
---mount=type=bind,source=requirements,target=requirements \
---mount=type=bind,source=tools,target=tools \
---mount=type=bind,source=setup.py,target=setup.py \
---mount=type=bind,source=nemo/package_info.py,target=nemo/package_info.py \
---mount=type=bind,source=nemo/__init__.py,target=nemo/__init__.py <<"EOF" bash -ex
+  --mount=type=bind,source=requirements,target=requirements \
+  --mount=type=bind,source=tools,target=tools \
+  --mount=type=bind,source=setup.py,target=setup.py \
+  --mount=type=bind,source=nemo/package_info.py,target=nemo/package_info.py \
+  --mount=type=bind,source=nemo/__init__.py,target=nemo/__init__.py <<"EOF" bash -ex
 pip install --no-cache-dir --no-build-isolation --extra-index-url https://pypi.nvidia.com \
 "transformer-engine @ git+https://github.com/NVIDIA/TransformerEngine.git@${TE_TAG}" \
 "megatron_core @ git+https://github.com/NVIDIA/Megatron-LM.git@${MCORE_TAG}" \
@@ -84,6 +84,9 @@ git checkout ${MCORE_TAG} && \
 popd
 export PYTHONPATH="${PYTHONPATH}:/workspace/Megatron-LM"
 
+# Install nvidia-resiliency-ext
+pip install --no-cache-dir "git+https://github.com/NVIDIA/nvidia-resiliency-ext.git@97aad77609d2e25ed38ac5c99f0c13f93c48464e"
+
 EOF
 
 # Copy over NeMo code

diff --git a/README.md b/README.md
@@ -118,7 +118,8 @@ Overall, these enhancements make NeMo 2.0 a powerful, scalable, and user-friendl
 
 All NeMo models are trained with
 [Lightning](https://github.com/Lightning-AI/lightning). Training is
-automatically scalable to 1000s of GPUs.
+automatically scalable to 1000s of GPUs. You can check the performance benchmarks using the
+latest NeMo Framework container [here](https://docs.nvidia.com/nemo-framework/user-guide/latest/performance/performance_summary.html).
 
 When applicable, NeMo models leverage cutting-edge distributed training
 techniques, incorporating [parallelism