Enable CUDA 11.8 and Hopper support (#4308)

- adds CUDA 11.8 based build - extend the image decoder to support H100 Hopper Signed-off-by: Janusz Lisiecki <[email protected]>
NVIDIA · Oct 5, 2022 · 0ae7f87 · 0ae7f87
1 parent c707838
commit 0ae7f87
Show file tree

Hide file tree

Showing 9 changed files with 69 additions and 18 deletions.
diff --git a/cmake/CUDA_utils.cmake b/cmake/CUDA_utils.cmake
@@ -46,12 +46,12 @@ elseif (${ARCH} MATCHES "aarch64")
   # from the whole list/; "70" "75" "80" "86"
   # we pick only major arch as minor should be compatible without JITing, it should
   # shrink  the output binary
-  set(CUDA_known_archs "70" "80")
+  set(CUDA_known_archs "70" "80" "90")
 else()
   # from the whole list: "35" "50" "52" "60" "61" "70" "75" "80" "86"
   # we pick only major arch as minor should be compatible without JITing, it should
   # shrink  the output binary
-  set(CUDA_known_archs "35" "50" "60" "70" "80")
+  set(CUDA_known_archs "35" "50" "60" "70" "80" "90")
 endif()
 
 set(CUDA_TARGET_ARCHS ${CUDA_known_archs} CACHE STRING "List of target CUDA architectures")

diff --git a/dali/operators/decoder/image_decoder.cc b/dali/operators/decoder/image_decoder.cc
@@ -95,7 +95,7 @@ the largest allocation value that is printed in the statistics.)code",
   .AddOptionalArg("hw_decoder_load",
       R"code(The percentage of the image data to be processed by the HW JPEG decoder.
 
-Applies **only** to the ``mixed`` backend type in NVIDIA Ampere GPU architecture.
+Applies **only** to the ``mixed`` backend type in NVIDIA Ampere GPU and newer architecture.
 
 Determines the percentage of the workload that will be offloaded to the hardware decoder,
 if available. The optimal workload depends on the number of threads that are provided to
@@ -105,14 +105,14 @@ the DALI pipeline and should be found empirically. More details can be found at
   .AddOptionalArg("preallocate_width_hint",
       R"code(Image width hint.
 
-Applies **only** to the ``mixed`` backend type in NVIDIA Ampere GPU architecture.
+Applies **only** to the ``mixed`` backend type in NVIDIA Ampere GPU and newer architecture.
 
 The hint is used to preallocate memory for the HW JPEG decoder.)code",
       0)
   .AddOptionalArg("preallocate_height_hint",
       R"code(Image width hint.
 
-Applies **only** to the ``mixed`` backend type in NVIDIA Ampere GPU architecture.
+Applies **only** to the ``mixed`` backend type in NVIDIA Ampere GPU and newer architecture.
 
 The hint is used to preallocate memory for the HW JPEG decoder.)code",
       0)

diff --git a/dali/operators/decoder/nvjpeg/nvjpeg_decoder_decoupled_api.h b/dali/operators/decoder/nvjpeg/nvjpeg_decoder_decoupled_api.h
@@ -86,6 +86,9 @@ class nvJPEGDecoder : public Operator<MixedBackend>, CachedDecoderImpl {
                      spec.GetArgument<int>("device_id"),
                      spec.GetArgument<bool>("affine"),
                      "image decoder nvJPEG2k") {
+    // TODO(ktokarski) TODO(jlisiecki) For now it is unused,
+    // adjust NVJPEG to (full capacity of) H100
+    (void) num_hw_engines_;
 #if IS_HW_DECODER_COMPATIBLE
     // if hw_decoder_load is not present in the schema (crop/sliceDecoder) then it is not supported
     bool try_init_hw_decoder = false;
@@ -115,6 +118,14 @@ class nvJPEGDecoder : public Operator<MixedBackend>, CachedDecoderImpl {
 #endif
         LOG_LINE << "Using NVJPEG_BACKEND_HARDWARE" << std::endl;
         CUDA_CALL(nvjpegJpegStateCreate(handle_, &state_hw_batched_));
+        if (nvjpegIsSymbolAvailable("nvjpegGetHardwareDecoderInfo")) {
+          nvjpegGetHardwareDecoderInfo(handle_, &num_hw_engines_, &num_hw_cores_per_engine_);
+          // ToDo adjust hw_decoder_load_ based on num_hw_engines_ and num_hw_cores_per_engine_
+        } else {
+          // assume pre H100 so the defaults are as follow
+          num_hw_engines_ = 1;
+          num_hw_cores_per_engine_ = 5;
+        }
         if (!RestrictPinnedMemUsage()) {
           hw_decoder_images_staging_.set_pinned(true);
           // assume close the worst case size 300kb per image
@@ -1135,10 +1146,9 @@ class nvJPEGDecoder : public Operator<MixedBackend>, CachedDecoderImpl {
     if (hw_decoder_load == 0.f) return 0;
     auto hw_batch_size = static_cast<int>(std::round(hw_decoder_load * curr_batch_size));
 
-    constexpr int kNumHwDecoders = 5;
-    int tail = hw_batch_size % kNumHwDecoders;
+    int tail = hw_batch_size % num_hw_cores_per_engine_;
     if (tail > 0) {
-      hw_batch_size = hw_batch_size + kNumHwDecoders - tail;
+      hw_batch_size = hw_batch_size + num_hw_cores_per_engine_ - tail;
     }
     if (hw_batch_size > curr_batch_size) {
       hw_batch_size = curr_batch_size;
@@ -1166,6 +1176,8 @@ class nvJPEGDecoder : public Operator<MixedBackend>, CachedDecoderImpl {
 
   // Used to ensure the work in the thread pool is picked FIFO
   int64_t task_priority_seq_ = 0;
+  unsigned int num_hw_engines_ = 1;
+  unsigned int num_hw_cores_per_engine_ = 1;
 };
 
 }  // namespace dali

diff --git a/dali/operators/sequence/optical_flow/optical_flow.cc b/dali/operators/sequence/optical_flow/optical_flow.cc
@@ -26,7 +26,7 @@ The main input for this operator is a sequence of frames. Optionally, the operat
 can be provided with external hints for the optical flow calculation. The output format of this operator
 matches the output format of the optical flow driver API.
 Refer to https://developer.nvidia.com/opticalflow-sdk for more information about the
-Turing and Ampere optical flow hardware that is used by DALI.
+Turing, Ampere and Hopper optical flow hardware that is used by DALI.
 )code")
                 .NumInput(1, 2)
                 .NumOutput(1)

diff --git a/docker/Dockerfile.cuda118.aarch64.deps b/docker/Dockerfile.cuda118.aarch64.deps
@@ -0,0 +1,12 @@
+ARG TOOLKIT_BASE_IMAGE=ubuntu:20.04
+FROM ${TOOLKIT_BASE_IMAGE} as cuda
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt update && apt install -y libxml2 curl perl gcc && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN curl -LO https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux_sbsa.run && \
+    chmod +x cuda_*.run && \
+    ./cuda_*.run --silent --no-opengl-libs --toolkit && \
+    rm -f cuda_*.run;
diff --git a/docker/Dockerfile.cuda118.x86_64.deps b/docker/Dockerfile.cuda118.x86_64.deps
@@ -0,0 +1,26 @@
+ARG TOOLKIT_BASE_IMAGE=ubuntu:20.04
+FROM ${TOOLKIT_BASE_IMAGE} as cuda
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt update && apt install -y libxml2 curl perl gcc && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN curl -LO https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run && \
+    chmod +x cuda_*.run && \
+    ./cuda_*.run --silent --no-opengl-libs --toolkit && \
+    rm -f cuda_*.run;
+
+RUN NVJPEG2K_VERSION=0.5.0.25-1 && \
+    CUFILE_VERSION=1.4.0.31-1 && \
+    apt-get update && \
+    apt-get install wget software-properties-common -y && \
+    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
+    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub && \
+    add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /" && \
+    apt-get update && \
+    apt-get install libnvjpeg2k0=${NVJPEG2K_VERSION} libnvjpeg2k-dev=${NVJPEG2K_VERSION} -y && \
+    apt-get install libcufile-dev-11-8=${CUFILE_VERSION} -y && \
+    cp /usr/include/nvjpeg2k* /usr/local/cuda/include/ && \
+    cp /usr/lib/x86_64-linux-gnu/libnvjpeg2k* /usr/local/cuda/lib64/ && \
+    rm -rf /var/lib/apt/lists/*
diff --git a/docker/build.sh b/docker/build.sh
@@ -5,7 +5,7 @@ a build environment
 
 To change build configuration please export appropriate env variables (for exact meaning please check the README):
 PYVER=[default 3.6, required only by Run image]
-CUDA_VERSION=[default 11.7, accepts also 10.2, 11.0 and 11.1, 11.2, 11.3, 11.4, 11.5, 11.6]
+CUDA_VERSION=[default 11.8, accepts also 10.2, 11.0 and 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7]
 NVIDIA_BUILD_ID=[default 12345]
 CREATE_WHL=[default YES]
 CREATE_RUNNER=[default NO]
@@ -40,16 +40,16 @@ shift $((OPTIND - 1))
 export ARCH=${ARCH:-x86_64}
 export PYVER=${PYVER:-3.6}
 export PYV=${PYVER/./}
-export CUDA_VERSION=${CUDA_VERSION:-11.7}
+export CUDA_VERSION=${CUDA_VERSION:-11.8}
 export CUDA_VER=${CUDA_VERSION//./}
 
 if [ "${CUDA_VERSION%%\.*}" ]
 then
   if [ $CUDA_VER != "100" ] && [ $CUDA_VER != "102" ] && [ $CUDA_VER != "110" ] && [ $CUDA_VER != "111" ] && \
      [ $CUDA_VER != "112" ] && [ $CUDA_VER != "113" ] && [ $CUDA_VER != "114" ] && [ $CUDA_VER != "115" ] && \
-     [ $CUDA_VER != "116" ]
+     [ $CUDA_VER != "116" ] && [ $CUDA_VER != "117" ] && [ $CUDA_VER != "118" ]
   then
-      echo "Wrong CUDA_VERSION=$CUDA_VERSION provided. Only 10.0, 10.2, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5 and 11.6 are supported"
+      echo "Wrong CUDA_VERSION=$CUDA_VERSION provided. Only 10.0, 10.2, 11.0, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7 and 11.8 are supported"
       exit 1
   fi
 else

diff --git a/docs/compilation.rst b/docs/compilation.rst
@@ -38,10 +38,10 @@ Building Python Wheel
 Change directory (``cd``) into ``docker`` directory and run ``./build.sh``. If needed,
 set the following environment variables:
 
-* | CUDA_VERSION - CUDA toolkit version (10.2 and 11.7 are officially supported, 10.0, 11.0, 11.1,
-    11.2, 11.4, 11.5 and 11.6 are deprecated and may not work).
-  | The default is ``11.7``. Thanks to CUDA extended compatibility mode, CUDA 11.1, 11.2, 11.3, 11.4
-    11.5, 11.6 and 11.7 wheels are named as CUDA 11.0 because it can work with the CUDA 11.0 R450.x driver
+* | CUDA_VERSION - CUDA toolkit version (10.2 and 11.8 are officially supported, 10.0, 11.0, 11.1,
+    11.2, 11.4, 11.5, 11.6  and 11.7 are deprecated and may not work).
+  | The default is ``11.8``. Thanks to CUDA extended compatibility mode, CUDA 11.1, 11.2, 11.3, 11.4
+    11.5, 11.6, 11.7 and 11.8 wheels are named as CUDA 11.0 because it can work with the CUDA 11.0 R450.x driver
     family. Please update to the latest recommended driver version in that family.
   | If the value of the CUDA_VERSION is prefixed with `.` then any value ``.XX.Y`` can be passed,
     the supported version check is suppressed, and the user needs to make sure that

diff --git a/tools/stub_generator/nvjpeg.json b/tools/stub_generator/nvjpeg.json
@@ -40,6 +40,7 @@
       "nvjpegDecodeJpegHost": {},
       "nvjpegDecodeJpegDevice": {},
       "nvjpegBufferDeviceCreate": {},
-      "nvjpegGetProperty": {}
+      "nvjpegGetProperty": {},
+      "nvjpegGetHardwareDecoderInfo": {}
    }
 }