Merge remote-tracking branch 'refs/remotes/origin/optimize_ivf_flat' …

…into optimize_ivf_flat
rapidsai · Mar 18, 2024 · a61b6f9 · a61b6f9
2 parents 364b5d5 + 9f959ae
commit a61b6f9
Show file tree

Hide file tree

Showing 37 changed files with 808 additions and 361 deletions.
diff --git a/README.md b/README.md
@@ -1,8 +1,12 @@
 # <div align="left"><img src="https://rapids.ai/assets/images/rapids_logo.png" width="90px"/>&nbsp;RAFT: Reusable Accelerated Functions and Tools for Vector Search and More</div>
 
+> [!IMPORTANT]
+> The vector search and clustering algorithms in RAFT are being migrated to a new library dedicated to vector search called [cuVS](https://github.com/rapidsai/cuvs). We will continue to support the vector search algorithms in RAFT during this move, but will no longer update them after the RAPIDS 24.06 (June) release. We plan to complete the migration by RAPIDS 24.08 (August) release.
+
 ![RAFT tech stack](img/raft-tech-stack-vss.png)
 
 
+
 ## Contents
 <hr>
 
@@ -77,6 +81,8 @@ Projects that use the RAFT ANNS algorithms for accelerating vector search includ
 
 Please see the example [Jupyter notebook](https://github.com/rapidsai/raft/blob/HEAD/notebooks/VectorSearch_QuestionRetrieval.ipynb) to get started RAFT for vector search in Python.
 
+
+
 ### Information Retrieval
 
 RAFT contains a catalog of reusable primitives for composing algorithms that require fast neighborhood computations, such as

diff --git a/build.sh b/build.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 # raft build scripts
 
@@ -305,7 +305,7 @@ if hasArg --allgpuarch; then
     BUILD_ALL_GPU_ARCH=1
 fi
 
-if hasArg --compile-lib || (( ${NUMARGS} == 0 )); then
+if hasArg --compile-lib || hasArg pylibraft || (( ${NUMARGS} == 0 )); then
     COMPILE_LIBRARY=ON
     CMAKE_TARGET="${CMAKE_TARGET};raft_lib"
 fi
@@ -405,7 +405,7 @@ fi
 
 ################################################################################
 # Configure for building all C++ targets
-if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs || hasArg tests || hasArg bench-prims || hasArg bench-ann; then
+if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs || hasArg tests || hasArg bench-prims || hasArg bench-ann || ((${COMPILE_LIBRARY} == ON )); then
     if (( ${BUILD_ALL_GPU_ARCH} == 0 )); then
         RAFT_CMAKE_CUDA_ARCHITECTURES="NATIVE"
         echo "Building for the architecture of the GPU in the system..."
@@ -512,6 +512,8 @@ fi
 
 if hasArg docs; then
     set -x
+    export RAPIDS_VERSION="$(sed -E -e 's/^([0-9]{2})\.([0-9]{2})\.([0-9]{2}).*$/\1.\2.\3/' "${REPODIR}/VERSION")"
+    export RAPIDS_VERSION_MAJOR_MINOR="$(sed -E -e 's/^([0-9]{2})\.([0-9]{2})\.([0-9]{2}).*$/\1.\2/' "${REPODIR}/VERSION")"
     cd ${DOXYGEN_BUILD_DIR}
     doxygen Doxyfile
     cd ${SPHINX_BUILD_DIR}

diff --git a/ci/build_docs.sh b/ci/build_docs.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 set -euo pipefail
 
@@ -28,7 +28,9 @@ rapids-mamba-retry install \
   pylibraft \
   raft-dask
 
-export RAPIDS_VERSION_NUMBER="24.04"
+export RAPIDS_VERSION="$(rapids-version)"
+export RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
+export RAPIDS_VERSION_NUMBER="$RAPIDS_VERSION_MAJOR_MINOR"
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
 rapids-logger "Build CPP docs"

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 ########################
 # RAFT Version Updater #
 ########################
@@ -36,23 +36,11 @@ function sed_runner() {
     sed -i.bak ''"$1"'' $2 && rm -f ${2}.bak
 }
 
-sed_runner "s/set(RAPIDS_VERSION .*)/set(RAPIDS_VERSION \"${NEXT_SHORT_TAG}\")/g" cpp/CMakeLists.txt
 sed_runner "s/set(RAPIDS_VERSION .*)/set(RAPIDS_VERSION \"${NEXT_SHORT_TAG}\")/g" cpp/template/cmake/thirdparty/fetch_rapids.cmake
-sed_runner "s/set(RAFT_VERSION .*)/set(RAFT_VERSION \"${NEXT_FULL_TAG}\")/g" cpp/CMakeLists.txt
-sed_runner 's/'"pylibraft_version .*)"'/'"pylibraft_version ${NEXT_FULL_TAG})"'/g' python/pylibraft/CMakeLists.txt
-sed_runner 's/'"raft_dask_version .*)"'/'"raft_dask_version ${NEXT_FULL_TAG})"'/g' python/raft-dask/CMakeLists.txt
-sed_runner 's/'"branch-.*\/RAPIDS.cmake"'/'"branch-${NEXT_SHORT_TAG}\/RAPIDS.cmake"'/g' fetch_rapids.cmake
 
 # Centralized version file update
 echo "${NEXT_FULL_TAG}" > VERSION
 
-# Wheel testing script
-sed_runner "s/branch-.*/branch-${NEXT_SHORT_TAG}/g" ci/test_wheel_raft_dask.sh
-
-# Docs update
-sed_runner 's/version = .*/version = '"'${NEXT_SHORT_TAG}'"'/g' docs/source/conf.py
-sed_runner 's/release = .*/release = '"'${NEXT_FULL_TAG}'"'/g' docs/source/conf.py
-
 DEPENDENCIES=(
   dask-cuda
   pylibraft
@@ -84,9 +72,6 @@ sed_runner "/^ucx_py_version:$/ {n;s/.*/  - \"${NEXT_UCX_PY_VERSION}\"/}" conda/
 for FILE in .github/workflows/*.yaml; do
   sed_runner "/shared-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
 done
-sed_runner "s/RAPIDS_VERSION_NUMBER=\".*/RAPIDS_VERSION_NUMBER=\"${NEXT_SHORT_TAG}\"/g" ci/build_docs.sh
-
-sed_runner "/^PROJECT_NUMBER/ s|\".*\"|\"${NEXT_SHORT_TAG}\"|g" cpp/doxygen/Doxyfile
 
 sed_runner "/^set(RAFT_VERSION/ s|\".*\"|\"${NEXT_SHORT_TAG}\"|g" docs/source/build.md
 sed_runner "s|branch-[0-9][0-9].[0-9][0-9]|branch-${NEXT_SHORT_TAG}|g" docs/source/build.md

diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -39,7 +39,7 @@ dependencies:
 - nccl>=2.9.9
 - ninja
 - numba>=0.57
-- numpy>=1.23
+- numpy>=1.23,<2.0a0
 - numpydoc
 - nvcc_linux-aarch64=11.8
 - pre-commit
@@ -57,5 +57,5 @@ dependencies:
 - sysroot_linux-aarch64==2.17
 - ucx-proc=*=gpu
 - ucx-py==0.37.*
-- ucx>=1.13.0
+- ucx>=1.15.0,<1.16.0
 name: all_cuda-118_arch-aarch64
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -39,7 +39,7 @@ dependencies:
 - nccl>=2.9.9
 - ninja
 - numba>=0.57
-- numpy>=1.23
+- numpy>=1.23,<2.0a0
 - numpydoc
 - nvcc_linux-64=11.8
 - pre-commit
@@ -57,5 +57,5 @@ dependencies:
 - sysroot_linux-64==2.17
 - ucx-proc=*=gpu
 - ucx-py==0.37.*
-- ucx>=1.13.0
+- ucx>=1.15.0,<1.16.0
 name: all_cuda-118_arch-x86_64
diff --git a/conda/environments/all_cuda-122_arch-aarch64.yaml b/conda/environments/all_cuda-122_arch-aarch64.yaml
@@ -36,7 +36,7 @@ dependencies:
 - nccl>=2.9.9
 - ninja
 - numba>=0.57
-- numpy>=1.23
+- numpy>=1.23,<2.0a0
 - numpydoc
 - pre-commit
 - pydata-sphinx-theme
@@ -53,5 +53,5 @@ dependencies:
 - sysroot_linux-aarch64==2.17
 - ucx-proc=*=gpu
 - ucx-py==0.37.*
-- ucx>=1.13.0
+- ucx>=1.15.0,<1.16.0
 name: all_cuda-122_arch-aarch64
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -36,7 +36,7 @@ dependencies:
 - nccl>=2.9.9
 - ninja
 - numba>=0.57
-- numpy>=1.23
+- numpy>=1.23,<2.0a0
 - numpydoc
 - pre-commit
 - pydata-sphinx-theme
@@ -53,5 +53,5 @@ dependencies:
 - sysroot_linux-64==2.17
 - ucx-proc=*=gpu
 - ucx-py==0.37.*
-- ucx>=1.13.0
+- ucx>=1.15.0,<1.16.0
 name: all_cuda-122_arch-x86_64
diff --git a/conda/recipes/libraft/build_libraft_template.sh b/conda/recipes/libraft/build_libraft_template.sh
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 # Just building template so we verify it uses libraft.so and fail if it doesn't build
-./build.sh template
+./build.sh template --no-nvtx
diff --git a/conda/recipes/pylibraft/meta.yaml b/conda/recipes/pylibraft/meta.yaml
@@ -65,7 +65,7 @@ requirements:
     {% endif %}
     - libraft {{ version }}
     - libraft-headers {{ version }}
-    - numpy >=1.23
+    - numpy >=1.23,<2.0a0
     - python x.x
     - rmm ={{ minor_version }}
 

diff --git a/conda/recipes/raft-dask/conda_build_config.yaml b/conda/recipes/raft-dask/conda_build_config.yaml
@@ -14,7 +14,7 @@ sysroot_version:
   - "2.17"
 
 ucx_version:
-  - ">=1.14.1,<1.16.0"
+  - ">=1.15.0,<1.16.0"
 
 ucx_py_version:
   - "0.37.*"

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -10,11 +10,8 @@
 # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 # or implied. See the License for the specific language governing permissions and limitations under
 # the License.
-set(RAPIDS_VERSION "24.04")
-set(RAFT_VERSION "24.04.00")
-
 cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
-include(../fetch_rapids.cmake)
+include(../rapids_config.cmake)
 include(rapids-cmake)
 include(rapids-cpm)
 include(rapids-export)
@@ -34,7 +31,7 @@ endif()
 
 project(
   RAFT
-  VERSION ${RAFT_VERSION}
+  VERSION "${RAPIDS_VERSION}"
   LANGUAGES ${lang_list}
 )
 

diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile
@@ -38,7 +38,7 @@ PROJECT_NAME           = "RAFT C++ API"
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = "24.04"
+PROJECT_NUMBER         = "$(RAPIDS_VERSION_MAJOR_MINOR)"
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a

diff --git a/cpp/include/raft/distance/detail/pairwise_distance_cutlass_base.cuh b/cpp/include/raft/distance/detail/pairwise_distance_cutlass_base.cuh
@@ -91,18 +91,11 @@ std::enable_if_t<ops::has_cutlass_op<OpT>::value> cutlassDistanceKernel(const Da
 
   typename EpilogueOutputOp::Params epilog_op_param(dist_op, fin_op);
 
-  const DataT *a, *b;
-
-  IdxT gemm_lda, gemm_ldb;
-
   // Number of pipelines you want to use
   constexpr int NumStages = 3;
   // Alignment
   constexpr int Alignment = VecLen;
 
-  // default initialize problem size with row major inputs
-  auto problem_size = cutlass::gemm::GemmCoord(n, m, k);
-
   using cutlassDistKernel =
     typename cutlass::gemm::kernel::PairwiseDistanceGemm<DataT,
                                                          Alignment,
@@ -116,53 +109,73 @@ std::enable_if_t<ops::has_cutlass_op<OpT>::value> cutlassDistanceKernel(const Da
 
   using cutlassDist = cutlass::gemm::device::GemmUniversalAdapter<cutlassDistKernel>;
 
-  if constexpr (isRowMajor) {
-    a        = y;
-    b        = x;
-    gemm_lda = ldb;
-    gemm_ldb = lda;
-  } else {
-    problem_size = cutlass::gemm::GemmCoord(m, n, k);
-    a            = x;
-    b            = y;
-    gemm_lda     = lda;
-    gemm_ldb     = ldb;
+  constexpr uint32_t gridYZMax      = ((1 << (sizeof(uint16_t) * 8)) - 1);
+  constexpr uint32_t max_batch_size = gridYZMax * cutlassDistKernel::ThreadblockShape::kN;
+  IdxT numNbatches                  = (n - 1 + max_batch_size) / max_batch_size;
+
+  for (IdxT i = 0; i < numNbatches; i++) {
+    const DataT *a, *b;
+    IdxT gemm_lda, gemm_ldb;
+    size_t offsetN = i * max_batch_size;
+
+    if constexpr (isRowMajor) {
+      gemm_lda = ldb;
+      gemm_ldb = lda;
+      a        = y + offsetN * gemm_lda;
+      b        = x;
+    } else {
+      gemm_lda = lda;
+      gemm_ldb = ldb;
+      a        = x;
+      b        = y + offsetN;
+    }
+    IdxT chunkN   = (i + 1) * max_batch_size;
+    IdxT currentN = (chunkN < n) ? max_batch_size : (n - offsetN);
+
+    // default initialize problem size with row major inputs
+    auto problem_size = isRowMajor ? cutlass::gemm::GemmCoord(currentN, m, k)
+                                   : cutlass::gemm::GemmCoord(m, currentN, k);
+
+    typename cutlassDist::Arguments arguments{
+      mode,
+      problem_size,
+      batch_count,
+      epilog_op_param,
+      a,
+      b,
+      xn,                    // C matrix eq vector param, which here is A norm
+      nullptr,               // tensor_Z,
+      (DataT*)yn + offsetN,  // this is broadcast vec, which is required to be non-const param
+      dOutput + offsetN,     // Output distance matrix
+      (int64_t)0,            // batch stride A
+      (int64_t)0,            // batch stride B
+      (int64_t)0,            // batch stride Norm A
+      (int64_t)0,
+      (int64_t)0,  // batch stride Norm B
+      (int64_t)0,  // batch stride Output
+      gemm_lda,    // stride A
+      gemm_ldb,    // stride B
+      1,           // stride A norm
+      0,           // this is no-op for Z
+      0,           // This must be zero
+      ldd          // stride Output matrix
+    };
+
+    // Using the arguments, query for extra workspace required for matrix multiplication computation
+    size_t workspace_size = cutlassDist::get_workspace_size(arguments);
+    // Allocate workspace memory
+    rmm::device_uvector<uint8_t> workspace(workspace_size, stream);
+    // Instantiate CUTLASS kernel depending on templates
+    cutlassDist cutlassDist_op;
+    // Check the problem size is supported or not
+    RAFT_CUTLASS_TRY(cutlassDist_op.can_implement(arguments));
+
+    // Initialize CUTLASS kernel with arguments and workspace pointer
+    RAFT_CUTLASS_TRY(cutlassDist_op.initialize(arguments, workspace.data(), stream));
+
+    // Launch initialized CUTLASS kernel
+    RAFT_CUTLASS_TRY(cutlassDist_op(stream));
   }
-
-  typename cutlassDist::Arguments arguments{
-    mode,       problem_size, batch_count, epilog_op_param, a, b,
-    xn,          // C matrix eq vector param, which here is A norm
-    nullptr,     // tensor_Z,
-    (DataT*)yn,  // this is broadcast vec, which is required to be non-const param
-    dOutput,     // Output distance matrix
-    (int64_t)0,  // batch stride A
-    (int64_t)0,  // batch stride B
-    (int64_t)0,  // batch stride Norm A
-    (int64_t)0,
-    (int64_t)0,  // batch stride Norm B
-    (int64_t)0,  // batch stride Output
-    gemm_lda,    // stride A
-    gemm_ldb,    // stride B
-    1,           // stride A norm
-    0,           // this is no-op for Z
-    0,           // This must be zero
-    ldd          // stride Output matrix
-  };
-
-  // Using the arguments, query for extra workspace required for matrix multiplication computation
-  size_t workspace_size = cutlassDist::get_workspace_size(arguments);
-  // Allocate workspace memory
-  rmm::device_uvector<uint8_t> workspace(workspace_size, stream);
-  // Instantiate CUTLASS kernel depending on templates
-  cutlassDist cutlassDist_op;
-  // Check the problem size is supported or not
-  RAFT_CUTLASS_TRY(cutlassDist_op.can_implement(arguments));
-
-  // Initialize CUTLASS kernel with arguments and workspace pointer
-  RAFT_CUTLASS_TRY(cutlassDist_op.initialize(arguments, workspace.data(), stream));
-
-  // Launch initialized CUTLASS kernel
-  RAFT_CUTLASS_TRY(cutlassDist_op(stream));
 }
 
 };  // namespace detail

diff --git a/cpp/include/raft/neighbors/ball_cover-inl.cuh b/cpp/include/raft/neighbors/ball_cover-inl.cuh
@@ -333,7 +333,7 @@ void eps_nn(raft::resources const& handle,
     query.extent(0),
     adj.data_handle(),
     vd.data_handle(),
-    spatial::knn::detail::EuclideanFunc<value_t, int_t>());
+    spatial::knn::detail::EuclideanSqFunc<value_t, int_t>());
 }
 
 /**
@@ -392,7 +392,7 @@ void eps_nn(raft::resources const& handle,
     adj_ia.data_handle(),
     adj_ja.data_handle(),
     vd.data_handle(),
-    spatial::knn::detail::EuclideanFunc<value_t, int_t>());
+    spatial::knn::detail::EuclideanSqFunc<value_t, int_t>());
 }
 
 /**