diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
index 3f869da9a..a529b27e2 100644
--- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -25,6 +25,8 @@ dependencies:
 - glog>=0.6.0
 - h5py>=3.8.0
 - hnswlib=0.6.2
+- libaio
+- libboost-devel
 - libcublas-dev=11.11.3.6
 - libcublas=11.11.3.6
 - libcurand-dev=10.3.0.86
@@ -35,6 +37,7 @@ dependencies:
 - libcusparse=11.7.5.86
 - librmm==24.12.*,>=0.0.0a0
 - matplotlib
+- mkl-devel
 - nccl>=2.19
 - ninja
 - nlohmann_json>=3.11.2
diff --git a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
index 81943b184..2ce3b5d7e 100644
--- a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
@@ -26,12 +26,15 @@ dependencies:
 - glog>=0.6.0
 - h5py>=3.8.0
 - hnswlib=0.6.2
+- libaio
+- libboost-devel
 - libcublas-dev
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
 - librmm==24.12.*,>=0.0.0a0
 - matplotlib
+- mkl-devel
 - nccl>=2.19
 - ninja
 - nlohmann_json>=3.11.2
diff --git a/conda/recipes/cuvs-bench-cpu/meta.yaml b/conda/recipes/cuvs-bench-cpu/meta.yaml
index 02c11346f..9aa1fa17d 100644
--- a/conda/recipes/cuvs-bench-cpu/meta.yaml
+++ b/conda/recipes/cuvs-bench-cpu/meta.yaml
@@ -47,6 +47,9 @@ requirements:
     - benchmark
     - fmt {{ fmt_version }}
     - glog {{ glog_version }}
+    - libaio
+    - libboost-devel
+    - mkl-devel  # [linux64]
     - nlohmann_json {{ nlohmann_json_version }}
     - openblas
     - python
diff --git a/conda/recipes/cuvs-bench/meta.yaml b/conda/recipes/cuvs-bench/meta.yaml
index 3e81edc58..0dcf2b64b 100644
--- a/conda/recipes/cuvs-bench/meta.yaml
+++ b/conda/recipes/cuvs-bench/meta.yaml
@@ -72,7 +72,10 @@ requirements:
     - libcublas-dev
     {% endif %}
     - glog {{ glog_version }}
+    - libaio
+    - libboost-devel
     - libcuvs {{ version }}
+    - mkl-devel  # [linux64]
     - nlohmann_json {{ nlohmann_json_version }}
     - openblas
     # rmm is needed to determine if package is gpu-enabled
diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt
index 0f6b42ae9..ac8eff20a 100644
--- a/cpp/bench/ann/CMakeLists.txt
+++ b/cpp/bench/ann/CMakeLists.txt
@@ -32,6 +32,12 @@ option(CUVS_ANN_BENCH_USE_CUVS_BRUTE_FORCE "Include cuVS brute force knn in benc
 option(CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB "Include cuVS CAGRA with HNSW search in benchmark" ON)
 option(CUVS_ANN_BENCH_USE_HNSWLIB "Include hnsw algorithm in benchmark" ON)
 option(CUVS_ANN_BENCH_USE_GGNN "Include ggnn algorithm in benchmark" OFF)
+option(CUVS_ANN_BENCH_USE_DISKANN "Include DISKANN search in benchmark" ON)
+option(CUVS_ANN_BENCH_USE_CUVS_VAMANA "Include cuVS Vamana with DiskANN search in benchmark" ON)
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "(ARM|arm|aarch64)")
+  set(CUVS_ANN_BENCH_USE_DISKANN OFF)
+  set(CUVS_ANN_BENCH_USE_CUVS_VAMANA OFF)
+endif()
 option(CUVS_ANN_BENCH_USE_CUVS_MG "Include cuVS ann mg algorithm in benchmark" ${BUILD_MG_ALGOS})
 option(CUVS_ANN_BENCH_SINGLE_EXE
        "Make a single executable with benchmark as shared library modules" OFF
@@ -57,6 +63,7 @@ if(BUILD_CPU_ONLY)
   set(CUVS_ANN_BENCH_USE_GGNN OFF)
   set(CUVS_KNN_BENCH_USE_CUVS_BRUTE_FORCE OFF)
   set(CUVS_ANN_BENCH_USE_CUVS_MG OFF)
+  set(CUVS_ANN_BENCH_USE_CUVS_VAMANA OFF)
 else()
   set(CUVS_FAISS_ENABLE_GPU ON)
 endif()
@@ -69,6 +76,7 @@ if(CUVS_ANN_BENCH_USE_CUVS_IVF_PQ
    OR CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB
    OR CUVS_KNN_BENCH_USE_CUVS_BRUTE_FORCE
    OR CUVS_ANN_BENCH_USE_CUVS_MG
+   OR CUVS_ANN_BENCH_USE_CUVS_VAMANA
 )
   set(CUVS_ANN_BENCH_USE_CUVS ON)
 endif()
@@ -90,6 +98,10 @@ if(CUVS_ANN_BENCH_USE_FAISS)
   include(cmake/thirdparty/get_faiss)
 endif()
 
+if(CUVS_ANN_BENCH_USE_DISKANN OR CUVS_ANN_BENCH_USE_CUVS_VAMANA)
+  include(cmake/thirdparty/get_diskann)
+endif()
+
 # ##################################################################################################
 # * Target function -------------------------------------------------------------
 
@@ -290,6 +302,17 @@ if(CUVS_ANN_BENCH_USE_GGNN)
   )
 endif()
 
+if(CUVS_ANN_BENCH_USE_DISKANN)
+  ConfigureAnnBench(
+    NAME DISKANN_MEMORY PATH src/diskann/diskann_benchmark.cpp LINKS diskann::diskann
+  )
+  ConfigureAnnBench(NAME DISKANN_SSD PATH src/diskann/diskann_benchmark.cpp LINKS diskann::diskann)
+endif()
+
+if(CUVS_ANN_BENCH_USE_CUVS_VAMANA)
+  ConfigureAnnBench(NAME CUVS_VAMANA PATH src/cuvs/cuvs_vamana.cu LINKS cuvs diskann::diskann)
+endif()
+
 # ##################################################################################################
 # * Dynamically-loading ANN_BENCH executable -------------------------------------------------------
 if(CUVS_ANN_BENCH_SINGLE_EXE)
diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp
index 06e1e27af..7f507cd22 100644
--- a/cpp/bench/ann/src/common/benchmark.hpp
+++ b/cpp/bench/ann/src/common/benchmark.hpp
@@ -135,6 +135,12 @@ void bench_build(::benchmark::State& state,
     }
   }
 
+  if (index.algo == "diskann_ssd") {
+    make_sure_parent_dir_exists(index.file);
+    index.build_param["dataset_file"]  = dataset->base_filename();
+    index.build_param["path_to_index"] = index.file;
+  }
+
   std::unique_ptr<algo<T>> algo;
   try {
     algo = create_algo<T>(index.algo, dataset->distance(), dataset->dim(), index.build_param);
@@ -144,7 +150,8 @@ void bench_build(::benchmark::State& state,
 
   const auto algo_property = parse_algo_property(algo->get_preference(), index.build_param);
 
-  const T* base_set      = dataset->base_set(algo_property.dataset_memory_type);
+  const T* base_set = nullptr;
+  if (index.algo != "diskann_ssd") base_set = dataset->base_set(algo_property.dataset_memory_type);
   std::size_t index_size = dataset->base_set_size();
 
   cuda_timer gpu_timer{algo};
@@ -223,7 +230,12 @@ void bench_search(::benchmark::State& state,
 
   const T* query_set = nullptr;
 
-  if (!file_exists(index.file)) {
+  std::string filename;
+  if (index.algo != "diskann_ssd")
+    filename = index.file;
+  else
+    filename = index.file + "_disk.index";
+  if (!file_exists(filename)) {
     state.SkipWithError("Index file is missing. Run the benchmark in the build mode first.");
     return;
   }
diff --git a/cpp/bench/ann/src/common/dataset.hpp b/cpp/bench/ann/src/common/dataset.hpp
index 49020fe36..c3f565f61 100644
--- a/cpp/bench/ann/src/common/dataset.hpp
+++ b/cpp/bench/ann/src/common/dataset.hpp
@@ -114,6 +114,8 @@ class bin_file {
     }
   }
 
+  std::string file() const { return file_; }
+
  private:
   void check_suffix();
   void open_file() const;
@@ -253,10 +255,11 @@ class dataset {
 
   auto name() const -> std::string { return name_; }
   auto distance() const -> std::string { return distance_; }
-  virtual auto dim() const -> int               = 0;
-  virtual auto max_k() const -> uint32_t        = 0;
-  virtual auto base_set_size() const -> size_t  = 0;
-  virtual auto query_set_size() const -> size_t = 0;
+  virtual auto dim() const -> int                   = 0;
+  virtual auto max_k() const -> uint32_t            = 0;
+  virtual auto base_set_size() const -> size_t      = 0;
+  virtual auto query_set_size() const -> size_t     = 0;
+  virtual auto base_filename() const -> std::string = 0;
 
   // load data lazily, so don't pay the overhead of reading unneeded set
   // e.g. don't load base set when searching
@@ -424,6 +427,7 @@ class bin_dataset : public dataset<T> {
   auto max_k() const -> uint32_t override;
   auto base_set_size() const -> size_t override;
   auto query_set_size() const -> size_t override;
+  std::string base_filename() const override;
 
  private:
   void load_base_set() const;
@@ -541,4 +545,10 @@ void bin_dataset<T>::map_base_set() const
   this->mapped_base_set_ = base_file_.map();
 }
 
+template <typename T>
+std::string bin_dataset<T>::base_filename() const
+{
+  return base_file_.file();
+}
+
 }  // namespace  cuvs::bench
diff --git a/cpp/bench/ann/src/cuvs/cuvs_cagra_diskann_wrapper.h b/cpp/bench/ann/src/cuvs/cuvs_cagra_diskann_wrapper.h
new file mode 100644
index 000000000..566d35897
--- /dev/null
+++ b/cpp/bench/ann/src/cuvs/cuvs_cagra_diskann_wrapper.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "../hnswlib/hnswlib_wrapper.h"
+#include "cuvs_cagra_wrapper.h"
+
+#include <memory>
+
+namespace cuvs::bench {
+
+template <typename T, typename IdxT>
+class cuvs_cagra_diskann : public algo<T>, public algo_gpu {
+ public:
+  using search_param_base = typename algo<T>::search_param;
+  using build_param       = typename cuvs_cagra<T, IdxT>::build_param;
+  using search_param      = typename diskann_mem<T>::search_param;
+
+  cuvs_cagra_diskann(Metric metric, int dim, const build_param& param)
+    : algo<T>(metric, dim),
+      cagra_build_{metric, dim, param},
+      // hnsw_lib param values don't matter since we don't build with hnsw_lib
+      diskann_mem_search_{metric, dim, typename diskann_mem<T>::build_param{50, 100}}
+  {
+  }
+
+  void build(const T* dataset, size_t nrow) final;
+
+  void set_search_param(const search_param_base& param) override;
+
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              algo_base::index_type* neighbors,
+              float* distances) const override;
+
+  [[nodiscard]] auto get_sync_stream() const noexcept -> cudaStream_t override
+  {
+    return cagra_build_.get_sync_stream();
+  }
+
+  // to enable dataset access from GPU memory
+  [[nodiscard]] auto get_preference() const -> algo_property override
+  {
+    algo_property property;
+    property.dataset_memory_type = MemoryType::kHostMmap;
+    property.query_memory_type   = MemoryType::kHost;
+    return property;
+  }
+
+  void save(const std::string& file) const override;
+  void load(const std::string&) override;
+  std::unique_ptr<algo<T>> copy() override
+  {
+    return std::make_unique<cuvs_cagra_hnswlib<T, IdxT>>(*this);
+  }
+
+ private:
+  cuvs_cagra<T, IdxT> cagra_build_;
+  hnsw_lib<T> hnswlib_search_;
+};
+
+template <typename T, typename IdxT>
+void cuvs_cagra_hnswlib<T, IdxT>::build(const T* dataset, size_t nrow)
+{
+  cagra_build_.build(dataset, nrow);
+}
+
+template <typename T, typename IdxT>
+void cuvs_cagra_hnswlib<T, IdxT>::set_search_param(const search_param_base& param_)
+{
+  hnswlib_search_.set_search_param(param_);
+}
+
+template <typename T, typename IdxT>
+void cuvs_cagra_hnswlib<T, IdxT>::save(const std::string& file) const
+{
+  cagra_build_.save_to_hnswlib(file);
+}
+
+template <typename T, typename IdxT>
+void cuvs_cagra_hnswlib<T, IdxT>::load(const std::string& file)
+{
+  hnswlib_search_.load(file);
+  hnswlib_search_.set_base_layer_only();
+}
+
+template <typename T, typename IdxT>
+void cuvs_cagra_hnswlib<T, IdxT>::search(
+  const T* queries, int batch_size, int k, algo_base::index_type* neighbors, float* distances) const
+{
+  hnswlib_search_.search(queries, batch_size, k, neighbors, distances);
+}
+
+}  // namespace cuvs::bench
diff --git a/cpp/bench/ann/src/cuvs/cuvs_vamana.cu b/cpp/bench/ann/src/cuvs/cuvs_vamana.cu
new file mode 100644
index 000000000..7c30a5420
--- /dev/null
+++ b/cpp/bench/ann/src/cuvs/cuvs_vamana.cu
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../common/ann_types.hpp"
+#include "cuvs_vamana_wrapper.h"
+
+#include <rmm/cuda_device.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
+
+namespace cuvs::bench {
+
+template <typename T, typename IdxT>
+void parse_build_param(const nlohmann::json& conf,
+                       typename cuvs::bench::cuvs_vamana<T, IdxT>::build_param& param)
+{
+  if (conf.contains("graph_degree")) { param.graph_degree = conf.at("graph_degree"); }
+  if (conf.contains("visited_size")) { param.visited_size = conf.at("visited_size"); }
+  if (conf.contains("alpha")) { param.alpha = conf.at("alpha"); }
+}
+
+template <typename T, typename IdxT>
+void parse_search_param(const nlohmann::json& conf,
+                        typename cuvs::bench::cuvs_vamana<T, IdxT>::search_param& param)
+{
+  if (conf.contains("L_search")) { param.L_search = conf.at("L_search"); }
+  if (conf.contains("num_threads")) { param.num_threads = conf.at("num_threads"); }
+}
+
+template <typename T>
+auto create_algo(const std::string& algo_name,
+                 const std::string& distance,
+                 int dim,
+                 const nlohmann::json& conf) -> std::unique_ptr<cuvs::bench::algo<T>>
+{
+  [[maybe_unused]] cuvs::bench::Metric metric = parse_metric(distance);
+  std::unique_ptr<cuvs::bench::algo<T>> a;
+
+  if constexpr (std::is_same_v<T, float> or std::is_same_v<T, std::uint8_t>) {
+    if (algo_name == "cuvs_vamana") {
+      typename cuvs::bench::cuvs_vamana<T, uint32_t>::build_param param;
+      parse_build_param<T, uint32_t>(conf, param);
+      a = std::make_unique<cuvs::bench::cuvs_vamana<T, uint32_t>>(metric, dim, param);
+    }
+  }
+
+  if (!a) { throw std::runtime_error("invalid algo: '" + algo_name + "'"); }
+
+  return a;
+}
+
+template <typename T>
+auto create_search_param(const std::string& algo_name, const nlohmann::json& conf)
+  -> std::unique_ptr<typename cuvs::bench::algo<T>::search_param>
+{
+  if (algo_name == "cuvs_vamana") {
+    auto param = std::make_unique<typename cuvs::bench::cuvs_vamana<T, uint32_t>::search_param>();
+    parse_search_param<T, uint32_t>(conf, *param);
+    return param;
+  }
+
+  throw std::runtime_error("invalid algo: '" + algo_name + "'");
+}
+
+}  // namespace cuvs::bench
+
+REGISTER_ALGO_INSTANCE(float);
+
+#ifdef ANN_BENCH_BUILD_MAIN
+#include "../common/benchmark.hpp"
+/*
+[NOTE] Dear developer,
+
+Please don't modify the content of the `main` function; this will make the behavior of the benchmark
+executable differ depending on the cmake flags and will complicate the debugging. In particular,
+don't try to setup an RMM memory resource here; it will anyway be modified by the memory resource
+set on per-algorithm basis. For example, see `cuvs/cuvs_ann_bench_utils.h`.
+*/
+int main(int argc, char** argv) { return cuvs::bench::run_main(argc, argv); }
+#endif
diff --git a/cpp/bench/ann/src/cuvs/cuvs_vamana_wrapper.h b/cpp/bench/ann/src/cuvs/cuvs_vamana_wrapper.h
new file mode 100644
index 000000000..d6c2867f1
--- /dev/null
+++ b/cpp/bench/ann/src/cuvs/cuvs_vamana_wrapper.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "../common/ann_types.hpp"
+#include "../diskann/diskann_wrapper.h"
+#include "cuvs_ann_bench_utils.h"
+#include <cuvs/neighbors/vamana.hpp>
+
+#include <memory>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/mdspan.hpp>
+
+namespace cuvs::bench {
+
+template <typename T, typename IdxT>
+class cuvs_vamana : public algo<T>, public algo_gpu {
+ public:
+  using build_param       = cuvs::neighbors::experimental::vamana::index_params;
+  using search_param_base = typename algo<T>::search_param;
+  using search_param      = typename diskann_memory<T>::search_param;
+
+  cuvs_vamana(Metric metric, int dim, const build_param& param);
+
+  void build(const T* dataset, size_t nrow) final;
+
+  void set_search_param(const search_param_base& param) override;
+
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              algo_base::index_type* neighbors,
+              float* distances) const override;
+
+  [[nodiscard]] auto get_sync_stream() const noexcept -> cudaStream_t override
+  {
+    return handle_.get_sync_stream();
+  }
+
+  // to enable dataset access from GPU memory
+  [[nodiscard]] auto get_preference() const -> algo_property override
+  {
+    algo_property property;
+    property.dataset_memory_type = MemoryType::kDevice;
+    property.query_memory_type   = MemoryType::kHost;
+    return property;
+  }
+
+  void save(const std::string& file) const override;
+  void load(const std::string&) override;
+  std::unique_ptr<algo<T>> copy() override { return std::make_unique<cuvs_vamana<T, IdxT>>(*this); }
+
+ private:
+  std::shared_ptr<cuvs::neighbors::experimental::vamana::index<T, IdxT>> vamana_index_;
+  std::shared_ptr<diskann_memory<T>> diskann_memory_search_;
+  configured_raft_resources handle_{};
+  build_param vamana_index_params_;
+};
+
+template <typename T, typename IdxT>
+cuvs_vamana<T, IdxT>::cuvs_vamana(Metric metric, int dim, const build_param& param)
+  : algo<T>(metric, dim)
+{
+  this->vamana_index_params_ = param;
+  diskann_memory_search_     = std::make_shared<cuvs::bench::diskann_memory<T>>(
+    metric, dim, typename diskann_memory<T>::build_param{param.graph_degree, param.visited_size});
+}
+
+template <typename T, typename IdxT>
+void cuvs_vamana<T, IdxT>::build(const T* dataset, size_t nrow)
+{
+  auto dataset_view_host = raft::make_mdspan<const T, int64_t, raft::row_major, true, false>(
+    dataset, raft::make_extents<int64_t>(nrow, this->dim_));
+  auto dataset_view_device = raft::make_mdspan<const T, int64_t, raft::row_major, true, false>(
+    dataset, raft::make_extents<int64_t>(nrow, this->dim_));
+  bool dataset_is_on_host = raft::get_device_for_address(dataset) == -1;
+
+  vamana_index_ = std::make_shared<cuvs::neighbors::experimental::vamana::index<T, uint32_t>>(
+    std::move(dataset_is_on_host ? cuvs::neighbors::experimental::vamana::build(
+                                     handle_, vamana_index_params_, dataset_view_host)
+                                 : cuvs::neighbors::experimental::vamana::build(
+                                     handle_, vamana_index_params_, dataset_view_device)));
+}
+
+template <typename T, typename IdxT>
+void cuvs_vamana<T, IdxT>::set_search_param(const search_param_base& param_)
+{
+  diskann_memory_search_->set_search_param(param_);
+}
+
+template <typename T, typename IdxT>
+void cuvs_vamana<T, IdxT>::save(const std::string& file) const
+{
+  cuvs::neighbors::experimental::vamana::serialize(handle_, file, *vamana_index_);
+}
+
+template <typename T, typename IdxT>
+void cuvs_vamana<T, IdxT>::load(const std::string& file)
+{
+  diskann_memory_search_->load(file);
+}
+
+template <typename T, typename IdxT>
+void cuvs_vamana<T, IdxT>::search(
+  const T* queries, int batch_size, int k, algo_base::index_type* neighbors, float* distances) const
+{
+  diskann_memory_search_->search(queries, batch_size, k, neighbors, distances);
+}
+
+}  // namespace cuvs::bench
diff --git a/cpp/bench/ann/src/diskann/diskann_benchmark.cpp b/cpp/bench/ann/src/diskann/diskann_benchmark.cpp
new file mode 100644
index 000000000..ca8a94048
--- /dev/null
+++ b/cpp/bench/ann/src/diskann/diskann_benchmark.cpp
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../common/ann_types.hpp"
+#include "diskann_wrapper.h"
+
+#define JSON_DIAGNOSTICS 1
+#include <nlohmann/json.hpp>
+
+#include <algorithm>
+#include <cmath>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+namespace cuvs::bench {
+
+template <typename T>
+void parse_build_param(const nlohmann::json& conf,
+                       typename cuvs::bench::diskann_memory<T>::build_param& param)
+{
+  param.R = conf.at("R");
+  if (conf.contains("L_build")) { param.L_build = conf.at("L_build"); }
+  if (conf.contains("alpha")) { param.num_threads = conf.at("alpha"); }
+  if (conf.contains("num_threads")) { param.num_threads = conf.at("num_threads"); }
+}
+
+template <typename T>
+void parse_build_param(const nlohmann::json& conf,
+                       typename cuvs::bench::diskann_ssd<T>::build_param& param)
+{
+  param.R = conf.at("R");
+  if (conf.contains("L_build")) { param.L_build = conf.at("L_build"); }
+  if (conf.contains("alpha")) { param.num_threads = conf.at("alpha"); }
+  if (conf.contains("num_threads")) { param.num_threads = conf.at("num_threads"); }
+  if (conf.contains("QD")) { param.QD = conf.at("QD"); }
+  if (conf.contains("dataset_file")) { param.dataset_file = conf.at("dataset_file"); }
+  if (conf.contains("path_to_index")) { param.path_to_index = conf.at("path_to_index"); }
+}
+
+template <typename T>
+void parse_search_param(const nlohmann::json& conf,
+                        typename cuvs::bench::diskann_memory<T>::search_param& param)
+{
+  param.L_search    = conf.at("L_search");
+  param.num_threads = conf.at("num_threads");
+}
+
+template <typename T>
+void parse_search_param(const nlohmann::json& conf,
+                        typename cuvs::bench::diskann_ssd<T>::search_param& param)
+{
+  param.L_search    = conf.at("L_search");
+  param.num_threads = conf.at("num_threads");
+  if (conf.contains("num_nodes_to_cache")) {
+    param.num_nodes_to_cache = conf.at("num_nodes_to_cache");
+  }
+  if (conf.contains("beam_width")) { param.beam_width = conf.at("beam_width"); }
+}
+
+template <typename T, template <typename> class Algo>
+std::unique_ptr<cuvs::bench::algo<T>> make_algo(cuvs::bench::Metric metric,
+                                                int dim,
+                                                const nlohmann::json& conf)
+{
+  typename Algo<T>::build_param param;
+  parse_build_param<T>(conf, param);
+  return std::make_unique<Algo<T>>(metric, dim, param);
+}
+
+template <typename T>
+auto create_algo(const std::string& algo_name,
+                 const std::string& distance,
+                 int dim,
+                 const nlohmann::json& conf) -> std::unique_ptr<cuvs::bench::algo<T>>
+{
+  cuvs::bench::Metric metric = parse_metric(distance);
+  std::unique_ptr<cuvs::bench::algo<T>> a;
+
+  if constexpr (std::is_same_v<T, float> || std::is_same_v<T, uint8_t> ||
+                std::is_same_v<T, int8_t>) {
+    if (algo_name == "diskann_memory") {
+      a = make_algo<T, cuvs::bench::diskann_memory>(metric, dim, conf);
+    } else if (algo_name == "diskann_ssd") {
+      a = make_algo<T, cuvs::bench::diskann_ssd>(metric, dim, conf);
+    }
+  }
+  if (!a) { throw std::runtime_error("invalid algo: '" + algo_name + "'"); }
+
+  return a;
+}
+
+template <typename T>
+std::unique_ptr<typename cuvs::bench::algo<T>::search_param> create_search_param(
+  const std::string& algo_name, const nlohmann::json& conf)
+{
+  if (algo_name == "diskann_memory") {
+    auto param = std::make_unique<typename cuvs::bench::diskann_memory<T>::search_param>();
+    parse_search_param<T>(conf, *param);
+    return param;
+  } else if (algo_name == "diskann_ssd") {
+    auto param = std::make_unique<typename cuvs::bench::diskann_ssd<T>::search_param>();
+    parse_search_param<T>(conf, *param);
+    return param;
+  }
+  throw std::runtime_error("invalid algo: '" + algo_name + "'");
+}
+
+};  // namespace cuvs::bench
+
+REGISTER_ALGO_INSTANCE(float);
+
+#ifdef ANN_BENCH_BUILD_MAIN
+#include "../common/benchmark.hpp"
+int main(int argc, char** argv) { return cuvs::bench::run_main(argc, argv); }
+#endif
\ No newline at end of file
diff --git a/cpp/bench/ann/src/diskann/diskann_wrapper.h b/cpp/bench/ann/src/diskann/diskann_wrapper.h
new file mode 100644
index 000000000..79f207c81
--- /dev/null
+++ b/cpp/bench/ann/src/diskann/diskann_wrapper.h
@@ -0,0 +1,335 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "../common/ann_types.hpp"
+
+#include <limits>
+
+#include <disk_utils.h>
+#include <index.h>
+#include <linux_aligned_file_reader.h>
+#include <omp.h>
+#include <pq_flash_index.h>
+#include <utils.h>
+
+#include <chrono>
+#include <memory>
+#include <vector>
+
+namespace cuvs::bench {
+
+diskann::Metric parse_metric_to_diskann(cuvs::bench::Metric metric)
+{
+  if (metric == cuvs::bench::Metric::kInnerProduct) {
+    return diskann::Metric::INNER_PRODUCT;
+  } else if (metric == cuvs::bench::Metric::kEuclidean) {
+    return diskann::Metric::L2;
+  } else {
+    throw std::runtime_error("currently only inner product and L2 supported for benchmarking");
+  }
+}
+
+template <typename T>
+class diskann_memory : public algo<T> {
+ public:
+  struct build_param {
+    uint32_t R;
+    uint32_t L_build;
+    uint32_t build_pq_bytes = 0;
+    float alpha             = 1.2;
+    int num_threads         = omp_get_num_procs();
+  };
+
+  using search_param_base = typename algo<T>::search_param;
+  struct search_param : public search_param_base {
+    uint32_t L_search;
+    uint32_t num_threads = omp_get_num_procs();
+    // Mode metric_objective;
+  };
+
+  diskann_memory(Metric metric, int dim, const build_param& param);
+
+  void build(const T* dataset, size_t nrow) override;
+
+  void set_search_param(const search_param_base& param) override;
+
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              algo_base::index_type* indices,
+              float* distances) const override;
+
+  void save(const std::string& path_to_index) const override;
+  void load(const std::string& path_to_index) override;
+  diskann_memory(const diskann_memory<T>& other) = default;
+  std::unique_ptr<algo<T>> copy() override { return std::make_unique<diskann_memory<T>>(*this); }
+
+  [[nodiscard]] auto get_preference() const -> algo_property override
+  {
+    algo_property property;
+    property.dataset_memory_type = MemoryType::kHost;
+    property.query_memory_type   = MemoryType::kHost;
+    return property;
+  }
+
+ private:
+  std::shared_ptr<diskann::IndexWriteParameters> diskann_index_write_params_{nullptr};
+  uint32_t max_points_;
+  uint32_t build_pq_bytes_ = 0;
+  int num_threads_;
+  uint32_t L_search_;
+  Mode bench_mode_;
+  int num_search_threads_;
+  std::string index_path_prefix_;
+  std::shared_ptr<diskann::Index<T>> mem_index_{nullptr};
+  void initialize_index_(size_t max_points);
+};
+
+template <typename T>
+diskann_memory<T>::diskann_memory(Metric metric, int dim, const build_param& param)
+  : algo<T>(metric, dim)
+{
+  assert(this->dim_ > 0);
+  num_threads_                = param.num_threads;
+  diskann_index_write_params_ = std::make_shared<diskann::IndexWriteParameters>(
+    diskann::IndexWriteParametersBuilder(param.L_build, param.R)
+      .with_filter_list_size(0)
+      .with_alpha(param.alpha)
+      .with_saturate_graph(false)
+      .with_num_threads(param.num_threads)
+      .build());
+}
+
+template <typename T>
+void diskann_memory<T>::initialize_index_(size_t max_points)
+{
+  this->mem_index_ = std::make_shared<diskann::Index<T>>(parse_metric_to_diskann(this->metric_),
+                                                         this->dim_,
+                                                         max_points,
+                                                         diskann_index_write_params_,
+                                                         nullptr,
+                                                         0,
+                                                         false,
+                                                         false,
+                                                         false,
+                                                         build_pq_bytes_ > 0,
+                                                         build_pq_bytes_,
+                                                         false,
+                                                         false);
+}
+template <typename T>
+void diskann_memory<T>::build(const T* dataset, size_t nrow)
+{
+  initialize_index_(nrow);
+  mem_index_->build(dataset, nrow, std::vector<uint32_t>());
+}
+
+template <typename T>
+void diskann_memory<T>::set_search_param(const search_param_base& param_)
+{
+  auto param          = dynamic_cast<const search_param&>(param_);
+  L_search_           = param.L_search;
+  num_search_threads_ = param.num_threads;
+
+  // only latency mode supported. Use the num_threads search param to run search with multiple
+  // threads
+  bench_mode_ = Mode::kLatency;
+
+  // Create a pool if multiple query threads have been set and the pool hasn't been created already
+  initialize_index_(0);
+  this->mem_index_->load(index_path_prefix_.c_str(), num_search_threads_, L_search_);
+}
+
+template <typename T>
+void diskann_memory<T>::search(
+  const T* queries, int batch_size, int k, algo_base::index_type* indices, float* distances) const
+{
+#pragma omp parallel for schedule(dynamic, 1)
+  for (int i = 0; i < batch_size; i++) {
+    mem_index_->search(queries + i * this->dim_,
+                       static_cast<size_t>(k),
+                       L_search_,
+                       reinterpret_cast<uint64_t*>(indices + i * k),
+                       distances + i * k);
+  }
+}
+
+template <typename T>
+void diskann_memory<T>::save(const std::string& path_to_index) const
+{
+  this->mem_index_->save(path_to_index.c_str());
+}
+
+template <typename T>
+void diskann_memory<T>::load(const std::string& path_to_index)
+{
+  // only save the index path prefix here
+  index_path_prefix_ = path_to_index;
+}
+
+template <typename T>
+class diskann_ssd : public algo<T> {
+ public:
+  struct build_param {
+    uint32_t R;
+    uint32_t L_build;
+    uint32_t build_pq_bytes   = 0;
+    float alpha               = 1.2;
+    int num_threads           = omp_get_num_procs();
+    uint32_t QD               = 192;
+    std::string dataset_file  = "";
+    std::string path_to_index = "";
+  };
+  using search_param_base = typename algo<T>::search_param;
+
+  struct search_param : public search_param_base {
+    uint32_t L_search;
+    uint32_t num_threads        = omp_get_num_procs() / 2;
+    uint32_t num_nodes_to_cache = 10000;
+    int beam_width              = 2;
+    // Mode metric_objective;
+  };
+
+  diskann_ssd(Metric metric, int dim, const build_param& param);
+
+  void build(const T* dataset, size_t nrow) override;
+
+  void set_search_param(const search_param_base& param) override;
+
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              algo_base::index_type* neighbors,
+              float* distances) const override;
+
+  void save(const std::string& path_to_index) const override;
+  void load(const std::string& path_to_index) override;
+  diskann_ssd(const diskann_ssd<T>& other) = default;
+  std::unique_ptr<algo<T>> copy() override { return std::make_unique<diskann_ssd<T>>(*this); }
+
+  [[nodiscard]] auto get_preference() const -> algo_property override
+  {
+    algo_property property;
+    property.dataset_memory_type = MemoryType::kHost;
+    property.query_memory_type   = MemoryType::kHost;
+    return property;
+  }
+
+ private:
+  std::string index_build_params_str;
+  std::shared_ptr<diskann::PQFlashIndex<T, uint32_t>> p_flash_index_;
+  int beam_width_;
+  uint32_t num_nodes_to_cache_;
+
+  // in-memory index params
+  uint32_t build_pq_bytes_ = 0;
+  uint32_t max_points_;
+  // for safe scratch space allocs, set the default to half the number of procs for loading the
+  // index. User must ensure that the number of search threads is less than or equal to this value
+  int num_search_threads_ = omp_get_num_procs() / 2;
+  // L_search is hardcoded to the maximum visited list size in the search params. This default is
+  // for loading the index
+  uint32_t L_search_ = 384;
+  Mode bench_mode_;
+  std::string base_file_;
+  std::string index_path_prefix_;
+  std::shared_ptr<AlignedFileReader> reader = nullptr;
+};
+
+template <typename T>
+diskann_ssd<T>::diskann_ssd(Metric metric, int dim, const build_param& param) : algo<T>(metric, dim)
+{
+  // Currently set the indexing RAM budget and the search RAM budget to max value to avoid sharding
+  uint32_t build_dram_budget  = std::numeric_limits<uint32_t>::max();
+  uint32_t search_dram_budget = std::numeric_limits<uint32_t>::max();
+  index_build_params_str =
+    std::string(std::to_string(param.R)) + " " + std::string(std::to_string(param.L_build)) + " " +
+    std::string(std::to_string(search_dram_budget)) + " " +
+    std::string(std::to_string(build_dram_budget)) + " " +
+    std::string(std::to_string(param.num_threads)) + " " + std::string(std::to_string(false)) +
+    " " + std::string(std::to_string(false)) + " " + std::string(std::to_string(0)) + " " +
+    std::string(std::to_string(param.QD));
+  base_file_         = param.dataset_file;
+  index_path_prefix_ = param.path_to_index;
+}
+
+template <typename T>
+void diskann_ssd<T>::build(const T* dataset, size_t nrow)
+{
+  diskann::build_disk_index<float>(base_file_.c_str(),
+                                   index_path_prefix_.c_str(),
+                                   index_build_params_str.c_str(),
+                                   parse_metric_to_diskann(this->metric_),
+                                   false,
+                                   std::string(""),
+                                   false,
+                                   std::string(""),
+                                   std::string(""),
+                                   0,
+                                   0);
+}
+
+template <typename T>
+void diskann_ssd<T>::set_search_param(const search_param_base& param_)
+{
+  auto param          = dynamic_cast<const search_param&>(param_);
+  L_search_           = param.L_search;
+  num_search_threads_ = param.num_threads;
+  num_nodes_to_cache_ = param.num_nodes_to_cache;
+  beam_width_         = param.beam_width;
+
+  // only latency mode supported with thread pool
+  bench_mode_ = Mode::kLatency;
+}
+
+template <typename T>
+void diskann_ssd<T>::search(
+  const T* queries, int batch_size, int k, algo_base::index_type* neighbors, float* distances) const
+{
+#pragma omp parallel for schedule(dynamic, 1)
+  for (int64_t i = 0; i < (int64_t)batch_size; i++) {
+    p_flash_index_->cached_beam_search(queries + (i * this->dim_),
+                                       static_cast<size_t>(k),
+                                       L_search_,
+                                       reinterpret_cast<uint64_t*>(neighbors + i * k),
+                                       distances + i * k,
+                                       beam_width_,
+                                       false,
+                                       nullptr);
+  }
+}
+
+template <typename T>
+void diskann_ssd<T>::save(const std::string& path_to_index) const
+{
+  // Nothing to do here. Index already saved in build stage.
+}
+
+template <typename T>
+void diskann_ssd<T>::load(const std::string& path_to_index)
+{
+  reader.reset(new LinuxAlignedFileReader());
+  p_flash_index_ =
+    std::make_shared<diskann::PQFlashIndex<T>>(reader, parse_metric_to_diskann(this->metric_));
+  int result = p_flash_index_->load(num_search_threads_, path_to_index.c_str());
+  std::vector<uint32_t> node_list;
+  p_flash_index_->cache_bfs_levels(num_nodes_to_cache_, node_list);
+  p_flash_index_->load_cache_list(node_list);
+  node_list.clear();
+  node_list.shrink_to_fit();
+}
+};  // namespace cuvs::bench
diff --git a/cpp/cmake/patches/diskann.diff b/cpp/cmake/patches/diskann.diff
new file mode 100644
index 000000000..466b24d22
--- /dev/null
+++ b/cpp/cmake/patches/diskann.diff
@@ -0,0 +1,227 @@
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 3d3d2b8..3079d12 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -145,62 +145,14 @@ if (MSVC)
+         "${DISKANN_MKL_LIB_PATH}/mkl_intel_thread.lib")
+ else()
+     # expected path for manual intel mkl installs
+-    set(POSSIBLE_OMP_PATHS "/opt/intel/oneapi/compiler/latest/linux/compiler/lib/intel64_lin/libiomp5.so;/usr/lib/x86_64-linux-gnu/libiomp5.so;/opt/intel/lib/intel64_lin/libiomp5.so")
+-    foreach(POSSIBLE_OMP_PATH ${POSSIBLE_OMP_PATHS})
+-        if (EXISTS ${POSSIBLE_OMP_PATH})
+-            get_filename_component(OMP_PATH ${POSSIBLE_OMP_PATH} DIRECTORY)
+-        endif()
+-    endforeach()
+-
+-    if(NOT OMP_PATH)
+-        message(FATAL_ERROR "Could not find Intel OMP in standard locations; use -DOMP_PATH to specify the install location for your environment")
+-    endif()
+-    link_directories(${OMP_PATH})
+-
+-    set(POSSIBLE_MKL_LIB_PATHS "/opt/intel/oneapi/mkl/latest/lib/intel64/libmkl_core.so;/usr/lib/x86_64-linux-gnu/libmkl_core.so;/opt/intel/mkl/lib/intel64/libmkl_core.so")
+-    foreach(POSSIBLE_MKL_LIB_PATH ${POSSIBLE_MKL_LIB_PATHS})
+-        if (EXISTS ${POSSIBLE_MKL_LIB_PATH})
+-            get_filename_component(MKL_PATH ${POSSIBLE_MKL_LIB_PATH} DIRECTORY)
+-        endif()
+-    endforeach()
+-
+-    set(POSSIBLE_MKL_INCLUDE_PATHS "/opt/intel/oneapi/mkl/latest/include;/usr/include/mkl;/opt/intel/mkl/include/;")
+-    foreach(POSSIBLE_MKL_INCLUDE_PATH ${POSSIBLE_MKL_INCLUDE_PATHS})
+-        if (EXISTS ${POSSIBLE_MKL_INCLUDE_PATH})
+-            set(MKL_INCLUDE_PATH ${POSSIBLE_MKL_INCLUDE_PATH})
+-        endif()
+-    endforeach()
+-    if(NOT MKL_PATH)
+-        message(FATAL_ERROR "Could not find Intel MKL in standard locations; use -DMKL_PATH to specify the install location for your environment")
+-    elseif(NOT MKL_INCLUDE_PATH)
+-        message(FATAL_ERROR "Could not find Intel MKL in standard locations; use -DMKL_INCLUDE_PATH to specify the install location for headers for your environment")
+-    endif()
+-    if (EXISTS ${MKL_PATH}/libmkl_def.so.2)
+-        set(MKL_DEF_SO ${MKL_PATH}/libmkl_def.so.2)
+-    elseif(EXISTS ${MKL_PATH}/libmkl_def.so)
+-        set(MKL_DEF_SO ${MKL_PATH}/libmkl_def.so)
+-    else()
+-        message(FATAL_ERROR "Despite finding MKL, libmkl_def.so was not found in expected locations.")
+-    endif()
+-    link_directories(${MKL_PATH})
+-    include_directories(${MKL_INCLUDE_PATH})
++    find_package(MKL CONFIG REQUIRED)
++    include_directories($<TARGET_PROPERTY:MKL::MKL,INTERFACE_INCLUDE_DIRECTORIES>)
++    link_libraries($<LINK_ONLY:MKL::MKL>)
+ 
+     # compile flags and link libraries
+     add_compile_options(-m64 -Wl,--no-as-needed)
+     if (NOT PYBIND)
+         link_libraries(mkl_intel_ilp64 mkl_intel_thread mkl_core iomp5 pthread m dl)
+-    else()
+-        # static linking for python so as to minimize customer dependency issues
+-        link_libraries(
+-                ${MKL_PATH}/libmkl_intel_ilp64.a
+-                ${MKL_PATH}/libmkl_intel_thread.a
+-                ${MKL_PATH}/libmkl_core.a
+-                ${MKL_DEF_SO}
+-                iomp5
+-                pthread
+-                m
+-                dl
+-        )
+     endif()
+ endif()
+ 
+@@ -286,7 +238,7 @@ if(MSVC)
+ 	set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY_RELEASE ${PROJECT_SOURCE_DIR}/x64/Release)
+ else()
+     set(ENV{TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD} 500000000000)
+-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2 -mfma -msse2 -ftree-vectorize -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free -fopenmp -fopenmp-simd -funroll-loops -Wfatal-errors -DUSE_AVX2")
++    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2 -mfma -msse2 -ftree-vectorize -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free -fopenmp -fopenmp-simd -funroll-loops -Wfatal-errors -DUSE_AVX2 -fno-finite-math-only -laio")
+     set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g -DDEBUG")
+     if (NOT PYBIND)
+         set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DNDEBUG -Ofast")
+@@ -300,10 +252,6 @@ else()
+ endif()
+ 
+ add_subdirectory(src)
+-if (NOT PYBIND)
+-    add_subdirectory(apps)
+-    add_subdirectory(apps/utils)
+-endif()
+ 
+ if (UNIT_TEST)
+     enable_testing()
+diff --git a/include/distance.h b/include/distance.h
+index f3b1de2..d4da72e 100644
+--- a/include/distance.h
++++ b/include/distance.h
+@@ -77,6 +77,7 @@ class DistanceCosineInt8 : public Distance<int8_t>
+     DistanceCosineInt8() : Distance<int8_t>(diskann::Metric::COSINE)
+     {
+     }
++    using Distance<int8_t>::compare;
+     DISKANN_DLLEXPORT virtual float compare(const int8_t *a, const int8_t *b, uint32_t length) const;
+ };
+ 
+@@ -86,6 +87,7 @@ class DistanceL2Int8 : public Distance<int8_t>
+     DistanceL2Int8() : Distance<int8_t>(diskann::Metric::L2)
+     {
+     }
++    using Distance<int8_t>::compare;
+     DISKANN_DLLEXPORT virtual float compare(const int8_t *a, const int8_t *b, uint32_t size) const;
+ };
+ 
+@@ -96,6 +98,7 @@ class AVXDistanceL2Int8 : public Distance<int8_t>
+     AVXDistanceL2Int8() : Distance<int8_t>(diskann::Metric::L2)
+     {
+     }
++    using Distance<int8_t>::compare;
+     DISKANN_DLLEXPORT virtual float compare(const int8_t *a, const int8_t *b, uint32_t length) const;
+ };
+ 
+@@ -105,6 +108,7 @@ class DistanceCosineFloat : public Distance<float>
+     DistanceCosineFloat() : Distance<float>(diskann::Metric::COSINE)
+     {
+     }
++    using Distance<float>::compare;
+     DISKANN_DLLEXPORT virtual float compare(const float *a, const float *b, uint32_t length) const;
+ };
+ 
+@@ -114,7 +118,7 @@ class DistanceL2Float : public Distance<float>
+     DistanceL2Float() : Distance<float>(diskann::Metric::L2)
+     {
+     }
+-
++    using Distance<float>::compare;
+ #ifdef _WINDOWS
+     DISKANN_DLLEXPORT virtual float compare(const float *a, const float *b, uint32_t size) const;
+ #else
+@@ -128,6 +132,7 @@ class AVXDistanceL2Float : public Distance<float>
+     AVXDistanceL2Float() : Distance<float>(diskann::Metric::L2)
+     {
+     }
++    using Distance<float>::compare;
+     DISKANN_DLLEXPORT virtual float compare(const float *a, const float *b, uint32_t length) const;
+ };
+ 
+@@ -146,6 +151,7 @@ class SlowDistanceCosineUInt8 : public Distance<uint8_t>
+     SlowDistanceCosineUInt8() : Distance<uint8_t>(diskann::Metric::COSINE)
+     {
+     }
++    using Distance<uint8_t>::compare;
+     DISKANN_DLLEXPORT virtual float compare(const uint8_t *a, const uint8_t *b, uint32_t length) const;
+ };
+ 
+@@ -155,6 +161,7 @@ class DistanceL2UInt8 : public Distance<uint8_t>
+     DistanceL2UInt8() : Distance<uint8_t>(diskann::Metric::L2)
+     {
+     }
++    using Distance<uint8_t>::compare;
+     DISKANN_DLLEXPORT virtual float compare(const uint8_t *a, const uint8_t *b, uint32_t size) const;
+ };
+ 
+@@ -198,6 +205,7 @@ class AVXDistanceInnerProductFloat : public Distance<float>
+     AVXDistanceInnerProductFloat() : Distance<float>(diskann::Metric::INNER_PRODUCT)
+     {
+     }
++    using Distance<float>::compare;
+     DISKANN_DLLEXPORT virtual float compare(const float *a, const float *b, uint32_t length) const;
+ };
+ 
+@@ -213,6 +221,7 @@ class AVXNormalizedCosineDistanceFloat : public Distance<float>
+     AVXNormalizedCosineDistanceFloat() : Distance<float>(diskann::Metric::COSINE)
+     {
+     }
++    using Distance<float>::compare;
+     DISKANN_DLLEXPORT virtual float compare(const float *a, const float *b, uint32_t length) const
+     {
+         // Inner product returns negative values to indicate distance.
+diff --git a/include/utils.h b/include/utils.h
+index d3af5c3..417af31 100644
+--- a/include/utils.h
++++ b/include/utils.h
+@@ -29,6 +29,7 @@ typedef int FileHandle;
+ #include "types.h"
+ #include "tag_uint128.h"
+ #include <any>
++#include <xmmintrin.h>
+ 
+ #ifdef EXEC_ENV_OLS
+ #include "content_buf.h"
+diff --git a/src/index.cpp b/src/index.cpp
+index bf93344..9d8336c 100644
+--- a/src/index.cpp
++++ b/src/index.cpp
+@@ -17,9 +17,7 @@
+ #include "gperftools/malloc_extension.h"
+ #endif
+ 
+-#ifdef _WINDOWS
+ #include <xmmintrin.h>
+-#endif
+ 
+ #include "index.h"
+ 
+diff --git a/src/partition.cpp b/src/partition.cpp
+index 570d45c..fb54cbf 100644
+--- a/src/partition.cpp
++++ b/src/partition.cpp
+@@ -21,9 +21,7 @@
+ #include "parameters.h"
+ #include "memory_mapper.h"
+ #include "partition.h"
+-#ifdef _WINDOWS
+ #include <xmmintrin.h>
+-#endif
+ 
+ // block size for reading/ processing large files and matrices in blocks
+ #define BLOCK_SIZE 5000000
+diff --git a/src/pq_flash_index.cpp b/src/pq_flash_index.cpp
+index d9ad506..145a978 100644
+--- a/src/pq_flash_index.cpp
++++ b/src/pq_flash_index.cpp
+@@ -8,6 +8,7 @@
+ #include "pq_scratch.h"
+ #include "pq_flash_index.h"
+ #include "cosine_similarity.h"
++#include <xmmintrin.h>
+ 
+ #ifdef _WINDOWS
+ #include "windows_aligned_file_reader.h"
diff --git a/cpp/cmake/patches/diskann_override.json b/cpp/cmake/patches/diskann_override.json
new file mode 100644
index 000000000..c83898548
--- /dev/null
+++ b/cpp/cmake/patches/diskann_override.json
@@ -0,0 +1,16 @@
+{
+  "packages" : {
+    "diskann" : {
+      "version": "0.7.0",
+      "git_url": "https://github.com/microsoft/DiskANN.git",
+      "git_tag": "main",
+      "patches" : [
+        {
+          "file" : "${current_json_dir}/diskann.diff",
+          "issue" : "Correct compilation issues",
+          "fixed_in" : ""
+        }
+      ]
+    }
+  }
+}
diff --git a/cpp/cmake/thirdparty/get_diskann.cmake b/cpp/cmake/thirdparty/get_diskann.cmake
new file mode 100644
index 000000000..8cea20d5c
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_diskann.cmake
@@ -0,0 +1,49 @@
+#=============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+function(find_and_configure_diskann)
+  set(oneValueArgs VERSION REPOSITORY PINNED_TAG)
+  cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN} )
+
+    include(${rapids-cmake-dir}/cpm/package_override.cmake)
+  set(patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/../patches")
+  rapids_cpm_package_override("${patch_dir}/diskann_override.json")
+
+  include("${rapids-cmake-dir}/cpm/detail/package_details.cmake")
+  rapids_cpm_package_details(diskann version repository tag shallow exclude)
+
+  include("${rapids-cmake-dir}/cpm/detail/generate_patch_command.cmake")
+  rapids_cpm_generate_patch_command(diskann ${version} patch_command)
+
+  rapids_cpm_find(diskann ${version}
+          GLOBAL_TARGETS diskann
+          CPM_ARGS
+          OPTIONS
+          "PYBIND OFF"
+          "UNIT_TEST OFF"
+          "RESTAPI OFF"
+          "PORTABLE OFF")
+
+  include("${rapids-cmake-dir}/cpm/detail/display_patch_status.cmake")
+  rapids_cpm_display_patch_status(diskann)
+    
+  if(NOT TARGET diskann::diskann)
+      target_include_directories(diskann INTERFACE "$<BUILD_INTERFACE:${diskann_SOURCE_DIR}/include>")
+      add_library(diskann::diskann ALIAS diskann)
+  endif()
+endfunction()
+find_and_configure_diskann()
\ No newline at end of file
diff --git a/cpp/src/neighbors/detail/vamana/vamana_serialize.cuh b/cpp/src/neighbors/detail/vamana/vamana_serialize.cuh
index a554464f6..7a42ed025 100644
--- a/cpp/src/neighbors/detail/vamana/vamana_serialize.cuh
+++ b/cpp/src/neighbors/detail/vamana/vamana_serialize.cuh
@@ -75,6 +75,7 @@ void serialize(raft::resources const& res,
              d_graph.data_handle(),
              d_graph.size(),
              raft::resource::get_cuda_stream(res));
+  raft::resource::sync_stream(res);
 
   size_t total_edges = 0;
   size_t num_sparse  = 0;
diff --git a/dependencies.yaml b/dependencies.yaml
index a68a550bb..48004b321 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -476,6 +476,18 @@ dependencies:
           - h5py>=3.8.0
           - benchmark>=1.8.2
           - openblas
+    specific:
+      - output_types: conda
+        matrices:
+          - matrix:
+              arch: x86_64
+            packages:
+              - mkl-devel
+              - libaio
+              - libboost-devel
+          - matrix:
+              arch: aarch64
+            packages: null
   bench_python:
     common:
       - output_types: [conda, pyproject, requirements]
diff --git a/python/cuvs_bench/cuvs_bench/config/algorithms.yaml b/python/cuvs_bench/cuvs_bench/config/algorithms.yaml
index 357517933..a5f79ff78 100644
--- a/python/cuvs_bench/cuvs_bench/config/algorithms.yaml
+++ b/python/cuvs_bench/cuvs_bench/config/algorithms.yaml
@@ -49,3 +49,12 @@ hnswlib:
 cuvs_cagra_hnswlib:
   executable: CUVS_CAGRA_HNSWLIB_ANN_BENCH
   requires_gpu: true
+diskann_memory:
+  executable: DISKANN_MEMORY_ANN_BENCH
+  requires_gpu: false
+diskann_ssd:
+  executable: DISKANN_SSD_ANN_BENCH
+  requires_gpu: false
+cuvs_vamana:
+  executable: CUVS_VAMANA_ANN_BENCH
+  requires_gpu: true
diff --git a/python/cuvs_bench/cuvs_bench/config/algos/constraints/__init__.py b/python/cuvs_bench/cuvs_bench/config/algos/constraints/__init__.py
index de05bd752..dd521bec2 100644
--- a/python/cuvs_bench/cuvs_bench/config/algos/constraints/__init__.py
+++ b/python/cuvs_bench/cuvs_bench/config/algos/constraints/__init__.py
@@ -99,3 +99,24 @@ def faiss_gpu_ivf_pq_search(params, build_params, k, batch_size):
 def hnswlib_search(params, build_params, k, batch_size):
     if "ef" in params:
         return params["ef"] >= k
+
+
+###############################################################################
+#                              DiskANN constraints                            #
+###############################################################################
+
+
+def diskann_memory_build(params, dim):
+    ret = True
+    if "R" in params and "L_build" in params:
+        ret = params["R"] <= params["L_build"]
+    return ret
+
+
+def diskann_ssd_build(params, dim):
+    ret = True
+    if "R" in params and "L_build" in params:
+        ret = params["R"] <= params["L_build"]
+    if "QD" in params:
+        ret = params["QD"] <= dim
+    return ret
diff --git a/python/cuvs_bench/cuvs_bench/config/algos/cuvs_vamana.yaml b/python/cuvs_bench/cuvs_bench/config/algos/cuvs_vamana.yaml
new file mode 100644
index 000000000..6e64b61ca
--- /dev/null
+++ b/python/cuvs_bench/cuvs_bench/config/algos/cuvs_vamana.yaml
@@ -0,0 +1,10 @@
+name: cuvs_vamana
+groups:
+  base:
+    build:
+      graph_degree: [64, 96]
+      visited_size: [128, 256, 512]
+      alpha: [1.2]
+    search:
+      L_search: [10, 20, 30, 40, 50, 100, 200, 300]
+      num_threads: [32]
diff --git a/python/cuvs_bench/cuvs_bench/config/algos/diskann_memory.yaml b/python/cuvs_bench/cuvs_bench/config/algos/diskann_memory.yaml
new file mode 100644
index 000000000..faf122465
--- /dev/null
+++ b/python/cuvs_bench/cuvs_bench/config/algos/diskann_memory.yaml
@@ -0,0 +1,13 @@
+name: diskann_memory
+constraints:
+  build: cuvs_bench.config.algos.constraints.diskann_memory_build
+groups:
+  base:
+    build:
+      R: [64, 96]
+      L_build: [128, 256, 384]
+      alpha: [1.2]
+      num_threads: [32]
+    search:
+      num_threads: [32]
+      L_search: [10, 20, 30, 40, 50, 100, 200, 300]
diff --git a/python/cuvs_bench/cuvs_bench/config/algos/diskann_ssd.yaml b/python/cuvs_bench/cuvs_bench/config/algos/diskann_ssd.yaml
new file mode 100644
index 000000000..40afc6fce
--- /dev/null
+++ b/python/cuvs_bench/cuvs_bench/config/algos/diskann_ssd.yaml
@@ -0,0 +1,13 @@
+name: diskann_ssd
+constraints:
+  build: cuvs_bench.config.algos.constraints.diskann_ssd_build
+groups:
+  base:
+    build:
+      R: [64, 96]
+      L_build: [128, 256, 384]
+      QD: [192]
+      num_threads: [32]
+    search:
+      L_search: [10, 20, 30, 40, 50, 100, 200, 300]
+      num_threads: [32]