From f3f159ae166426125347e7d6f8dd7210d4075179 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 13 Dec 2024 08:46:57 -0500
Subject: [PATCH] Use no-sync copy for fixed-width types in cudf::concatenate
 (#17584)

Replacing `thrust::copy` with `cudaMemcpyAsync` improves performance upto 2x in specific cases in `cudf::concatenate`
The `thrust::copy` does a sync for device-to-device copy though it is not necessary.  Using `rmm::exec_policy_nosync` had no effect. Will work with CCCL to determine if this is a bug in `thrust::copy` since computing the return value does not require a sync.

Also moved the benchmark for concatenate from googlebench to nvbench.

Closes #17172

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/17584
---
 cpp/benchmarks/CMakeLists.txt          |   5 +-
 cpp/benchmarks/column/concatenate.cpp  | 169 -------------------------
 cpp/benchmarks/copying/concatenate.cpp |  84 ++++++++++++
 cpp/src/copying/concatenate.cu         |   6 +-
 4 files changed, 92 insertions(+), 172 deletions(-)
 delete mode 100644 cpp/benchmarks/column/concatenate.cpp
 create mode 100644 cpp/benchmarks/copying/concatenate.cpp
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 8e5ea900efa..b1456600c95 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -140,8 +140,9 @@ function(ConfigureNVBench CMAKE_BENCH_NAME)
 endfunction()
 
 # ##################################################################################################
-# * column benchmarks -----------------------------------------------------------------------------
-ConfigureBench(COLUMN_CONCAT_BENCH column/concatenate.cpp)
+# * copying benchmarks
+# -----------------------------------------------------------------------------
+ConfigureNVBench(COPYING_NVBENCH copying/concatenate.cpp)
 
 # ##################################################################################################
 # * gather benchmark ------------------------------------------------------------------------------
diff --git a/cpp/benchmarks/column/concatenate.cpp b/cpp/benchmarks/column/concatenate.cpp
deleted file mode 100644
index 51106c72137..00000000000
--- a/cpp/benchmarks/column/concatenate.cpp
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/fixture/templated_benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
-
-#include <cudf_test/column_wrapper.hpp>
-
-#include <cudf/concatenate.hpp>
-#include <cudf/table/table.hpp>
-#include <cudf/utilities/default_stream.hpp>
-
-#include <thrust/iterator/constant_iterator.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
-
-#include <algorithm>
-#include <vector>
-
-class Concatenate : public cudf::benchmark {};
-
-template <typename T, bool Nullable>
-static void BM_concatenate(benchmark::State& state)
-{
-  cudf::size_type const num_rows = state.range(0);
-  cudf::size_type const num_cols = state.range(1);
-
-  auto input         = create_sequence_table(cycle_dtypes({cudf::type_to_id<T>()}, num_cols),
-                                     row_count{num_rows},
-                                     Nullable ? std::optional<double>{2.0 / 3.0} : std::nullopt);
-  auto input_columns = input->view();
-  std::vector<cudf::column_view> column_views(input_columns.begin(), input_columns.end());
-
-  CUDF_CHECK_CUDA(0);
-
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::get_default_stream());
-    auto result = cudf::concatenate(column_views);
-  }
-
-  state.SetBytesProcessed(state.iterations() * num_cols * num_rows * sizeof(T));
-}
-
-#define CONCAT_BENCHMARK_DEFINE(type, nullable)                             \
-  BENCHMARK_DEFINE_F(Concatenate, BM_concatenate##_##nullable_##nullable)   \
-  (::benchmark::State & st) { BM_concatenate<type, nullable>(st); }         \
-  BENCHMARK_REGISTER_F(Concatenate, BM_concatenate##_##nullable_##nullable) \
-    ->RangeMultiplier(8)                                                    \
-    ->Ranges({{1 << 6, 1 << 18}, {2, 1024}})                                \
-    ->Unit(benchmark::kMillisecond)                                         \
-    ->UseManualTime();
-
-CONCAT_BENCHMARK_DEFINE(int64_t, false)
-CONCAT_BENCHMARK_DEFINE(int64_t, true)
-
-template <typename T, bool Nullable>
-static void BM_concatenate_tables(benchmark::State& state)
-{
-  cudf::size_type const num_rows   = state.range(0);
-  cudf::size_type const num_cols   = state.range(1);
-  cudf::size_type const num_tables = state.range(2);
-
-  std::vector<std::unique_ptr<cudf::table>> tables(num_tables);
-  std::generate_n(tables.begin(), num_tables, [&]() {
-    return create_sequence_table(cycle_dtypes({cudf::type_to_id<T>()}, num_cols),
-                                 row_count{num_rows},
-                                 Nullable ? std::optional<double>{2.0 / 3.0} : std::nullopt);
-  });
-
-  // Generate table views
-  std::vector<cudf::table_view> table_views(num_tables);
-  std::transform(tables.begin(), tables.end(), table_views.begin(), [](auto& table) mutable {
-    return table->view();
-  });
-
-  CUDF_CHECK_CUDA(0);
-
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::get_default_stream());
-    auto result = cudf::concatenate(table_views);
-  }
-
-  state.SetBytesProcessed(state.iterations() * num_cols * num_rows * num_tables * sizeof(T));
-}
-
-#define CONCAT_TABLES_BENCHMARK_DEFINE(type, nullable)                             \
-  BENCHMARK_DEFINE_F(Concatenate, BM_concatenate_tables##_##nullable_##nullable)   \
-  (::benchmark::State & st) { BM_concatenate_tables<type, nullable>(st); }         \
-  BENCHMARK_REGISTER_F(Concatenate, BM_concatenate_tables##_##nullable_##nullable) \
-    ->RangeMultiplier(8)                                                           \
-    ->Ranges({{1 << 8, 1 << 12}, {2, 32}, {2, 128}})                               \
-    ->Unit(benchmark::kMillisecond)                                                \
-    ->UseManualTime();
-
-CONCAT_TABLES_BENCHMARK_DEFINE(int64_t, false)
-CONCAT_TABLES_BENCHMARK_DEFINE(int64_t, true)
-
-class ConcatenateStrings : public cudf::benchmark {};
-
-template <bool Nullable>
-static void BM_concatenate_strings(benchmark::State& state)
-{
-  using column_wrapper = cudf::test::strings_column_wrapper;
-
-  auto const num_rows  = state.range(0);
-  auto const num_chars = state.range(1);
-  auto const num_cols  = state.range(2);
-
-  std::string str(num_chars, 'a');
-
-  // Create owning columns
-  std::vector<column_wrapper> columns;
-  columns.reserve(num_cols);
-  std::generate_n(std::back_inserter(columns), num_cols, [num_rows, c_str = str.c_str()]() {
-    auto iter = thrust::make_constant_iterator(c_str);
-    if (Nullable) {
-      auto count_it = thrust::make_counting_iterator(0);
-      auto valid_iter =
-        thrust::make_transform_iterator(count_it, [](auto i) { return i % 3 == 0; });
-      return column_wrapper(iter, iter + num_rows, valid_iter);
-    } else {
-      return column_wrapper(iter, iter + num_rows);
-    }
-  });
-
-  // Generate column views
-  std::vector<cudf::column_view> column_views;
-  column_views.reserve(columns.size());
-  std::transform(
-    columns.begin(), columns.end(), std::back_inserter(column_views), [](auto const& col) {
-      return static_cast<cudf::column_view>(col);
-    });
-
-  CUDF_CHECK_CUDA(0);
-
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::get_default_stream());
-    auto result = cudf::concatenate(column_views);
-  }
-
-  state.SetBytesProcessed(state.iterations() * num_cols * num_rows *
-                          (sizeof(int32_t) + num_chars));  // offset + chars
-}
-
-#define CONCAT_STRINGS_BENCHMARK_DEFINE(nullable)                                   \
-  BENCHMARK_DEFINE_F(Concatenate, BM_concatenate_strings##_##nullable_##nullable)   \
-  (::benchmark::State & st) { BM_concatenate_strings<nullable>(st); }               \
-  BENCHMARK_REGISTER_F(Concatenate, BM_concatenate_strings##_##nullable_##nullable) \
-    ->RangeMultiplier(8)                                                            \
-    ->Ranges({{1 << 8, 1 << 14}, {8, 128}, {2, 256}})                               \
-    ->Unit(benchmark::kMillisecond)                                                 \
-    ->UseManualTime();
-
-CONCAT_STRINGS_BENCHMARK_DEFINE(false)
-CONCAT_STRINGS_BENCHMARK_DEFINE(true)
diff --git a/cpp/benchmarks/copying/concatenate.cpp b/cpp/benchmarks/copying/concatenate.cpp
new file mode 100644
index 00000000000..586b479d0ad
--- /dev/null
+++ b/cpp/benchmarks/copying/concatenate.cpp
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/concatenate.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <vector>
+
+static void bench_concatenate(nvbench::state& state)
+{
+  auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const num_cols = static_cast<cudf::size_type>(state.get_int64("num_cols"));
+  auto const nulls    = static_cast<cudf::size_type>(state.get_float64("nulls"));
+
+  auto input = create_sequence_table(
+    cycle_dtypes({cudf::type_to_id<int64_t>()}, num_cols), row_count{num_rows}, nulls);
+  auto input_columns = input->view();
+  auto column_views  = std::vector<cudf::column_view>(input_columns.begin(), input_columns.end());
+
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  state.add_global_memory_reads<int64_t>(num_rows * num_cols);
+  state.add_global_memory_writes<int64_t>(num_rows * num_cols);
+
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch&) { auto result = cudf::concatenate(column_views); });
+}
+
+NVBENCH_BENCH(bench_concatenate)
+  .set_name("concatenate")
+  .add_int64_axis("num_rows", {64, 512, 4096, 32768, 262144})
+  .add_int64_axis("num_cols", {2, 8, 64, 512, 1024})
+  .add_float64_axis("nulls", {0.0, 0.3});
+
+static void bench_concatenate_strings(nvbench::state& state)
+{
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const num_cols  = static_cast<cudf::size_type>(state.get_int64("num_cols"));
+  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const nulls     = static_cast<cudf::size_type>(state.get_float64("nulls"));
+
+  data_profile const profile =
+    data_profile_builder()
+      .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width)
+      .null_probability(nulls);
+  auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
+  auto const input  = column->view();
+
+  auto column_views = std::vector<cudf::column_view>(num_cols, input);
+
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  auto const sv = cudf::strings_column_view(input);
+  state.add_global_memory_reads<int8_t>(sv.chars_size(stream) * num_cols);
+  state.add_global_memory_writes<int64_t>(sv.chars_size(stream) * num_cols);
+
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch&) { auto result = cudf::concatenate(column_views); });
+}
+
+NVBENCH_BENCH(bench_concatenate_strings)
+  .set_name("concatenate_strings")
+  .add_int64_axis("num_rows", {256, 512, 4096, 16384})
+  .add_int64_axis("num_cols", {2, 8, 64, 256})
+  .add_int64_axis("row_width", {32, 128})
+  .add_float64_axis("nulls", {0.0, 0.3});
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index d8419760120..6fc49afd7ac 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -308,7 +308,11 @@ std::unique_ptr<column> for_each_concatenate(host_span<column_view const> views,
 
   auto count = 0;
   for (auto& v : views) {
-    thrust::copy(rmm::exec_policy(stream), v.begin<T>(), v.end<T>(), m_view.begin<T>() + count);
+    cudaMemcpyAsync(m_view.begin<T>() + count,
+                    v.begin<T>(),
+                    v.size() * sizeof(T),
+                    cudaMemcpyDeviceToDevice,
+                    stream.value());
     count += v.size();
   }