From f3f159ae166426125347e7d6f8dd7210d4075179 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Fri, 13 Dec 2024 08:46:57 -0500 Subject: [PATCH] Use no-sync copy for fixed-width types in cudf::concatenate (#17584) Replacing `thrust::copy` with `cudaMemcpyAsync` improves performance upto 2x in specific cases in `cudf::concatenate` The `thrust::copy` does a sync for device-to-device copy though it is not necessary. Using `rmm::exec_policy_nosync` had no effect. Will work with CCCL to determine if this is a bug in `thrust::copy` since computing the return value does not require a sync. Also moved the benchmark for concatenate from googlebench to nvbench. Closes #17172 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/17584 --- cpp/benchmarks/CMakeLists.txt | 5 +- cpp/benchmarks/column/concatenate.cpp | 169 ------------------------- cpp/benchmarks/copying/concatenate.cpp | 84 ++++++++++++ cpp/src/copying/concatenate.cu | 6 +- 4 files changed, 92 insertions(+), 172 deletions(-) delete mode 100644 cpp/benchmarks/column/concatenate.cpp create mode 100644 cpp/benchmarks/copying/concatenate.cpp diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 8e5ea900efa..b1456600c95 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -140,8 +140,9 @@ function(ConfigureNVBench CMAKE_BENCH_NAME) endfunction() # ################################################################################################## -# * column benchmarks ----------------------------------------------------------------------------- -ConfigureBench(COLUMN_CONCAT_BENCH column/concatenate.cpp) +# * copying benchmarks +# ----------------------------------------------------------------------------- +ConfigureNVBench(COPYING_NVBENCH copying/concatenate.cpp) # ################################################################################################## # * gather benchmark ------------------------------------------------------------------------------ diff --git a/cpp/benchmarks/column/concatenate.cpp b/cpp/benchmarks/column/concatenate.cpp deleted file mode 100644 index 51106c72137..00000000000 --- a/cpp/benchmarks/column/concatenate.cpp +++ /dev/null @@ -1,169 +0,0 @@ -/* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include -#include -#include -#include - -#include - -#include -#include -#include - -#include -#include -#include - -#include -#include - -class Concatenate : public cudf::benchmark {}; - -template -static void BM_concatenate(benchmark::State& state) -{ - cudf::size_type const num_rows = state.range(0); - cudf::size_type const num_cols = state.range(1); - - auto input = create_sequence_table(cycle_dtypes({cudf::type_to_id()}, num_cols), - row_count{num_rows}, - Nullable ? std::optional{2.0 / 3.0} : std::nullopt); - auto input_columns = input->view(); - std::vector column_views(input_columns.begin(), input_columns.end()); - - CUDF_CHECK_CUDA(0); - - for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::get_default_stream()); - auto result = cudf::concatenate(column_views); - } - - state.SetBytesProcessed(state.iterations() * num_cols * num_rows * sizeof(T)); -} - -#define CONCAT_BENCHMARK_DEFINE(type, nullable) \ - BENCHMARK_DEFINE_F(Concatenate, BM_concatenate##_##nullable_##nullable) \ - (::benchmark::State & st) { BM_concatenate(st); } \ - BENCHMARK_REGISTER_F(Concatenate, BM_concatenate##_##nullable_##nullable) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 6, 1 << 18}, {2, 1024}}) \ - ->Unit(benchmark::kMillisecond) \ - ->UseManualTime(); - -CONCAT_BENCHMARK_DEFINE(int64_t, false) -CONCAT_BENCHMARK_DEFINE(int64_t, true) - -template -static void BM_concatenate_tables(benchmark::State& state) -{ - cudf::size_type const num_rows = state.range(0); - cudf::size_type const num_cols = state.range(1); - cudf::size_type const num_tables = state.range(2); - - std::vector> tables(num_tables); - std::generate_n(tables.begin(), num_tables, [&]() { - return create_sequence_table(cycle_dtypes({cudf::type_to_id()}, num_cols), - row_count{num_rows}, - Nullable ? std::optional{2.0 / 3.0} : std::nullopt); - }); - - // Generate table views - std::vector table_views(num_tables); - std::transform(tables.begin(), tables.end(), table_views.begin(), [](auto& table) mutable { - return table->view(); - }); - - CUDF_CHECK_CUDA(0); - - for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::get_default_stream()); - auto result = cudf::concatenate(table_views); - } - - state.SetBytesProcessed(state.iterations() * num_cols * num_rows * num_tables * sizeof(T)); -} - -#define CONCAT_TABLES_BENCHMARK_DEFINE(type, nullable) \ - BENCHMARK_DEFINE_F(Concatenate, BM_concatenate_tables##_##nullable_##nullable) \ - (::benchmark::State & st) { BM_concatenate_tables(st); } \ - BENCHMARK_REGISTER_F(Concatenate, BM_concatenate_tables##_##nullable_##nullable) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 8, 1 << 12}, {2, 32}, {2, 128}}) \ - ->Unit(benchmark::kMillisecond) \ - ->UseManualTime(); - -CONCAT_TABLES_BENCHMARK_DEFINE(int64_t, false) -CONCAT_TABLES_BENCHMARK_DEFINE(int64_t, true) - -class ConcatenateStrings : public cudf::benchmark {}; - -template -static void BM_concatenate_strings(benchmark::State& state) -{ - using column_wrapper = cudf::test::strings_column_wrapper; - - auto const num_rows = state.range(0); - auto const num_chars = state.range(1); - auto const num_cols = state.range(2); - - std::string str(num_chars, 'a'); - - // Create owning columns - std::vector columns; - columns.reserve(num_cols); - std::generate_n(std::back_inserter(columns), num_cols, [num_rows, c_str = str.c_str()]() { - auto iter = thrust::make_constant_iterator(c_str); - if (Nullable) { - auto count_it = thrust::make_counting_iterator(0); - auto valid_iter = - thrust::make_transform_iterator(count_it, [](auto i) { return i % 3 == 0; }); - return column_wrapper(iter, iter + num_rows, valid_iter); - } else { - return column_wrapper(iter, iter + num_rows); - } - }); - - // Generate column views - std::vector column_views; - column_views.reserve(columns.size()); - std::transform( - columns.begin(), columns.end(), std::back_inserter(column_views), [](auto const& col) { - return static_cast(col); - }); - - CUDF_CHECK_CUDA(0); - - for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::get_default_stream()); - auto result = cudf::concatenate(column_views); - } - - state.SetBytesProcessed(state.iterations() * num_cols * num_rows * - (sizeof(int32_t) + num_chars)); // offset + chars -} - -#define CONCAT_STRINGS_BENCHMARK_DEFINE(nullable) \ - BENCHMARK_DEFINE_F(Concatenate, BM_concatenate_strings##_##nullable_##nullable) \ - (::benchmark::State & st) { BM_concatenate_strings(st); } \ - BENCHMARK_REGISTER_F(Concatenate, BM_concatenate_strings##_##nullable_##nullable) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 8, 1 << 14}, {8, 128}, {2, 256}}) \ - ->Unit(benchmark::kMillisecond) \ - ->UseManualTime(); - -CONCAT_STRINGS_BENCHMARK_DEFINE(false) -CONCAT_STRINGS_BENCHMARK_DEFINE(true) diff --git a/cpp/benchmarks/copying/concatenate.cpp b/cpp/benchmarks/copying/concatenate.cpp new file mode 100644 index 00000000000..586b479d0ad --- /dev/null +++ b/cpp/benchmarks/copying/concatenate.cpp @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include + +#include +#include +#include +#include + +#include + +#include + +static void bench_concatenate(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const num_cols = static_cast(state.get_int64("num_cols")); + auto const nulls = static_cast(state.get_float64("nulls")); + + auto input = create_sequence_table( + cycle_dtypes({cudf::type_to_id()}, num_cols), row_count{num_rows}, nulls); + auto input_columns = input->view(); + auto column_views = std::vector(input_columns.begin(), input_columns.end()); + + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + state.add_global_memory_reads(num_rows * num_cols); + state.add_global_memory_writes(num_rows * num_cols); + + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch&) { auto result = cudf::concatenate(column_views); }); +} + +NVBENCH_BENCH(bench_concatenate) + .set_name("concatenate") + .add_int64_axis("num_rows", {64, 512, 4096, 32768, 262144}) + .add_int64_axis("num_cols", {2, 8, 64, 512, 1024}) + .add_float64_axis("nulls", {0.0, 0.3}); + +static void bench_concatenate_strings(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const num_cols = static_cast(state.get_int64("num_cols")); + auto const row_width = static_cast(state.get_int64("row_width")); + auto const nulls = static_cast(state.get_float64("nulls")); + + data_profile const profile = + data_profile_builder() + .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width) + .null_probability(nulls); + auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); + auto const input = column->view(); + + auto column_views = std::vector(num_cols, input); + + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + auto const sv = cudf::strings_column_view(input); + state.add_global_memory_reads(sv.chars_size(stream) * num_cols); + state.add_global_memory_writes(sv.chars_size(stream) * num_cols); + + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch&) { auto result = cudf::concatenate(column_views); }); +} + +NVBENCH_BENCH(bench_concatenate_strings) + .set_name("concatenate_strings") + .add_int64_axis("num_rows", {256, 512, 4096, 16384}) + .add_int64_axis("num_cols", {2, 8, 64, 256}) + .add_int64_axis("row_width", {32, 128}) + .add_float64_axis("nulls", {0.0, 0.3}); diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu index d8419760120..6fc49afd7ac 100644 --- a/cpp/src/copying/concatenate.cu +++ b/cpp/src/copying/concatenate.cu @@ -308,7 +308,11 @@ std::unique_ptr for_each_concatenate(host_span views, auto count = 0; for (auto& v : views) { - thrust::copy(rmm::exec_policy(stream), v.begin(), v.end(), m_view.begin() + count); + cudaMemcpyAsync(m_view.begin() + count, + v.begin(), + v.size() * sizeof(T), + cudaMemcpyDeviceToDevice, + stream.value()); count += v.size(); }