Skip to content

Commit

Permalink
Use no-sync copy for fixed-width types in cudf::concatenate (#17584)
Browse files Browse the repository at this point in the history
Replacing `thrust::copy` with `cudaMemcpyAsync` improves performance upto 2x in specific cases in `cudf::concatenate`
The `thrust::copy` does a sync for device-to-device copy though it is not necessary.  Using `rmm::exec_policy_nosync` had no effect. Will work with CCCL to determine if this is a bug in `thrust::copy` since computing the return value does not require a sync.

Also moved the benchmark for concatenate from googlebench to nvbench.

Closes #17172

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: #17584
  • Loading branch information
davidwendt authored Dec 13, 2024
1 parent 48aa08f commit f3f159a
Show file tree
Hide file tree
Showing 4 changed files with 92 additions and 172 deletions.
5 changes: 3 additions & 2 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,9 @@ function(ConfigureNVBench CMAKE_BENCH_NAME)
endfunction()

# ##################################################################################################
# * column benchmarks -----------------------------------------------------------------------------
ConfigureBench(COLUMN_CONCAT_BENCH column/concatenate.cpp)
# * copying benchmarks
# -----------------------------------------------------------------------------
ConfigureNVBench(COPYING_NVBENCH copying/concatenate.cpp)

# ##################################################################################################
# * gather benchmark ------------------------------------------------------------------------------
Expand Down
169 changes: 0 additions & 169 deletions cpp/benchmarks/column/concatenate.cpp

This file was deleted.

84 changes: 84 additions & 0 deletions cpp/benchmarks/copying/concatenate.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
/*
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <benchmarks/common/generate_input.hpp>

#include <cudf/column/column_view.hpp>
#include <cudf/concatenate.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/utilities/default_stream.hpp>

#include <nvbench/nvbench.cuh>

#include <vector>

static void bench_concatenate(nvbench::state& state)
{
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const num_cols = static_cast<cudf::size_type>(state.get_int64("num_cols"));
auto const nulls = static_cast<cudf::size_type>(state.get_float64("nulls"));

auto input = create_sequence_table(
cycle_dtypes({cudf::type_to_id<int64_t>()}, num_cols), row_count{num_rows}, nulls);
auto input_columns = input->view();
auto column_views = std::vector<cudf::column_view>(input_columns.begin(), input_columns.end());

auto stream = cudf::get_default_stream();
state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
state.add_global_memory_reads<int64_t>(num_rows * num_cols);
state.add_global_memory_writes<int64_t>(num_rows * num_cols);

state.exec(nvbench::exec_tag::sync,
[&](nvbench::launch&) { auto result = cudf::concatenate(column_views); });
}

NVBENCH_BENCH(bench_concatenate)
.set_name("concatenate")
.add_int64_axis("num_rows", {64, 512, 4096, 32768, 262144})
.add_int64_axis("num_cols", {2, 8, 64, 512, 1024})
.add_float64_axis("nulls", {0.0, 0.3});

static void bench_concatenate_strings(nvbench::state& state)
{
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const num_cols = static_cast<cudf::size_type>(state.get_int64("num_cols"));
auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
auto const nulls = static_cast<cudf::size_type>(state.get_float64("nulls"));

data_profile const profile =
data_profile_builder()
.distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width)
.null_probability(nulls);
auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
auto const input = column->view();

auto column_views = std::vector<cudf::column_view>(num_cols, input);

auto stream = cudf::get_default_stream();
state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
auto const sv = cudf::strings_column_view(input);
state.add_global_memory_reads<int8_t>(sv.chars_size(stream) * num_cols);
state.add_global_memory_writes<int64_t>(sv.chars_size(stream) * num_cols);

state.exec(nvbench::exec_tag::sync,
[&](nvbench::launch&) { auto result = cudf::concatenate(column_views); });
}

NVBENCH_BENCH(bench_concatenate_strings)
.set_name("concatenate_strings")
.add_int64_axis("num_rows", {256, 512, 4096, 16384})
.add_int64_axis("num_cols", {2, 8, 64, 256})
.add_int64_axis("row_width", {32, 128})
.add_float64_axis("nulls", {0.0, 0.3});
6 changes: 5 additions & 1 deletion cpp/src/copying/concatenate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,11 @@ std::unique_ptr<column> for_each_concatenate(host_span<column_view const> views,

auto count = 0;
for (auto& v : views) {
thrust::copy(rmm::exec_policy(stream), v.begin<T>(), v.end<T>(), m_view.begin<T>() + count);
cudaMemcpyAsync(m_view.begin<T>() + count,
v.begin<T>(),
v.size() * sizeof(T),
cudaMemcpyDeviceToDevice,
stream.value());
count += v.size();
}

Expand Down

0 comments on commit f3f159a

Please sign in to comment.