Merge branch 'comp-headers-cleanup' of https://github.com/vuule/cudf …

…into comp-headers-cleanup
rapidsai · Dec 20, 2024 · 0a64f1c · 0a64f1c
2 parents 947fbd4 + 0cf8375
commit 0a64f1c
Show file tree

Hide file tree

Showing 67 changed files with 1,856 additions and 794 deletions.
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
@@ -13,6 +13,7 @@ jobs:
   # Please keep pr-builder as the top job here
   pr-builder:
     needs:
+      - check-nightly-ci
       - changed-files
       - checks
       - conda-cpp-build
@@ -54,6 +55,18 @@ jobs:
       - name: Telemetry setup
         if: ${{ vars.TELEMETRY_ENABLED == 'true' }}
         uses: rapidsai/shared-actions/telemetry-dispatch-stash-base-env-vars@main
+  check-nightly-ci:
+    # Switch to ubuntu-latest once it defaults to a version of Ubuntu that
+    # provides at least Python 3.11 (see
+    # https://docs.python.org/3/library/datetime.html#datetime.date.fromisoformat)
+    runs-on: ubuntu-24.04
+    env:
+      RAPIDS_GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+    steps:
+      - name: Check if nightly CI is passing
+        uses: rapidsai/shared-actions/check_nightly_success/dispatch@main
+        with:
+          repo: cudf
   changed-files:
     secrets: inherit
     needs: telemetry-setup

diff --git a/cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh b/cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh
@@ -57,62 +57,71 @@ struct MurmurHash3_x86_32 {
 };
 
 template <>
-hash_value_type __device__ inline MurmurHash3_x86_32<bool>::operator()(bool const& key) const
+MurmurHash3_x86_32<bool>::result_type __device__ inline MurmurHash3_x86_32<bool>::operator()(
+  bool const& key) const
 {
   return this->compute(static_cast<uint8_t>(key));
 }
 
 template <>
-hash_value_type __device__ inline MurmurHash3_x86_32<float>::operator()(float const& key) const
+MurmurHash3_x86_32<float>::result_type __device__ inline MurmurHash3_x86_32<float>::operator()(
+  float const& key) const
 {
   return this->compute(normalize_nans_and_zeros(key));
 }
 
 template <>
-hash_value_type __device__ inline MurmurHash3_x86_32<double>::operator()(double const& key) const
+MurmurHash3_x86_32<double>::result_type __device__ inline MurmurHash3_x86_32<double>::operator()(
+  double const& key) const
 {
   return this->compute(normalize_nans_and_zeros(key));
 }
 
 template <>
-hash_value_type __device__ inline MurmurHash3_x86_32<cudf::string_view>::operator()(
-  cudf::string_view const& key) const
+MurmurHash3_x86_32<cudf::string_view>::result_type
+  __device__ inline MurmurHash3_x86_32<cudf::string_view>::operator()(
+    cudf::string_view const& key) const
 {
   return this->compute_bytes(reinterpret_cast<cuda::std::byte const*>(key.data()),
                              key.size_bytes());
 }
 
 template <>
-hash_value_type __device__ inline MurmurHash3_x86_32<numeric::decimal32>::operator()(
-  numeric::decimal32 const& key) const
+MurmurHash3_x86_32<numeric::decimal32>::result_type
+  __device__ inline MurmurHash3_x86_32<numeric::decimal32>::operator()(
+    numeric::decimal32 const& key) const
 {
   return this->compute(key.value());
 }
 
 template <>
-hash_value_type __device__ inline MurmurHash3_x86_32<numeric::decimal64>::operator()(
-  numeric::decimal64 const& key) const
+MurmurHash3_x86_32<numeric::decimal64>::result_type
+  __device__ inline MurmurHash3_x86_32<numeric::decimal64>::operator()(
+    numeric::decimal64 const& key) const
 {
   return this->compute(key.value());
 }
 
 template <>
-hash_value_type __device__ inline MurmurHash3_x86_32<numeric::decimal128>::operator()(
-  numeric::decimal128 const& key) const
+MurmurHash3_x86_32<numeric::decimal128>::result_type
+  __device__ inline MurmurHash3_x86_32<numeric::decimal128>::operator()(
+    numeric::decimal128 const& key) const
 {
   return this->compute(key.value());
 }
 
 template <>
-hash_value_type __device__ inline MurmurHash3_x86_32<cudf::list_view>::operator()(
-  cudf::list_view const& key) const
+MurmurHash3_x86_32<cudf::list_view>::result_type
+  __device__ inline MurmurHash3_x86_32<cudf::list_view>::operator()(
+    cudf::list_view const& key) const
 {
   CUDF_UNREACHABLE("List column hashing is not supported");
 }
 
 template <>
-hash_value_type __device__ inline MurmurHash3_x86_32<cudf::struct_view>::operator()(
-  cudf::struct_view const& key) const
+MurmurHash3_x86_32<cudf::struct_view>::result_type
+  __device__ inline MurmurHash3_x86_32<cudf::struct_view>::operator()(
+    cudf::struct_view const& key) const
 {
   CUDF_UNREACHABLE("Direct hashing of struct_view is not supported");
 }

diff --git a/cpp/src/groupby/hash/compute_groupby.cu b/cpp/src/groupby/hash/compute_groupby.cu
@@ -61,7 +61,7 @@ std::unique_ptr<table> compute_groupby(table_view const& keys,
     d_row_equal,
     probing_scheme_t{d_row_hash},
     cuco::thread_scope_device,
-    cuco::storage<GROUPBY_WINDOW_SIZE>{},
+    cuco::storage<GROUPBY_BUCKET_SIZE>{},
     cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
     stream.value()};
 

diff --git a/cpp/src/groupby/hash/compute_mapping_indices.cuh b/cpp/src/groupby/hash/compute_mapping_indices.cuh
@@ -106,15 +106,15 @@ CUDF_KERNEL void mapping_indices_kernel(cudf::size_type num_input_rows,
   __shared__ cudf::size_type shared_set_indices[GROUPBY_SHM_MAX_ELEMENTS];
 
   // Shared set initialization
-  __shared__ cuco::window<cudf::size_type, GROUPBY_WINDOW_SIZE> windows[window_extent.value()];
+  __shared__ cuco::bucket<cudf::size_type, GROUPBY_BUCKET_SIZE> buckets[bucket_extent.value()];
 
   auto raw_set = cuco::static_set_ref{
     cuco::empty_key<cudf::size_type>{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
     global_set.key_eq(),
     probing_scheme_t{global_set.hash_function()},
     cuco::thread_scope_block,
-    cuco::aow_storage_ref<cudf::size_type, GROUPBY_WINDOW_SIZE, decltype(window_extent)>{
-      window_extent, windows}};
+    cuco::bucket_storage_ref<cudf::size_type, GROUPBY_BUCKET_SIZE, decltype(bucket_extent)>{
+      bucket_extent, buckets}};
   auto shared_set = raw_set.rebind_operators(cuco::insert_and_find);
 
   auto const block = cooperative_groups::this_thread_block();

diff --git a/cpp/src/groupby/hash/helpers.cuh b/cpp/src/groupby/hash/helpers.cuh
@@ -27,7 +27,7 @@ namespace cudf::groupby::detail::hash {
 CUDF_HOST_DEVICE auto constexpr GROUPBY_CG_SIZE = 1;
 
 /// Number of slots per thread
-CUDF_HOST_DEVICE auto constexpr GROUPBY_WINDOW_SIZE = 1;
+CUDF_HOST_DEVICE auto constexpr GROUPBY_BUCKET_SIZE = 1;
 
 /// Thread block size
 CUDF_HOST_DEVICE auto constexpr GROUPBY_BLOCK_SIZE = 128;
@@ -48,9 +48,9 @@ using shmem_extent_t =
   cuco::extent<cudf::size_type,
                static_cast<cudf::size_type>(static_cast<double>(GROUPBY_SHM_MAX_ELEMENTS) * 1.43)>;
 
-/// Number of windows needed by each shared memory hash set
-CUDF_HOST_DEVICE auto constexpr window_extent =
-  cuco::make_window_extent<GROUPBY_CG_SIZE, GROUPBY_WINDOW_SIZE>(shmem_extent_t{});
+/// Number of buckets needed by each shared memory hash set
+CUDF_HOST_DEVICE auto constexpr bucket_extent =
+  cuco::make_bucket_extent<GROUPBY_CG_SIZE, GROUPBY_BUCKET_SIZE>(shmem_extent_t{});
 
 using row_hash_t =
   cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,
@@ -75,23 +75,23 @@ using global_set_t = cuco::static_set<cudf::size_type,
                                       row_comparator_t,
                                       probing_scheme_t,
                                       cudf::detail::cuco_allocator<char>,
-                                      cuco::storage<GROUPBY_WINDOW_SIZE>>;
+                                      cuco::storage<GROUPBY_BUCKET_SIZE>>;
 
 using nullable_global_set_t = cuco::static_set<cudf::size_type,
                                                cuco::extent<int64_t>,
                                                cuda::thread_scope_device,
                                                nullable_row_comparator_t,
                                                probing_scheme_t,
                                                cudf::detail::cuco_allocator<char>,
-                                               cuco::storage<GROUPBY_WINDOW_SIZE>>;
+                                               cuco::storage<GROUPBY_BUCKET_SIZE>>;
 
 template <typename Op>
 using hash_set_ref_t = cuco::static_set_ref<
   cudf::size_type,
   cuda::thread_scope_device,
   row_comparator_t,
   probing_scheme_t,
-  cuco::aow_storage_ref<cudf::size_type, GROUPBY_WINDOW_SIZE, cuco::window_extent<int64_t>>,
+  cuco::bucket_storage_ref<cudf::size_type, GROUPBY_BUCKET_SIZE, cuco::bucket_extent<int64_t>>,
   Op>;
 
 template <typename Op>
@@ -100,6 +100,6 @@ using nullable_hash_set_ref_t = cuco::static_set_ref<
   cuda::thread_scope_device,
   nullable_row_comparator_t,
   probing_scheme_t,
-  cuco::aow_storage_ref<cudf::size_type, GROUPBY_WINDOW_SIZE, cuco::window_extent<int64_t>>,
+  cuco::bucket_storage_ref<cudf::size_type, GROUPBY_BUCKET_SIZE, cuco::bucket_extent<int64_t>>,
   Op>;
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/io/comp/common.hpp b/cpp/src/io/comp/common.hpp
@@ -21,7 +21,7 @@
 namespace cudf::io::detail {
 
 /**
- * @brief The value used for padding a data buffer such that its size will be multiple of it.
+ * @brief The size used for padding a data buffer's size to a multiple of the padding.
  *
  * Padding is necessary for input/output buffers of several compression/decompression kernels
  * (inflate_kernel and nvcomp snappy). Such kernels operate on aligned data pointers, which require

diff --git a/cpp/src/io/fst/logical_stack.cuh b/cpp/src/io/fst/logical_stack.cuh
@@ -513,6 +513,12 @@ void sparse_stack_op_to_top_of_stack(StackSymbolItT d_symbols,
       stream));
   }
 
+  // Check if the last element of d_kv_operations is 0. If not, then we have a problem.
+  if (num_symbols_in && !supports_reset_op) {
+    StackOpT last_symbol = d_kv_ops_current.element(num_symbols_in - 1, stream);
+    CUDF_EXPECTS(last_symbol.stack_level == 0, "The logical stack is not empty!");
+  }
+
   // Stable radix sort, sorting by stack level of the operations
   d_kv_operations_unsigned = cub::DoubleBuffer<StackOpUnsignedT>{
     reinterpret_cast<StackOpUnsignedT*>(d_kv_operations.Current()),

diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
@@ -1473,10 +1473,11 @@ void get_stack_context(device_span<SymbolT const> json_in,
                                   to_stack_op::start_state,
                                   stream);
 
-  auto stack_ops_bufsize = d_num_stack_ops.value(stream);
+  // Copy back to actual number of stack operations
+  auto num_stack_ops = d_num_stack_ops.value(stream);
   // Sequence of stack symbols and their position in the original input (sparse representation)
-  rmm::device_uvector<StackSymbolT> stack_ops{stack_ops_bufsize, stream};
-  rmm::device_uvector<SymbolOffsetT> stack_op_indices{stack_ops_bufsize, stream};
+  rmm::device_uvector<StackSymbolT> stack_ops{num_stack_ops, stream};
+  rmm::device_uvector<SymbolOffsetT> stack_op_indices{num_stack_ops, stream};
 
   // Run bracket-brace FST to retrieve starting positions of structs and lists
   json_to_stack_ops_fst.Transduce(json_in.begin(),
@@ -1487,9 +1488,6 @@ void get_stack_context(device_span<SymbolT const> json_in,
                                   to_stack_op::start_state,
                                   stream);
 
-  // Copy back to actual number of stack operations
-  auto const num_stack_ops = d_num_stack_ops.value(stream);
-
   // Stack operations with indices are converted to top of the stack for each character in the input
   if (stack_behavior == stack_behavior_t::ResetOnDelimiter) {
     fst::sparse_stack_op_to_top_of_stack<fst::stack_op_support::WITH_RESET_SUPPORT, StackLevelT>(

diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu
@@ -180,9 +180,9 @@ CUDF_KERNEL void __launch_bounds__(block_size)
 
   for (size_type i = 0; i < dict.map_slots.size(); i += block_size) {
     if (t + i < dict.map_slots.size()) {
-      auto window = dict.map_slots.begin() + t + i;
-      // Collect all slots from each window.
-      for (auto& slot : *window) {
+      auto bucket = dict.map_slots.begin() + t + i;
+      // Collect all slots from each bucket.
+      for (auto& slot : *bucket) {
         auto const key = slot.first;
         if (key != KEY_SENTINEL) {
           auto loc       = counter.fetch_add(1, memory_order_relaxed);

diff --git a/cpp/src/io/orc/orc_gpu.hpp b/cpp/src/io/orc/orc_gpu.hpp
@@ -47,16 +47,16 @@ using slot_type   = cuco::pair<key_type, mapped_type>;
 auto constexpr map_cg_size =
   1;  ///< A CUDA Cooperative Group of 1 thread (set for best performance) to handle each subset.
       ///< Note: Adjust insert and find loops to use `cg::tile<map_cg_size>` if increasing this.
-auto constexpr window_size =
+auto constexpr bucket_size =
   1;  ///< Number of concurrent slots (set for best performance) handled by each thread.
 auto constexpr occupancy_factor = 1.43f;  ///< cuCollections suggests using a hash map of size
                                           ///< N * (1/0.7) = 1.43 to target a 70% occupancy factor.
-using storage_type     = cuco::aow_storage<slot_type,
-                                       window_size,
-                                       cuco::extent<std::size_t>,
-                                       cudf::detail::cuco_allocator<char>>;
+using storage_type     = cuco::bucket_storage<slot_type,
+                                          bucket_size,
+                                          cuco::extent<std::size_t>,
+                                          cudf::detail::cuco_allocator<char>>;
 using storage_ref_type = typename storage_type::ref_type;
-using window_type      = typename storage_type::window_type;
+using bucket_type      = typename storage_type::bucket_type;
 using slot_type        = cuco::pair<key_type, mapped_type>;
 
 auto constexpr KEY_SENTINEL   = size_type{-1};
@@ -193,7 +193,7 @@ struct StripeStream {
  */
 struct stripe_dictionary {
   // input
-  device_span<window_type> map_slots;  // hash map (windows) storage
+  device_span<bucket_type> map_slots;  // hash map (buckets) storage
   uint32_t column_idx      = 0;        // column index
   size_type start_row      = 0;        // first row in the stripe
   size_type start_rowgroup = 0;        // first rowgroup in the stripe

diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
@@ -210,7 +210,7 @@ struct map_find_fn {
 
 template <int block_size>
 CUDF_KERNEL void __launch_bounds__(block_size)
-  populate_chunk_hash_maps_kernel(device_span<window_type> const map_storage,
+  populate_chunk_hash_maps_kernel(device_span<bucket_type> const map_storage,
                                   cudf::detail::device_2dspan<PageFragment const> frags)
 {
   auto const col_idx = blockIdx.y;
@@ -239,7 +239,7 @@ CUDF_KERNEL void __launch_bounds__(block_size)
 
 template <int block_size>
 CUDF_KERNEL void __launch_bounds__(block_size)
-  collect_map_entries_kernel(device_span<window_type> const map_storage,
+  collect_map_entries_kernel(device_span<bucket_type> const map_storage,
                              device_span<EncColumnChunk> chunks)
 {
   auto& chunk = chunks[blockIdx.x];
@@ -251,11 +251,11 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   if (t == 0) { new (&counter) cuda::atomic<size_type, SCOPE>{0}; }
   __syncthreads();
 
-  // Iterate over all windows in the map.
+  // Iterate over all buckets in the map.
   for (; t < chunk.dict_map_size; t += block_size) {
-    auto window = map_storage.data() + chunk.dict_map_offset + t;
-    // Collect all slots from each window.
-    for (auto& slot : *window) {
+    auto bucket = map_storage.data() + chunk.dict_map_offset + t;
+    // Collect all slots from each bucket.
+    for (auto& slot : *bucket) {
       auto const key = slot.first;
       if (key != KEY_SENTINEL) {
         auto const loc = counter.fetch_add(1, memory_order_relaxed);
@@ -272,7 +272,7 @@ CUDF_KERNEL void __launch_bounds__(block_size)
 
 template <int block_size>
 CUDF_KERNEL void __launch_bounds__(block_size)
-  get_dictionary_indices_kernel(device_span<window_type> const map_storage,
+  get_dictionary_indices_kernel(device_span<bucket_type> const map_storage,
                                 cudf::detail::device_2dspan<PageFragment const> frags)
 {
   auto const col_idx = blockIdx.y;
@@ -302,7 +302,7 @@ CUDF_KERNEL void __launch_bounds__(block_size)
                   s_ck_start_val_idx);
 }
 
-void populate_chunk_hash_maps(device_span<window_type> const map_storage,
+void populate_chunk_hash_maps(device_span<bucket_type> const map_storage,
                               cudf::detail::device_2dspan<PageFragment const> frags,
                               rmm::cuda_stream_view stream)
 {
@@ -311,7 +311,7 @@ void populate_chunk_hash_maps(device_span<window_type> const map_storage,
     <<<dim_grid, DEFAULT_BLOCK_SIZE, 0, stream.value()>>>(map_storage, frags);
 }
 
-void collect_map_entries(device_span<window_type> const map_storage,
+void collect_map_entries(device_span<bucket_type> const map_storage,
                          device_span<EncColumnChunk> chunks,
                          rmm::cuda_stream_view stream)
 {
@@ -320,7 +320,7 @@ void collect_map_entries(device_span<window_type> const map_storage,
     <<<chunks.size(), block_size, 0, stream.value()>>>(map_storage, chunks);
 }
 
-void get_dictionary_indices(device_span<window_type> const map_storage,
+void get_dictionary_indices(device_span<bucket_type> const map_storage,
                             cudf::detail::device_2dspan<PageFragment const> frags,
                             rmm::cuda_stream_view stream)
 {