diff --git a/WORKSPACE b/WORKSPACE index 4cf968237..ba74f8c23 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -54,6 +54,46 @@ http_archive( url = "https://github.com/sewenew/redis-plus-plus/archive/refs/tags/1.2.3.zip", ) +http_archive( + name = "hkv", + build_file = "//build_deps/toolchains/hkv:hkv.BUILD", + patch_cmds = [ + """sed -i.bak '1772i\\'$'\\n ThrustAllocator thrust_allocator_;\\n' include/merlin_hashtable.cuh""", + """sed -i.bak '225i\\'$'\\n thrust_allocator_.set_allocator(allocator_);\\n' include/merlin_hashtable.cuh""", + "sed -i.bak 's/thrust::sort_by_key(thrust_par.on(stream)/thrust::sort_by_key(thrust_par(thrust_allocator_).on(stream)/' include/merlin_hashtable.cuh", + "sed -i.bak 's/reduce(thrust_par.on(stream)/reduce(thrust_par(thrust_allocator_).on(stream)/' include/merlin_hashtable.cuh", + """sed -i.bak '125i\\'$'\\n template \\n' include/merlin/allocator.cuh""", + """sed -i.bak '126i\\'$'\\n struct ThrustAllocator : thrust::device_malloc_allocator {\\n' include/merlin/allocator.cuh""", + """sed -i.bak '127i\\'$'\\n public:\\n' include/merlin/allocator.cuh""", + """sed -i.bak '128i\\'$'\\n typedef thrust::device_malloc_allocator super_t;\\n' include/merlin/allocator.cuh""", + """sed -i.bak '129i\\'$'\\n typedef typename super_t::pointer pointer;\\n' include/merlin/allocator.cuh""", + """sed -i.bak '130i\\'$'\\n typedef typename super_t::size_type size_type;\\n' include/merlin/allocator.cuh""", + """sed -i.bak '131i\\'$'\\n public:\\n' include/merlin/allocator.cuh""", + """sed -i.bak '132i\\'$'\\n pointer allocate(size_type n) {\\n' include/merlin/allocator.cuh""", + """sed -i.bak '133i\\'$'\\n void* ptr = nullptr;\\n' include/merlin/allocator.cuh""", + """sed -i.bak '134i\\'$'\\n MERLIN_CHECK(\\n' include/merlin/allocator.cuh""", + """sed -i.bak '135i\\'$'\\n allocator_ != nullptr,\\n' include/merlin/allocator.cuh""", + """sed -i.bak '136i\\'$'\\n "[ThrustAllocator] set_allocator should be called in advance!");\\n' include/merlin/allocator.cuh""", + """sed -i.bak '137i\\'$'\\n allocator_->alloc(MemoryType::Device, &ptr, sizeof(T) * n);\\n' include/merlin/allocator.cuh""", + """sed -i.bak '138i\\'$'\\n return pointer(reinterpret_cast(ptr));\\n' include/merlin/allocator.cuh""", + """sed -i.bak '139i\\'$'\\n }\\n' include/merlin/allocator.cuh""", + """sed -i.bak '140i\\'$'\\n void deallocate(pointer p, size_type n) {\\n' include/merlin/allocator.cuh""", + """sed -i.bak '141i\\'$'\\n MERLIN_CHECK(\\n' include/merlin/allocator.cuh""", + """sed -i.bak '142i\\'$'\\n allocator_ != nullptr,\\n' include/merlin/allocator.cuh""", + """sed -i.bak '143i\\'$'\\n "[ThrustAllocator] set_allocator should be called in advance!");\\n' include/merlin/allocator.cuh""", + """sed -i.bak '144i\\'$'\\n allocator_->free(MemoryType::Device, reinterpret_cast(p.get()));\\n' include/merlin/allocator.cuh""", + """sed -i.bak '145i\\'$'\\n }\\n' include/merlin/allocator.cuh""", + """sed -i.bak '146i\\'$'\\n void set_allocator(BaseAllocator* allocator) { allocator_ = allocator; }\\n' include/merlin/allocator.cuh""", + """sed -i.bak '147i\\'$'\\n public:\\n' include/merlin/allocator.cuh""", + """sed -i.bak '148i\\'$'\\n BaseAllocator* allocator_ = nullptr;\\n' include/merlin/allocator.cuh""", + """sed -i.bak '149i\\'$'\\n };\\n' include/merlin/allocator.cuh""", + """sed -i.bak '20i\\'$'\\n #include \\n' include/merlin/allocator.cuh""", + ], + sha256 = "f8179c445a06a558262946cda4d8ae7252d313e73f792586be9b1bc0c993b1cf", + strip_prefix = "HierarchicalKV-0.1.0-beta.6", + url = "https://github.com/NVIDIA-Merlin/HierarchicalKV/archive/refs/tags/v0.1.0-beta.6.tar.gz", +) + tf_configure( name = "local_config_tf", ) diff --git a/build_deps/toolchains/gpu/cuda_configure.bzl b/build_deps/toolchains/gpu/cuda_configure.bzl index 2257b54dc..25c968672 100644 --- a/build_deps/toolchains/gpu/cuda_configure.bzl +++ b/build_deps/toolchains/gpu/cuda_configure.bzl @@ -68,8 +68,8 @@ _DEFAULT_CUDA_COMPUTE_CAPABILITIES = { _DEFAULT_CUDA_COMPUTE_CAPABILITIES.update( {"11.{}".format(v): [ - "6.0", - "6.1", + # "6.0", + # "6.1", "7.0", "7.5", "8.0", diff --git a/build_deps/toolchains/hkv/BUILD b/build_deps/toolchains/hkv/BUILD new file mode 100644 index 000000000..e69de29bb diff --git a/build_deps/toolchains/hkv/hkv.BUILD b/build_deps/toolchains/hkv/hkv.BUILD new file mode 100644 index 000000000..6250a9fec --- /dev/null +++ b/build_deps/toolchains/hkv/hkv.BUILD @@ -0,0 +1,18 @@ +load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda", "if_cuda_is_configured") + +package(default_visibility = ["//visibility:public"]) + +cc_library( + name = "hkv", + hdrs = glob([ + "include/merlin/core_kernels/*.cuh", + "include/merlin/*.cuh", + "include/merlin_hashtable.cuh", + "include/merlin_localfile.hpp", + ]), + copts = [ + "-Ofast", + ], + include_prefix = "include", + includes = ["include"], +) \ No newline at end of file diff --git a/docs/api_docs/tfra/dynamic_embedding.md b/docs/api_docs/tfra/dynamic_embedding.md index 860dfc5be..51bd79263 100644 --- a/docs/api_docs/tfra/dynamic_embedding.md +++ b/docs/api_docs/tfra/dynamic_embedding.md @@ -47,6 +47,12 @@ Export dynamic_embedding APIs. [`class ModelMode`](../tfra/dynamic_embedding/ModelMode.md): The global config of model modes. +[`class HkvHashTable`](../tfra/dynamic_embedding/HkvHashTable.md): A generic mutable hash table implementation. + +[`class HkvHashTableConfig`](../tfra/dynamic_embedding/HkvHashTableConfig.md): HkvHashTableConfig config init_capacity, max_capacity, max_hbm_for_values of HkvHashTable + +[`class HkvHashTableCreator`](../tfra/dynamic_embedding/HkvHashTableCreator.md): A generic KV table creator. + [`class RedisTable`](../tfra/dynamic_embedding/RedisTable.md): A generic mutable hash table implementation. [`class RedisTableConfig`](../tfra/dynamic_embedding/RedisTableConfig.md): RedisTableConfig config json file for connecting Redis service and diff --git a/docs/api_docs/tfra/dynamic_embedding/CuckooHashTable.md b/docs/api_docs/tfra/dynamic_embedding/CuckooHashTable.md index 7293afae5..1bef31687 100644 --- a/docs/api_docs/tfra/dynamic_embedding/CuckooHashTable.md +++ b/docs/api_docs/tfra/dynamic_embedding/CuckooHashTable.md @@ -53,7 +53,8 @@ remove method. It does not support initialization via the init method. ```python table = tfra.dynamic_embedding.CuckooHashTable(key_dtype=tf.string, value_dtype=tf.int64, - default_value=-1) + default_value=-1, + device=['/GPU:0']) sess.run(table.insert(keys, values)) out = table.lookup(query_keys) print(out.eval()) @@ -106,6 +107,10 @@ A `CuckooHashTable` object. * `ValueError`: If checkpoint is True and no name was specified. +## `Important update!!` + +We have made updates to the underlying implementation of the CuckooHashTable. The original CPU table remains unchanged, but the GPU table now uses the HKV implementation instead of nvhash. To ensure interface consistency, the init_capacity and max_capacity of HKV will be set to the init_size value you pass in. It is important to note that after this setting, the GPU hash table will not automatically resize, and the final capacity will be the same as the init_size. The max_hbm_for_values parameter of hkv will be set to a sufficiently large number to ensure that all your data is stored in the GPU table. Additionally, hkv has requirements for GPU compute capability, which needs to be 8.0 or above. For more detailed information about HKV, please refer to the documentation of HKV. + ## Properties diff --git a/docs/api_docs/tfra/dynamic_embedding/CuckooHashTableCreator.md b/docs/api_docs/tfra/dynamic_embedding/CuckooHashTableCreator.md index deb8b588a..db8afb080 100644 --- a/docs/api_docs/tfra/dynamic_embedding/CuckooHashTableCreator.md +++ b/docs/api_docs/tfra/dynamic_embedding/CuckooHashTableCreator.md @@ -41,13 +41,10 @@ class for creating the real KV table backend(TF resource). #### Example usage: - +Due to CuckooHashTableConfig include nothing for parameter default satisfied. Just setting the parameter saver is enough. ```python -redis_config1=tfra.dynamic_embedding.RedisTableConfig( - redis_config_abs_dir="xx/yy.json" -) -redis_creator1=tfra.dynamic_embedding.RedisTableCreator(redis_config1) +cuckoo_creator=tfra.dynamic_embedding.CuckooHashTableCreator(saver=de.FileSystemSaver()) ```

__init__

diff --git a/docs/api_docs/tfra/dynamic_embedding/HkvHashTable.md b/docs/api_docs/tfra/dynamic_embedding/HkvHashTable.md new file mode 100644 index 000000000..04022b0b3 --- /dev/null +++ b/docs/api_docs/tfra/dynamic_embedding/HkvHashTable.md @@ -0,0 +1,381 @@ +
+ + + + + + + + + + + + + + + +
+ +# tfra.dynamic_embedding.HkvHashTable + + + + + +
+ + + View source on GitHub + +
+
+
+
+
+ + + +## Class `HkvHashTable` + +A generic mutable hash table implementation. + +HkvHashTable is a multi-level cache hash table that allows storing values simultaneously in both GPU and CPU. It enables efficient utilization of training resources while ensuring high-performance queries, insert. This greatly expands the capacity of the hash table, making it suitable for more complex training tasks. For more detailed information about HierarchicalKV, please refer to [HierarchicalKV +](https://github.com/NVIDIA-Merlin/HierarchicalKV). + + +#### Environment request + +* CUDA version >= 11.2 +* NVIDIA GPU with compute capability 8.0, 8.6, 8.7 or 9.0 +* GCC supports `C++17' standard or later. + + +#### Example usage: + + + +```python +table = tfra.dynamic_embedding.HkvHashTable(key_dtype=tf.string, + value_dtype=tf.int64, + default_value=-1) +sess.run(table.insert(keys, values)) +out = table.lookup(query_keys) +print(out.eval()) +``` + +

__init__

+ +View source + +``` python +KHkvHashTableInitCapacity = 1024 * 1024 +KHkvHashTableMaxCapacity = 1024 * 1024 +KHkvHashTableMaxHbmForValues = 1024 * 1024 * 1024 + + +__init__( + key_dtype, + value_dtype, + default_value, + name='HkvHashTable', + checkpoint=(True), + init_capacity=KHkvHashTableInitCapacity, + max_capacity=KHkvHashTableMaxCapacity, + max_hbm_for_values=KHkvHashTableMaxHbmForValues, + config=None, + device='', +) +``` + +Creates an empty `HkvHashTable` object. + +Creates a table, the type of its keys and values are specified by key_dtype +and value_dtype, respectively. + +#### Args: + + +* `key_dtype`: the type of the key tensors. +* `value_dtype`: the type of the value tensors. +* `default_value`: The value to use if a key is missing in the table. +* `name`: A name for the operation (optional). +* `checkpoint`: if True, the contents of the table are saved to and restored + from checkpoints. If `shared_name` is empty for a checkpointed table, it + is shared using the table node name. +* `init_capacity`: initial size for the Variable and initial size of each hash +* `max_capacity`: max capacity for the Variable and max capacity of each hash +* `max_hbm_for_values`: The maximum HBM capacity occupied by the values of the hash table, measured in bytes. +* `config`: a HkvHashTableConfig object +* `device`: initial size for the Variable and initial size of each hash + tables will be int(init_size / N), N is the number of the devices. + + +#### Returns: + +A `HkvHashTable` object. + + + +#### Raises: + + +* `ValueError`: If checkpoint is True and no name was specified. + + + +## Properties + +

key_dtype

+ +The table key dtype. + + +

name

+ +The name of the table. + + +

resource_handle

+ +Returns the resource handle associated with this Resource. + + +

value_dtype

+ +The table value dtype. + + + + +## Methods + +

__getitem__

+ +``` python +__getitem__(keys) +``` + +Looks up `keys` in a table, outputs the corresponding values. + + +

accum

+ +View source + +``` python +accum( + keys, + values_or_deltas, + exists, + name=None +) +``` + +Associates `keys` with `values`. + + +#### Args: + + +* `keys`: Keys to accmulate. Can be a tensor of any shape. + Must match the table's key type. +* `values_or_deltas`: values to be associated with keys. Must be a tensor of + the same shape as `keys` and match the table's value type. +* `exists`: A bool type tensor indicates if keys already exist or not. + Must be a tensor of the same shape as `keys`. +* `name`: A name for the operation (optional). + + +#### Returns: + +The created Operation. + + + +#### Raises: + + +* `TypeError`: when `keys` or `values` doesn't match the table data + types. + +

clear

+ +View source + +``` python +clear(name=None) +``` + +clear all keys and values in the table. + + +#### Args: + + +* `name`: A name for the operation (optional). + + +#### Returns: + +The created Operation. + + +

export

+ +View source + +``` python +export(name=None) +``` + +Returns tensors of all keys and values in the table. + + +#### Args: + + +* `name`: A name for the operation (optional). + + +#### Returns: + +A pair of tensors with the first tensor containing all keys and the + second tensors containing all values in the table. + + +

insert

+ +View source + +``` python +insert( + keys, + values, + name=None +) +``` + +Associates `keys` with `values`. + + +#### Args: + + +* `keys`: Keys to insert. Can be a tensor of any shape. Must match the table's + key type. +* `values`: Values to be associated with keys. Must be a tensor of the same + shape as `keys` and match the table's value type. +* `name`: A name for the operation (optional). + + +#### Returns: + +The created Operation. + + + +#### Raises: + + +* `TypeError`: when `keys` or `values` doesn't match the table data + types. + +

lookup

+ +View source + +``` python +lookup( + keys, + dynamic_default_values=None, + return_exists=(False), + name=None +) +``` + +Looks up `keys` in a table, outputs the corresponding values. + +The `default_value` is used for keys not present in the table. + +#### Args: + + +* `keys`: Keys to look up. Can be a tensor of any shape. Must match the + table's key_dtype. +* `dynamic_default_values`: The values to use if a key is missing in the + table. If None (by default), the static default_value + `self._default_value` will be used. +* `return_exists`: if True, will return a additional Tensor which indicates + if or not keys are existing in the table. +* `name`: A name for the operation (optional). + + +#### Returns: + +A tensor containing the values in the same shape as `keys` using the + table's value type. + +* `exists`: A bool type Tensor of the same shape as `keys` which indicates + if keys are existing in the table. + Only provided if `return_exists` is True. + + +#### Raises: + + +* `TypeError`: when `keys` do not match the table data types. + +

remove

+ +View source + +``` python +remove( + keys, + name=None +) +``` + +Removes `keys` and its associated values from the table. + +If a key is not present in the table, it is silently ignored. + +#### Args: + + +* `keys`: Keys to remove. Can be a tensor of any shape. Must match the table's + key type. +* `name`: A name for the operation (optional). + + +#### Returns: + +The created Operation. + + + +#### Raises: + + +* `TypeError`: when `keys` do not match the table data types. + +

size

+ +View source + +``` python +size(name=None) +``` + +Compute the number of elements in this table. + + +#### Args: + + +* `name`: A name for the operation (optional). + + +#### Returns: + +A scalar tensor containing the number of elements in this table. diff --git a/docs/api_docs/tfra/dynamic_embedding/HkvHashTableConfig.md b/docs/api_docs/tfra/dynamic_embedding/HkvHashTableConfig.md new file mode 100644 index 000000000..4bba1b907 --- /dev/null +++ b/docs/api_docs/tfra/dynamic_embedding/HkvHashTableConfig.md @@ -0,0 +1,66 @@ +
+ + + +
+ +# tfra.dynamic_embedding.HkvHashTableConfig + + + + + +
+ + + View source on GitHub + +
+
+
+
+
+ + + +## Class `HkvHashTableConfig` + + + + + + + + +

__init__

+ +View source + +``` python + +KHkvHashTableInitCapacity = 1024 * 1024 +KHkvHashTableMaxCapacity = 1024 * 1024 +KHkvHashTableMaxHbmForValues = 1024 * 1024 * 1024 + +__init__( + init_capacity=KHkvHashTableInitCapacity, + max_capacity=KHkvHashTableMaxCapacity, + max_hbm_for_values=KHkvHashTableMaxHbmForValues +): +``` + +HkvHashTableConfig contains three parameters to configure the HashTable, They all have default values. + +#### Args: + + +* `init_capacity`: The initial capacity of the hash table. +* `max_capacity`: The maximum capacity of the hash table. +* `max_hbm_for_values`: The maximum HBM for values, in bytes. + +#### Configuration Suggestion + +* `Pure HBM mode`: set the max_hbm_for_values >= sizeof(V) * dim * max_capacity +* `HBM + HMEM mode`: set the max_hbm_for_values < sizeof(V) * dim * max_capacity +* `Pure HMEM mode`: set the max_hbm_for_values = 0 +* if max_capacity == init_capacity, the HBM + HMEM consumption = sizeof(V) * dim * max_capacity diff --git a/docs/api_docs/tfra/dynamic_embedding/HkvHashTableCreator.md b/docs/api_docs/tfra/dynamic_embedding/HkvHashTableCreator.md new file mode 100644 index 000000000..e318ad90d --- /dev/null +++ b/docs/api_docs/tfra/dynamic_embedding/HkvHashTableCreator.md @@ -0,0 +1,118 @@ +
+ + + + + +
+ +# tfra.dynamic_embedding.HkvHashTableCreator + + + + + +
+ + + View source on GitHub + +
+
+
+
+
+ + + +## Class `HkvHashTableCreator` + + A generic KV table creator. + + + + + + KV table instance will be created by the create function with config. +And also a config class for specific table instance backend should be +inited before callling the creator function. + And then, the KVCreator class instance will be passed to the Variable +class for creating the real KV table backend(TF resource). + +#### Example usage: + + + +```python +hkv_config=tfra.dynamic_embedding.HkvHashTableConfig( + init_capacity=1024 * 1024, + max_capacity=1024 * 1024, + max_hbm_for_values=0, +) +hkv_creator=tfra.dynamic_embedding.HkvHashTableCreator(config=hkv_config) +``` + +

__init__

+ +View source + +``` python +__init__(config=None) +``` + +Initialize self. See help(type(self)) for accurate signature. + + + + +## Methods + +

create

+ +View source + +``` python +create( + key_dtype=None, + value_dtype=None, + default_value=None, + name=None, + checkpoint=None, + init_size=KHkvHashTableInitCapacity, + config=None, + device=None, + shard_saveable_object_fn=None, +) +``` + + + + +

get_config

+ +View source + +``` python +get_config() + +#return as follow + +config = { + 'key_dtype': self.key_dtype, + 'value_dtype': self.value_dtype, + 'default_value': self.default_value.numpy(), + 'name': self.name, + 'checkpoint': self.checkpoint, + 'init_capacity': self.init_capacity, + 'max_capacity': self.max_capacity, + 'max_hbm_for_values': self.max_hbm_for_values + 'config': self.config, + 'device': self.device, +} +``` + + + + + + diff --git a/tensorflow_recommenders_addons/dynamic_embedding/__init__.py b/tensorflow_recommenders_addons/dynamic_embedding/__init__.py index 6f601ea1c..3269c9171 100644 --- a/tensorflow_recommenders_addons/dynamic_embedding/__init__.py +++ b/tensorflow_recommenders_addons/dynamic_embedding/__init__.py @@ -18,6 +18,9 @@ 'CuckooHashTable', 'CuckooHashTableConfig', 'CuckooHashTableCreator', + 'HkvHashTable', + 'HkvHashTableConfig', + 'HkvHashTableCreator', 'RedisTable', 'RedisTableConfig', 'RedisTableCreator', @@ -51,10 +54,13 @@ from tensorflow_recommenders_addons.dynamic_embedding.python.ops import math_ops as math from tensorflow_recommenders_addons.dynamic_embedding.python.ops import data_flow_ops as data_flow from tensorflow_recommenders_addons.dynamic_embedding.python.ops.dynamic_embedding_creator import ( - KVCreator, CuckooHashTableConfig, CuckooHashTableCreator, RedisTableConfig, + KVCreator, CuckooHashTableConfig, CuckooHashTableCreator, + HkvHashTableConfig, HkvHashTableCreator, RedisTableConfig, RedisTableCreator, FileSystemSaver) from tensorflow_recommenders_addons.dynamic_embedding.python.ops.cuckoo_hashtable_ops import ( CuckooHashTable,) +from tensorflow_recommenders_addons.dynamic_embedding.python.ops.hkv_hashtable_ops import ( + HkvHashTable,) from tensorflow_recommenders_addons.dynamic_embedding.python.ops.redis_table_ops import ( RedisTable,) from tensorflow_recommenders_addons.dynamic_embedding.python.ops.dynamic_embedding_ops import ( diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/BUILD b/tensorflow_recommenders_addons/dynamic_embedding/core/BUILD index a2e030135..0879258bb 100644 --- a/tensorflow_recommenders_addons/dynamic_embedding/core/BUILD +++ b/tensorflow_recommenders_addons/dynamic_embedding/core/BUILD @@ -4,7 +4,7 @@ load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda", "if_cuda_is_configure package(default_visibility = ["//visibility:public"]) -load("//tensorflow_recommenders_addons:tensorflow_recommenders_addons.bzl", "custom_op_library", "if_cuda_for_tf_serving") +load("//tensorflow_recommenders_addons:tensorflow_recommenders_addons.bzl", "custom_cuda_op_library", "custom_op_library", "if_cuda_for_tf_serving") load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda") load("@local_config_tf//:build_defs.bzl", "FOR_TF_SERVING") @@ -17,18 +17,6 @@ custom_op_library( "utils/utils.h", "utils/types.h", ] + glob(["kernels/lookup_impl/lookup_table_op_cpu*"]), - cuda_deps = if_cuda_for_tf_serving( - ["//tensorflow_recommenders_addons/dynamic_embedding/core/lib/nvhash:nvhashtable"], - [], - FOR_TF_SERVING, - ), - cuda_srcs = if_cuda([ - "kernels/cuckoo_hashtable_op.h", - "kernels/cuckoo_hashtable_op_gpu.h", - "kernels/cuckoo_hashtable_op_gpu.cu.cc", - "utils/utils.h", - "utils/types.h", - ] + glob(["kernels/lookup_impl/lookup_table_op_gpu*"])), deps = [ "//tensorflow_recommenders_addons/dynamic_embedding/core/lib/cuckoo:cuckoohash", ], @@ -79,11 +67,6 @@ custom_op_library( "ops/math_ops.cc", "utils/utils.h", ], - cuda_deps = if_cuda_for_tf_serving( - ["//tensorflow_recommenders_addons/dynamic_embedding/core/lib/nvhash:nvhashtable"], - [], - FOR_TF_SERVING, - ), cuda_srcs = [ "kernels/segment_reduction_ops.h", "kernels/segment_reduction_ops_gpu.cu.cc", @@ -98,11 +81,6 @@ custom_op_library( "ops/data_flow_ops.cc", "utils/utils.h", ], - cuda_deps = if_cuda_for_tf_serving( - ["//tensorflow_recommenders_addons/dynamic_embedding/core/lib/nvhash:nvhashtable"], - [], - FOR_TF_SERVING, - ), cuda_srcs = [ "kernels/fill_functor.cu.cc", "kernels/dynamic_partition_op_gpu.cu.cc", @@ -112,3 +90,24 @@ custom_op_library( ) # TODO: Add hkv targets. +custom_cuda_op_library( + name = "_hkv_ops.so", + srcs = [ + "ops/hkv_hashtable_ops.cc", + "utils/utils.h", + ], + copts = [ + "-Ofast", + ], + cuda_deps = if_cuda_for_tf_serving( + ["@hkv//:hkv"], + [], + FOR_TF_SERVING, + ), + cuda_srcs = if_cuda([ + "utils/utils.h", + "utils/types.h", + "kernels/cuckoo_hashtable_op_gpu.h", + "kernels/hkv_hashtable_op_gpu.cu.cc", + ]) + glob(["kernels/lookup_impl/lookup_table_op_hkv*"]), +) diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/hkv_hashtable_ops.cc b/tensorflow_recommenders_addons/dynamic_embedding/core/hkv_hashtable_ops.cc deleted file mode 100644 index 7af17be5c..000000000 --- a/tensorflow_recommenders_addons/dynamic_embedding/core/hkv_hashtable_ops.cc +++ /dev/null @@ -1,311 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/core/framework/common_shape_fns.h" -#include "tensorflow/core/framework/op.h" -#include "tensorflow/core/framework/op_def_builder.h" -#include "tensorflow/core/framework/shape_inference.h" -#include "tensorflow_recommenders_addons/dynamic_embedding/core/utils/utils.h" - -namespace tensorflow { - -using shape_inference::DimensionHandle; -using shape_inference::InferenceContext; -using shape_inference::ShapeAndType; -using shape_inference::ShapeHandle; - -namespace { - -Status ScalarAndTwoElementVectorInputsAndScalarOutputs(InferenceContext* c) { - ShapeHandle handle; - DimensionHandle unused_handle; - TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle)); - for (int i = 1; i < c->num_inputs(); ++i) { - TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 1, &handle)); - TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_handle)); - } - for (int i = 0; i < c->num_outputs(); ++i) { - c->set_output(i, c->Scalar()); - } - return Status::OK(); -} - -} // namespace - -Status ValidateTableResourceHandle(InferenceContext* c, ShapeHandle keys, - const string& key_dtype_attr, - const string& value_dtype_attr, - bool is_lookup, - ShapeAndType* output_shape_and_type) { - auto* handle_data = c->input_handle_shapes_and_types(0); - if (handle_data == nullptr || handle_data->size() != 2) { - output_shape_and_type->shape = c->UnknownShape(); - output_shape_and_type->dtype = DT_INVALID; - } else { - const ShapeAndType& key_shape_and_type = (*handle_data)[0]; - const ShapeAndType& value_shape_and_type = (*handle_data)[1]; - DataType key_dtype; - TF_RETURN_IF_ERROR(c->GetAttr(key_dtype_attr, &key_dtype)); - if (key_shape_and_type.dtype != key_dtype) { - return errors::InvalidArgument( - "Trying to read value with wrong dtype. " - "Expected ", - DataTypeString(key_shape_and_type.dtype), " got ", - DataTypeString(key_dtype)); - } - DataType value_dtype; - TF_RETURN_IF_ERROR(c->GetAttr(value_dtype_attr, &value_dtype)); - if (value_shape_and_type.dtype != value_dtype) { - return errors::InvalidArgument( - "Trying to read value with wrong dtype. " - "Expected ", - DataTypeString(value_shape_and_type.dtype), " got ", - DataTypeString(value_dtype)); - } - output_shape_and_type->dtype = value_shape_and_type.dtype; - - if (is_lookup) { - if (c->RankKnown(key_shape_and_type.shape) && c->RankKnown(keys)) { - int keys_rank = c->Rank(keys); - int key_suffix_rank = c->Rank(key_shape_and_type.shape); - if (keys_rank < key_suffix_rank) { - return errors::InvalidArgument( - "Expected keys to have suffix ", - c->DebugString(key_shape_and_type.shape), - " but saw shape: ", c->DebugString(keys)); - } - for (int d = 0; d < key_suffix_rank; d++) { - // Ensure the suffix of keys match what's in the Table. - DimensionHandle dim = c->Dim(key_shape_and_type.shape, d); - TF_RETURN_IF_ERROR( - c->ReplaceDim(keys, keys_rank - key_suffix_rank + d, dim, &keys)); - } - std::vector keys_prefix_vec; - keys_prefix_vec.reserve(keys_rank - key_suffix_rank); - for (int d = 0; d < keys_rank - key_suffix_rank; ++d) { - keys_prefix_vec.push_back(c->Dim(keys, d)); - } - ShapeHandle keys_prefix = c->MakeShape(keys_prefix_vec); - TF_RETURN_IF_ERROR(c->Concatenate(keys_prefix, - value_shape_and_type.shape, - &output_shape_and_type->shape)); - } else { - output_shape_and_type->shape = c->UnknownShape(); - } - } else { - TF_RETURN_IF_ERROR(c->Concatenate(keys, value_shape_and_type.shape, - &output_shape_and_type->shape)); - } - } - return Status::OK(); -} - -Status HkvHashTableShape(InferenceContext* c, const ShapeHandle& key, - const ShapeHandle& value) { - c->set_output(0, c->Scalar()); - - ShapeHandle key_s; - TF_RETURN_IF_ERROR(c->WithRankAtMost(key, 1, &key_s)); - - DataType key_t; - TF_RETURN_IF_ERROR(c->GetAttr("key_dtype", &key_t)); - - DataType value_t; - TF_RETURN_IF_ERROR(c->GetAttr("value_dtype", &value_t)); - - c->set_output_handle_shapes_and_types( - 0, std::vector{{key_s, key_t}, {value, value_t}}); - - return Status::OK(); -} - -REGISTER_OP("TfraHkvHashTableFind") - .Input("table_handle: resource") - .Input("keys: Tin") - .Input("default_value: Tout") - .Output("values: Tout") - .Attr("Tin: type") - .Attr("Tout: type") - .SetShapeFn([](InferenceContext* c) { - ShapeHandle handle; - TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle)); - - ShapeAndType value_shape_and_type; - TF_RETURN_IF_ERROR(ValidateTableResourceHandle( - c, - /*keys=*/c->input(1), - /*key_dtype_attr=*/"Tin", - /*value_dtype_attr=*/"Tout", - /*is_lookup=*/true, &value_shape_and_type)); - c->set_output(0, value_shape_and_type.shape); - - return Status::OK(); - }); - -REGISTER_OP("TfraHkvHashTableFindWithExists") - .Input("table_handle: resource") - .Input("keys: Tin") - .Input("default_value: Tout") - .Output("values: Tout") - .Output("exists: bool") - .Attr("Tin: type") - .Attr("Tout: type") - .SetShapeFn([](InferenceContext* c) { - ShapeHandle handle; - TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle)); - - ShapeHandle keys = c->UnknownShapeOfRank(1); - ShapeAndType value_shape_and_type; - TF_RETURN_IF_ERROR(ValidateTableResourceHandle( - c, - /*keys=*/c->input(1), - /*key_dtype_attr=*/"Tin", - /*value_dtype_attr=*/"Tout", - /*is_lookup=*/true, &value_shape_and_type)); - c->set_output(0, value_shape_and_type.shape); - c->set_output(1, keys); - - return Status::OK(); - }); - -REGISTER_OP("TfraHkvHashTableInsert") - .Input("table_handle: resource") - .Input("keys: Tin") - .Input("values: Tout") - .Attr("Tin: type") - .Attr("Tout: type") - .SetShapeFn([](InferenceContext* c) { - ShapeHandle handle; - TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle)); - - // TODO: Validate keys and values shape. - return Status::OK(); - }); - -REGISTER_OP("TfraHkvHashTableAccum") - .Input("table_handle: resource") - .Input("keys: key_dtype") - .Input("values_or_deltas: value_dtype") - .Input("exists: bool") - .Attr("key_dtype: type") - .Attr("value_dtype: type") - .SetShapeFn([](InferenceContext* c) { - ShapeHandle handle; - TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle)); - - // TODO: Validate keys and values shape. - return Status::OK(); - }); - -REGISTER_OP("TfraHkvHashTableRemove") - .Input("table_handle: resource") - .Input("keys: Tin") - .Attr("Tin: type") - .SetShapeFn([](InferenceContext* c) { - ShapeHandle handle; - TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle)); - TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(1), 1, &handle)); - - // TODO(turboale): Validate keys shape. - return Status::OK(); - }); - -REGISTER_OP("TfraHkvHashTableClear") - .Input("table_handle: resource") - .Attr("key_dtype: type") - .Attr("value_dtype: type"); - -REGISTER_OP("TfraHkvHashTableSize") - .Input("table_handle: resource") - .Output("size: int64") - .SetShapeFn(ScalarAndTwoElementVectorInputsAndScalarOutputs); - -REGISTER_OP("TfraHkvHashTableExport") - .Input("table_handle: resource") - .Output("keys: Tkeys") - .Output("values: Tvalues") - .Attr("Tkeys: type") - .Attr("Tvalues: type") - .SetShapeFn([](InferenceContext* c) { - ShapeHandle handle; - TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle)); - ShapeHandle keys = c->UnknownShapeOfRank(1); - ShapeAndType value_shape_and_type; - TF_RETURN_IF_ERROR(ValidateTableResourceHandle( - c, - /*keys=*/keys, - /*key_dtype_attr=*/"Tkeys", - /*value_dtype_attr=*/"Tvalues", - /*is_lookup=*/false, &value_shape_and_type)); - c->set_output(0, keys); - c->set_output(1, value_shape_and_type.shape); - return Status::OK(); - }); - -REGISTER_OP("TfraHkvHashTableSaveToFileSystem") - .Input("table_handle: resource") - .Input("dirpath: string") - .Input("file_name: string") - .Attr("key_dtype: type") - .Attr("value_dtype: type") - .Attr("dirpath_env: string") - .Attr("append_to_file: bool") - .Attr("buffer_size: int >= 1"); - -REGISTER_OP("TfraHkvHashTableImport") - .Input("table_handle: resource") - .Input("keys: Tin") - .Input("values: Tout") - .Attr("Tin: type") - .Attr("Tout: type") - .SetShapeFn([](InferenceContext* c) { - ShapeHandle handle; - TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle)); - - ShapeHandle keys; - TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &keys)); - TF_RETURN_IF_ERROR(c->Merge(keys, c->input(2), &keys)); - return Status::OK(); - }); - -REGISTER_OP("TfraHkvHashTableLoadFromFileSystem") - .Input("table_handle: resource") - .Input("dirpath: string") - .Input("file_name: string") - .Attr("key_dtype: type") - .Attr("value_dtype: type") - .Attr("dirpath_env: string") - .Attr("load_entire_dir: bool") - .Attr("buffer_size: int >= 1"); - -REGISTER_OP("TfraHkvHashTableOfTensors") - .Output("table_handle: resource") - .Attr("container: string = ''") - .Attr("shared_name: string = ''") - .Attr("use_node_name_sharing: bool = false") - .Attr("key_dtype: type") - .Attr("value_dtype: type") - .Attr("value_shape: shape = {}") - .Attr("init_capacity: int = 0") - .Attr("max_capacity: int = 0") - .SetIsStateful() - .SetShapeFn([](InferenceContext* c) { - PartialTensorShape value_p; - TF_RETURN_IF_ERROR(c->GetAttr("value_shape", &value_p)); - ShapeHandle value_s; - TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(value_p, &value_s)); - return HkvHashTableShape(c, /*key=*/c->Scalar(), /*value=*/value_s); - }); -} // namespace tensorflow diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/hkv_hashtable_op_gpu.cu.cc b/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/hkv_hashtable_op_gpu.cu.cc index 3803893c3..5911bb778 100644 --- a/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/hkv_hashtable_op_gpu.cu.cc +++ b/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/hkv_hashtable_op_gpu.cu.cc @@ -15,7 +15,7 @@ limitations under the License. #if GOOGLE_CUDA #include "tensorflow_recommenders_addons/dynamic_embedding/core/kernels/cuckoo_hashtable_op_gpu.h" -#include "tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_gpu.h" +#include "tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_hkv.h" #include "tensorflow_recommenders_addons/dynamic_embedding/core/utils/utils.h" #define EIGEN_USE_GPU @@ -35,10 +35,9 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/path.h" #include "tensorflow/core/util/env_var.h" -#include "tensorflow/stream_executor/stream.h" - #include "tensorflow/core/util/gpu_device_functions.h" #include "tensorflow/core/util/gpu_kernel_helper.h" +#include "tensorflow/stream_executor/stream.h" namespace tensorflow { @@ -47,7 +46,7 @@ using GPUDevice = Eigen::GpuDevice; namespace recommenders_addons { namespace lookup { -constexpr size_t kDefaultGpuInitCapacity = 1024; +constexpr size_t kDefaultGpuInitCapacity = 1024 * 1024; using tensorflow::OpKernelContext; using tensorflow::lookup::LookupInterface; @@ -55,6 +54,7 @@ using tensorflow::lookup::LookupInterface; template class HkvHashTableOfTensorsGpu final : public LookupInterface { private: + std::unique_ptr allocator_ptr_; public: HkvHashTableOfTensorsGpu(OpKernelContext* ctx, OpKernel* kernel) { @@ -70,56 +70,78 @@ class HkvHashTableOfTensorsGpu final : public LookupInterface { int64 init_capacity_i64 = 0; int64 max_capacity_i64 = 0; - OP_REQUIRES_OK(ctx, GetNodeAttr(kernel->def(), "init_capacity", &init_capacity_i64)); - OP_REQUIRES_OK(ctx, GetNodeAttr(kernel->def(), "max_capacity", &max_capacity_i64)); + int64 max_hbm_for_vectors_i64 = 0; + OP_REQUIRES_OK( + ctx, GetNodeAttr(kernel->def(), "init_capacity", &init_capacity_i64)); + OP_REQUIRES_OK( + ctx, GetNodeAttr(kernel->def(), "max_capacity", &max_capacity_i64)); + OP_REQUIRES_OK(ctx, GetNodeAttr(kernel->def(), "max_hbm_for_vectors", + &max_hbm_for_vectors_i64)); + OP_REQUIRES( + ctx, (max_hbm_for_vectors_i64 >= 0), + errors::InvalidArgument("params max_hbm_for_vectors less than 0")); + options.init_capacity = static_cast(init_capacity_i64); options.max_capacity = static_cast(max_capacity_i64); + options.max_hbm_for_vectors = static_cast(max_hbm_for_vectors_i64); if (options.max_capacity == 0) { - char* env_max_capacity_str = std::getenv("TFRA_GPU_HASHTABLE_UPLIMIT_SIZE"); - if (env_max_capacity_str) { - options.max_capacity = static_cast(std::atoll(env_max_capacity_str)); - LOG(WARNING) << "GPU table max capacity was not set in attribute, get " - << options.max_capacity << " from env TFRA_GPU_HASHTABLE_UPLIMIT_SIZE."; - } else { - throw std::runtime_error("max_capaicty=0 and TFRA_GPU_HASHTABLE_UPLIMIT_SIZE not set is not valid."); - } + char* env_max_capacity_str = + std::getenv("TFRA_GPU_HASHTABLE_UPLIMIT_SIZE"); + OP_REQUIRES(ctx, (env_max_capacity_str != nullptr), + errors::InvalidArgument( + "max_capaicty=0 and TFRA_GPU_HASHTABLE_UPLIMIT_SIZE not " + "set is not valid.")); + options.max_capacity = + static_cast(std::atoll(env_max_capacity_str)); + LOG(WARNING) << "GPU table max capacity was not set in attribute, get " + << options.max_capacity + << " from env TFRA_GPU_HASHTABLE_UPLIMIT_SIZE."; } if (options.init_capacity == 0) { options.init_capacity = kDefaultGpuInitCapacity; - LOG(WARNING) << "GPU table init capacity was not set in attribute, use default" - << kDefaultGpuInitCapacity; + LOG(WARNING) + << "GPU table init capacity was not set in attribute, use default" + << kDefaultGpuInitCapacity; } if (options.max_capacity < options.init_capacity) { - LOG(WARNING) << "GPU table max_capacity < init_capacity, (" << options.max_capacity - << "/" << options.init_capacity << "). Reset to " << options.init_capacity; + LOG(WARNING) << "GPU table max_capacity < init_capacity, (" + << options.max_capacity << "/" << options.init_capacity + << "). Reset to " << options.init_capacity; options.max_capacity = options.init_capacity; } if (table_) { return; } - this->CreateTable(options, &table_); + allocator_ptr_ = std::make_unique(ctx); + OP_REQUIRES_OK(ctx, + this->CreateTable(options, allocator_ptr_.get(), &table_)); OP_REQUIRES(ctx, (table_ != nullptr), errors::InvalidArgument("HashTable on GPU is created failed!")); - LOG(INFO) << "GPU table max capacity was created on max_capacity: " - << options.max_capacity << ", and init capacity: " - << options.init_capacity + << options.max_capacity + << ", and init capacity: " << options.init_capacity << " with K=" << std::type_index(typeid(K)).name() << ", V=" << std::type_index(typeid(V)).name(); } ~HkvHashTableOfTensorsGpu() { + mutex_lock l(mu_); + if (table_) { + delete table_; + table_ = nullptr; + } } - void CreateTable(gpu::TableWrapperInitOptions& options, gpu::TableWrapper** pptable) { - gpu::CreateTableImpl(pptable, options, runtime_dim_); + Status CreateTable(gpu::TableWrapperInitOptions& options, + nv::merlin::BaseAllocator* allocator, + gpu::TableWrapper** pptable) { + return gpu::CreateTableImpl(pptable, options, allocator, runtime_dim_); } size_t size() const override { tf_shared_lock l(mu_); - cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); size_t retv = table_->get_size(stream); @@ -132,7 +154,8 @@ class HkvHashTableOfTensorsGpu final : public LookupInterface { tf_shared_lock l(mu_); auto stream = ctx->eigen_device().stream(); int64 hret = static_cast(table_->get_size(stream)); - CUDA_CHECK(cudaMemcpyAsync(s, &hret, sizeof(int64), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(s, &hret, sizeof(int64), cudaMemcpyHostToDevice, + stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); } @@ -154,18 +177,23 @@ class HkvHashTableOfTensorsGpu final : public LookupInterface { is_full_default ? default_value.shape().dim_size(0) : 1; CUDA_CHECK(cudaMallocAsync(&d_status, sizeof(bool) * len, stream)); CUDA_CHECK(cudaMemsetAsync(d_status, 0, sizeof(bool) * len, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); { tf_shared_lock l(mu_); - table_->get((const K*)d_keys.tensor_data().data(), - (V*)(value->tensor_data().data()), - d_status, len, - (V*)(default_value.tensor_data().data()), - stream, is_full_default); - CUDA_CHECK(cudaStreamSynchronize(stream)); + try { + table_->get((const K*)d_keys.tensor_data().data(), + (V*)(value->tensor_data().data()), d_status, len, + (V*)(default_value.tensor_data().data()), stream, + is_full_default); + CUDA_CHECK(cudaStreamSynchronize(stream)); + } catch (std::runtime_error& e) { + return Status(tensorflow::error::INTERNAL, e.what()); + } } CUDA_CHECK(cudaFreeAsync(d_status, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } - CUDA_CHECK(cudaStreamSynchronize(stream)); + return Status::OK(); } @@ -187,14 +215,19 @@ class HkvHashTableOfTensorsGpu final : public LookupInterface { is_full_default ? default_value.shape().dim_size(0) : 1; { tf_shared_lock l(mu_); - table_->get((const K*)d_keys.tensor_data().data(), - (V*)(value->tensor_data().data()), - (bool*)exists->tensor_data().data(), len, - (V*)(default_value.tensor_data().data()), - stream, is_full_default); + try { + table_->get((const K*)d_keys.tensor_data().data(), + (V*)(value->tensor_data().data()), + (bool*)exists->tensor_data().data(), len, + (V*)(default_value.tensor_data().data()), stream, + is_full_default); + } catch (std::runtime_error& e) { + return Status(tensorflow::error::INTERNAL, e.what()); + } } CUDA_CHECK(cudaStreamSynchronize(stream)); } + return Status::OK(); } @@ -204,10 +237,13 @@ class HkvHashTableOfTensorsGpu final : public LookupInterface { auto stream = ctx->eigen_device().stream(); { mutex_lock l(mu_); - table_->upsert((const K*)keys.tensor_data().data(), - (const V*)(values.tensor_data().data()), - len, stream); - }; + try { + table_->upsert((const K*)keys.tensor_data().data(), + (const V*)(values.tensor_data().data()), len, stream); + } catch (std::runtime_error& e) { + return Status(tensorflow::error::INTERNAL, e.what()); + } + } CUDA_CHECK(cudaStreamSynchronize(stream)); return Status::OK(); @@ -219,11 +255,14 @@ class HkvHashTableOfTensorsGpu final : public LookupInterface { auto stream = ctx->eigen_device().stream(); { mutex_lock l(mu_); - table_->accum( - (const K*)keys.tensor_data().data(), - (const V*)(values_or_deltas.tensor_data().data()), - (const bool*)exists.tensor_data().data(), len, stream); - }; + try { + table_->accum((const K*)keys.tensor_data().data(), + (const V*)(values_or_deltas.tensor_data().data()), + (const bool*)exists.tensor_data().data(), len, stream); + } catch (std::runtime_error& e) { + return Status(tensorflow::error::INTERNAL, e.what()); + } + } CUDA_CHECK(cudaStreamSynchronize(stream)); return Status::OK(); @@ -236,16 +275,23 @@ class HkvHashTableOfTensorsGpu final : public LookupInterface { if (len > 0) { CUDA_CHECK(cudaMallocAsync((void**)&d_keys, sizeof(K) * len, stream)); - CUDA_CHECK(cudaMemsetAsync((void*)&d_keys, 0, sizeof(K) * len, stream)); - CUDA_CHECK(cudaMemcpyAsync((void*)d_keys, (void*)keys.tensor_data().data(), - sizeof(K) * len, cudaMemcpyDefault, stream)); + CUDA_CHECK(cudaMemsetAsync((void*)d_keys, 0, sizeof(K) * len, stream)); + CUDA_CHECK(cudaMemcpyAsync((void*)d_keys, + (void*)keys.tensor_data().data(), + sizeof(K) * len, cudaMemcpyDefault, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); { mutex_lock l(mu_); - table_->remove((const K*)d_keys, len, stream); + try { + table_->remove((const K*)d_keys, len, stream); + } catch (std::runtime_error& e) { + return Status(tensorflow::error::INTERNAL, e.what()); + } } CUDA_CHECK(cudaFreeAsync(d_keys, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } - CUDA_CHECK(cudaStreamSynchronize(stream)); + return Status::OK(); } @@ -253,7 +299,11 @@ class HkvHashTableOfTensorsGpu final : public LookupInterface { auto stream = ctx->eigen_device().stream(); { mutex_lock l(mu_); - table_->clear(stream); + try { + table_->clear(stream); + } catch (std::runtime_error& e) { + return Status(tensorflow::error::INTERNAL, e.what()); + } } CUDA_CHECK(cudaStreamSynchronize(stream)); return Status::OK(); @@ -264,29 +314,47 @@ class HkvHashTableOfTensorsGpu final : public LookupInterface { size_t len = keys.flat().size(); K* d_keys; V* d_values; - auto stream = ctx->eigen_device().stream(); if (len > 0) { - CUDA_CHECK(cudaMallocAsync((void**)&d_keys, sizeof(K) * len, stream)); - CUDA_CHECK(cudaMemsetAsync((void*)&d_keys, 0, sizeof(K) * len, stream)); - CUDA_CHECK( - cudaMallocAsync((void**)&d_values, sizeof(V) * runtime_dim_ * len, stream)); - CUDA_CHECK( - cudaMemsetAsync((void*)&d_values, 0, sizeof(V) * runtime_dim_ * len, stream)); - CUDA_CHECK(cudaMemcpyAsync((void*)d_keys, (void*)keys.tensor_data().data(), - sizeof(K) * len, cudaMemcpyDefault, stream)); - CUDA_CHECK(cudaMemcpyAsync((void*)d_values, (void*)values.tensor_data().data(), - sizeof(V) * runtime_dim_ * len, cudaMemcpyDefault, stream)); + auto stream = ctx->eigen_device().stream(); + cudaPointerAttributes keys_attr; + CUDA_CHECK(cudaPointerGetAttributes(&keys_attr, + (void*)keys.tensor_data().data())); + if (keys_attr.type != cudaMemoryTypeDevice) { + CUDA_CHECK(cudaMallocManaged((void**)&d_keys, sizeof(K) * len)); + CUDA_CHECK(cudaMemcpy((void*)d_keys, (void*)keys.tensor_data().data(), + sizeof(K) * len, cudaMemcpyDefault)); + } else { + d_keys = (K*)keys.tensor_data().data(); + } + cudaPointerAttributes values_attr; + CUDA_CHECK(cudaPointerGetAttributes(&values_attr, + (void*)values.tensor_data().data())); + if (values_attr.type != cudaMemoryTypeDevice) { + CUDA_CHECK(cudaMallocManaged((void**)&d_values, + sizeof(V) * runtime_dim_ * len)); + CUDA_CHECK( + cudaMemcpy((void*)d_values, (void*)values.tensor_data().data(), + sizeof(V) * runtime_dim_ * len, cudaMemcpyDefault)); + } else { + d_values = (V*)values.tensor_data().data(); + } { mutex_lock l(mu_); - table_->clear(stream); - table_->upsert((const K*)d_keys, - (const V*)d_values, len, stream); + try { + table_->clear(stream); + table_->upsert((const K*)d_keys, (const V*)d_values, len, stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); + } catch (std::runtime_error& e) { + return Status(tensorflow::error::INTERNAL, e.what()); + } + } + if (keys_attr.type != cudaMemoryTypeDevice) { + CUDA_CHECK(cudaFree(d_keys)); + } + if (values_attr.type != cudaMemoryTypeDevice) { + CUDA_CHECK(cudaFree(d_values)); } - CUDA_CHECK(cudaStreamSynchronize(stream)); - CUDA_CHECK(cudaFreeAsync(d_keys, stream)); - CUDA_CHECK(cudaFreeAsync(d_values, stream)); } - CUDA_CHECK(cudaStreamSynchronize(stream)); return Status::OK(); } @@ -306,15 +374,15 @@ class HkvHashTableOfTensorsGpu final : public LookupInterface { tf_shared_lock l(mu_); len = table_->get_capacity(); size = (int64)table_->get_size(stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); } - CUDA_CHECK(cudaStreamSynchronize(stream)); - CUDA_CHECK(cudaMallocAsync(&d_dump_counter, sizeof(size_t), stream)); CUDA_CHECK(cudaMemsetAsync(d_dump_counter, 0, sizeof(size_t), stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); AllocatorAttributes attr; - //attr.set_gpu_compatible(true); - //attr.set_nic_compatible(true); + // attr.set_gpu_compatible(true); + // attr.set_nic_compatible(true); attr.set_on_host(false); TF_RETURN_IF_ERROR( @@ -323,9 +391,14 @@ class HkvHashTableOfTensorsGpu final : public LookupInterface { "values", TensorShape({size, (int64)runtime_dim_}), &values, attr)); if (size) { tf_shared_lock l(mu_); - table_->dump((K*)keys->flat().data(), - (V*)(values->matrix().data()), offset, - len, d_dump_counter, stream); + try { + table_->dump((K*)keys->flat().data(), + (V*)(values->matrix().data()), offset, len, + d_dump_counter, stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); + } catch (std::runtime_error& e) { + return Status(tensorflow::error::INTERNAL, e.what()); + } } CUDA_CHECK(cudaFreeAsync(d_dump_counter, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); @@ -349,15 +422,15 @@ class HkvHashTableOfTensorsGpu final : public LookupInterface { tf_shared_lock l(mu_); len = table_->get_capacity(); size = (int64)table_->get_size(stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); } - CUDA_CHECK(cudaStreamSynchronize(stream)); - CUDA_CHECK(cudaMallocAsync(&d_dump_counter, sizeof(size_t), stream)); CUDA_CHECK(cudaMemsetAsync(d_dump_counter, 0, sizeof(size_t), stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); AllocatorAttributes attr; - //attr.set_gpu_compatible(true); - //attr.set_nic_compatible(true); + // attr.set_gpu_compatible(true); + // attr.set_nic_compatible(true); attr.set_on_host(false); TF_RETURN_IF_ERROR( @@ -368,11 +441,15 @@ class HkvHashTableOfTensorsGpu final : public LookupInterface { ctx->allocate_output("metas", TensorShape({(size)}), &metas, attr)); if (size) { tf_shared_lock l(mu_); - table_->dump_with_metas((K*)keys->flat().data(), - (V*)(values->matrix().data()), - (uint64_t*)(metas->flat().data()), - offset, len, d_dump_counter, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + try { + table_->dump_with_metas((K*)keys->flat().data(), + (V*)(values->matrix().data()), + (uint64_t*)(metas->flat().data()), offset, + len, d_dump_counter, stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); + } catch (std::runtime_error& e) { + return Status(tensorflow::error::INTERNAL, e.what()); + } } CUDA_CHECK(cudaFreeAsync(d_dump_counter, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); @@ -381,10 +458,10 @@ class HkvHashTableOfTensorsGpu final : public LookupInterface { Status ExportKeysAndMetas(OpKernelContext* ctx, size_t split_size) { tf_shared_lock l(mu_); - size_t span_len = 0; + // size_t span_len = 0; int64 size = 0; - const size_t offset = 0; + // const size_t offset = 0; Tensor* keys = nullptr; Tensor* metas = nullptr; @@ -403,10 +480,13 @@ class HkvHashTableOfTensorsGpu final : public LookupInterface { ctx->allocate_output("metas", TensorShape({(size)}), &metas, attr)); if (size) { - table_->dump_keys_and_metas((K*)keys->flat().data(), - (int64*)(metas->flat().data()), - static_cast(size), - split_size, stream); + try { + table_->dump_keys_and_metas( + (K*)keys->flat().data(), (int64*)(metas->flat().data()), + static_cast(size), split_size, stream); + } catch (std::runtime_error& e) { + return Status(tensorflow::error::INTERNAL, e.what()); + } } } CUDA_CHECK(cudaStreamSynchronize(stream)); @@ -414,39 +494,77 @@ class HkvHashTableOfTensorsGpu final : public LookupInterface { } Status ExportValuesToFile(OpKernelContext* ctx, const string filepath, - const size_t buffer_size) { + const size_t buffer_size, bool append_to_file) { auto stream = ctx->eigen_device().stream(); + FileSystem* fs; + const auto env = ctx->env(); + TF_RETURN_IF_ERROR(env->GetFileSystemForFile(filepath, &fs)); { tf_shared_lock l(mu_); - table_->dump_to_file(filepath, runtime_dim_, stream, buffer_size); + try { + table_->dump_to_file(fs, filepath, runtime_dim_, stream, buffer_size, + append_to_file); + } catch (std::runtime_error& e) { + return Status(tensorflow::error::INTERNAL, e.what()); + } } CUDA_CHECK(cudaStreamSynchronize(stream)); + return Status::OK(); } - Status ImportValuesFromFile(OpKernelContext* ctx, const string filepath, - const size_t buffer_size) { + Status ImportValuesFromFile(OpKernelContext* ctx, const string& dirpath, + const std::string& file_name, + const size_t buffer_size, bool load_entire_dir) { auto stream = ctx->eigen_device().stream(); + FileSystem* fs; + const auto env = ctx->env(); + TF_RETURN_WITH_CONTEXT_IF_ERROR(env->GetFileSystemForFile(dirpath, &fs), + "Please make sure you have already " + "imported tensorflow_io before using " + "TFRA file system operation."); + const size_t value_dim = static_cast(value_shape_.dim_size(0)); + + std::vector all_filepath; + std::string filepath = io::JoinPath(dirpath, file_name); + + if (load_entire_dir) { + string separator = "_mht_"; + int separator_pos = file_name.rfind(separator); + string file_pattern = + io::JoinPath(dirpath, + file_name.substr(0, separator_pos + separator.size())) + + "*"; + TF_RETURN_IF_ERROR(fs->GetMatchingPaths(file_pattern, &all_filepath)); + // delete -keys/-values postfix + for (auto it = all_filepath.begin(); it != all_filepath.end(); ++it) { + int kv_separator_pos = it->rfind("-"); + *it = it->substr(0, kv_separator_pos); + } + // remove duplicate elements + sort(all_filepath.begin(), all_filepath.end()); + all_filepath.erase(unique(all_filepath.begin(), all_filepath.end()), + all_filepath.end()); + } { mutex_lock l(mu_); - - string keyfile = filepath + ".keys"; - FILE* tmpfd = fopen(keyfile.c_str(), "rb"); - if (tmpfd == nullptr) { - return errors::NotFound("Failed to read key file", keyfile); + try { + table_->clear(stream); + if (load_entire_dir) { + for (const auto& path : all_filepath) { + CUDA_CHECK(cudaStreamSynchronize(stream)); + table_->load_from_file(fs, path, runtime_dim_, stream, buffer_size); + } + } else { + CUDA_CHECK(cudaStreamSynchronize(stream)); + table_->load_from_file(fs, filepath, runtime_dim_, stream, + buffer_size); + } + } catch (std::runtime_error& e) { + return Status(tensorflow::error::INTERNAL, e.what()); } - fseek(tmpfd, 0, SEEK_END); - long int filesize = ftell(tmpfd); - size_t size = static_cast(filesize) / sizeof(K); - fseek(tmpfd, 0, SEEK_SET); - fclose(tmpfd); - - table_->clear(stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); - table_->load_from_file(filepath, size, runtime_dim_, stream, - buffer_size); } CUDA_CHECK(cudaStreamSynchronize(stream)); return Status::OK(); @@ -467,6 +585,7 @@ class HkvHashTableOfTensorsGpu final : public LookupInterface { } // namespace lookup // Table lookup op. Perform the lookup operation on the given table. +template class HashTableFindGpuOp : public OpKernel { public: explicit HashTableFindGpuOp(OpKernelConstruction* ctx) : OpKernel(ctx) {} @@ -475,6 +594,8 @@ class HashTableFindGpuOp : public OpKernel { lookup::LookupInterface* table; OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table)); core::ScopedUnref unref_me(table); + lookup::HkvHashTableOfTensorsGpu* table_hkv = + (lookup::HkvHashTableOfTensorsGpu*)table; // Input 0 could be a STRING_REF or a RESOURCE DataType expected_input_0 = DT_RESOURCE; @@ -495,13 +616,13 @@ class HashTableFindGpuOp : public OpKernel { OP_REQUIRES_OK(ctx, ctx->allocate_output("values", output_shape, &out, attr)); - OP_REQUIRES_OK(ctx, table->Find(ctx, keys, out, default_values)); + OP_REQUIRES_OK(ctx, table_hkv->Find(ctx, keys, out, default_values)); } }; -REGISTER_KERNEL_BUILDER( - Name(PREFIX_OP_NAME(HkvHashTableFind)).Device(DEVICE_GPU), - HashTableFindGpuOp); +// REGISTER_KERNEL_BUILDER( +// Name(PREFIX_OP_NAME(HkvHashTableFind)).Device(DEVICE_GPU), +// HashTableFindGpuOp); // Table lookup op. Perform the lookup operation on the given table. @@ -542,7 +663,7 @@ class HashTableFindWithExistsGpuOp : public OpKernel { ctx->allocate_output("exists", keys.shape(), &exists, attr)); OP_REQUIRES_OK(ctx, table_hkv->FindWithExists(ctx, keys, values, - default_values, exists)); + default_values, exists)); } }; @@ -596,12 +717,12 @@ class HashTableAccumGpuOp : public OpKernel { const Tensor& exists = ctx->input(3); OP_REQUIRES_OK( ctx, table->CheckKeyAndValueTensorsForInsert(keys, values_or_deltas)); - OP_REQUIRES_OK(ctx, - table_hkv->Accum(ctx, keys, values_or_deltas, exists)); + OP_REQUIRES_OK(ctx, table_hkv->Accum(ctx, keys, values_or_deltas, exists)); } }; // Table remove op. +// template class HashTableRemoveGpuOp : public OpKernel { public: explicit HashTableRemoveGpuOp(OpKernelConstruction* ctx) : OpKernel(ctx) {} @@ -642,6 +763,7 @@ class HashTableClearGpuOp : public OpKernel { }; // Op that returns the size of the given table. +template class HashTableSizeGpuOp : public OpKernel { public: explicit HashTableSizeGpuOp(OpKernelConstruction* ctx) : OpKernel(ctx) {} @@ -660,12 +782,15 @@ class HashTableSizeGpuOp : public OpKernel { OP_REQUIRES_OK(ctx, ctx->allocate_output("size", TensorShape({}), &out, attr)); - int64* p_size = (int64*)out->flat().data(); table_hkv->size_i64(ctx, p_size); } }; +// REGISTER_KERNEL_BUILDER( +// Name(PREFIX_OP_NAME(HkvHashTableSize)).Device(DEVICE_GPU), +// HashTableSizeGpuOp); + // Op that outputs tensors of all keys and all values. class HashTableExportGpuOp : public OpKernel { public: @@ -697,8 +822,7 @@ class HashTableExportWithMetasGpuOp : public OpKernel { core::ScopedUnref unref_me(table); lookup::HkvHashTableOfTensorsGpu* table_hkv = (lookup::HkvHashTableOfTensorsGpu*)table; - OP_REQUIRES_OK( - ctx, table_hkv->ExportValuesWithMetas(ctx)); + OP_REQUIRES_OK(ctx, table_hkv->ExportValuesWithMetas(ctx)); } }; @@ -716,39 +840,12 @@ class HashTableExportKeysAndMetasGpuOp : public OpKernel { core::ScopedUnref unref_me(table); lookup::HkvHashTableOfTensorsGpu* table_hkv = (lookup::HkvHashTableOfTensorsGpu*)table; - OP_REQUIRES_OK(ctx, table_hkv->ExportKeysAndMetas(ctx, static_cast(split_size_i64_))); - } - private: - int64 split_size_i64_; -}; - -template -class HashTableExportToFileGpuOp : public OpKernel { - public: - explicit HashTableExportToFileGpuOp(OpKernelConstruction* ctx) - : OpKernel(ctx) { - int64 signed_buffer_size = 0; - ctx->GetAttr("buffer_size", &signed_buffer_size); - buffer_size_ = static_cast(signed_buffer_size); - } - - void Compute(OpKernelContext* ctx) override { - lookup::LookupInterface* table; - OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table)); - core::ScopedUnref unref_me(table); - - const Tensor& ftensor = ctx->input(1); - OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(ftensor.shape()), - errors::InvalidArgument("filepath must be scalar.")); - string filepath = string(ftensor.scalar()().data()); - lookup::HkvHashTableOfTensorsGpu* table_hkv = - (lookup::HkvHashTableOfTensorsGpu*)table; - OP_REQUIRES_OK( - ctx, table_hkv->ExportValuesToFile(ctx, filepath, buffer_size_)); + OP_REQUIRES_OK(ctx, table_hkv->ExportKeysAndMetas( + ctx, static_cast(split_size_i64_))); } private: - size_t buffer_size_; + int64 split_size_i64_; }; // Clear the table and insert data. @@ -777,14 +874,71 @@ REGISTER_KERNEL_BUILDER( Name(PREFIX_OP_NAME(HkvHashTableImport)).Device(DEVICE_GPU), HashTableImportGpuOp); +// Op that export all keys and values to FileSystem. +template +class HashTableSaveToFileSystemGpuOp : public OpKernel { + public: + explicit HashTableSaveToFileSystemGpuOp(OpKernelConstruction* ctx) + : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("dirpath_env", &dirpath_env_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("append_to_file", &append_to_file_)); + int64 signed_buffer_size = 0; + OP_REQUIRES_OK(ctx, ctx->GetAttr("buffer_size", &signed_buffer_size)); + buffer_size_ = static_cast(signed_buffer_size); + } + + void Compute(OpKernelContext* ctx) override { + lookup::LookupInterface* table; + OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table)); + core::ScopedUnref unref_me(table); + + string dirpath; + TF_CHECK_OK(ReadStringFromEnvVar(dirpath_env_, "NotFound", &dirpath)); + if (dirpath != "NotFound") { + LOG(INFO) << "Read TFRA key/value file directory path from the " + "environment variable " + << dirpath_env_ << " successfully. Saving directory path is " + << dirpath; + } else { + const Tensor& dir_tensor = ctx->input(1); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(dir_tensor.shape()), + errors::InvalidArgument("directory path must be scalar.")); + dirpath = string(dir_tensor.scalar()().data()); + } + + const Tensor& fname_tensor = ctx->input(2); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(fname_tensor.shape()), + errors::InvalidArgument("file name must be scalar.")); + string file_name = string(fname_tensor.scalar()().data()); + + lookup::HkvHashTableOfTensorsGpu* table_hkv = + (lookup::HkvHashTableOfTensorsGpu*)table; + LOG(INFO) << "c++ dirpath: " << dirpath << " filename: " << file_name; + std::string filepath = io::JoinPath(dirpath, file_name); + + // OP_REQUIRES_OK( + // ctx, fs->RecursivelyCreateDir(std::string(fs->Dirname(filepath)))); + + OP_REQUIRES_OK(ctx, table_hkv->ExportValuesToFile( + ctx, filepath, buffer_size_, append_to_file_)); + } + + private: + string dirpath_env_; + bool append_to_file_; + size_t buffer_size_; +}; + // Clear the table and insert data from FileSystem. template -class HashTableImportFromFileGpuOp : public OpKernel { +class HashTableLoadFromFileSystemGpuOp : public OpKernel { public: - explicit HashTableImportFromFileGpuOp(OpKernelConstruction* ctx) + explicit HashTableLoadFromFileSystemGpuOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("dirpath_env", &dirpath_env_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("load_entire_dir", &load_entire_dir_)); int64 signed_buffer_size = 0; - ctx->GetAttr("buffer_size", &signed_buffer_size); + OP_REQUIRES_OK(ctx, ctx->GetAttr("buffer_size", &signed_buffer_size)); buffer_size_ = static_cast(signed_buffer_size); } @@ -793,85 +947,108 @@ class HashTableImportFromFileGpuOp : public OpKernel { OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table)); core::ScopedUnref unref_me(table); - const Tensor& ftensor = ctx->input(1); - OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(ftensor.shape()), - errors::InvalidArgument("filepath must be scalar.")); - string filepath = string(ftensor.scalar()().data()); + string dirpath; + TF_CHECK_OK(ReadStringFromEnvVar(dirpath_env_, "NotFound", &dirpath)); + if (dirpath != "NotFound") { + LOG(INFO) << "Read TFRA key/value file directory path from the " + "environment variable " + << dirpath_env_ << " successfully. Saving directory path is " + << dirpath; + } else { + const Tensor& dir_tensor = ctx->input(1); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(dir_tensor.shape()), + errors::InvalidArgument("directory path must be scalar.")); + dirpath = string(dir_tensor.scalar()().data()); + } + + const Tensor& fname_tensor = ctx->input(2); + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(fname_tensor.shape()), + errors::InvalidArgument("file name must be scalar.")); + string file_name = string(fname_tensor.scalar()().data()); + + LOG(INFO) << "c++ dirpath :" << dirpath << " filename: " << file_name; + lookup::HkvHashTableOfTensorsGpu* table_hkv = (lookup::HkvHashTableOfTensorsGpu*)table; OP_REQUIRES_OK( - ctx, table_hkv->ImportValuesFromFile(ctx, filepath, buffer_size_)); + ctx, table_hkv->ImportValuesFromFile(ctx, dirpath, file_name, + buffer_size_, load_entire_dir_)); } private: + string dirpath_env_; + bool load_entire_dir_; size_t buffer_size_; }; // Register the HkvHashTableOfTensors op. - #define REGISTER_KERNEL(key_dtype, value_dtype) \ REGISTER_KERNEL_BUILDER( \ - Name(PREFIX_OP_NAME(HkvHashTableOfTensors)) \ + Name(PREFIX_OP_NAME(HkvHashTableOfTensors)) \ .Device(DEVICE_GPU) \ .TypeConstraint("key_dtype") \ .TypeConstraint("value_dtype"), \ - HashTableGpuOp< \ - lookup::HkvHashTableOfTensorsGpu, \ - key_dtype, value_dtype>); \ - REGISTER_KERNEL_BUILDER(Name(PREFIX_OP_NAME(HkvHashTableClear)) \ + HashTableGpuOp, \ + key_dtype, value_dtype>); \ + REGISTER_KERNEL_BUILDER(Name(PREFIX_OP_NAME(HkvHashTableClear)) \ .Device(DEVICE_GPU) \ .TypeConstraint("key_dtype") \ .TypeConstraint("value_dtype"), \ HashTableClearGpuOp); \ - REGISTER_KERNEL_BUILDER(Name(PREFIX_OP_NAME(HkvHashTableSize)) \ + REGISTER_KERNEL_BUILDER(Name(PREFIX_OP_NAME(HkvHashTableSize)) \ .Device(DEVICE_GPU) \ .TypeConstraint("key_dtype") \ .TypeConstraint("value_dtype"), \ HashTableSizeGpuOp); \ - REGISTER_KERNEL_BUILDER(Name(PREFIX_OP_NAME(HkvHashTableAccum)) \ + REGISTER_KERNEL_BUILDER(Name(PREFIX_OP_NAME(HkvHashTableAccum)) \ .Device(DEVICE_GPU) \ .TypeConstraint("key_dtype") \ .TypeConstraint("value_dtype"), \ HashTableAccumGpuOp); \ - REGISTER_KERNEL_BUILDER(Name(PREFIX_OP_NAME(HkvHashTableExportWithMetas)) \ - .Device(DEVICE_GPU) \ - .TypeConstraint("key_dtype") \ - .TypeConstraint("value_dtype"), \ - HashTableExportWithMetasGpuOp); \ - REGISTER_KERNEL_BUILDER(Name(PREFIX_OP_NAME(HkvHashTableExportToFile)) \ - .Device(DEVICE_GPU) \ - .HostMemory("filepath") \ - .TypeConstraint("key_dtype") \ - .TypeConstraint("value_dtype"), \ - HashTableExportToFileGpuOp); \ REGISTER_KERNEL_BUILDER( \ - Name(PREFIX_OP_NAME(HkvHashTableImportFromFile)) \ + Name(PREFIX_OP_NAME(HkvHashTableExportWithMetas)) \ .Device(DEVICE_GPU) \ - .HostMemory("filepath") \ .TypeConstraint("key_dtype") \ .TypeConstraint("value_dtype"), \ - HashTableImportFromFileGpuOp); \ + HashTableExportWithMetasGpuOp); \ + REGISTER_KERNEL_BUILDER(Name(PREFIX_OP_NAME(HkvHashTableFind)) \ + .Device(DEVICE_GPU) \ + .TypeConstraint("Tin") \ + .TypeConstraint("Tout"), \ + HashTableFindGpuOp); \ REGISTER_KERNEL_BUILDER( \ - Name(PREFIX_OP_NAME(HkvHashTableFindWithExists)) \ + Name(PREFIX_OP_NAME(HkvHashTableFindWithExists)) \ .Device(DEVICE_GPU) \ .TypeConstraint("Tin") \ .TypeConstraint("Tout"), \ - HashTableFindWithExistsGpuOp); + HashTableFindWithExistsGpuOp); \ + REGISTER_KERNEL_BUILDER( \ + Name(PREFIX_OP_NAME(HkvHashTableSaveToFileSystem)) \ + .Device(DEVICE_GPU) \ + .TypeConstraint("key_dtype") \ + .TypeConstraint("value_dtype"), \ + HashTableSaveToFileSystemGpuOp); \ + REGISTER_KERNEL_BUILDER( \ + Name(PREFIX_OP_NAME(HkvHashTableLoadFromFileSystem)) \ + .Device(DEVICE_GPU) \ + .TypeConstraint("key_dtype") \ + .TypeConstraint("value_dtype"), \ + HashTableLoadFromFileSystemGpuOp); REGISTER_KERNEL(int64, float); -REGISTER_KERNEL(int64, Eigen::half); -REGISTER_KERNEL(int64, int64); -REGISTER_KERNEL(int64, int32); REGISTER_KERNEL(int64, int8); -REGISTER_KERNEL(int32, float); +REGISTER_KERNEL(int64, int32); +REGISTER_KERNEL(int64, int64); +REGISTER_KERNEL(int64, Eigen::half); #undef REGISTER_KERNEL -#define SINGLE_ATTR_REGISTER_KERNEL(key_dtype, value_type) \ - REGISTER_KERNEL_BUILDER(Name(PREFIX_OP_NAME(HkvHashTableExportKeysAndMetas)) \ - .Device(DEVICE_GPU) \ - .TypeConstraint("Tkeys"), \ - HashTableExportKeysAndMetasGpuOp); +#define SINGLE_ATTR_REGISTER_KERNEL(key_dtype, value_type) \ + REGISTER_KERNEL_BUILDER( \ + Name(PREFIX_OP_NAME(HkvHashTableExportKeysAndMetas)) \ + .Device(DEVICE_GPU) \ + .TypeConstraint("Tkeys"), \ + HashTableExportKeysAndMetasGpuOp); SINGLE_ATTR_REGISTER_KERNEL(int64, float); diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_hkv.h b/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_hkv.h index 0da759780..3e109962e 100644 --- a/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_hkv.h +++ b/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_hkv.h @@ -16,14 +16,20 @@ limitations under the License. #ifndef TFRA_CORE_KERNELS_LOOKUP_TABLE_OP_GPU_H_ #define TFRA_CORE_KERNELS_LOOKUP_TABLE_OP_GPU_H_ -#include #include #include -#include -#include #include + #include +#include +#include +#include +#include "merlin/allocator.cuh" +#include "merlin/types.cuh" +#include "merlin/utils.cuh" +#include "merlin_hashtable.cuh" +#include "merlin_localfile.hpp" #include "tensorflow/core/framework/bounds_check.h" #include "tensorflow/core/framework/lookup_interface.h" #include "tensorflow/core/framework/op_kernel.h" @@ -33,12 +39,10 @@ limitations under the License. #include "tensorflow/core/kernels/lookup_util.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/lib/io/buffered_inputstream.h" +#include "tensorflow/core/lib/io/random_inputstream.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/thread_annotations.h" -#include "tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin_hashtable.cuh" -#include "tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin_localfile.hpp" -#include "tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/types.cuh" -#include "tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/utils.cuh" namespace tensorflow { namespace recommenders_addons { @@ -50,9 +54,7 @@ class KVOnlyFile : public nv::merlin::BaseKVFile { public: KVOnlyFile() : keys_fp_(nullptr), values_fp_(nullptr) {} - ~KVOnlyFile() { - close(); - } + ~KVOnlyFile() { close(); } bool open(const std::string& keys_path, const std::string& values_path, const char* mode) { @@ -80,21 +82,23 @@ class KVOnlyFile : public nv::merlin::BaseKVFile { } } - size_t read(const size_t n, const size_t dim, K* keys, V* vectors, M* metas) override { + size_t read(const size_t n, const size_t dim, K* keys, V* vectors, + M* metas) override { size_t nread_keys = fread(keys, sizeof(K), static_cast(n), keys_fp_); size_t nread_vecs = fread(vectors, sizeof(V) * dim, static_cast(n), values_fp_); if (nread_keys != nread_vecs) { - LOG(INFO) << "Partially read failed. " << nread_keys << " kv pairs by KVOnlyFile."; + LOG(INFO) << "Partially read failed. " << nread_keys + << " kv pairs by KVOnlyFile."; return 0; } LOG(INFO) << "Partially read " << nread_keys << " kv pairs by KVOnlyFile."; return nread_keys; } - size_t write(const size_t n, const size_t dim, const K* keys, const V* vectors, - const M* metas) override { + size_t write(const size_t n, const size_t dim, const K* keys, + const V* vectors, const M* metas) override { size_t nwritten_keys = fwrite(keys, sizeof(K), static_cast(n), keys_fp_); size_t nwritten_vecs = @@ -102,7 +106,8 @@ class KVOnlyFile : public nv::merlin::BaseKVFile { if (nwritten_keys != nwritten_vecs) { return 0; } - LOG(INFO) << "Partially write " << nwritten_keys << " kv pairs by KVOnlyFile."; + LOG(INFO) << "Partially write " << nwritten_keys + << " kv pairs by KVOnlyFile."; return nwritten_keys; } @@ -111,9 +116,157 @@ class KVOnlyFile : public nv::merlin::BaseKVFile { FILE* values_fp_; }; +template +class RandomKVFile : public nv::merlin::BaseKVFile { + public: + RandomKVFile(FileSystem* fs, const std::string& filepath, size_t value_dim, + size_t buffer_size, bool append_to_file = false) + : fs_(fs), + filepath_(filepath), + value_dim_(value_dim), + buffer_size_(buffer_size), + append_to_file_(append_to_file) {} + + ~RandomKVFile() {} + + Status open(const std::string& key_filepath, + const std::string& value_filepath, const std::string& mode) { + key_buffer_byte_size_ = buffer_size_ * sizeof(K); + const size_t value_len = sizeof(V) * value_dim_; + value_buffer_byte_size_ = buffer_size_ * value_len; + + if ("rb" == mode) { + TF_RETURN_IF_ERROR(fs_->FileExists(key_filepath)); + TF_RETURN_IF_ERROR(fs_->NewRandomAccessFile(key_filepath, &key_file_)); + key_input_stream_ = + std::make_unique(key_file_.get()); + key_reader_ = std::make_unique( + key_input_stream_.get(), key_buffer_byte_size_ * 2); + + TF_RETURN_IF_ERROR(fs_->FileExists(value_filepath)); + TF_RETURN_IF_ERROR( + fs_->NewRandomAccessFile(value_filepath, &value_file_)); + value_input_stream_ = + std::make_unique(value_file_.get()); + value_reader_ = std::make_unique( + value_input_stream_.get(), value_buffer_byte_size_ * 2); + + uint64 key_file_size = 0; + TF_RETURN_IF_ERROR(fs_->GetFileSize(key_filepath, &key_file_size)); + size_t key_size = key_file_size / sizeof(K); + + uint64 value_file_size = 0; + TF_RETURN_IF_ERROR(fs_->GetFileSize(value_filepath, &value_file_size)); + size_t value_size = value_file_size / value_len; + + if (key_size != value_size) { + return errors::Unavailable( + "the keys number in file " + key_filepath + + " is not equal to the value vectors number in file " + + value_filepath + "."); + } + } else if ("wb" == mode) { + std::string key_tmpfilepath(key_filepath + ".tmp"); + std::string value_tmpfilepath(value_filepath + ".tmp"); + + bool has_atomic_move = false; + auto has_atomic_move_ret = + fs_->HasAtomicMove(filepath_, &has_atomic_move); + bool need_tmp_file = + (has_atomic_move == false) || (has_atomic_move_ret != Status::OK()); + + if (!need_tmp_file) { + key_tmpfilepath = key_filepath; + value_tmpfilepath = value_filepath; + } + TF_RETURN_IF_ERROR( + fs_->RecursivelyCreateDir(std::string(fs_->Dirname(filepath_)))); + + if (append_to_file_) { + TF_RETURN_IF_ERROR( + fs_->NewAppendableFile(key_tmpfilepath, &key_writer_)); + TF_RETURN_IF_ERROR( + fs_->NewAppendableFile(value_tmpfilepath, &value_writer_)); + } else { + TF_RETURN_IF_ERROR(fs_->NewWritableFile(key_tmpfilepath, &key_writer_)); + TF_RETURN_IF_ERROR( + fs_->NewWritableFile(value_tmpfilepath, &value_writer_)); + } + } + return Status::OK(); + } + + void close() { + if (key_writer_) { + key_writer_->Flush(); + } + if (value_writer_) { + value_writer_->Flush(); + } + } + + size_t read(const size_t n, const size_t dim, K* keys, V* vectors, + M* metas) override { + size_t key_read_byte = n * sizeof(K); + size_t value_read_byte = n * sizeof(V) * dim; + + key_buffer_.resize(key_read_byte); + value_buffer_.resize(value_read_byte); + + key_reader_->ReadNBytes(key_read_byte, &key_buffer_); + value_reader_->ReadNBytes(value_read_byte, &value_buffer_); + + memcpy((char*)keys, key_buffer_.data(), key_buffer_.size()); + memcpy((char*)vectors, value_buffer_.data(), value_buffer_.size()); + + size_t nread_keys = key_buffer_.size() / sizeof(K); + return nread_keys; + } + + size_t write(const size_t n, const size_t dim, const K* keys, + const V* vectors, const M* metas) override { + size_t key_write_byte = n * sizeof(K); + size_t value_write_byte = n * sizeof(V) * value_dim_; + std::vector key_buffer_vector(key_buffer_byte_size_); + char* key_buffer = key_buffer_vector.data(); + std::vector value_buffer_vector(value_buffer_byte_size_); + char* value_buffer = value_buffer_vector.data(); + + memcpy(key_buffer, (void*)keys, key_write_byte); + memcpy(value_buffer, (void*)vectors, value_write_byte); + + key_writer_->Append(StringPiece(key_buffer, key_write_byte)); + value_writer_->Append(StringPiece(value_buffer, value_write_byte)); + + return n; + } + + private: + size_t value_dim_; + FileSystem* fs_ = nullptr; + std::string filepath_; + size_t buffer_size_; + size_t key_buffer_byte_size_; + size_t value_buffer_byte_size_; + tstring key_buffer_; + tstring value_buffer_; + bool append_to_file_ = false; + + std::unique_ptr key_writer_ = nullptr; + std::unique_ptr value_writer_ = nullptr; + + std::unique_ptr key_file_ = nullptr; + std::unique_ptr value_file_ = nullptr; + std::unique_ptr key_input_stream_ = nullptr; + std::unique_ptr value_input_stream_ = nullptr; + std::unique_ptr key_reader_ = nullptr; + std::unique_ptr value_reader_ = nullptr; +}; + // template to avoid multidef in compile time only. template -__global__ void gpu_u64_to_i64_kernel(const uint64_t* u64, int64* i64, size_t len) { +__global__ void gpu_u64_to_i64_kernel(const uint64_t* u64, int64* i64, + size_t len) { size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x; if (tid < len) { i64[tid] = static_cast(u64[tid]); @@ -129,10 +282,12 @@ __global__ void broadcast_kernel(T* data, T val, size_t n) { } template -void gpu_cast_u64_to_i64(const uint64_t* u64, int64* i64, size_t len, cudaStream_t stream) { +void gpu_cast_u64_to_i64(const uint64_t* u64, int64* i64, size_t len, + cudaStream_t stream) { size_t block_size = nv::merlin::SAFE_GET_BLOCK_SIZE(1024); size_t grid_size = nv::merlin::SAFE_GET_GRID_SIZE(len, block_size); - gpu_u64_to_i64_kernel<<>>(u64, i64, len); + gpu_u64_to_i64_kernel + <<>>(u64, i64, len); } using GPUDevice = Eigen::ThreadPoolDevice; @@ -140,32 +295,159 @@ using GPUDevice = Eigen::ThreadPoolDevice; struct TableWrapperInitOptions { size_t max_capacity; size_t init_capacity; + size_t max_hbm_for_vectors; + size_t max_bucket_size; + float max_load_factor; + int block_size; + int io_block_size; +}; + +template +__global__ void gpu_fill_default_values(V* d_vals, V* d_def_val, size_t len, + size_t dim) { + int threadId = blockIdx.x * blockDim.x + threadIdx.x; + if (threadId < len) { +#pragma unroll + for (int i = 0; i < dim; i++) { + d_vals[threadId * dim + i] = d_def_val[i]; + } + } +} + +class TFOrDefaultAllocator : public nv::merlin::BaseAllocator { + private: + using NMMemType = nv::merlin::MemoryType; + // tensorflow::Allocator* tf_host_allocator_ = nullptr; + tensorflow::Allocator* tf_device_allocator_ = nullptr; + std::unique_ptr default_allocator_ = nullptr; + bool use_default_allocator_ = false; + // bool tf_async_allocator_stream_set_ = false; + static constexpr size_t kAllocatorAlignment = 8; + + public: + TFOrDefaultAllocator() : use_default_allocator_(true) { + default_allocator_ = std::make_unique(); + } + + TFOrDefaultAllocator(OpKernelContext* ctx) { + if (ctx) { + tensorflow::AllocatorAttributes tf_alloc_attrs; + tf_device_allocator_ = ctx->get_allocator(tf_alloc_attrs); + } else { + use_default_allocator_ = true; + default_allocator_ = std::make_unique(); + } + } + + ~TFOrDefaultAllocator() override {} + + void alloc(const NMMemType type, void** ptr, size_t size, + unsigned int pinned_flags = cudaHostAllocDefault) override { + if (!use_default_allocator_) { + switch (type) { + case NMMemType::Device: + *ptr = tf_device_allocator_->AllocateRaw(kAllocatorAlignment, size); + if (nullptr == *ptr) { + throw std::runtime_error( + "Failed to allocator gpu memory, please adjust param 'max_hbm' " + "smaller."); + } + break; + case NMMemType::Pinned: + CUDA_CHECK(cudaMallocHost(ptr, size, pinned_flags)); + break; + case NMMemType::Host: + *ptr = std::malloc(size); + break; + } + } else { + default_allocator_->alloc(type, ptr, size, pinned_flags); + } + } + + void alloc_async(const NMMemType type, void** ptr, size_t size, + cudaStream_t stream) override { + if (!use_default_allocator_) { + if (NMMemType::Device == type) { + *ptr = tf_device_allocator_->AllocateRaw(kAllocatorAlignment, size); + if (nullptr == *ptr) { + throw std::runtime_error( + "Failed to allocator gpu memory, please adjust param 'max_hbm' " + "smaller."); + } + } + } else { + default_allocator_->alloc_async(type, ptr, size, stream); + } + } + + void free(const NMMemType type, void* ptr) override { + if (!use_default_allocator_) { + switch (type) { + case NMMemType::Device: + tf_device_allocator_->DeallocateRaw(ptr); + break; + case NMMemType::Pinned: + CUDA_CHECK(cudaFreeHost(ptr)); + break; + case NMMemType::Host: + std::free(ptr); + break; + } + } else { + default_allocator_->free(type, ptr); + } + } + + void free_async(const NMMemType type, void* ptr, + cudaStream_t stream) override { + if (!use_default_allocator_) { + if (NMMemType::Device == type) { + tf_device_allocator_->DeallocateRaw(ptr); + } + } else { + default_allocator_->free_async(type, ptr, stream); + } + } }; template class TableWrapper { private: - //using M = uint64_t; + // using M = uint64_t; using Table = nv::merlin::HashTable; + nv::merlin::HashTableOptions mkv_options_; public: TableWrapper(TableWrapperInitOptions& init_options, size_t dim) { max_capacity_ = init_options.max_capacity; dim_ = dim; - nv::merlin::HashTableOptions mkv_options; - mkv_options.init_capacity = std::min(init_options.init_capacity, max_capacity_); - mkv_options.max_capacity = max_capacity_; + // nv::merlin::HashTableOptions mkv_options_; + mkv_options_.init_capacity = + std::min(init_options.init_capacity, max_capacity_); + mkv_options_.max_capacity = max_capacity_; // Since currently GPU nodes are not compatible to fast // pcie connections for D2H non-continous wirte, so just // use pure hbm mode now. - mkv_options.max_hbm_for_vectors = std::numeric_limits::max(); - mkv_options.max_load_factor = 0.63; - mkv_options.block_size = nv::merlin::SAFE_GET_BLOCK_SIZE(1024); - mkv_options.dim = dim; - mkv_options.evict_strategy = nv::merlin::EvictStrategy::kCustomized; - block_size_ = mkv_options.block_size; + // mkv_options_.max_hbm_for_vectors = std::numeric_limits::max(); + mkv_options_.max_hbm_for_vectors = init_options.max_hbm_for_vectors; + mkv_options_.max_load_factor = 0.5; + mkv_options_.block_size = nv::merlin::SAFE_GET_BLOCK_SIZE(128); + mkv_options_.dim = dim; + // mkv_options_.evict_strategy = nv::merlin::EvictStrategy::kCustomized; + mkv_options_.evict_strategy = nv::merlin::EvictStrategy::kLru; + + block_size_ = mkv_options_.block_size; table_ = new Table(); - table_->init(mkv_options); + } + + Status init(nv::merlin::BaseAllocator* allocator) { + try { + table_->init(mkv_options_, allocator); + } catch (std::runtime_error& e) { + return Status(tensorflow::error::INTERNAL, e.what()); + } + return Status::OK(); } ~TableWrapper() { delete table_; } @@ -173,61 +455,79 @@ class TableWrapper { void upsert(const K* d_keys, const V* d_vals, size_t len, cudaStream_t stream) { uint64_t t0 = (uint64_t)time(NULL); - uint64_t* timestamp_metas = nullptr; - CUDA_CHECK(cudaMallocAsync(×tamp_metas, len * sizeof(uint64_t), stream)); - CUDA_CHECK(cudaMemsetAsync(timestamp_metas, 0, len * sizeof(uint64_t), stream)); + // uint64_t* timestamp_metas = nullptr; + // CUDA_CHECK( + // cudaMallocAsync(×tamp_metas, len * sizeof(uint64_t), stream)); + // CUDA_CHECK( + // cudaMemsetAsync(timestamp_metas, 0, len * sizeof(uint64_t), stream)); + // CUDA_CHECK(cudaStreamSynchronize(stream)); + size_t grid_size = nv::merlin::SAFE_GET_GRID_SIZE(len, block_size_); - broadcast_kernel<<>>(timestamp_metas, t0, len); + // broadcast_kernel + // <<>>(timestamp_metas, t0, len); - table_->insert_or_assign(len, d_keys, d_vals, /*d_metas=*/timestamp_metas, stream); - CUDA_CHECK(cudaFreeAsync(timestamp_metas, stream)); + table_->insert_or_assign(len, d_keys, d_vals, /*d_metas=*/nullptr, stream); + // CUDA_CHECK(cudaFreeAsync(timestamp_metas, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } - void accum(const K* d_keys, const V* d_vals_or_deltas, - const bool* d_exists, size_t len, cudaStream_t stream) { + void accum(const K* d_keys, const V* d_vals_or_deltas, const bool* d_exists, + size_t len, cudaStream_t stream) { uint64_t t0 = (uint64_t)time(NULL); - uint64_t* timestamp_metas = nullptr; - CUDA_CHECK(cudaMallocAsync(×tamp_metas, len * sizeof(uint64_t), stream)); - CUDA_CHECK(cudaMemsetAsync(timestamp_metas, 0, len * sizeof(uint64_t), stream)); + // uint64_t* timestamp_metas = nullptr; + // CUDA_CHECK( + // cudaMallocAsync(×tamp_metas, len * sizeof(uint64_t), stream)); + // CUDA_CHECK( + // cudaMemsetAsync(timestamp_metas, 0, len * sizeof(uint64_t), stream)); + // CUDA_CHECK(cudaStreamSynchronize(stream)); + size_t grid_size = nv::merlin::SAFE_GET_GRID_SIZE(len, block_size_); - broadcast_kernel<<>>(timestamp_metas, t0, len); - table_->accum_or_assign(len, d_keys, d_vals_or_deltas, d_exists, /*d_metas=*/timestamp_metas, stream); - CUDA_CHECK(cudaFreeAsync(timestamp_metas, stream)); - CUDA_CHECK(cudaStreamSynchronize(stream)) + // broadcast_kernel + // <<>>(timestamp_metas, t0, len); + table_->accum_or_assign(len, d_keys, d_vals_or_deltas, d_exists, + /*d_metas=*/nullptr, stream); + // CUDA_CHECK(cudaFreeAsync(timestamp_metas, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); } - void dump(K* d_key, V* d_val, const size_t offset, - const size_t search_length, size_t* d_dump_counter, - cudaStream_t stream) const { - table_->export_batch(search_length, offset, d_dump_counter, d_key, d_val, /*d_metas=*/nullptr, stream); + void dump(K* d_key, V* d_val, const size_t offset, const size_t search_length, + size_t* d_dump_counter, cudaStream_t stream) const { + table_->export_batch(search_length, offset, d_dump_counter, d_key, d_val, + /*d_metas=*/nullptr, stream); } - void dump_with_metas(K* d_key, V* d_val, uint64_t* d_metas, const size_t offset, - const size_t search_length, size_t* d_dump_counter, - cudaStream_t stream) const { - table_->export_batch(search_length, offset, d_dump_counter, d_key, d_val, d_metas, stream); + void dump_with_metas(K* d_key, V* d_val, uint64_t* d_metas, + const size_t offset, const size_t search_length, + size_t* d_dump_counter, cudaStream_t stream) const { + table_->export_batch(search_length, offset, d_dump_counter, d_key, d_val, + d_metas, stream); } - void dump_keys_and_metas(K* keys, int64* metas, size_t len, - size_t split_len, cudaStream_t stream) const { + void dump_keys_and_metas(K* keys, int64* metas, size_t len, size_t split_len, + cudaStream_t stream) const { V* values_buf = nullptr; size_t offset = 0; size_t real_offset = 0; size_t skip = split_len; uint64_t* metas_u64 = reinterpret_cast(metas); size_t span_len = table_->capacity(); - CUDA_CHECK(cudaMallocAsync(&values_buf, sizeof(V) * dim_ * split_len, stream)); - CUDA_CHECK(cudaMemsetAsync(values_buf, 0, sizeof(V) * dim_ * split_len, stream)); + CUDA_CHECK( + cudaMallocAsync(&values_buf, sizeof(V) * dim_ * split_len, stream)); + CUDA_CHECK( + cudaMemsetAsync(values_buf, 0, sizeof(V) * dim_ * split_len, stream)); for (; offset < span_len; offset += split_len) { if (offset + skip > span_len) { skip = span_len - offset; } // TODO: overlap the loop - size_t h_dump_counter = table_->export_batch(skip, offset, keys + real_offset, values_buf, metas_u64 + real_offset, stream); + size_t h_dump_counter = + table_->export_batch(skip, offset, keys + real_offset, values_buf, + metas_u64 + real_offset, stream); CudaCheckError(); if (h_dump_counter > 0) { - gpu_cast_u64_to_i64(metas_u64 + real_offset, metas + real_offset, h_dump_counter, stream); + gpu_cast_u64_to_i64(metas_u64 + real_offset, metas + real_offset, + h_dump_counter, stream); real_offset += h_dump_counter; } CUDA_CHECK(cudaStreamSynchronize(stream)); @@ -237,80 +537,131 @@ class TableWrapper { CUDA_CHECK(cudaStreamSynchronize(stream)); } - void dump_to_file(const string filepath, size_t dim, - cudaStream_t stream, - const size_t buffer_size) const { + // TODO (LinGeLin) support metas + bool is_valid_metas(const std::string& keyfile, + const std::string& metafile) const { + return false; + } + + void dump_to_file(FileSystem* fs, const string filepath, size_t dim, + cudaStream_t stream, const size_t buffer_size, + bool append_to_file) const { LOG(INFO) << "dump_to_file, filepath: " << filepath << ", dim: " << dim << ", stream: " << stream << ", buffer_size: " << buffer_size; - std::unique_ptr> wfile; - string keyfile = ; - string valuefile = ; - string metafile = ; - - wfile.reset(new TimestampV1CompatFile); - bool open_ok = wfile->open(keyfile, valuefile, metafile, "wb"); - if (!open_ok) { - std::string error_msg = "Failed to dump to file to " + keyfile + ", " + valuefile + ", " + metafile; + + std::unique_ptr> wfile; + + string keyfile = filepath + "-keys"; + string valuefile = filepath + "-values"; + string metafile = filepath + "-metas"; + bool has_metas = false; + Status status = Status::OK(); + + if (is_valid_metas(keyfile, metafile)) { + wfile.reset(new nv::merlin::LocalKVFile); + bool open_ok = reinterpret_cast*>( + wfile.get()) + ->open(keyfile, valuefile, metafile, "wb"); + has_metas = true; + if (!open_ok) { + std::string error_msg = "Failed to dump to file to " + keyfile + ", " + + valuefile + ", " + metafile; + throw std::runtime_error(error_msg); + } + } else { + wfile.reset(new RandomKVFile( + fs, filepath, dim, buffer_size, append_to_file)); + status = reinterpret_cast*>(wfile.get()) + ->open(keyfile, valuefile, "wb"); + } + if (!status.ok()) { + std::string error_msg = "Failed to dump to file to " + keyfile + ", " + + valuefile + ", " + metafile + " " + + status.ToString(); throw std::runtime_error(error_msg); } size_t n_saved = table_->save(wfile.get(), buffer_size, stream); - LOG(INFO) << "[op] Save " << n_saved << " pairs into keyfile: " - << keyfile << ", and valuefile: " << valuefile - << ", and metafile: " << metafile; + if (has_metas) { + LOG(INFO) << "[op] Save " << n_saved << " pairs from keyfile: " << keyfile + << ", and valuefile: " << valuefile << ", and metafile" + << metafile; + } else { + LOG(INFO) << "[op] Save " << n_saved << " pairs from keyfile: " << keyfile + << ", and valuefile: " << valuefile; + } CUDA_CHECK(cudaStreamSynchronize(stream)); - wfile->close(); + if (has_metas) { + reinterpret_cast*>(wfile.get()) + ->close(); + } else { + reinterpret_cast*>(wfile.get())->close(); + } } - void load_from_file(const string filepath, - size_t key_num, size_t dim, cudaStream_t stream, - const size_t buffer_size) { + void load_from_file(FileSystem* fs, const string filepath, size_t dim, + cudaStream_t stream, const size_t buffer_size) { std::unique_ptr> rfile; - string keyfile = ; - string valuefile = ; - string metafile = ; - //rfile.reset(new TimestampV1CompatFile); + string keyfile = filepath + "-keys"; + string valuefile = filepath + "-values"; + string metafile = filepath + "-metas"; bool has_metas = false; - bool open_ok = false; + Status status = Status::OK(); if (is_valid_metas(keyfile, metafile)) { - rfile.reset(new TimestampV1CompatFile); - open_ok = reinterpret_cast*>(rfile.get())->open(keyfile, valuefile, metafile, "rb"); + rfile.reset(new nv::merlin::LocalKVFile); + bool open_ok = reinterpret_cast*>( + rfile.get()) + ->open(keyfile, valuefile, metafile, "rb"); has_metas = true; + if (!open_ok) { + std::string error_msg = "Failed to load from file " + keyfile + ", " + + valuefile + ", " + metafile; + throw std::runtime_error(error_msg); + } } else { - rfile.reset(new KVOnlyFile); - open_ok = reinterpret_cast*>(rfile.get())->open(keyfile, valuefile, "rb"); + rfile.reset( + new RandomKVFile(fs, filepath, dim, buffer_size)); + status = reinterpret_cast*>(rfile.get()) + ->open(keyfile, valuefile, "rb"); } - if (!open_ok) { - std::string error_msg = "Failed to load from file to " + keyfile + ", " + valuefile + ", " + metafile; - throw std::runtime_error("Failed to "); + if (!status.ok()) { + std::string error_msg = "Failed to load from file " + keyfile + ", " + + valuefile + ", " + metafile + " " + + status.ToString(); + throw std::runtime_error(error_msg); } size_t n_loaded = table_->load(rfile.get(), buffer_size, stream); if (has_metas) { - LOG(INFO) << "[op] Load " << n_loaded << " pairs into keyfile: " - << keyfile << ", and valuefile: " << valuefile - << ", and metafile" << metafile; + LOG(INFO) << "[op] Load " << n_loaded + << " pairs from keyfile: " << keyfile + << ", and valuefile: " << valuefile << ", and metafile" + << metafile; } else { - LOG(INFO) << "[op] Load " << n_loaded << " pairs into keyfile: " - << keyfile << ", and valuefile: " << valuefile; + LOG(INFO) << "[op] Load " << n_loaded + << " pairs from keyfile: " << keyfile + << ", and valuefile: " << valuefile; } CUDA_CHECK(cudaStreamSynchronize(stream)); if (has_metas) { - reinterpret_cast*>(rfile.get())->close(); + reinterpret_cast*>(rfile.get()) + ->close(); } else { - reinterpret_cast*>(rfile.get())->close(); + reinterpret_cast*>(rfile.get())->close(); } } - void get(const K* d_keys, V* d_vals, bool* d_status, size_t len, - V* d_def_val, cudaStream_t stream, - bool is_full_size_default) const { + void get(const K* d_keys, V* d_vals, bool* d_status, size_t len, V* d_def_val, + cudaStream_t stream, bool is_full_size_default) const { if (is_full_size_default) { - CUDA_CHECK(cudaMemcpyAsync(d_vals, d_def_val, sizeof(V) * dim_ * len, cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_vals, d_def_val, sizeof(V) * dim_ * len, + cudaMemcpyDeviceToDevice, stream)); } else { size_t grid_size = nv::merlin::SAFE_GET_GRID_SIZE(len, block_size_); - gpu_fill_default_values<<>>(d_vals, d_def_val, len, dim_); + gpu_fill_default_values + <<>>( + d_vals, d_def_val, len, dim_); } table_->find(len, d_keys, d_vals, d_status, /*d_metas=*/nullptr, stream); } @@ -326,9 +677,7 @@ class TableWrapper { CUDA_CHECK(cudaStreamSynchronize(stream)); } - size_t get_size(cudaStream_t stream) const { - return table_->size(stream); - } + size_t get_size(cudaStream_t stream) const { return table_->size(stream); } size_t get_capacity() const { return table_->capacity(); } @@ -347,9 +696,12 @@ class TableWrapper { }; template -void CreateTableImpl(TableWrapper** pptable, TableWrapperInitOptions& options, - size_t runtime_dim) { +Status CreateTableImpl(TableWrapper** pptable, + TableWrapperInitOptions& options, + nv::merlin::BaseAllocator* allocator, + size_t runtime_dim) { *pptable = new TableWrapper(options, runtime_dim); + return (*pptable)->init(allocator); } } // namespace gpu diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_hkv_impl.cu.cc b/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_hkv_impl.cu.cc index 47660f73c..10878c3d7 100644 --- a/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_hkv_impl.cu.cc +++ b/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_hkv_impl.cu.cc @@ -13,24 +13,25 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_gpu.h" +#include "tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_hkv.h" + namespace tensorflow { namespace recommenders_addons { namespace lookup { namespace gpu { #define DEFINE_PURE_GPU_HASHTABLE(key_type, value_type) \ - template<> class TableWrapper - + template <> \ + class TableWrapper DEFINE_PURE_GPU_HASHTABLE(int64, float); +DEFINE_PURE_GPU_HASHTABLE(int64, int8); DEFINE_PURE_GPU_HASHTABLE(int64, int32); DEFINE_PURE_GPU_HASHTABLE(int64, int64); -DEFINE_PURE_GPU_HASHTABLE(int64, int64); +DEFINE_PURE_GPU_HASHTABLE(int64, Eigen::half); #undef DEFINE_PURE_GPU_HASHTABLE - } // namespace gpu } // namespace lookup } // namespace recommenders_addons diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/BUILD b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/BUILD deleted file mode 100644 index b7d22e73b..000000000 --- a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/BUILD +++ /dev/null @@ -1,42 +0,0 @@ -load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda", "if_cuda_is_configured") - -package(default_visibility = ["//visibility:public"]) - -load("//tensorflow_recommenders_addons:tensorflow_recommenders_addons.bzl", "custom_op_library", - "if_cuda_for_tf_serving", "tf_nvcc_binary") - -cc_library( - name = "merlin_kv", - hdrs = [ - "merlin/array_kernels.cuh", - "merlin/core_kernels.cuh", - "merlin/debug.hpp", - "merlin/flexible_buffer.cuh", - "merlin/memory_pool.cuh", - "merlin/group_lock.hpp", - #"merlin/initializers.cuh", - #"merlin/managed.cuh", - #"merlin/optimizers.cuh", - "merlin/types.cuh", - "merlin/utils.cuh", - "merlin_hashtable.cuh", - "merlin_localfile.hpp", - ], - deps = [ - "//tensorflow_recommenders_addons/dynamic_embedding/core/lib/utils:cuda_utils", - ], -) - -tf_nvcc_binary( - name="merlin_kv_insert_and_evict_test", - srcs=[], - cuda_srcs=[ - "insert_and_evict_test.cu.cc", - "test_util.cu.h", - ], - cuda_deps=[ - ":merlin_kv", - ], - copts=[], - linkopts=[], -) diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/array_kernels.cuh b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/array_kernels.cuh deleted file mode 100644 index cd11fb713..000000000 --- a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/array_kernels.cuh +++ /dev/null @@ -1,345 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http:///www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "tensorflow_recommenders_addons/dynamic_embedding/core/lib/utils/cuda_utils.cuh" - -#include -#include "cuda_runtime.h" -#include "thrust/device_vector.h" -#include "thrust/execution_policy.h" -#include "thrust/scan.h" -#include "thrust/count.h" -#include "types.cuh" -#include "utils.cuh" - -namespace nv { -namespace merlin { - -template -__global__ void keys_not_empty(const K* keys, bool* masks, size_t n) { - int tid = blockIdx.x * blockDim.x + threadIdx.x; - if (tid < n) { - masks[tid] = keys[tid] != EMPTY_KEY; - } -} - -template -__global__ void gpu_cell_count(const bool* masks, bool target, - Tidx* offsets, size_t n, size_t* n_existed) { - int tid = blockIdx.x * blockDim.x + threadIdx.x; - auto g = cg::tiled_partition(cg::this_thread_block()); - int rank = g.thread_rank(); - bool is_existed = false; - if (tid < n) { - if (masks[tid] == target) { - is_existed = true; - } - } - unsigned int vote = g.ballot(is_existed); - int g_ones = __popc((int)vote); - if (rank == 0 && tid < n) { - offsets[tid / TILE_SIZE] = static_cast(g_ones); - atomicAdd(static_cast(n_existed), static_cast(g_ones)); - } -} - -template -__global__ void gpu_select_key_kernel(const bool* masks, bool target, size_t n, - const Tidx* offsets, const K* __restrict keys, - K* __restrict outkeys, Tidx* outoffsets) { - int tid = blockIdx.x * blockDim.x + threadIdx.x; - auto g = cg::tiled_partition(cg::this_thread_block()); - int rank = g.thread_rank(); - - bool is_existed = false; - if (tid < n) { - if (masks[tid] == target) { - is_existed = true; - } - } - unsigned int vote = g.ballot(is_existed); - unsigned int r_vote = __brev(vote) >> (32 - TILE_SIZE); - - if (tid < n) { - r_vote = r_vote >> (TILE_SIZE - rank - 1); - if (masks[tid] == target) { - int prefix_n = __popc(r_vote) - 1; - Tidx bias = offsets[tid / TILE_SIZE] + static_cast(prefix_n); - outkeys[bias] = keys[tid]; - outoffsets[bias] = static_cast(tid); - } - } -} - -template -__global__ void gpu_select_kv_kernel(const bool* masks, bool target, size_t n, - const Tidx* offsets, - const K* __restrict keys, - V* __restrict values, - K* __restrict outkeys, - V* __restrict outvalues, - const size_t dim) { - int tid = blockIdx.x * blockDim.x + threadIdx.x; - auto g = cg::tiled_partition(cg::this_thread_block()); - int rank = g.thread_rank(); - - bool is_existed = false; - if (tid < n) { - if (masks[tid] == target) { - is_existed = true; - } - } - unsigned int vote = g.ballot(is_existed); - unsigned int r_vote = __brev(vote) >> (32 - TILE_SIZE); - - if (tid < n) { - r_vote = r_vote >> (TILE_SIZE - rank - 1); - if (masks[tid] == target) { - int prefix_n = __popc(r_vote) - 1; - Tidx bias = offsets[tid / TILE_SIZE] + static_cast(prefix_n); - outkeys[bias] = keys[tid]; - for (size_t i=0;i -__global__ void gpu_select_kvm_kernel(const bool* masks, size_t n, - const Tidx* offsets, K* __restrict keys, - V* __restrict values, M* __restrict metas, - const size_t dim) { - int tid = blockIdx.x * blockDim.x + threadIdx.x; - auto g = cg::tiled_partition(cg::this_thread_block()); - int rank = g.thread_rank(); - - bool is_existed = false; - if (tid < n) { - if (masks[tid]) { - is_existed = true; - } - } - unsigned int vote = g.ballot(is_existed); - unsigned int r_vote = __brev(vote) >> (32 - TILE_SIZE); - K empty_key = (K)EMPTY_KEY; - - if (tid < n) { - r_vote = r_vote >> (TILE_SIZE - rank - 1); - if (masks[tid]) { - int prefix_n = __popc(r_vote) - 1; - Tidx bias = offsets[tid / TILE_SIZE] + static_cast(prefix_n); - - if (bias == tid) return; - - K target_key = 0; - AtomicKey* atomic_key = reinterpret_cast*>(keys) + bias; - while (target_key != empty_key) { - //target_key = atomicCAS(keys + bias, empty_key, keys[tid]); - target_key = empty_key; - atomic_key->compare_exchange_weak(target_key, keys[tid], - cuda::std::memory_order_relaxed, - cuda::std::memory_order_relaxed); - } - if (metas) metas[bias] = metas[tid]; - for (size_t j = 0; j < dim; j++) { - values[dim * bias + j] = values[dim * tid + j]; - } - //atomicExch(keys + tid, empty_key); - atomic_key = reinterpret_cast*>(keys) + tid; - atomic_key->store(empty_key, cuda::std::memory_order_relaxed); - } - } -} - -template -__global__ void gpu_select_kvm_kernel_v2(size_t n, - K* __restrict keys, - V* __restrict values, - M* __restrict metas, - K* __restrict tmp_keys, - V* __restrict tmp_values, - M* __restrict tmp_metas, - size_t* cnt, - const size_t dim) { - int tid = blockIdx.x * blockDim.x + threadIdx.x; - if (tid < n) { - size_t offset = atomicAdd(cnt, 1llu); - tmp_keys[offset] = keys[tid]; - for (size_t j = 0; j < dim; j++) { - tmp_values[offset * dim + j] = values[tid * dim + j]; - } - if (metas) { - tmp_metas[offset] = metas[tid]; - } - } -} - -template -void gpu_pick_kvm_inplace(size_t grid_size, size_t block_size, const bool* masks, - bool target, size_t n, size_t* n_evicted, Tidx* offsets, - K* __restrict keys, V* __restrict values, - M* __restrict metas, size_t dim, cudaStream_t stream) { - size_t n_offsets = (n + TILE_SIZE - 1) / TILE_SIZE; - gpu_cell_count - <<>>(masks, target, offsets, n, n_evicted); -#if THRUST_VERSION >= 101600 - auto policy = thrust::cuda::par_nosync.on(stream); -#else - auto policy = thrust::cuda::par.on(stream); -#endif - thrust::device_ptr d_src(offsets); - thrust::device_ptr d_dest(offsets); - thrust::exclusive_scan(policy, d_src, d_src + n_offsets, d_dest); - if (target) { - gpu_select_kvm_kernel - <<>>(masks, n, offsets, - keys, values, metas, dim); - } else { - throw std::runtime_error("Not used"); - //gpu_select_kvm_kernel_reverse - // <<>>(masks, n, offsets, - // keys, values, metas, dim); - } -} - -template -size_t gpu_pick_kvm_v2(size_t grid_size, size_t block_size, - bool target, size_t n, size_t* n_evicted, - K* __restrict keys, V* __restrict values, - M* __restrict metas, size_t dim, cudaStream_t stream) { -#if THRUST_VERSION >= 101600 - auto policy = thrust::cuda::par_nosync.on(stream); -#else - auto policy = thrust::cuda::par.on(stream); -#endif - thrust::device_ptr d_src(keys); - int empty_cnt = thrust::count(policy, d_src, d_src + n, (K)EMPTY_KEY); - size_t h_cnt = n - static_cast(empty_cnt); - if (h_cnt == 0) { - return 0; - } - K* tmp_keys = nullptr; - V* tmp_values = nullptr; - M* tmp_metas = nullptr; - if (target) { - CUDA_CHECK(cudaMallocAsync(&tmp_keys, h_cnt * sizeof(K), stream)); - CUDA_CHECK(cudaMemsetAsync(tmp_keys, 0, h_cnt * sizeof(K), stream)); - CUDA_CHECK(cudaMallocAsync(&tmp_values, h_cnt * dim * sizeof(V), stream)); - CUDA_CHECK(cudaMemsetAsync(tmp_values, 0, h_cnt * dim * sizeof(V), stream)); - if (metas) { - CUDA_CHECK(cudaMallocAsync(&tmp_metas, h_cnt * sizeof(M), stream)); - CUDA_CHECK(cudaMemsetAsync(tmp_metas, 0, h_cnt * sizeof(M), stream)); - } - gpu_select_kvm_kernel_v2 - <<>>(n, - keys, values, metas, tmp_keys, tmp_values, tmp_metas, n_evicted, dim); - CUDA_CHECK(cudaMemcpyAsync(keys, tmp_keys, h_cnt * sizeof(K), cudaMemcpyDeviceToDevice, stream)); - CUDA_CHECK(cudaMemcpyAsync(values, tmp_values, h_cnt * dim * sizeof(V), cudaMemcpyDeviceToDevice, stream)); - if(metas) { - CUDA_CHECK(cudaMemcpyAsync(metas, tmp_metas, h_cnt * sizeof(M), cudaMemcpyDeviceToDevice, stream)); - } - CUDA_CHECK(cudaFreeAsync(tmp_keys, stream)); - CUDA_CHECK(cudaFreeAsync(tmp_values, stream)); - if (tmp_metas) { - CUDA_CHECK(cudaFreeAsync(tmp_metas, stream)); - } - CUDA_CHECK(cudaStreamSynchronize(stream)); - } else { - throw std::runtime_error("Not used"); - //gpu_select_kvm_kernel_reverse - // <<>>(masks, n, offsets, - // keys, values, metas, dim); - } - return h_cnt; -} - -template -void gpu_pick_kvm_inplace_wrap(const bool* masks, bool target, - size_t n, size_t* n_evicted, - K* __restrict keys, V* __restrict values, - M* __restrict metas, size_t dim, cudaStream_t stream) { - size_t block_size = 256; - size_t grid_size = SAFE_GET_GRID_SIZE(n, block_size); - size_t n_offsets = (n + TILE_SIZE - 1) / TILE_SIZE; - int64_t* offsets = nullptr; - CUDA_CHECK(cudaMallocAsync(&offsets, sizeof(int64_t) * n_offsets, stream)); - gpu_pick_kvm_inplace(grid_size, block_size, - masks, target, n, n_evicted, offsets, keys, values, metas, dim, stream); - CUDA_CHECK(cudaFreeAsync(offsets, stream)); -} - -template -void gpu_pick_keys(const bool* masks, bool target, size_t n, size_t* n_evicted, - const K* __restrict keys, K* __restrict outkeys, - int64_t* outoffsets, cudaStream_t stream) { - size_t block_size = 256; - size_t grid_size = SAFE_GET_GRID_SIZE(n, block_size); - size_t n_offsets = (n + TILE_SIZE - 1) / TILE_SIZE; - int64_t* offsets = nullptr; - CUDA_CHECK(cudaMallocAsync(&offsets, sizeof(int64_t) * n_offsets, stream)); - CUDA_CHECK(cudaMemsetAsync(offsets, 0, sizeof(int64_t) * n_offsets, stream)); - - gpu_cell_count - <<>>(masks, target, offsets, n, n_evicted); -#if THRUST_VERSION >= 101600 - auto policy = thrust::cuda::par_nosync.on(stream); -#else - auto policy = thrust::cuda::par.on(stream); -#endif - thrust::device_ptr d_src(offsets); - thrust::device_ptr d_dest(offsets); - thrust::exclusive_scan(policy, d_src, d_src + n_offsets, d_dest); - gpu_select_key_kernel - <<>>(masks, target, n, offsets, - keys, outkeys, outoffsets); - CUDA_CHECK(cudaFreeAsync(offsets, stream)); -} - -template -void gpu_pick_kvs(const bool* masks, bool target, size_t n, size_t* n_evicted, - const K* __restrict keys, - V* __restrict values, - K* __restrict outkeys, - V* __restrict outvalues, - size_t dim, - cudaStream_t stream) { - size_t block_size = 256; - size_t grid_size = SAFE_GET_GRID_SIZE(n, block_size); - size_t n_offsets = (n + TILE_SIZE - 1) / TILE_SIZE; - int64_t* offsets = nullptr; - CUDA_CHECK(cudaMallocAsync(&offsets, sizeof(int64_t) * n_offsets, stream)); - CUDA_CHECK(cudaMemsetAsync(offsets, 0, sizeof(int64_t) * n_offsets, stream)); - - gpu_cell_count - <<>>(masks, target, offsets, n, n_evicted); -#if THRUST_VERSION >= 101600 - auto policy = thrust::cuda::par_nosync.on(stream); -#else - auto policy = thrust::cuda::par.on(stream); -#endif - thrust::device_ptr d_src(offsets); - thrust::device_ptr d_dest(offsets); - thrust::exclusive_scan(policy, d_src, d_src + n_offsets, d_dest); - gpu_select_kv_kernel - <<>>(masks, target, n, offsets, - keys, values, outkeys, outvalues, dim); - CUDA_CHECK(cudaFreeAsync(offsets, stream)); -} -} // namespace merlin -} // namespace nv diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/core_kernels.cuh b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/core_kernels.cuh deleted file mode 100644 index 43504dd68..000000000 --- a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/core_kernels.cuh +++ /dev/null @@ -1,2510 +0,0 @@ -/* - * Copyright (c) 2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http:///www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include -#include -#include -#include "types.cuh" -#include "utils.cuh" - -using namespace cooperative_groups; -namespace cg = cooperative_groups; - -namespace nv { -namespace merlin { - -/* For improving performance consideration, allocating up to 64 table structures - * in constant memory is supported. To close this function, please set - * `TableOption::use_constant_memory` to `false`. - */ -constexpr int MAX_CONSTANT_TABLE = 64; -static std::mutex constant_table_mutex; -static uint64_t constant_table_flag = 0; - -__constant__ char - c_table_[sizeof(Table) * MAX_CONSTANT_TABLE]; - -template -int allocate_constant_table() { - std::lock_guard guard(constant_table_mutex); - if (constant_table_flag == std::numeric_limits::max()) return -1; - int table_index = 0; - while (constant_table_flag & (1l << table_index)) { - table_index++; - } - - constant_table_flag = constant_table_flag | (1l << table_index); - - return table_index; -} - -template -void release_constant_table(int table_index) { - std::lock_guard guard(constant_table_mutex); - if (table_index < 0 || table_index >= MAX_CONSTANT_TABLE) return; - constant_table_flag = constant_table_flag & (~(1l << table_index)); -} - -template -__global__ void create_locks(M* __restrict mutex, const size_t start, - const size_t end) { - size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x; - if (start + tid < end) { - new (mutex + start + tid) M(); - } -} - -template -__global__ void release_locks(M* __restrict mutex, const size_t start, - const size_t end) { - size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x; - if (start + tid < end) { - (mutex + start + tid)->~M(); - } -} - -template -__global__ void create_atomic_keys(Bucket* __restrict buckets, - const size_t start, const size_t end, - const size_t bucket_max_size) { - size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x; - if (start + tid < end) { - for (size_t i = 0; i < bucket_max_size; i++) - new (buckets[start + tid].keys(i)) - AtomicKey{static_cast(EMPTY_KEY)}; - } -} - -template -__global__ void create_atomic_metas(Bucket* __restrict buckets, - const size_t start, const size_t end, - const size_t bucket_max_size) { - size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x; - if (start + tid < end) { - for (size_t i = 0; i < bucket_max_size; i++) { - new (buckets[start + tid].metas(i)) - AtomicMeta{static_cast(EMPTY_META)}; - } - new (&(buckets[start + tid].cur_meta)) - AtomicMeta{static_cast(EMPTY_META)}; - new (&(buckets[start + tid].min_meta)) - AtomicMeta{static_cast(EMPTY_META)}; - new (&(buckets[start + tid].min_pos)) AtomicPos{1}; - } -} - -/* Initialize the buckets with index from start to end. */ -template -void initialize_buckets(Table** table, const size_t start, - const size_t end) { - /* As testing results show us, when the number of buckets is greater than - * the 4 million the performance will drop significantly, we believe the - * to many pinned memory allocation causes this issue, so we change the - * strategy to allocate some memory slices whose size is not greater than - * 64GB, and put the buckets pointer point to the slices. - */ - MERLIN_CHECK(start < end, - "initialize_buckets, start should be less than end!"); - size_t buckets_num = end - start; - const size_t total_size_of_vectors = - buckets_num * (*table)->bucket_max_size * sizeof(V) * (*table)->dim; - const size_t num_of_memory_slices = - 1 + (total_size_of_vectors - 1) / (*table)->bytes_per_slice; - size_t num_of_buckets_in_one_slice = - (*table)->bytes_per_slice / - ((*table)->bucket_max_size * sizeof(V) * (*table)->dim); - size_t num_of_allocated_buckets = 0; - - realloc_managed( - &((*table)->slices), (*table)->num_of_memory_slices * sizeof(V*), - ((*table)->num_of_memory_slices + num_of_memory_slices) * sizeof(V*)); - - for (size_t i = (*table)->num_of_memory_slices; - i < (*table)->num_of_memory_slices + num_of_memory_slices; i++) { - if (i == (*table)->num_of_memory_slices + num_of_memory_slices - 1) { - num_of_buckets_in_one_slice = buckets_num - num_of_allocated_buckets; - } - size_t slice_real_size = num_of_buckets_in_one_slice * - (*table)->bucket_max_size * sizeof(V) * - (*table)->dim; - if ((*table)->remaining_hbm_for_vectors >= slice_real_size) { - CUDA_CHECK(cudaMalloc(&((*table)->slices[i]), slice_real_size)); - (*table)->remaining_hbm_for_vectors -= slice_real_size; - } else { - (*table)->is_pure_hbm = false; - CUDA_CHECK( - cudaMallocHost(&((*table)->slices[i]), slice_real_size, - cudaHostAllocMapped | cudaHostAllocWriteCombined)); - } - for (int j = 0; j < num_of_buckets_in_one_slice; j++) { - (*table)->buckets[start + num_of_allocated_buckets + j].vectors = - (*table)->slices[i] + j * (*table)->bucket_max_size * (*table)->dim; - } - num_of_allocated_buckets += num_of_buckets_in_one_slice; - } - - (*table)->num_of_memory_slices += num_of_memory_slices; - for (int i = start; i < end; i++) { - CUDA_CHECK(cudaMalloc(&((*table)->buckets[i].keys_), - (*table)->bucket_max_size * sizeof(AtomicKey))); - CUDA_CHECK(cudaMalloc(&((*table)->buckets[i].metas_), - (*table)->bucket_max_size * sizeof(AtomicMeta))); - } - - { - const size_t block_size = 512; - const size_t N = end - start + 1; - const int grid_size = SAFE_GET_GRID_SIZE(N, block_size); - create_locks<<>>((*table)->locks, start, end); - } - - { - const size_t block_size = 512; - const size_t N = end - start + 1; - const int grid_size = SAFE_GET_GRID_SIZE(N, block_size); - create_atomic_keys<<>>( - (*table)->buckets, start, end, (*table)->bucket_max_size); - } - - { - const size_t block_size = 512; - const size_t N = end - start + 1; - const int grid_size = SAFE_GET_GRID_SIZE(N, block_size); - create_atomic_metas<<>>( - (*table)->buckets, start, end, (*table)->bucket_max_size); - } - CudaCheckError(); -} - -template -size_t get_slice_size(Table** table) { - const size_t min_slice_size = - (*table)->bucket_max_size * sizeof(V) * (*table)->dim; - const size_t max_table_size = (*table)->max_size * sizeof(V) * (*table)->dim; - size_t slice_size = 0; - - if (max_table_size >= GB(16)) { - slice_size = GB(2); - } else if (max_table_size >= GB(2)) { - slice_size = MB(128); - } else if (max_table_size >= MB(128)) { - slice_size = MB(16); - } else if (max_table_size >= MB(16)) { - slice_size = MB(1); - } else { - slice_size = min_slice_size; - } - - return std::max(min_slice_size, slice_size); -} - -/* Initialize a Table struct. - - K: The key type - V: The value type which should be static array type and C++ class - with customized construct is not supported. - M: The meta type, the meta will be used to store the timestamp - or occurrence frequency or any thing for eviction. - DIM: Vector dimension. -*/ -template -void create_table(Table** table, const size_t dim, - const size_t init_size = 134217728, - const size_t max_size = std::numeric_limits::max(), - const size_t max_hbm_for_vectors = 0, - const size_t bucket_max_size = 128, - const size_t tile_size = 32, const bool primary = true) { - CUDA_CHECK(cudaMallocManaged((void**)table, sizeof(Table))); - CUDA_CHECK(cudaMemset(*table, 0, sizeof(Table))); - (*table)->dim = dim; - (*table)->bucket_max_size = bucket_max_size; - (*table)->max_size = std::max(init_size, max_size); - (*table)->tile_size = tile_size; - (*table)->is_pure_hbm = true; - (*table)->bytes_per_slice = get_slice_size(table); - - // The bucket number will be the minimum needed for saving memory if no - // rehash. - if ((init_size * 2) > (*table)->max_size) { - (*table)->buckets_num = - 1 + (((*table)->max_size - 1) / (*table)->bucket_max_size); - } else { - (*table)->buckets_num = 1; - while ((*table)->buckets_num * (*table)->bucket_max_size < init_size) { - (*table)->buckets_num *= 2; - } - } - - (*table)->capacity = (*table)->buckets_num * (*table)->bucket_max_size; - (*table)->max_hbm_for_vectors = max_hbm_for_vectors; - (*table)->remaining_hbm_for_vectors = max_hbm_for_vectors; - (*table)->primary = primary; - - CUDA_CHECK(cudaMalloc((void**)&((*table)->locks), - (*table)->buckets_num * sizeof(Mutex))); - CUDA_CHECK( - cudaMemset((*table)->locks, 0, (*table)->buckets_num * sizeof(Mutex))); - - CUDA_CHECK(cudaMalloc((void**)&((*table)->buckets_size), - (*table)->buckets_num * sizeof(int))); - CUDA_CHECK(cudaMemset((*table)->buckets_size, 0, - (*table)->buckets_num * sizeof(int))); - - CUDA_CHECK( - cudaMallocManaged((void**)&((*table)->buckets), - (*table)->buckets_num * sizeof(Bucket))); - CUDA_CHECK(cudaMemset((*table)->buckets, 0, - (*table)->buckets_num * sizeof(Bucket))); - - initialize_buckets(table, 0, (*table)->buckets_num); - CudaCheckError(); -} - -/* Double the capacity on storage, must be followed by calling the - * rehash_kernel. */ -template -void double_capacity(Table** table) { - realloc(&((*table)->locks), (*table)->buckets_num * sizeof(Mutex), - (*table)->buckets_num * sizeof(Mutex) * 2); - realloc(&((*table)->buckets_size), (*table)->buckets_num * sizeof(int), - (*table)->buckets_num * sizeof(int) * 2); - - realloc_managed*>( - &((*table)->buckets), (*table)->buckets_num * sizeof(Bucket), - (*table)->buckets_num * sizeof(Bucket) * 2); - - initialize_buckets(table, (*table)->buckets_num, - (*table)->buckets_num * 2); - - (*table)->capacity *= 2; - (*table)->buckets_num *= 2; -} - -/* free all of the resource of a Table. */ -template -void destroy_table(Table** table) { - for (int i = 0; i < (*table)->buckets_num; i++) { - CUDA_CHECK(cudaFree((*table)->buckets[i].keys_)); - CUDA_CHECK(cudaFree((*table)->buckets[i].metas_)); - } - - for (int i = 0; i < (*table)->num_of_memory_slices; i++) { - if (is_on_device((*table)->slices[i])) { - CUDA_CHECK(cudaFree((*table)->slices[i])); - } else { - CUDA_CHECK(cudaFreeHost((*table)->slices[i])); - } - } - { - const size_t block_size = 512; - const size_t N = (*table)->buckets_num; - const int grid_size = SAFE_GET_GRID_SIZE(N, block_size); - release_locks - <<>>((*table)->locks, 0, (*table)->buckets_num); - } - CUDA_CHECK(cudaFree((*table)->slices)); - CUDA_CHECK(cudaFree((*table)->buckets_size)); - CUDA_CHECK(cudaFree((*table)->buckets)); - CUDA_CHECK(cudaFree((*table)->locks)); - CUDA_CHECK(cudaFree(*table)); - CUDA_CHECK(cudaDeviceSynchronize()); - CudaCheckError(); -} - -template -__forceinline__ __device__ void defragmentation_for_rehash( - Bucket* __restrict bucket, uint32_t remove_pos, - const size_t bucket_max_size, const size_t buckets_num, const size_t dim) { - uint32_t key_idx; - size_t global_idx = 0; - size_t start_idx = 0; - K find_key; - K hashed_key; - - uint32_t empty_pos = remove_pos; - - int i = 1; - while (i < bucket_max_size) { - key_idx = (remove_pos + i) & (bucket_max_size - 1); - find_key = (bucket->keys(key_idx))->load(cuda::std::memory_order_relaxed); - if (find_key == static_cast(EMPTY_KEY)) { - break; - } - hashed_key = Murmur3HashDevice(find_key); - global_idx = hashed_key % (buckets_num * bucket_max_size); - start_idx = global_idx % bucket_max_size; - - if ((start_idx <= empty_pos && empty_pos < key_idx) || - (key_idx < start_idx && start_idx <= empty_pos) || - (empty_pos <= key_idx && key_idx < start_idx)) { - const K key = - (*(bucket->keys(key_idx))).load(cuda::std::memory_order_relaxed); - (*(bucket->keys(empty_pos))).store(key, cuda::std::memory_order_relaxed); - const M meta = - (*(bucket->metas(key_idx))).load(cuda::std::memory_order_relaxed); - (*(bucket->metas(empty_pos))) - .store(meta, cuda::std::memory_order_relaxed); - for (int j = 0; j < dim; j++) { - bucket->vectors[empty_pos * dim + j] = - bucket->vectors[key_idx * dim + j]; - } - (*(bucket->keys(key_idx))) - .store(static_cast(EMPTY_KEY), cuda::std::memory_order_relaxed); - empty_pos = key_idx; - remove_pos = key_idx; - i = 1; - } else { - i++; - } - } -} - -template -__forceinline__ __device__ void refresh_bucket_meta( - cg::thread_block_tile g, Bucket* bucket, - const size_t bucket_max_size) { - M min_val = MAX_META; - int min_pos = 0; - - for (int i = g.thread_rank(); i < bucket_max_size; i += TILE_SIZE) { - const K key = (bucket->keys(i))->load(cuda::std::memory_order_relaxed); - if (key == static_cast(EMPTY_KEY) || - key == static_cast(RECLAIM_KEY)) { - continue; - } - const M meta = bucket->metas(i)->load(cuda::std::memory_order_relaxed); - if (meta < min_val) { - min_pos = i; - min_val = meta; - } - } - M global_min_val = cg::reduce(g, min_val, cg::less()); - if (min_val == global_min_val) { - bucket->min_pos.store(min_pos, cuda::std::memory_order_relaxed); - bucket->min_meta.store(min_val, cuda::std::memory_order_relaxed); - } -} - -template -__device__ __forceinline__ void copy_vector( - cg::thread_block_tile const& g, const V* src, V* dst, - const size_t dim) { - for (auto i = g.thread_rank(); i < dim; i += g.size()) { - dst[i] = src[i]; - } - - // cuda::barrier bar; - // init(&bar, 1); - // cuda::memcpy_async(g, dst, src, dim * sizeof(V), bar); - // - // bar.arrive_and_wait(); -} - -/* Write the N data from src to each address in *dst by using CPU threads, - * usually called by upsert kernel. - * - * @note: In some machines with AMD CPUs, the `write_kernel` has low performance - * thru PCI-E, so we try to use the `memcpy` on CPU threads for writing work to - * reach better performance. - */ -template -void write_by_cpu(V** __restrict dst, const V* __restrict src, - const int* __restrict offset, size_t dim, int N, - int n_worker = 16) { - std::vector thds; - if (n_worker < 1) n_worker = 1; - - auto functor = [dim](V** __restrict dst, const V* __restrict src, - const int* __restrict offset, int handled_size, - int trunk_size) -> void { - for (int i = handled_size; i < handled_size + trunk_size; i++) { - if (dst[i] != nullptr) { - memcpy(dst[i], src + offset[i] * dim, sizeof(V) * dim); - } - } - }; - - int32_t trunk_size_floor = N / n_worker; - int32_t trunk_size_remain = N % n_worker; - int32_t n_worker_used = trunk_size_floor == 0 ? trunk_size_remain : n_worker; - - size_t handled_size = 0; - for (int i = 0; i < n_worker_used; i++) { - int32_t cur_trunk_size = trunk_size_floor; - if (trunk_size_remain != 0) { - cur_trunk_size += 1; - trunk_size_remain--; - } - thds.push_back( - std::thread(functor, dst, src, offset, handled_size, cur_trunk_size)); - handled_size += cur_trunk_size; - } - - for (int i = 0; i < n_worker_used; i++) { - thds[i].join(); - } -} - -template -__forceinline__ __device__ void move_key_to_new_bucket( - cg::thread_block_tile g, int rank, const K& key, const M& meta, - const V* __restrict vector, Bucket* __restrict new_bucket, - const size_t new_bkt_idx, const size_t new_start_idx, - int* __restrict buckets_size, const size_t bucket_max_size, - const size_t buckets_num, const size_t dim) { - uint32_t key_pos; - unsigned empty_vote; - int local_size; - int src_lane; - - for (uint32_t tile_offset = 0; tile_offset < bucket_max_size; - tile_offset += TILE_SIZE) { - size_t key_offset = - (new_start_idx + tile_offset + rank) & (bucket_max_size - 1); - const K current_key = - (*(new_bucket->keys(key_offset))).load(cuda::std::memory_order_relaxed); - empty_vote = g.ballot(current_key == static_cast(EMPTY_KEY)); - if (empty_vote) { - src_lane = __ffs(empty_vote) - 1; - key_pos = - (new_start_idx + tile_offset + src_lane) & (bucket_max_size - 1); - local_size = buckets_size[new_bkt_idx]; - if (rank == src_lane) { - new_bucket->keys(key_pos)->store(key, cuda::std::memory_order_relaxed); - new_bucket->metas(key_pos)->store(meta, - cuda::std::memory_order_relaxed); - atomicAdd(&(buckets_size[new_bkt_idx]), 1); - } - local_size = g.shfl(local_size, src_lane); - if (local_size >= bucket_max_size) { - refresh_bucket_meta(g, new_bucket, bucket_max_size); - } - copy_vector(g, vector, new_bucket->vectors + key_pos * dim, - dim); - break; - } - } -} - -template -__global__ void rehash_kernel_for_fast_mode( - const Table* __restrict table, size_t N) { - Bucket* buckets = table->buckets; - int* __restrict buckets_size = table->buckets_size; - const size_t bucket_max_size = table->bucket_max_size; - const size_t buckets_num = table->buckets_num; - const size_t dim = table->dim; - - auto g = cg::tiled_partition(cg::this_thread_block()); - int rank = g.thread_rank(); - size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x; - size_t global_idx; - uint32_t start_idx = 0; - K target_key = 0; - M target_meta = 0; - - for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) { - uint32_t bkt_idx = t / TILE_SIZE; - Bucket* bucket = (buckets + bkt_idx); - - lock(g, table->locks[bkt_idx]); - uint32_t key_idx = 0; - while (key_idx < bucket_max_size) { - key_idx = g.shfl(key_idx, 0); - target_key = - (bucket->keys(key_idx))->load(cuda::std::memory_order_relaxed); - target_meta = - bucket->metas(key_idx)->load(cuda::std::memory_order_relaxed); - if (target_key != static_cast(EMPTY_KEY) && - target_key != static_cast(RECLAIM_KEY)) { - K hashed_key = Murmur3HashDevice(target_key); - global_idx = hashed_key % (buckets_num * bucket_max_size); - uint32_t new_bkt_idx = global_idx / bucket_max_size; - if (new_bkt_idx != bkt_idx) { - start_idx = global_idx % bucket_max_size; - move_key_to_new_bucket( - g, rank, target_key, target_meta, - (bucket->vectors + key_idx * dim), buckets + new_bkt_idx, - new_bkt_idx, start_idx, buckets_size, bucket_max_size, - buckets_num, table->dim); - if (rank == 0) { - (bucket->keys(key_idx)) - ->store(static_cast(EMPTY_KEY), - cuda::std::memory_order_relaxed); - atomicSub(&(buckets_size[bkt_idx]), 1); - defragmentation_for_rehash( - bucket, key_idx, bucket_max_size, buckets_num / 2, dim); - key_idx = 0; - } - } else { - key_idx++; - } - } else { - key_idx++; - } - } - unlock(g, table->locks[bkt_idx]); - } -} - -/* Write the N data from src to each address in *dst, - usually called by upsert kernel. - - `src`: A continuous memory pointer with Vector - which can be HBM. - `dst`: A pointer of pointer to V which should be on HBM, - but each value (a pointer of V) could point to a - memory on HBM or HMEM. - `N`: Number of vectors that need to be written. -*/ -template -__global__ void write_kernel(const V* __restrict src, V** __restrict dst, - const int* __restrict src_offset, const size_t dim, - const size_t N) { - size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x; - - for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) { - int vec_index = int(t / dim); - int dim_index = t % dim; - - if (dst[vec_index] != nullptr) { - if (src_offset != nullptr) { - dst[vec_index][dim_index] = - src[src_offset[vec_index] * dim + dim_index]; - } else { - dst[vec_index][dim_index] = src[vec_index * dim + dim_index]; - } - } - } -} - -/* Write the values of delta_or_val into the table. If the key[i] is already in - the table indicted be @exists[i], a @delta_or_val[i] will be added to the the - existing value. if the key not exists, the value @val_or_delta[i] will be - assigned to the address @dst[i]. - - `delta_or_val`: will be treated as val and accumlating should be executed. - `dst`: A pointer of pointer to V which should be on HBM, - but each value (a pointer of V) could point to a - memory on HBM or HMEM. - `existed`: If the keys existed before this kernel is executed. - `status`: The existence status for each key when the kernel is being - executed. - - `N`: number of vectors needed to be writen. -*/ -template -__global__ void write_with_accum_kernel(const V* __restrict delta_or_val, - V** __restrict dst, - const bool* __restrict existed, - const bool* __restrict status, - const int* __restrict src_offset, - const size_t dim, size_t N) { - size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x; - - for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) { - int vec_index = int(t / dim); - int dim_index = t % dim; - - if (dst[vec_index] != nullptr && - existed[src_offset[vec_index]] == status[src_offset[vec_index]]) { - if (status[src_offset[vec_index]]) { - dst[vec_index][dim_index] += - delta_or_val[src_offset[vec_index] * dim + dim_index]; - } else { - dst[vec_index][dim_index] = - delta_or_val[src_offset[vec_index] * dim + dim_index]; - } - } - } -} - -/* Add a @delta[i] to the the value saved in the address @dst[i]. - - `delta`: a delta value which should be add to. - `dst`: A pointer of pointer to V which should be on HBM, - but each value (a pointer of V) could point to a - memory on HBM or HMEM. - `N`: number of vectors needed to be writen. -*/ -template -__global__ void write_with_accum_kernel(const V* __restrict delta, - V** __restrict dst, - const int* __restrict src_offset, - const size_t dim, size_t N) { - size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x; - - for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) { - int vec_index = int(t / dim); - int dim_index = t % dim; - - if (dst[vec_index] != nullptr) { - dst[vec_index][dim_index] += - delta[src_offset[vec_index] * dim + dim_index]; - } - } -} - -/* Read the N data from src to each address in *dst, - usually called by upsert kernel. - - `src`: A pointer of pointer of V which should be on HBM, - but each value (a pointer of V) could point to a - memory on HBM or HMEM. - `dst`: A continue memory pointer with Vector - which should be HBM. - `mask`: One for each `dst`. If true, reading from src, - or false reading from default_val. - `default_val`: Default value with shape (1, DIM) or (N, DIM) - `N`: The number of vectors needed to be read. - 'full_size_default': - If true, the d_def_val will be treated as - a full size default value which shape must be (N, DIM). -*/ -template -__global__ void read_kernel(const V* const* __restrict src, V* __restrict dst, - const bool* mask, const int* __restrict dst_offset, - const size_t dim, size_t N) { - size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x; - - for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) { - int vec_index = int(t / dim); - int dim_index = t % dim; - int real_dst_offset = - dst_offset != nullptr ? dst_offset[vec_index] : vec_index; - - /// Copy selected values and fill in default value for all others. - if (mask[real_dst_offset] && src[vec_index] != nullptr) { - dst[real_dst_offset * dim + dim_index] = src[vec_index][dim_index]; - } - } -} - -/* Read the N data from src to each address in *dst, - * usually called by upsert kernel. - * - * `src`: A pointer of pointer of V which should be on HBM, - * but each value (a pointer of V) could point to a - * memory on HBM or HMEM. - * `dst`: A continue memory pointer with Vector - * which should be HBM. - * `N`: Number of vectors needed to be read. - */ -template -__global__ void read_kernel(const V** __restrict src, V* __restrict dst, - const int* __restrict dst_offset, const size_t dim, - const size_t N) { - size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x; - - for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) { - int vec_index = int(t / dim); - int real_dst_offset = - dst_offset != nullptr ? dst_offset[vec_index] : vec_index; - int dim_index = t % dim; - if (src[vec_index] != nullptr) { - dst[real_dst_offset * dim + dim_index] = src[vec_index * dim + dim_index]; - } - } -} - -template -__device__ __forceinline__ unsigned find_in_bucket( - cg::thread_block_tile g, Bucket* bucket, - const K& find_key, uint32_t& tile_offset, const uint32_t& start_idx, - const size_t& bucket_max_size) { - uint32_t key_pos = 0; - -#pragma unroll - for (tile_offset = 0; tile_offset < bucket_max_size; - tile_offset += TILE_SIZE) { - key_pos = - (start_idx + tile_offset + g.thread_rank()) & (bucket_max_size - 1); - auto const current_key = - bucket->keys(key_pos)->load(cuda::std::memory_order_relaxed); - auto const found_vote = g.ballot(find_key == current_key); - if (found_vote) { - return found_vote; - } - - if (g.any(current_key == static_cast(EMPTY_KEY))) { - return 0; - } - } - return 0; -} - -template -__device__ __forceinline__ OccupyResult find_without_lock( - cg::thread_block_tile g, Bucket* __restrict__ bucket, - const K desired_key, const size_t start_idx, int& key_pos, int& src_lane, - const size_t bucket_max_size) { - K expected_key = static_cast(EMPTY_KEY); - - AtomicKey* current_key; - - unsigned vote = 0; - - for (uint32_t tile_offset = 0; tile_offset < bucket_max_size; - tile_offset += TILE_SIZE) { - key_pos = (start_idx + tile_offset + g.thread_rank()) % bucket_max_size; - - current_key = bucket->keys(key_pos); - - expected_key = current_key->load(cuda::std::memory_order_relaxed); - vote = g.ballot(desired_key == expected_key); - if (vote) { - src_lane = __ffs(vote) - 1; - key_pos = g.shfl(key_pos, src_lane); - return OccupyResult::DUPLICATE; - } - vote = g.ballot(expected_key == static_cast(EMPTY_KEY)); - if (vote) break; - } - return OccupyResult::CONTINUE; -} - -template -__device__ __inline__ OccupyResult find_and_lock_when_vacant( - cg::thread_block_tile g, Bucket* __restrict__ bucket, - const K desired_key, const M desired_meta, K& evicted_key, - const size_t start_idx, int& key_pos, int& src_lane, - const size_t bucket_max_size) { - K expected_key = static_cast(EMPTY_KEY); - - AtomicKey* current_key; - AtomicMeta* current_meta; - - K local_min_meta_key = static_cast(EMPTY_KEY); - - M local_min_meta_val = MAX_META; - M temp_min_meta_val = MAX_META; - int local_min_meta_pos = -1; - - unsigned vote = 0; - bool result = false; - - for (uint32_t tile_offset = 0; tile_offset < bucket_max_size; - tile_offset += TILE_SIZE) { - key_pos = (start_idx + tile_offset + g.thread_rank()) % bucket_max_size; - - current_key = bucket->keys(key_pos); - - // Step 1: try find and lock the desired_key. - do { - expected_key = desired_key; - result = current_key->compare_exchange_strong( - expected_key, static_cast(LOCKED_KEY), - cuda::std::memory_order_relaxed, cuda::std::memory_order_relaxed); - vote = g.ballot(result); - if (vote) { - src_lane = __ffs(vote) - 1; - key_pos = g.shfl(key_pos, src_lane); - return OccupyResult::DUPLICATE; - } - vote = g.ballot(expected_key == static_cast(EMPTY_KEY)); - if (vote) break; - vote = g.ballot(expected_key == static_cast(LOCKED_KEY)); - } while (vote != 0); - - // Step 2: (TBD)try find empty location. - while (vote) { - src_lane = __ffs(vote) - 1; - if (src_lane == g.thread_rank()) { - expected_key = static_cast(EMPTY_KEY); - result = current_key->compare_exchange_strong( - expected_key, static_cast(LOCKED_KEY), - cuda::std::memory_order_relaxed, cuda::std::memory_order_relaxed); - } - result = g.shfl(result, src_lane); - if (result) { - key_pos = g.shfl(key_pos, src_lane); - return OccupyResult::OCCUPIED_EMPTY; - } - vote -= ((unsigned(0x1)) << src_lane); - } - } - - for (uint32_t tile_offset = 0; tile_offset < bucket_max_size; - tile_offset += TILE_SIZE) { - key_pos = (start_idx + tile_offset + g.thread_rank()) % bucket_max_size; - - current_meta = bucket->metas(key_pos); - - // Step 4: record min meta location. - temp_min_meta_val = current_meta->load(cuda::std::memory_order_relaxed); - if (temp_min_meta_val < local_min_meta_val) { - expected_key = - bucket->keys(key_pos)->load(cuda::std::memory_order_relaxed); - if (expected_key != static_cast(LOCKED_KEY) && - expected_key != static_cast(EMPTY_KEY)) { - local_min_meta_key = expected_key; - local_min_meta_val = temp_min_meta_val; - local_min_meta_pos = key_pos; - } - } - } - // Step 5: insert by evicting some one. - const M global_min_meta_val = - cg::reduce(g, local_min_meta_val, cg::less()); - if (desired_meta < global_min_meta_val) { - return OccupyResult::REFUSED; - } - vote = g.ballot(local_min_meta_val <= global_min_meta_val); - if (vote) { - src_lane = __ffs(vote) - 1; - result = false; - if (src_lane == g.thread_rank()) { - // TBD: Here can be compare_exchange_weak. Do benchmark. - current_key = bucket->keys(local_min_meta_pos); - current_meta = bucket->metas(local_min_meta_pos); - evicted_key = local_min_meta_key; - result = current_key->compare_exchange_strong( - local_min_meta_key, static_cast(LOCKED_KEY), - cuda::std::memory_order_relaxed, cuda::std::memory_order_relaxed); - - // Need to recover when fail. - if (result && (current_meta->load(cuda::std::memory_order_relaxed) > - global_min_meta_val)) { - current_key->store(local_min_meta_key, cuda::std::memory_order_relaxed); - result = false; - } - } - result = g.shfl(result, src_lane); - if (result) { - // Not every `evicted_key` is correct expect the `src_lane` thread. - key_pos = g.shfl(local_min_meta_pos, src_lane); - return (evicted_key == static_cast(RECLAIM_KEY)) - ? OccupyResult::OCCUPIED_RECLAIMED - : OccupyResult::EVICT; - } - } - return OccupyResult::CONTINUE; -} - -template -__device__ __forceinline__ OccupyResult find_and_lock_when_full( - cg::thread_block_tile g, Bucket* __restrict__ bucket, - const K desired_key, const M desired_meta, K& evicted_key, - const size_t start_idx, int& key_pos, int& src_lane, - const size_t bucket_max_size) { - K expected_key = static_cast(EMPTY_KEY); - - AtomicKey* current_key; - AtomicMeta* current_meta; - - K local_min_meta_key = static_cast(EMPTY_KEY); - - M local_min_meta_val = MAX_META; - M temp_min_meta_val = MAX_META; - int local_min_meta_pos = -1; - - unsigned vote = 0; - bool result = false; - - for (uint32_t tile_offset = 0; tile_offset < bucket_max_size; - tile_offset += TILE_SIZE) { - key_pos = (start_idx + tile_offset + g.thread_rank()) % bucket_max_size; - - current_key = bucket->keys(key_pos); - - // Step 1: try find and lock the desired_key. - do { - expected_key = desired_key; - result = current_key->compare_exchange_strong( - expected_key, static_cast(LOCKED_KEY), - cuda::std::memory_order_relaxed, cuda::std::memory_order_relaxed); - vote = g.ballot(result); - if (vote) { - src_lane = __ffs(vote) - 1; - key_pos = g.shfl(key_pos, src_lane); - return OccupyResult::DUPLICATE; - } - vote = g.ballot(expected_key == static_cast(LOCKED_KEY)); - } while (vote != 0); - } - - for (uint32_t tile_offset = 0; tile_offset < bucket_max_size; - tile_offset += TILE_SIZE) { - key_pos = (start_idx + tile_offset + g.thread_rank()) % bucket_max_size; - - // Step 2: record min meta location. - temp_min_meta_val = - bucket->metas(key_pos)->load(cuda::std::memory_order_relaxed); - if (temp_min_meta_val < local_min_meta_val) { - while ((expected_key = bucket->keys(key_pos)->load( - cuda::std::memory_order_relaxed)) == - static_cast(LOCKED_KEY)) - ; - local_min_meta_key = expected_key; - local_min_meta_val = temp_min_meta_val; - local_min_meta_pos = key_pos; - } - } - - // Step 3: insert by evicting some one. - const M global_min_meta_val = - cg::reduce(g, local_min_meta_val, cg::less()); - if (desired_meta < global_min_meta_val) { - return OccupyResult::REFUSED; - } - vote = g.ballot(local_min_meta_val <= global_min_meta_val); - if (vote) { - src_lane = __ffs(vote) - 1; - result = false; - if (src_lane == g.thread_rank()) { - // TBD: Here can be compare_exchange_weak. Do benchmark. - current_key = bucket->keys(local_min_meta_pos); - current_meta = bucket->metas(local_min_meta_pos); - evicted_key = local_min_meta_key; - result = current_key->compare_exchange_strong( - local_min_meta_key, static_cast(LOCKED_KEY), - cuda::std::memory_order_relaxed, cuda::std::memory_order_relaxed); - - // Need to recover when fail. - if (result && (current_meta->load(cuda::std::memory_order_relaxed) > - global_min_meta_val)) { - current_key->store(local_min_meta_key, cuda::std::memory_order_relaxed); - result = false; - } - } - result = g.shfl(result, src_lane); - if (result) { - // Not every `evicted_key` is correct expect the `src_lane` thread. - key_pos = g.shfl(local_min_meta_pos, src_lane); - return (evicted_key == static_cast(RECLAIM_KEY)) - ? OccupyResult::OCCUPIED_RECLAIMED - : OccupyResult::EVICT; - } - } - return OccupyResult::CONTINUE; -} - -template -__device__ __forceinline__ OccupyResult find_and_lock_for_update( - cg::thread_block_tile g, Bucket* __restrict__ bucket, - const K desired_key, const size_t start_idx, int& key_pos, int& src_lane, - const size_t bucket_max_size) { - K expected_key = static_cast(EMPTY_KEY); - - AtomicKey* current_key; - - unsigned vote = 0; - bool result = false; - - for (uint32_t tile_offset = 0; tile_offset < bucket_max_size; - tile_offset += TILE_SIZE) { - key_pos = (start_idx + tile_offset + g.thread_rank()) % bucket_max_size; - - current_key = bucket->keys(key_pos); - - // Step 1: try find and lock the desired_key. - do { - expected_key = desired_key; - result = current_key->compare_exchange_strong( - expected_key, static_cast(LOCKED_KEY), - cuda::std::memory_order_relaxed, cuda::std::memory_order_relaxed); - vote = g.ballot(result); - if (vote) { - src_lane = __ffs(vote) - 1; - key_pos = g.shfl(key_pos, src_lane); - return OccupyResult::DUPLICATE; - } - vote = g.ballot(expected_key == static_cast(EMPTY_KEY)); - if (vote) return OccupyResult::REFUSED; - vote = g.ballot(expected_key == static_cast(LOCKED_KEY)); - } while (vote != 0); - } - return OccupyResult::REFUSED; -} - -template -__forceinline__ __device__ Bucket* get_key_position( - Bucket* __restrict buckets, const K key, size_t& bkt_idx, - size_t& start_idx, const size_t buckets_num, const size_t bucket_max_size) { - const uint32_t hashed_key = Murmur3HashDevice(key); - const size_t global_idx = hashed_key % (buckets_num * bucket_max_size); - bkt_idx = global_idx / bucket_max_size; - start_idx = global_idx % bucket_max_size; - return buckets + bkt_idx; -} - -template -__forceinline__ __device__ void update_meta(Bucket* __restrict bucket, - const int key_pos, - const M* __restrict metas, - const int key_idx) { - if (metas == nullptr) { - M cur_meta = - bucket->cur_meta.fetch_add(1, cuda::std::memory_order_relaxed) + 1; - bucket->metas(key_pos)->store(cur_meta, cuda::std::memory_order_relaxed); - } else { - bucket->metas(key_pos)->store(metas[key_idx], - cuda::std::memory_order_relaxed); - } - return; -} - -template -__global__ void upsert_kernel_with_io_core( - const Table* __restrict table, const size_t bucket_max_size, - const size_t buckets_num, const size_t dim, const K* __restrict keys, - const V* __restrict values, const M* __restrict metas, size_t N) { - auto g = cg::tiled_partition(cg::this_thread_block()); - int* buckets_size = table->buckets_size; - - for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N; - t += blockDim.x * gridDim.x) { - int key_pos = -1; - size_t key_idx = t / TILE_SIZE; - - const K insert_key = keys[key_idx]; - - if (IS_RESERVED_KEY(insert_key)) continue; - - const M insert_meta = - metas != nullptr ? metas[key_idx] : static_cast(MAX_META); - const V* insert_value = values + key_idx * dim; - - size_t bkt_idx = 0; - size_t start_idx = 0; - int src_lane = -1; - K evicted_key; - - Bucket* bucket = - get_key_position(table->buckets, insert_key, bkt_idx, start_idx, - buckets_num, bucket_max_size); - - OccupyResult occupy_result{OccupyResult::INITIAL}; - const int bucket_size = buckets_size[bkt_idx]; - do { - if (bucket_size < bucket_max_size) { - occupy_result = find_and_lock_when_vacant( - g, bucket, insert_key, insert_meta, evicted_key, start_idx, key_pos, - src_lane, bucket_max_size); - } else { - start_idx = (start_idx / TILE_SIZE) * TILE_SIZE; - occupy_result = find_and_lock_when_full( - g, bucket, insert_key, insert_meta, evicted_key, start_idx, key_pos, - src_lane, bucket_max_size); - } - - occupy_result = g.shfl(occupy_result, src_lane); - } while (occupy_result == OccupyResult::CONTINUE); - - if (occupy_result == OccupyResult::REFUSED) continue; - - if ((occupy_result == OccupyResult::OCCUPIED_EMPTY || - occupy_result == OccupyResult::OCCUPIED_RECLAIMED) && - g.thread_rank() == src_lane) { - atomicAdd(&(buckets_size[bkt_idx]), 1); - } - - copy_vector(g, insert_value, bucket->vectors + key_pos * dim, - dim); - if (g.thread_rank() == src_lane) { - update_meta(bucket, key_pos, metas, key_idx); - (bucket->keys(key_pos)) - ->store(insert_key, cuda::std::memory_order_relaxed); - } - } -} - -template -__global__ void upsert_and_evict_kernel_with_io_core( - const Table* __restrict table, const size_t bucket_max_size, - const size_t buckets_num, const size_t dim, const K* __restrict keys, - const V* __restrict values, const M* __restrict metas, - K* __restrict evicted_keys, V* __restrict evicted_values, - M* __restrict evicted_metas, size_t N) { - auto g = cg::tiled_partition(cg::this_thread_block()); - int* buckets_size = table->buckets_size; - - for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N; - t += blockDim.x * gridDim.x) { - int key_pos = -1; - const size_t key_idx = t / TILE_SIZE; - - const K insert_key = keys[key_idx]; - - if (IS_RESERVED_KEY(insert_key)) continue; - - const M insert_meta = - metas != nullptr ? metas[key_idx] : static_cast(MAX_META); - const V* insert_value = values + key_idx * dim; - - size_t bkt_idx = 0; - size_t start_idx = 0; - int src_lane = -1; - K evicted_key; - - Bucket* bucket = - get_key_position(table->buckets, insert_key, bkt_idx, start_idx, - buckets_num, bucket_max_size); - - OccupyResult occupy_result{OccupyResult::INITIAL}; - const int bucket_size = buckets_size[bkt_idx]; - do { - if (bucket_size < bucket_max_size) { - occupy_result = find_and_lock_when_vacant( - g, bucket, insert_key, insert_meta, evicted_key, start_idx, key_pos, - src_lane, bucket_max_size); - } else { - start_idx = (start_idx / TILE_SIZE) * TILE_SIZE; - occupy_result = find_and_lock_when_full( - g, bucket, insert_key, insert_meta, evicted_key, start_idx, key_pos, - src_lane, bucket_max_size); - } - occupy_result = g.shfl(occupy_result, src_lane); - } while (occupy_result == OccupyResult::CONTINUE); - - if (occupy_result == OccupyResult::REFUSED) { - copy_vector(g, insert_value, evicted_values + key_idx * dim, - dim); - continue; - } - - if ((occupy_result == OccupyResult::OCCUPIED_EMPTY || - occupy_result == OccupyResult::OCCUPIED_RECLAIMED) && - g.thread_rank() == src_lane) { - atomicAdd(&(buckets_size[bkt_idx]), 1); - } - - if (occupy_result == OccupyResult::EVICT) { - if (g.thread_rank() == src_lane) { - evicted_keys[key_idx] = evicted_key; - } - if (metas != nullptr) { - evicted_metas[key_idx] = metas[key_idx]; - } - copy_vector(g, bucket->vectors + key_pos * dim, - evicted_values + key_idx * dim, dim); - } - - copy_vector(g, insert_value, bucket->vectors + key_pos * dim, - dim); - if (g.thread_rank() == src_lane) { - update_meta(bucket, key_pos, metas, key_idx); - (bucket->keys(key_pos)) - ->store(insert_key, cuda::std::memory_order_relaxed); - } - } -} - -template -struct SelectUpsertKernelWithIO { - static void execute_kernel(const float& load_factor, const int& block_size, - const size_t bucket_max_size, - const size_t buckets_num, const size_t dim, - cudaStream_t& stream, const size_t& n, - const Table* __restrict table, - const K* __restrict keys, - const V* __restrict values, - const M* __restrict metas) { - if (load_factor <= 0.5) { - const unsigned int tile_size = 4; - const size_t N = n * tile_size; - const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); - upsert_kernel_with_io_core - <<>>( - table, bucket_max_size, buckets_num, dim, keys, values, metas, N); - - } else if (load_factor <= 0.875) { - const unsigned int tile_size = 8; - const size_t N = n * tile_size; - const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); - upsert_kernel_with_io_core - <<>>( - table, bucket_max_size, buckets_num, dim, keys, values, metas, N); - } else { - const unsigned int tile_size = 32; - const size_t N = n * tile_size; - const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); - upsert_kernel_with_io_core - <<>>( - table, bucket_max_size, buckets_num, dim, keys, values, metas, N); - } - return; - } -}; - -template -struct SelectUpsertAndEvictKernelWithIO { - static void execute_kernel( - const float& load_factor, const int& block_size, - const size_t bucket_max_size, const size_t buckets_num, const size_t dim, - cudaStream_t& stream, const size_t& n, - const Table* __restrict table, const K* __restrict keys, - const V* __restrict values, const M* __restrict metas, - K* __restrict evicted_keys, V* __restrict evicted_values, - M* __restrict evicted_metas) { - if (load_factor <= 0.5) { - const unsigned int tile_size = 4; - const size_t N = n * tile_size; - const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); - upsert_and_evict_kernel_with_io_core - <<>>( - table, bucket_max_size, buckets_num, dim, keys, values, metas, - evicted_keys, evicted_values, evicted_metas, N); - - } else if (load_factor <= 0.875) { - const unsigned int tile_size = 8; - const size_t N = n * tile_size; - const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); - - upsert_and_evict_kernel_with_io_core - <<>>( - table, bucket_max_size, buckets_num, dim, keys, values, metas, - evicted_keys, evicted_values, evicted_metas, N); - - } else { - const unsigned int tile_size = 32; - const size_t N = n * tile_size; - const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); - upsert_and_evict_kernel_with_io_core - <<>>( - table, bucket_max_size, buckets_num, dim, keys, values, metas, - evicted_keys, evicted_values, evicted_metas, N); - } - return; - } -}; - -/* Upsert with the end-user specified meta. - */ -template -__global__ void upsert_kernel(const Table* __restrict table, - const size_t bucket_max_size, - const size_t buckets_num, const size_t dim, - const K* __restrict keys, V** __restrict vectors, - const M* __restrict metas, - int* __restrict src_offset, size_t N) { - auto g = cg::tiled_partition(cg::this_thread_block()); - int* buckets_size = table->buckets_size; - - for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N; - t += blockDim.x * gridDim.x) { - int key_pos = -1; - size_t key_idx = t / TILE_SIZE; - - const K insert_key = keys[key_idx]; - if (IS_RESERVED_KEY(insert_key)) continue; - - const M insert_meta = - metas != nullptr ? metas[key_idx] : static_cast(MAX_META); - - size_t bkt_idx = 0; - size_t start_idx = 0; - int src_lane = -1; - K evicted_key; - - Bucket* bucket = - get_key_position(table->buckets, insert_key, bkt_idx, start_idx, - buckets_num, bucket_max_size); - - if (src_offset != nullptr && g.thread_rank() == 0) { - *(src_offset + key_idx) = key_idx; - } - - OccupyResult occupy_result{OccupyResult::INITIAL}; - const int bucket_size = buckets_size[bkt_idx]; - do { - if (bucket_size < bucket_max_size) { - occupy_result = find_and_lock_when_vacant( - g, bucket, insert_key, insert_meta, evicted_key, start_idx, key_pos, - src_lane, bucket_max_size); - } else { - start_idx = (start_idx / TILE_SIZE) * TILE_SIZE; - occupy_result = find_and_lock_when_vacant( - g, bucket, insert_key, insert_meta, evicted_key, start_idx, key_pos, - src_lane, bucket_max_size); - } - - occupy_result = g.shfl(occupy_result, src_lane); - } while (occupy_result == OccupyResult::CONTINUE); - - if (occupy_result == OccupyResult::REFUSED) continue; - - if ((occupy_result == OccupyResult::OCCUPIED_EMPTY || - occupy_result == OccupyResult::OCCUPIED_RECLAIMED) && - g.thread_rank() == src_lane) { - atomicAdd(&(buckets_size[bkt_idx]), 1); - } - - if (g.thread_rank() == src_lane) { - *(vectors + key_idx) = (bucket->vectors + key_pos * dim); - update_meta(bucket, key_pos, metas, key_idx); - (bucket->keys(key_pos)) - ->store(insert_key, cuda::std::memory_order_relaxed); - } - } -} - -/* Accum kernel with customized metas. - */ -template -__global__ void accum_kernel( - const Table* __restrict table, const K* __restrict keys, - V** __restrict vectors, const M* __restrict metas, - const bool* __restrict existed, Bucket* __restrict buckets, - int* __restrict buckets_size, const size_t bucket_max_size, - const size_t buckets_num, int* __restrict src_offset, - bool* __restrict status, size_t N) { - const size_t dim = table->dim; - size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x; - auto g = cg::tiled_partition(cg::this_thread_block()); - int rank = g.thread_rank(); - - for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) { - int key_pos = -1; - int local_size = 0; - bool local_found = false; - unsigned found_or_empty_vote = 0; - - size_t key_idx = t / TILE_SIZE; - K insert_key = *(keys + key_idx); - - if (IS_RESERVED_KEY(insert_key)) continue; - - K hashed_key = Murmur3HashDevice(insert_key); - size_t global_idx = hashed_key % (buckets_num * bucket_max_size); - size_t bkt_idx = global_idx / bucket_max_size; - size_t start_idx = global_idx % bucket_max_size; - - int src_lane = -1; - - Bucket* bucket = buckets + bkt_idx; - lock(g, table->locks[bkt_idx]); - if (rank == 0 && src_offset != nullptr) { - *(src_offset + key_idx) = key_idx; - } - - for (uint32_t tile_offset = 0; tile_offset < bucket_max_size; - tile_offset += TILE_SIZE) { - size_t key_offset = - (start_idx + tile_offset + rank) & (bucket_max_size - 1); - K current_key = - bucket->keys(key_offset)->load(cuda::std::memory_order_relaxed); - found_or_empty_vote = g.ballot(current_key == static_cast(EMPTY_KEY) || - insert_key == current_key); - if (found_or_empty_vote) { - src_lane = __ffs(found_or_empty_vote) - 1; - key_pos = (start_idx + tile_offset + src_lane) & (bucket_max_size - 1); - local_size = buckets_size[bkt_idx]; - if (rank == src_lane) { - if (current_key == insert_key) { - local_found = true; - *(status + key_idx) = local_found; - } - if (local_found == existed[key_idx]) { - (bucket->keys(key_pos)) - ->store(insert_key, cuda::std::memory_order_relaxed); - if (!local_found) { - buckets_size[bkt_idx]++; - local_size++; - } - *(vectors + key_idx) = (bucket->vectors + key_pos * dim); - update_meta(bucket, key_pos, metas, key_idx); - } - } - local_size = g.shfl(local_size, src_lane); - if (local_size >= bucket_max_size) { - refresh_bucket_meta(g, bucket, bucket_max_size); - } - break; - } - } - if (!found_or_empty_vote) { - if (rank == (bucket->min_pos % TILE_SIZE)) { - key_pos = bucket->min_pos; - (bucket->keys(key_pos)) - ->store(insert_key, cuda::std::memory_order_relaxed); - *(vectors + key_idx) = (bucket->vectors + key_pos * dim); - update_meta(bucket, key_pos, metas, key_idx); - } - refresh_bucket_meta(g, bucket, bucket_max_size); - } - unlock(g, table->locks[bkt_idx]); - } -} - -/* lookup with IO operation. This kernel is - * usually used for the pure HBM mode for better performance. - */ -template -__global__ void lookup_kernel_with_io(const Table* __restrict table, - const size_t bucket_max_size, - const size_t buckets_num, - const size_t dim, - const K* __restrict keys, - V* __restrict values, M* __restrict metas, - bool* __restrict found, size_t N) { - int* buckets_size = table->buckets_size; - Bucket* buckets = table->buckets; - - auto g = cg::tiled_partition(cg::this_thread_block()); - int rank = g.thread_rank(); - - for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N; - t += blockDim.x * gridDim.x) { - int key_idx = t / TILE_SIZE; - - const K find_key = keys[key_idx]; - if (IS_RESERVED_KEY(find_key)) continue; - - V* find_value = values + key_idx * dim; - - int key_pos = -1; - int src_lane = -1; - size_t bkt_idx = 0; - size_t start_idx = 0; - - Bucket* bucket = get_key_position( - buckets, find_key, bkt_idx, start_idx, buckets_num, bucket_max_size); - - const int bucket_size = buckets_size[bkt_idx]; - if (bucket_size >= bucket_max_size) { - start_idx = (start_idx / TILE_SIZE) * TILE_SIZE; - } - - OccupyResult occupy_result{OccupyResult::INITIAL}; - occupy_result = find_without_lock( - g, bucket, find_key, start_idx, key_pos, src_lane, bucket_max_size); - - if (occupy_result == OccupyResult::DUPLICATE) { - copy_vector(g, bucket->vectors + key_pos * dim, find_value, - dim); - if (rank == src_lane) { - if (metas != nullptr) { - *(metas + key_idx) = - bucket->metas(key_pos)->load(cuda::std::memory_order_relaxed); - } - if (found != nullptr) { - *(found + key_idx) = true; - } - } - } - } -} - -template -struct SelectLookupKernelWithIO { - static void execute_kernel(const float& load_factor, const int& block_size, - const size_t bucket_max_size, - const size_t buckets_num, const size_t dim, - cudaStream_t& stream, const size_t& n, - const Table* __restrict table, - const K* __restrict keys, V* __restrict values, - M* __restrict metas, bool* __restrict found) { - if (load_factor <= 0.75) { - const unsigned int tile_size = 4; - const size_t N = n * tile_size; - const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); - lookup_kernel_with_io - <<>>(table, bucket_max_size, - buckets_num, dim, keys, values, - metas, found, N); - } else { - const unsigned int tile_size = 16; - const size_t N = n * tile_size; - const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); - lookup_kernel_with_io - <<>>(table, bucket_max_size, - buckets_num, dim, keys, values, - metas, found, N); - } - return; - } -}; - -/* lookup kernel. - */ -template -__global__ void lookup_kernel(const Table* __restrict table, - const size_t bucket_max_size, - const size_t buckets_num, const size_t dim, - const K* __restrict keys, V** __restrict values, - M* __restrict metas, bool* __restrict found, - int* __restrict dst_offset, size_t N) { - int* buckets_size = table->buckets_size; - Bucket* buckets = table->buckets; - - auto g = cg::tiled_partition(cg::this_thread_block()); - int rank = g.thread_rank(); - - for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N; - t += blockDim.x * gridDim.x) { - int key_idx = t / TILE_SIZE; - - const K find_key = keys[key_idx]; - if (IS_RESERVED_KEY(find_key)) continue; - - int key_pos = -1; - int src_lane = -1; - size_t bkt_idx = 0; - size_t start_idx = 0; - - Bucket* bucket = get_key_position( - buckets, find_key, bkt_idx, start_idx, buckets_num, bucket_max_size); - - const int bucket_size = buckets_size[bkt_idx]; - if (bucket_size >= bucket_max_size) { - start_idx = (start_idx / TILE_SIZE) * TILE_SIZE; - } - - if (dst_offset != nullptr && rank == 0) { - *(dst_offset + key_idx) = key_idx; - } - - OccupyResult occupy_result{OccupyResult::INITIAL}; - occupy_result = find_without_lock( - g, bucket, find_key, start_idx, key_pos, src_lane, bucket_max_size); - - if (occupy_result == OccupyResult::DUPLICATE) { - if (rank == src_lane) { - *(values + key_idx) = (bucket->vectors + key_pos * dim); - if (metas != nullptr) { - *(metas + key_idx) = - bucket->metas(key_pos)->load(cuda::std::memory_order_relaxed); - } - if (found != nullptr) { - *(found + key_idx) = true; - } - } - } else { - if (rank == 0) { - *(values + key_idx) = nullptr; - } - } - } -} - -/* lookup with IO operation. This kernel is - * usually used for the pure HBM mode for better performance. - */ -template -__global__ void lookup_ptr_kernel(const Table* __restrict table, - const size_t bucket_max_size, - const size_t buckets_num, const size_t dim, - const K* __restrict keys, - V** __restrict values, M* __restrict metas, - bool* __restrict found, size_t N) { - int* buckets_size = table->buckets_size; - Bucket* buckets = table->buckets; - - auto g = cg::tiled_partition(cg::this_thread_block()); - int rank = g.thread_rank(); - - for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N; - t += blockDim.x * gridDim.x) { - int key_idx = t / TILE_SIZE; - - const K find_key = keys[key_idx]; - if (IS_RESERVED_KEY(find_key)) continue; - - int key_pos = -1; - int src_lane = -1; - size_t bkt_idx = 0; - size_t start_idx = 0; - - Bucket* bucket = get_key_position( - buckets, find_key, bkt_idx, start_idx, buckets_num, bucket_max_size); - - const int bucket_size = buckets_size[bkt_idx]; - if (bucket_size >= bucket_max_size) { - start_idx = (start_idx / TILE_SIZE) * TILE_SIZE; - } - - OccupyResult occupy_result{OccupyResult::INITIAL}; - occupy_result = find_without_lock( - g, bucket, find_key, start_idx, key_pos, src_lane, bucket_max_size); - - if (occupy_result == OccupyResult::DUPLICATE) { - if (rank == src_lane) { - values[key_idx] = bucket->vectors + key_pos * dim; - if (metas != nullptr) { - *(metas + key_idx) = - bucket->metas(key_pos)->load(cuda::std::memory_order_relaxed); - } - if (found != nullptr) { - *(found + key_idx) = true; - } - } - } - } -} - -template -struct SelectLookupPtrKernel { - static void execute_kernel(const float& load_factor, const int& block_size, - const size_t bucket_max_size, - const size_t buckets_num, const size_t dim, - cudaStream_t& stream, const size_t& n, - const Table* __restrict table, - const K* __restrict keys, V** __restrict values, - M* __restrict metas, bool* __restrict found) { - if (load_factor <= 0.75) { - const unsigned int tile_size = 4; - const size_t N = n * tile_size; - const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); - lookup_ptr_kernel - <<>>(table, bucket_max_size, - buckets_num, dim, keys, values, - metas, found, N); - } else { - const unsigned int tile_size = 16; - const size_t N = n * tile_size; - const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); - lookup_ptr_kernel - <<>>(table, bucket_max_size, - buckets_num, dim, keys, values, - metas, found, N); - } - return; - } -}; - -/* Clear all key-value in the table. */ -template -__global__ void clear_kernel(Table* __restrict table, size_t N) { - size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x; - const size_t bucket_max_size = table->bucket_max_size; - - for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) { - int key_idx = t % bucket_max_size; - int bkt_idx = t / bucket_max_size; - Bucket* bucket = &(table->buckets[bkt_idx]); - - (bucket->keys(key_idx)) - ->store(static_cast(EMPTY_KEY), cuda::std::memory_order_relaxed); - if (key_idx == 0) { - table->buckets_size[bkt_idx] = 0; - } - } -} - -/* Remove specified keys. */ -template -__global__ void remove_kernel(const Table* __restrict table, - const K* __restrict keys, - Bucket* __restrict buckets, - int* __restrict buckets_size, - const size_t bucket_max_size, - const size_t buckets_num, size_t N) { - auto g = cg::tiled_partition(cg::this_thread_block()); - int rank = g.thread_rank(); - - for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N; - t += blockDim.x * gridDim.x) { - int key_idx = t / TILE_SIZE; - K find_key = keys[key_idx]; - if (IS_RESERVED_KEY(find_key)) continue; - - int key_pos = -1; - - size_t bkt_idx = 0; - size_t start_idx = 0; - uint32_t tile_offset = 0; - - Bucket* bucket = get_key_position( - buckets, find_key, bkt_idx, start_idx, buckets_num, bucket_max_size); - - unsigned found_vote = 0; -#pragma unroll - for (tile_offset = 0; tile_offset < bucket_max_size; - tile_offset += TILE_SIZE) { - key_pos = (start_idx + tile_offset + rank) & (bucket_max_size - 1); - - const K current_key = - (bucket->keys(key_pos))->load(cuda::std::memory_order_relaxed); - - found_vote = g.ballot(find_key == current_key); - if (found_vote) { - break; - } - - if (g.any(current_key == static_cast(EMPTY_KEY))) { - break; - } - } - - if (found_vote) { - const int src_lane = __ffs(found_vote) - 1; - - if (g.thread_rank() == src_lane) { - const int key_pos = - (start_idx + tile_offset + src_lane) & (bucket_max_size - 1); - (bucket->keys(key_pos)) - ->store(static_cast(RECLAIM_KEY), - cuda::std::memory_order_relaxed); - (bucket->metas(key_pos)) - ->store(static_cast(EMPTY_META), - cuda::std::memory_order_relaxed); - atomicSub(&buckets_size[bkt_idx], 1); - } - break; - } - } -} - -/* Remove specified keys which match the Predict. */ -template -__global__ void remove_kernel(const Table* __restrict table, - const EraseIfPredictInternal pred, - const K pattern, const M threshold, - size_t* __restrict count, - Bucket* __restrict buckets, - int* __restrict buckets_size, - const size_t bucket_max_size, - const size_t buckets_num, size_t N) { - auto g = cg::tiled_partition(cg::this_thread_block()); - - for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N; - t += blockDim.x * gridDim.x) { - uint32_t bkt_idx = t; - uint32_t key_pos = 0; - - Bucket* bucket = buckets + bkt_idx; - - K current_key = 0; - M current_meta = 0; - uint32_t key_offset = 0; - while (key_offset < bucket_max_size) { - current_key = - bucket->keys(key_offset)->load(cuda::std::memory_order_relaxed); - current_meta = - bucket->metas(key_offset)->load(cuda::std::memory_order_relaxed); - if (!IS_RESERVED_KEY(current_key)) { - if (pred(current_key, current_meta, pattern, threshold)) { - atomicAdd(count, 1); - key_pos = key_offset; - (bucket->keys(key_pos)) - ->store(static_cast(RECLAIM_KEY), - cuda::std::memory_order_relaxed); - (bucket->metas(key_pos)) - ->store(static_cast(EMPTY_META), - cuda::std::memory_order_relaxed); - atomicSub(&buckets_size[bkt_idx], 1); - } else { - key_offset++; - } - } else { - key_offset++; - } - } - } -} - -/* Dump with meta. */ -template -inline std::tuple dump_kernel_shared_memory_size( - const size_t available_shared_memory) { - const size_t block_size{std::min( - available_shared_memory / 2 / sizeof(KVM), UINT64_C(1024))}; - MERLIN_CHECK( - block_size > 0, - "[HierarchicalKV] block_size <= 0, the K-V-M size may be too large!"); - - return {block_size * sizeof(KVM), block_size}; -} - -template -__global__ void dump_kernel(const Table* __restrict table, K* d_key, - V* __restrict d_val, M* __restrict d_meta, - const size_t offset, const size_t search_length, - size_t* d_dump_counter) { - extern __shared__ unsigned char s[]; - KVM* const block_tuples{reinterpret_cast*>(s)}; - - const size_t bucket_max_size{table->bucket_max_size}; - const size_t dim{table->dim}; - - __shared__ size_t block_acc; - __shared__ size_t global_acc; - - const size_t tid{blockIdx.x * blockDim.x + threadIdx.x}; - - if (threadIdx.x == 0) { - block_acc = 0; - } - __syncthreads(); - - if (tid < search_length) { - Bucket* const bucket{ - &table->buckets[(tid + offset) / bucket_max_size]}; - - const int key_idx{static_cast((tid + offset) % bucket_max_size)}; - const K key{(bucket->keys(key_idx))->load(cuda::std::memory_order_relaxed)}; - - if (!IS_RESERVED_KEY(key)) { - size_t local_index{atomicAdd(&block_acc, 1)}; - block_tuples[local_index] = { - key, &bucket->vectors[key_idx * dim], - bucket->metas(key_idx)->load(cuda::std::memory_order_relaxed)}; - } - } - __syncthreads(); - - if (threadIdx.x == 0) { - global_acc = atomicAdd(d_dump_counter, block_acc); - } - __syncthreads(); - - if (threadIdx.x < block_acc) { - const KVM& tuple{block_tuples[threadIdx.x]}; - - const size_t j{global_acc + threadIdx.x}; - d_key[j] = tuple.key; - for (int i{0}; i < dim; ++i) { - d_val[j * dim + i] = tuple.value[i]; - } - if (d_meta != nullptr) { - d_meta[j] = tuple.meta; - } - } -} - -/* Dump with meta. */ -template class PredFunctor> -__global__ void dump_kernel(const Table* __restrict table, - const K pattern, const M threshold, K* d_key, - V* __restrict d_val, M* __restrict d_meta, - const size_t offset, const size_t search_length, - size_t* d_dump_counter) { - extern __shared__ unsigned char s[]; - const size_t bucket_max_size = table->bucket_max_size; - const size_t dim = table->dim; - K* smem = (K*)s; - K* block_result_key = smem; - V* block_result_val = (V*)&(smem[blockDim.x]); - M* block_result_meta = (M*)&(block_result_val[blockDim.x * dim]); - __shared__ size_t block_acc; - __shared__ size_t global_acc; - PredFunctor fn; - - const size_t tid = blockIdx.x * blockDim.x + threadIdx.x; - - if (threadIdx.x == 0) { - block_acc = 0; - } - __syncthreads(); - - if (tid < search_length) { - int bkt_idx = (tid + offset) / bucket_max_size; - int key_idx = (tid + offset) % bucket_max_size; - Bucket* bucket = &(table->buckets[bkt_idx]); - - const K key = - (bucket->keys(key_idx))->load(cuda::std::memory_order_relaxed); - M meta = bucket->metas(key_idx)->load(cuda::std::memory_order_relaxed); - - if (key != static_cast(EMPTY_KEY) && - fn(key, meta, pattern, threshold)) { - size_t local_index = atomicAdd(&block_acc, 1); - block_result_key[local_index] = key; - for (int i = 0; i < dim; i++) { - atomicExch(&(block_result_val[local_index * dim + i]), - bucket->vectors[key_idx * dim + i]); - } - if (d_meta != nullptr) { - block_result_meta[local_index] = meta; - } - } - } - __syncthreads(); - - if (threadIdx.x == 0) { - global_acc = atomicAdd(d_dump_counter, block_acc); - } - __syncthreads(); - - if (threadIdx.x < block_acc) { - d_key[global_acc + threadIdx.x] = block_result_key[threadIdx.x]; - for (int i = 0; i < dim; i++) { - d_val[(global_acc + threadIdx.x) * dim + i] = - block_result_val[threadIdx.x * dim + i]; - } - if (d_meta != nullptr) { - d_meta[global_acc + threadIdx.x] = block_result_meta[threadIdx.x]; - } - } -} - -/* If founds[i] = true, read data from corresponding address of - * table_value_addrs and write to param_values; if founds[i] = false, write data - * from param_values to corresponding address of table_value_addrs. usually - * called by find_or_insert kernel. - */ -template -void read_or_write_by_cpu(V** __restrict table_value_addrs, - V* __restrict param_values, - const int* __restrict offset, const bool* founds, - size_t dim, int N, int n_worker = 16) { - std::vector thds; - if (n_worker < 1) n_worker = 1; - - auto functor = [founds, dim](V** __restrict table_value_addrs, - V* __restrict param_values, - const int* __restrict offset, int handled_size, - int trunk_size) -> void { - for (int i = handled_size; i < handled_size + trunk_size; i++) { - if (table_value_addrs[i] != nullptr) { - if (founds[offset[i]]) { - memcpy(param_values + offset[i] * dim, table_value_addrs[i], - sizeof(V) * dim); - } else { - memcpy(table_value_addrs[i], param_values + offset[i] * dim, - sizeof(V) * dim); - } - } - } - }; - - int32_t trunk_size_floor = N / n_worker; - int32_t trunk_size_remain = N % n_worker; - int32_t n_worker_used = trunk_size_floor == 0 ? trunk_size_remain : n_worker; - - size_t handled_size = 0; - for (int i = 0; i < n_worker_used; i++) { - int32_t cur_trunk_size = trunk_size_floor; - if (trunk_size_remain != 0) { - cur_trunk_size += 1; - trunk_size_remain--; - } - thds.push_back(std::thread(functor, table_value_addrs, param_values, offset, - handled_size, cur_trunk_size)); - handled_size += cur_trunk_size; - } - - for (int i = 0; i < n_worker_used; i++) { - thds[i].join(); - } -} - -/* - * find or insert with IO operation. This kernel is - * usually used for the pure HBM mode for better performance. - */ -template -__global__ void find_or_insert_kernel_with_io( - const Table* __restrict table, const size_t bucket_max_size, - const size_t buckets_num, const size_t dim, const K* __restrict keys, - V* __restrict values, M* __restrict metas, const size_t N) { - auto g = cg::tiled_partition(cg::this_thread_block()); - int* buckets_size = table->buckets_size; - - for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N; - t += blockDim.x * gridDim.x) { - int key_pos = -1; - const size_t key_idx = t / TILE_SIZE; - - const K find_or_insert_key = keys[key_idx]; - - if (IS_RESERVED_KEY(find_or_insert_key)) continue; - - const M find_or_insert_meta = - metas != nullptr ? metas[key_idx] : static_cast(MAX_META); - V* find_or_insert_value = values + key_idx * dim; - - size_t bkt_idx = 0; - size_t start_idx = 0; - int src_lane = -1; - K evicted_key; - - Bucket* bucket = - get_key_position(table->buckets, find_or_insert_key, bkt_idx, - start_idx, buckets_num, bucket_max_size); - - OccupyResult occupy_result{OccupyResult::INITIAL}; - const int bucket_size = buckets_size[bkt_idx]; - do { - if (bucket_size < bucket_max_size) { - occupy_result = find_and_lock_when_vacant( - g, bucket, find_or_insert_key, find_or_insert_meta, evicted_key, - start_idx, key_pos, src_lane, bucket_max_size); - } else { - start_idx = (start_idx / TILE_SIZE) * TILE_SIZE; - occupy_result = find_and_lock_when_full( - g, bucket, find_or_insert_key, find_or_insert_meta, evicted_key, - start_idx, key_pos, src_lane, bucket_max_size); - } - - occupy_result = g.shfl(occupy_result, src_lane); - } while (occupy_result == OccupyResult::CONTINUE); - - if (occupy_result == OccupyResult::REFUSED) continue; - - if ((occupy_result == OccupyResult::OCCUPIED_EMPTY || - occupy_result == OccupyResult::OCCUPIED_RECLAIMED) && - g.thread_rank() == src_lane) { - atomicAdd(&(buckets_size[bkt_idx]), 1); - } - - if (occupy_result == OccupyResult::DUPLICATE) { - copy_vector(g, bucket->vectors + key_pos * dim, - find_or_insert_value, dim); - if (metas != nullptr && g.thread_rank() == src_lane) { - *(metas + key_idx) = - bucket->metas(key_pos)->load(cuda::std::memory_order_relaxed); - } - } else { - copy_vector(g, find_or_insert_value, - bucket->vectors + key_pos * dim, dim); - if (g.thread_rank() == src_lane) { - update_meta(bucket, key_pos, metas, key_idx); - } - } - - if (g.thread_rank() == src_lane) { - (bucket->keys(key_pos)) - ->store(find_or_insert_key, cuda::std::memory_order_relaxed); - } - } -} - -template -struct SelectFindOrInsertKernelWithIO { - static void execute_kernel(const float& load_factor, const int& block_size, - const size_t bucket_max_size, - const size_t buckets_num, const size_t dim, - cudaStream_t& stream, const size_t& n, - const Table* __restrict table, - const K* __restrict keys, V* __restrict values, - M* __restrict metas) { - if (load_factor <= 0.75) { - const unsigned int tile_size = 4; - const size_t N = n * tile_size; - const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); - find_or_insert_kernel_with_io - <<>>( - table, bucket_max_size, buckets_num, dim, keys, values, metas, N); - } else { - const unsigned int tile_size = 32; - const size_t N = n * tile_size; - const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); - find_or_insert_kernel_with_io - <<>>( - table, bucket_max_size, buckets_num, dim, keys, values, metas, N); - } - return; - } -}; - -/* find or insert with the end-user specified meta. - */ -template -__global__ void find_or_insert_kernel( - const Table* __restrict table, const size_t bucket_max_size, - const size_t buckets_num, const size_t dim, const K* __restrict keys, - V** __restrict vectors, M* __restrict metas, bool* __restrict found, - int* __restrict keys_index, const size_t N) { - auto g = cg::tiled_partition(cg::this_thread_block()); - int* buckets_size = table->buckets_size; - - for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N; - t += blockDim.x * gridDim.x) { - int key_pos = -1; - size_t key_idx = t / TILE_SIZE; - - const K find_or_insert_key = keys[key_idx]; - - if (IS_RESERVED_KEY(find_or_insert_key)) continue; - - const M find_or_insert_meta = - metas != nullptr ? metas[key_idx] : static_cast(MAX_META); - - size_t bkt_idx = 0; - size_t start_idx = 0; - int src_lane = -1; - K evicted_key; - - Bucket* bucket = - get_key_position(table->buckets, find_or_insert_key, bkt_idx, - start_idx, buckets_num, bucket_max_size); - - if (g.thread_rank() == 0) { - *(keys_index + key_idx) = key_idx; - } - - OccupyResult occupy_result{OccupyResult::INITIAL}; - const int bucket_size = buckets_size[bkt_idx]; - do { - if (bucket_size < bucket_max_size) { - occupy_result = find_and_lock_when_vacant( - g, bucket, find_or_insert_key, find_or_insert_meta, evicted_key, - start_idx, key_pos, src_lane, bucket_max_size); - } else { - start_idx = (start_idx / TILE_SIZE) * TILE_SIZE; - occupy_result = find_and_lock_when_full( - g, bucket, find_or_insert_key, find_or_insert_meta, evicted_key, - start_idx, key_pos, src_lane, bucket_max_size); - } - - occupy_result = g.shfl(occupy_result, src_lane); - } while (occupy_result == OccupyResult::CONTINUE); - - if (occupy_result == OccupyResult::REFUSED) continue; - - if ((occupy_result == OccupyResult::OCCUPIED_EMPTY || - occupy_result == OccupyResult::OCCUPIED_RECLAIMED) && - g.thread_rank() == src_lane) { - atomicAdd(&(buckets_size[bkt_idx]), 1); - } - - if (occupy_result == OccupyResult::DUPLICATE) { - if (g.thread_rank() == src_lane) { - *(vectors + key_idx) = (bucket->vectors + key_pos * dim); - - if (found != nullptr) { - *(found + key_idx) = true; - } - - if (metas != nullptr) { - *(metas + key_idx) = - bucket->metas(key_pos)->load(cuda::std::memory_order_relaxed); - } - } - } else { - if (g.thread_rank() == src_lane) { - *(vectors + key_idx) = (bucket->vectors + key_pos * dim); - update_meta(bucket, key_pos, metas, key_idx); - } - } - - if (g.thread_rank() == src_lane) { - (bucket->keys(key_pos)) - ->store(find_or_insert_key, cuda::std::memory_order_relaxed); - } - } -} - -/* find or insert with the end-user specified meta. - */ -template -__global__ void find_ptr_or_insert_kernel( - const Table* __restrict table, const size_t bucket_max_size, - const size_t buckets_num, const size_t dim, const K* __restrict keys, - V** __restrict vectors, M* __restrict metas, bool* __restrict found, - const size_t N) { - auto g = cg::tiled_partition(cg::this_thread_block()); - int* buckets_size = table->buckets_size; - - for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N; - t += blockDim.x * gridDim.x) { - int key_pos = -1; - size_t key_idx = t / TILE_SIZE; - - const K find_or_insert_key = keys[key_idx]; - - if (IS_RESERVED_KEY(find_or_insert_key)) continue; - - const M find_or_insert_meta = - metas != nullptr ? metas[key_idx] : static_cast(MAX_META); - - size_t bkt_idx = 0; - size_t start_idx = 0; - int src_lane = -1; - K evicted_key; - - Bucket* bucket = - get_key_position(table->buckets, find_or_insert_key, bkt_idx, - start_idx, buckets_num, bucket_max_size); - - OccupyResult occupy_result{OccupyResult::INITIAL}; - const int bucket_size = buckets_size[bkt_idx]; - do { - if (bucket_size < bucket_max_size) { - occupy_result = find_and_lock_when_vacant( - g, bucket, find_or_insert_key, find_or_insert_meta, evicted_key, - start_idx, key_pos, src_lane, bucket_max_size); - } else { - start_idx = (start_idx / TILE_SIZE) * TILE_SIZE; - occupy_result = find_and_lock_when_full( - g, bucket, find_or_insert_key, find_or_insert_meta, evicted_key, - start_idx, key_pos, src_lane, bucket_max_size); - } - - occupy_result = g.shfl(occupy_result, src_lane); - } while (occupy_result == OccupyResult::CONTINUE); - - if (occupy_result == OccupyResult::REFUSED) continue; - - if ((occupy_result == OccupyResult::OCCUPIED_EMPTY || - occupy_result == OccupyResult::OCCUPIED_RECLAIMED) && - g.thread_rank() == src_lane) { - atomicAdd(&(buckets_size[bkt_idx]), 1); - } - - if (occupy_result == OccupyResult::DUPLICATE) { - if (g.thread_rank() == src_lane) { - *(vectors + key_idx) = (bucket->vectors + key_pos * dim); - *(found + key_idx) = true; - if (metas != nullptr) { - *(metas + key_idx) = - bucket->metas(key_pos)->load(cuda::std::memory_order_relaxed); - } - } - } else { - if (g.thread_rank() == src_lane) { - *(vectors + key_idx) = (bucket->vectors + key_pos * dim); - *(found + key_idx) = false; - update_meta(bucket, key_pos, metas, key_idx); - } - } - - if (g.thread_rank() == src_lane) { - (bucket->keys(key_pos)) - ->store(find_or_insert_key, cuda::std::memory_order_relaxed); - } - } -} - -template -struct SelectFindOrInsertPtrKernel { - static void execute_kernel(const float& load_factor, const int& block_size, - const size_t bucket_max_size, - const size_t buckets_num, const size_t dim, - cudaStream_t& stream, const size_t& n, - const Table* __restrict table, - const K* __restrict keys, V** __restrict values, - M* __restrict metas, bool* __restrict found) { - if (load_factor <= 0.5) { - const unsigned int tile_size = 4; - const size_t N = n * tile_size; - const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); - find_ptr_or_insert_kernel - <<>>(table, bucket_max_size, - buckets_num, dim, keys, values, - metas, found, N); - } else if (load_factor <= 0.875) { - const unsigned int tile_size = 8; - const size_t N = n * tile_size; - const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); - find_ptr_or_insert_kernel - <<>>(table, bucket_max_size, - buckets_num, dim, keys, values, - metas, found, N); - } else { - const unsigned int tile_size = 32; - const size_t N = n * tile_size; - const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); - find_ptr_or_insert_kernel - <<>>(table, bucket_max_size, - buckets_num, dim, keys, values, - metas, found, N); - } - return; - } -}; - -/* Read the data from address of table_value_addrs to corresponding position - in param_value if mask[i] is true, otherwise write data to table_value_addrs - form param_value, - usually called by find_or_insert kernel. - - `table_value_addrs`: A pointer of pointer of V which should be on HBM, - but each value (a pointer of V) could point to a - memory on HBM or HMEM. - `param_value`: A continue memory pointer with Vector - which should be HBM. - `mask`: One for each `param_value`. If true, reading from table_value_addrs, - or false writing table_value_addrs from param_value. - `param_key_index`: N values from address of table_value_addrs are mapped to - param_values according to param_key_index. - `dim`: the dim of value. - `N`: The number of vectors needed to be read. -*/ -template -__global__ void read_or_write_kernel(V** __restrict table_value_addrs, - V* __restrict param_values, - const bool* mask, - const int* __restrict param_key_index, - const size_t dim, const size_t N) { - size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x; - - for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) { - int vec_index = int(t / dim); - int dim_index = t % dim; - int real_key_index = - param_key_index != nullptr ? param_key_index[vec_index] : vec_index; - - /// if found, read the value form table, otherwise write it - if (table_value_addrs[vec_index] != nullptr) { - /// find - if (mask[real_key_index]) { - param_values[real_key_index * dim + dim_index] = - table_value_addrs[vec_index][dim_index]; - } - /// insert - else { - table_value_addrs[vec_index][dim_index] = - param_values[real_key_index * dim + dim_index]; - } - } - } -} - -/* - * update with IO operation. This kernel is - * usually used for the pure HBM mode for better performance. - */ -template -__global__ void update_kernel_with_io( - const Table* __restrict table, const size_t bucket_max_size, - const size_t buckets_num, const size_t dim, const K* __restrict keys, - const V* __restrict values, const M* __restrict metas, const size_t N) { - auto g = cg::tiled_partition(cg::this_thread_block()); - int* buckets_size = table->buckets_size; - - for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N; - t += blockDim.x * gridDim.x) { - int key_pos = -1; - size_t key_idx = t / TILE_SIZE; - - const K update_key = keys[key_idx]; - - if (IS_RESERVED_KEY(update_key)) continue; - - const V* update_value = values + key_idx * dim; - - size_t bkt_idx = 0; - size_t start_idx = 0; - int src_lane = -1; - - Bucket* bucket = - get_key_position(table->buckets, update_key, bkt_idx, start_idx, - buckets_num, bucket_max_size); - - OccupyResult occupy_result{OccupyResult::INITIAL}; - const int bucket_size = buckets_size[bkt_idx]; - - if (bucket_size >= bucket_max_size) { - start_idx = (start_idx / TILE_SIZE) * TILE_SIZE; - } - occupy_result = find_and_lock_for_update( - g, bucket, update_key, start_idx, key_pos, src_lane, bucket_max_size); - - occupy_result = g.shfl(occupy_result, src_lane); - - if (occupy_result == OccupyResult::REFUSED) continue; - - if (occupy_result == OccupyResult::DUPLICATE) { - copy_vector(g, update_value, - bucket->vectors + key_pos * dim, dim); - if (src_lane == g.thread_rank()) { - update_meta(bucket, key_pos, metas, key_idx); - } - } - - if (g.thread_rank() == src_lane) { - (bucket->keys(key_pos)) - ->store(update_key, cuda::std::memory_order_relaxed); - } - } -} - -template -struct SelectUpdateKernelWithIO { - static void execute_kernel(const float& load_factor, const int& block_size, - const size_t bucket_max_size, - const size_t buckets_num, const size_t dim, - cudaStream_t& stream, const size_t& n, - const Table* __restrict table, - const K* __restrict keys, - const V* __restrict values, - const M* __restrict metas) { - if (load_factor <= 0.75) { - const unsigned int tile_size = 4; - const size_t N = n * tile_size; - const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); - update_kernel_with_io - <<>>( - table, bucket_max_size, buckets_num, dim, keys, values, metas, N); - } else { - const unsigned int tile_size = 32; - const size_t N = n * tile_size; - const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); - update_kernel_with_io - <<>>( - table, bucket_max_size, buckets_num, dim, keys, values, metas, N); - } - return; - } -}; - -template -__global__ void update_kernel(const Table* __restrict table, - const size_t bucket_max_size, - const size_t buckets_num, const size_t dim, - const K* __restrict keys, V** __restrict vectors, - const M* __restrict metas, - int* __restrict src_offset, size_t N) { - auto g = cg::tiled_partition(cg::this_thread_block()); - int* buckets_size = table->buckets_size; - - for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N; - t += blockDim.x * gridDim.x) { - int key_pos = -1; - size_t key_idx = t / TILE_SIZE; - - const K update_key = keys[key_idx]; - - if (IS_RESERVED_KEY(update_key)) continue; - - size_t bkt_idx = 0; - size_t start_idx = 0; - int src_lane = -1; - - Bucket* bucket = - get_key_position(table->buckets, update_key, bkt_idx, start_idx, - buckets_num, bucket_max_size); - - OccupyResult occupy_result{OccupyResult::INITIAL}; - const int bucket_size = buckets_size[bkt_idx]; - *(src_offset + key_idx) = key_idx; - - if (bucket_size >= bucket_max_size) { - start_idx = (start_idx / TILE_SIZE) * TILE_SIZE; - } - occupy_result = find_and_lock_for_update( - g, bucket, update_key, start_idx, key_pos, src_lane, bucket_max_size); - - occupy_result = g.shfl(occupy_result, src_lane); - - if (occupy_result == OccupyResult::REFUSED) continue; - - if (g.thread_rank() == src_lane) { - if (occupy_result == OccupyResult::DUPLICATE) { - *(vectors + key_idx) = (bucket->vectors + key_pos * dim); - update_meta(bucket, key_pos, metas, key_idx); - } else { - *(vectors + key_idx) = nullptr; - } - } - - if (g.thread_rank() == src_lane) { - (bucket->keys(key_pos)) - ->store(update_key, cuda::std::memory_order_relaxed); - } - } -} - -} // namespace merlin -} // namespace nv diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/debug.hpp b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/debug.hpp deleted file mode 100644 index 2d9ebfe8b..000000000 --- a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/debug.hpp +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Copyright (c) 2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include -#include "tensorflow_recommenders_addons/dynamic_embedding/core/lib/utils/cuda_utils.cuh" - -namespace nv { -namespace merlin { - -class CudaException : public std::runtime_error { - public: - CudaException(const std::string& what) : runtime_error(what) {} -}; - -inline void cuda_check_(cudaError_t val, const char* file, int line) { - if (val != cudaSuccess) { - std::ostringstream os; - os << file << ':' << line << ": CUDA error " << cudaGetErrorName(val) - << " (#" << val << "): " << cudaGetErrorString(val); - throw CudaException(os.str()); - } -} - -//#ifdef CUDA_CHECK -//#error Unexpected redfinition of CUDA_CHECK! Something is wrong. -//#endif - -#ifndef CUDA_CHECK -#define CUDA_CHECK(val) \ - do { \ - nv::merlin::cuda_check_((val), __FILE__, __LINE__); \ - } while (0) -#endif // CUDA_CHECK - -class MerlinException : public std::runtime_error { - public: - MerlinException(const std::string& what) : runtime_error(what) {} -}; - -template -inline void merlin_check_(bool cond, const Msg& msg, const char* file, - int line) { - if (!cond) { - std::ostringstream os; - os << file << ':' << line << ": HierarchicalKV error " << msg; - throw MerlinException(os.str()); - } -} - -#ifdef MERLIN_CHECK -#error Unexpected redfinition of MERLIN_CHECK! Something is wrong. -#endif - -#define MERLIN_CHECK(cond, msg) \ - do { \ - nv::merlin::merlin_check_((cond), (msg), __FILE__, __LINE__); \ - } while (0) - -} // namespace merlin -} // namespace nv diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/flexible_buffer.cuh b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/flexible_buffer.cuh deleted file mode 100644 index d01dda6d1..000000000 --- a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/flexible_buffer.cuh +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) 2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include "utils.cuh" - -using std::cerr; -using std::endl; - -namespace nv { -namespace merlin { - -template -class FlexPinnedBuffer { - public: - FlexPinnedBuffer(const size_t size = 1) : ptr_(nullptr) { - if (!ptr_) { - size_ = size; - CUDA_CHECK(cudaMallocHost(&ptr_, sizeof(T) * size_)); - } - } - ~FlexPinnedBuffer() { - try { - if (!ptr_) CUDA_CHECK(cudaFreeHost(ptr_)); - } catch (const nv::merlin::CudaException& e) { - cerr << "[HierarchicalKV] Failed to free FlexPinnedBuffer!" << endl; - } - } - - __inline__ T* alloc_or_reuse(const size_t size = 0) { - if (size > size_) { - CUDA_CHECK(cudaFreeHost(ptr_)); - size_ = size; - CUDA_CHECK(cudaMallocHost(&ptr_, sizeof(T) * size_)); - } - return ptr_; - } - - private: - T* ptr_; - size_t size_; -}; - -} // namespace merlin -} // namespace nv \ No newline at end of file diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/group_lock.hpp b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/group_lock.hpp deleted file mode 100644 index 2cfdc6ed3..000000000 --- a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/group_lock.hpp +++ /dev/null @@ -1,229 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http:///www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Implementing a group mutex and relative lock guard for better E2E performance: - * - Allow multiple writers (like `insert_or_assign` `assign` `insert_and_evict` etc.) - * The CUDA kernels guarantee the data consistency in this situation. - * - Allow multiple readers (like `find` 'size` etc.) - * - Not allow readers and writers to run concurrently - * - The `write_read_lock` is used for special APIs (like `reserve` `erase` `clear` etc.) - */ -#include -#include -#include -#include -#include - -namespace nv { -namespace merlin { - -class group_shared_mutex { - public: - group_shared_mutex(const group_shared_mutex&) = delete; - group_shared_mutex& operator=(const group_shared_mutex&) = delete; - - group_shared_mutex() noexcept - : writer_count_(0), reader_count_(0), unique_flag_(false) {} - - void lock_read() { - for (;;) { - while (writer_count_.load(std::memory_order_acquire)) { - } - reader_count_.fetch_add(1, std::memory_order_acq_rel); - if (writer_count_.load(std::memory_order_acquire) == 0) { - break; - } - reader_count_.fetch_sub(1, std::memory_order_acq_rel); - } - } - - void unlock_read() { reader_count_.fetch_sub(1, std::memory_order_release); } - - void lock_write() { - for (;;) { - while (reader_count_.load(std::memory_order_acquire)) { - } - writer_count_.fetch_add(1, std::memory_order_acq_rel); - if (reader_count_.load(std::memory_order_acquire) == 0) { - break; - } - writer_count_.fetch_sub(1, std::memory_order_acq_rel); - } - } - - void unlock_write() { writer_count_.fetch_sub(1, std::memory_order_release); } - - void lock_write_read() { - /* Lock unique flag */ - bool expected = false; - while (!unique_flag_.compare_exchange_weak(expected, true, - std::memory_order_acq_rel)) { - expected = false; - } - - /* Ban writer */ - for (;;) { - while (writer_count_.load(std::memory_order_acquire)) { - } - reader_count_.fetch_add(1, std::memory_order_acq_rel); - if (writer_count_.load(std::memory_order_acquire) == 0) { - break; - } - reader_count_.fetch_sub(1, std::memory_order_acq_rel); - } - - /* Ban reader */ - for (;;) { - while (reader_count_.load(std::memory_order_acquire) > 1) { - } - writer_count_.fetch_add(1, std::memory_order_acq_rel); - if (reader_count_.load(std::memory_order_acquire) == 1) { - break; - } - writer_count_.fetch_sub(1, std::memory_order_acq_rel); - } - } - - void unlock_write_read() noexcept { - reader_count_.fetch_sub(1, std::memory_order_release); - writer_count_.fetch_sub(1, std::memory_order_release); - unique_flag_.store(false, std::memory_order_release); - } - - int writer_count() noexcept { - return writer_count_.load(std::memory_order_relaxed); - } - - int reader_count() noexcept { - return reader_count_.load(std::memory_order_relaxed); - } - - private: - std::atomic writer_count_; - std::atomic reader_count_; - std::atomic unique_flag_; -}; - -class reader_shared_lock { - public: - reader_shared_lock(const reader_shared_lock&) = delete; - reader_shared_lock(reader_shared_lock&&) = delete; - - reader_shared_lock& operator=(const reader_shared_lock&) = delete; - reader_shared_lock& operator=(reader_shared_lock&&) = delete; - - explicit reader_shared_lock(group_shared_mutex& mutex) : mutex_(&mutex) { - mutex_->lock_read(); - owns_ = true; - } - - explicit reader_shared_lock(group_shared_mutex& mutex, std::defer_lock_t) - : mutex_(&mutex), owns_(false) {} - - ~reader_shared_lock() { - if (owns_) { - mutex_->unlock_read(); - } - } - - void lock() noexcept { - if (!owns_) { - mutex_->lock_read(); - owns_ = true; - } - } - - bool owns_lock() const noexcept { return owns_; } - - private: - group_shared_mutex* const mutex_; - bool owns_; -}; - -class writer_shared_lock { - public: - writer_shared_lock(const writer_shared_lock&) = delete; - writer_shared_lock(writer_shared_lock&&) = delete; - - writer_shared_lock& operator=(const writer_shared_lock&) = delete; - writer_shared_lock& operator=(writer_shared_lock&&) = delete; - - explicit writer_shared_lock(group_shared_mutex& mutex) : mutex_(&mutex) { - mutex_->lock_write(); - owns_ = true; - } - - explicit writer_shared_lock(group_shared_mutex& mutex, std::defer_lock_t) - : mutex_(&mutex), owns_(false) {} - - ~writer_shared_lock() { - if (owns_) { - mutex_->unlock_write(); - } - } - - void lock() noexcept { - if (!owns_) { - mutex_->lock_write(); - owns_ = true; - } - } - - bool owns_lock() const noexcept { return owns_; } - - private: - group_shared_mutex* const mutex_; - bool owns_; -}; - -class write_read_lock { - public: - write_read_lock(const write_read_lock&) = delete; - write_read_lock(write_read_lock&&) = delete; - - write_read_lock& operator=(const write_read_lock&) = delete; - write_read_lock& operator=(write_read_lock&&) = delete; - - explicit write_read_lock(group_shared_mutex& mutex) : mutex_(&mutex) { - mutex_->lock_write_read(); - owns_ = true; - } - - explicit write_read_lock(group_shared_mutex& mutex, std::defer_lock_t) noexcept - : mutex_(&mutex), owns_(false) {} - - ~write_read_lock() { - if (owns_) { - mutex_->unlock_write_read(); - } - } - - void lock() { - assert(!owns_ && "[write_read_lock] trying to lock twice!"); - mutex_->lock_write_read(); - owns_ = true; - } - - bool owns_lock() const noexcept { return owns_; } - - private: - group_shared_mutex* const mutex_; - bool owns_; -}; - -} // namespace merlin -} // namespace nv \ No newline at end of file diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/initializers.cuh b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/initializers.cuh deleted file mode 100644 index 6df875688..000000000 --- a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/initializers.cuh +++ /dev/null @@ -1,147 +0,0 @@ -/* - * Copyright (c) 2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include "curand_philox4x32_x.h" -#include "types.cuh" -#include "utils.cuh" - -namespace nv { -namespace merlin { -namespace initializers { - -inline void cuda_rand_check_(curandStatus_t val, const char* file, int line) { - if (val != CURAND_STATUS_SUCCESS) { - throw CudaException(std::string(file) + ":" + std::to_string(line) + - ": CURAND error " + std::to_string(val)); - } -} - -#define CURAND_CHECK(val) \ - { nv::merlin::initializers::cuda_rand_check_((val), __FILE__, __LINE__); } - -template -void zeros(T* d_data, const size_t len, cudaStream_t stream) { - CUDA_CHECK(cudaMemsetAsync(d_data, 0, len, stream)); -} - -template -void random_normal(T* d_data, const size_t len, cudaStream_t stream, - const T mean = 0.0, const T stddev = 0.05, - const unsigned long long seed = 2022ULL) { - curandGenerator_t generator; - CURAND_CHECK(curandCreateGenerator(&generator, CURAND_RNG_PSEUDO_DEFAULT)); - CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(generator, seed)); - CURAND_CHECK(curandGenerateNormal(generator, d_data, len, mean, stddev)); -} - -template -__global__ void adjust_max_min(T* d_data, const T minval, const T maxval, - const size_t N) { - int tid = (blockIdx.x * blockDim.x) + threadIdx.x; - if (tid < N) { - d_data[tid] = - d_data[tid] * (maxval - minval) + (0.5 * (maxval + minval) - 0.5); - } -} - -template -void random_uniform(T* d_data, const size_t len, cudaStream_t stream, - const T minval = 0.0, const T maxval = 1.0, - const unsigned long long seed = 2022ULL) { - curandGenerator_t generator; - - CURAND_CHECK(curandCreateGenerator(&generator, CURAND_RNG_PSEUDO_DEFAULT)); - CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(generator, seed)); - - int N = len; - int block_size = 256; - int grid_size = (N + block_size - 1) / block_size; - CURAND_CHECK(curandGenerateUniform(generator, d_data, N)); - adjust_max_min - <<>>(d_data, minval, maxval, N); -} - -template -__global__ void init_states(curandStatePhilox4_32_10_t* states, - const unsigned long long seed, const size_t N) { - int tid = (blockIdx.x * blockDim.x) + threadIdx.x; - if (tid < N) { - curand_init(seed, tid, 0, &states[tid]); - } -} - -template -__global__ void make_truncated_normal(T* d_data, - curandStatePhilox4_32_10_t* states, - const size_t N) { - int tid = (blockIdx.x * blockDim.x) + threadIdx.x; - if (tid < N) { - constexpr T truncated_val = T(2.0); - while (fabsf(d_data[tid]) > truncated_val) { - d_data[tid] = curand_normal(&states[tid]); - } - } -} - -template -void truncated_normal(T* d_data, const size_t len, cudaStream_t stream, - const T minval = 0.0, const T maxval = 1.0, - const unsigned long long seed = 2022ULL) { - curandGenerator_t generator; - - CURAND_CHECK(curandCreateGenerator(&generator, CURAND_RNG_PSEUDO_DEFAULT)); - CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(generator, seed)); - - int N = len; - int block_size = 256; - int grid_size = (N + block_size - 1) / block_size; - curandStatePhilox4_32_10_t* d_states; - CUDA_CHECK(cudaMallocAsync(&d_states, N, stream)); - - init_states<<>>(d_states, seed, N); - - make_truncated_normal - <<>>(d_data, d_states, N); - - adjust_max_min - <<>>(d_data, minval, maxval, N); - - CUDA_CHECK(cudaFreeAsync(d_states, stream)); -} - -template -class Initializer { - public: - virtual ~Initializer() {} - virtual void initialize(T* data, size_t len, cudaStream_t stream) {} -}; - -template -class Zeros final : public Initializer { - public: - void initialize(T* data, const size_t len, cudaStream_t stream) override { - zeros(data, len, stream); - } -}; - -} // namespace initializers -} // namespace merlin -} // namespace nv \ No newline at end of file diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/memory_pool.cuh b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/memory_pool.cuh deleted file mode 100644 index 271676a6e..000000000 --- a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/memory_pool.cuh +++ /dev/null @@ -1,619 +0,0 @@ -/* - * Copyright (c) 2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include "debug.hpp" - -namespace nv { -namespace merlin { - -/** - * Allocators are used by the memory pool (and maybe other classes) to create - * RAII complient containers for buffers allocated in different memory areas. - */ -template -struct AllocatorBase { - using type = T; - using sync_unique_ptr = std::unique_ptr; - using async_unique_ptr = std::unique_ptr>; - using shared_ptr = std::shared_ptr; - - inline static sync_unique_ptr make_unique(size_t n) { - return sync_unique_ptr(Allocator::alloc(n)); - } - - inline static async_unique_ptr make_unique(size_t n, cudaStream_t stream) { - return {Allocator::alloc(n, stream), - [stream](type* p) { Allocator::free(p); }}; - } - - inline static shared_ptr make_shared(size_t n, cudaStream_t stream = 0) { - return {Allocator::alloc(n, stream), - [stream](type* p) { Allocator::free(p, stream); }}; - } - - inline void operator()(type* ptr) { Allocator::free(ptr); } -}; - -/** - * Trivial fallback implementation using the standard C++ allocator. This mostly - * exists to ensure interface correctness, and as an illustration of what a - * proper allocator implementation should look like. - */ -template -struct StandardAllocator final : AllocatorBase> { - using type = typename AllocatorBase>::type; - - static constexpr const char* name{"StandardAllocator"}; - - inline static type* alloc(size_t n, cudaStream_t stream = 0) { - return new type[n]; - } - - inline static void free(type* ptr, cudaStream_t stream = 0) { delete[] ptr; } -}; - -/** - * Claim/release buffers in pinned host memory. - */ -template -struct HostAllocator final : AllocatorBase> { - using type = typename AllocatorBase>::type; - - static constexpr const char* name{"HostAllocator"}; - - inline static type* alloc(size_t n, cudaStream_t stream = 0) { - void* ptr; - CUDA_CHECK(cudaMallocHost(&ptr, sizeof(T) * n)); - return reinterpret_cast(ptr); - } - - inline static void free(type* ptr, cudaStream_t stream = 0) { - CUDA_CHECK(cudaFreeHost(ptr)); - } -}; - -/** - * Claim/release buffers in the active CUDA device. Will not test if the correct - * device was used, and throw if CUDA runtime API response is negative. - */ -template -struct DeviceAllocator final : AllocatorBase> { - using type = typename AllocatorBase>::type; - - static constexpr const char* name{"DeviceAllocator"}; - - inline static type* alloc(size_t n, cudaStream_t stream = 0) { - void* ptr; - cudaError_t res; - if (stream) { - res = cudaMallocAsync(&ptr, sizeof(T) * n, stream); - } else { - res = cudaMalloc(&ptr, sizeof(T) * n); - } - CUDA_CHECK(res); - return reinterpret_cast(ptr); - } - - inline static void free(type* ptr, cudaStream_t stream = 0) { - cudaError_t res; - if (stream) { - res = cudaFreeAsync(ptr, stream); - } else { - res = cudaFree(ptr); - } - CUDA_CHECK(res); - } -}; - -/** - * Helper structure to configure a memory pool. - */ -struct MemoryPoolOptions { - size_t max_stock{4}; ///< Amount of buffers to keep in reserve. - size_t max_pending{16}; ///< Maximum amount of awaitable buffers. If this - ///< limit is exceeded threads will start to block. -}; - -/** - * Forward declares required to make templated ostream overload work. - */ -template -class MemoryPool; - -template -std::ostream& operator<<(std::ostream&, const MemoryPool&); - -/** - * CUDA deferred execution aware memory pool implementation. As for every memory - * pool, the general idea is to have resuable buffers. All buffers have the same - * size. - * - * General behavior: - * - * This memory pool implementation attempts to avoid blocking before the fact, - * but also avoids relying on a background worker. - * - * Buffer borrow and return semantics tightly align with C++ RAII principles. - * That is, if a workspace is requested, any borrowed buffers will be returned - * automatically when leaving the scope. - * - * You can either borrow a single buffer, or a workspace (that is multiple - * buffers). We support dynamic and static workspaces. Static workspaces have - * the benefit that they will never require heap memory (no hidden allocations). - * - * - * Buffer borrowing: - * - * If buffers are requested, we take them from the stock, if available. If the - * stock is depleted, we check if any pending buffer has been used up by the GPU - * and adds them to the stock. If was also not successful, we allocate a new - * buffer. Buffers or workspaces (groups of buffers). - * - * When borrowing a buffer a streaming context can be specified. This context is - * relevant for allocation and during returns. It is assumed that the stream you - * provide as context will be the stream where you queue the workload. Not doing - * so may lead to undefined behavior. - * - * Buffer return: - * - * If no context is provided, we cannot make any assumptions regarding the usage - * one the device. So we sychronize the device first and then return the buffer - * to the stock. If a streaming context was provided, we queue an event and add - * the buffer to the `pending` pool. That means, the buffer has been - * reqlinquished by the CPU, but may still be used by the GPU. If no pending - * slot is available, we probe the currently pending buffers events for - * completion. Completed pending buffers are returned to the reserve. If so, we - * queue the buffer in the freed slot. If that was unsucessful (i.e., all - * currently pending buffers are still in use by the GPU), we have no choice but - * the free the buffer using the current stream. - * - * In either case, `max_reserve` represents the maxmum size of the stock. If - * returning a buffer would lead to the stock exeeding this quantity, the buffer - * is queued for destruction. - */ -template -class MemoryPool final { - public: - using pool_type = MemoryPool; - using alloc_type = typename Allocator::type; - template - class Workspace { - public: - inline Workspace() : pool_{nullptr}, buffer_size_{0}, stream_{0} {} - - inline Workspace(pool_type* pool, cudaStream_t stream) - : pool_{pool}, buffer_size_{0}, stream_{stream} {} - - Workspace(const Workspace&) = delete; - - Workspace& operator=(const Workspace&) = delete; - - inline Workspace(Workspace&& other) - : pool_{other.pool_}, - buffer_size_{other.buffer_size_}, - stream_{other.stream_}, - buffers_{std::move(other.buffers_)} {} - - inline Workspace& operator=(Workspace&& other) { - if (pool_) { - pool_->put_raw(buffers_.begin(), buffers_.end(), buffer_size_, stream_); - } - pool_ = other.pool_; - buffer_size_ = other.buffer_size_; - stream_ = other.stream_; - buffers_ = std::move(other.buffers_); - other.pool_ = nullptr; - return *this; - } - - inline ~Workspace() { - if (pool_) { - pool_->put_raw(buffers_.begin(), buffers_.end(), buffer_size_, stream_); - } - } - - template - constexpr void at(const size_t n, T* ptr) const { - *ptr = at(n); - } - - template - constexpr T at(const size_t n) const { - return reinterpret_cast(buffers_.at(n)); - } - - template - constexpr void get(const size_t n, T* ptr) const { - *ptr = get(n); - } - - template - constexpr T get(const size_t n) const { - return reinterpret_cast(buffers_[n]); - } - - constexpr alloc_type* operator[](const size_t n) const { - return buffers_[n]; - } - - protected: - pool_type* pool_; - size_t buffer_size_; - cudaStream_t stream_; - Container buffers_; - }; - - template - class StaticWorkspace final : public Workspace> { - public: - using base_type = Workspace>; - - friend class MemoryPool; - - inline StaticWorkspace() : base_type() {} - - StaticWorkspace(const StaticWorkspace&) = delete; - - StaticWorkspace& operator=(const StaticWorkspace&) = delete; - - inline StaticWorkspace(StaticWorkspace&& other) - : base_type(std::move(other)) {} - - inline StaticWorkspace& operator=(StaticWorkspace&& other) { - base_type::operator=(std::move(other)); - return *this; - } - - private: - inline StaticWorkspace(pool_type* pool, size_t requested_buffer_size, - cudaStream_t stream) - : base_type(pool, stream) { - auto& buffers{this->buffers_}; - this->buffer_size_ = pool->get_raw(buffers.begin(), buffers.end(), - requested_buffer_size, stream); - } - }; - - class DynamicWorkspace final : public Workspace> { - public: - using base_type = Workspace>; - - friend class MemoryPool; - - inline DynamicWorkspace() : base_type() {} - - DynamicWorkspace(const DynamicWorkspace&) = delete; - - DynamicWorkspace& operator=(const DynamicWorkspace&) = delete; - - inline DynamicWorkspace(DynamicWorkspace&& other) - : base_type(std::move(other)) {} - - inline DynamicWorkspace& operator=(DynamicWorkspace&& other) { - base_type::operator=(std::move(other)); - return *this; - } - - private: - inline DynamicWorkspace(pool_type* pool, size_t n, - size_t requested_buffer_size, cudaStream_t stream) - : base_type(pool, stream) { - auto& buffers{this->buffers_}; - buffers.resize(n); - this->buffer_size_ = pool->get_raw(buffers.begin(), buffers.end(), - requested_buffer_size, stream); - } - }; - - MemoryPool(const MemoryPoolOptions& options) : options_{options} { - // Create initial buffer stock. - stock_.reserve(options_.max_stock); - - // Create enough events, so we have one per potentially pending buffer. - ready_events_.resize(options_.max_pending); - for (auto& ready_event : ready_events_) { - CUDA_CHECK(cudaEventCreate(&ready_event)); - } - - // Preallocate pending. - pending_.reserve(options_.max_pending); - } - - ~MemoryPool() { - // Make sure all queued tasks are complete. - await_pending(); - - // Free event and buffer memory. - for (auto& ready_event : ready_events_) { - CUDA_CHECK(cudaEventDestroy(ready_event)); - } - - // Any remaining buffers need to be properly unallocated. - deplete_stock(); - } - - inline size_t buffer_size() const { return buffer_size_; } - - inline size_t max_batch_size(size_t max_item_size) const { - return buffer_size_ / max_item_size; - } - - template - inline size_t max_batch_size() const { - return max_batch_size(sizeof(T)); - } - - size_t current_stock() const { - std::lock_guard lock(mutex_); - return stock_.size(); - } - - size_t num_pending() const { - std::lock_guard lock(mutex_); - return pending_.size(); - } - - void await_pending(cudaStream_t stream = 0) { - std::lock_guard lock(mutex_); - while (!pending_.empty()) { - collect_pending_unsafe(stream); - if (pending_.empty()) { - break; - } - std::this_thread::yield(); - } - } - - void deplete_stock() { - std::lock_guard lock(mutex_); - for (auto& ptr : stock_) { - Allocator::free(ptr); - } - stock_.clear(); - } - - inline std::unique_ptr> - get_unique(size_t requested_buffer_size, cudaStream_t stream = 0) { - alloc_type* ptr; - const size_t allocation_size = - get_raw(&ptr, (&ptr) + 1, requested_buffer_size, stream); - return {ptr, [this, allocation_size, stream](alloc_type* p) { - put_raw(&p, (&p) + 1, allocation_size, stream); - }}; - } - - inline std::shared_ptr get_shared(size_t requested_buffer_size, - cudaStream_t stream = 0) { - alloc_type* ptr; - const size_t allocation_size = - get_raw(&ptr, (&ptr) + 1, requested_buffer_size, stream); - return {ptr, [this, allocation_size, stream](alloc_type* p) { - put_raw(&p, (&p) + 1, allocation_size, stream); - }}; - } - - template - inline StaticWorkspace get_workspace(size_t requested_buffer_size, - cudaStream_t stream = 0) { - return {this, requested_buffer_size, stream}; - } - - inline DynamicWorkspace get_workspace(size_t n, size_t requested_buffer_size, - cudaStream_t stream = 0) { - return {this, n, requested_buffer_size, stream}; - } - - friend std::ostream& operator<<(std::ostream&, const MemoryPool&); - - private: - inline void collect_pending_unsafe(cudaStream_t stream) { - auto it{std::remove_if( - pending_.begin(), pending_.end(), [this, stream](const auto& pending) { - const cudaError_t state{cudaEventQuery(std::get<2>(pending))}; - switch (state) { - case cudaSuccess: - // Stock buffers and destroy those that are no - // longer needed, but only if the allocation_size - // is still the same as the current buffer_size. - if (stock_.size() < options_.max_stock && - std::get<1>(pending) == buffer_size_) { - stock_.emplace_back(std::get<0>(pending)); - } else { - Allocator::free(std::get<0>(pending), stream); - } - ready_events_.emplace_back(std::get<2>(pending)); - return true; - case cudaErrorNotReady: - return false; - default: - CUDA_CHECK(state); - return false; - } - })}; - pending_.erase(it, pending_.end()); - } - - inline void clear_stock_unsafe(cudaStream_t stream) { - for (auto& ptr : stock_) { - Allocator::free(ptr, stream); - } - stock_.clear(); - } - - template - inline size_t get_raw(Iterator first, Iterator const last, - size_t requested_buffer_size, cudaStream_t stream) { - // Get pre-allocated buffers if stock available. - size_t allocation_size; - { - std::lock_guard lock(mutex_); - - // If requested_buffer_size is within current buffer_size margins can - // reuse current buffers. - if (requested_buffer_size <= buffer_size_) { - while (first != last) { - // If no buffers available, try to make some available. - if (stock_.empty()) { - collect_pending_unsafe(stream); - if (stock_.empty()) { - // No buffers available. - break; - } - } - - // Just take the next available buffer. - *first++ = stock_.back(); - stock_.pop_back(); - } - } else { - // Drop the stock because we need more memory and those buffers have - // become useless to that end. - clear_stock_unsafe(stream); - buffer_size_ = requested_buffer_size; - } - - allocation_size = buffer_size_; - } - - // Forge new buffers until request can be filled. - for (; first != last; ++first) { - *first = Allocator::alloc(allocation_size, stream); - } - - return allocation_size; - } - - template - inline void put_raw(Iterator first, Iterator const last, - size_t allocation_size, cudaStream_t stream) { - std::lock_guard lock(mutex_); - - // If allocation_size of the workspace differs from the current buffer_size - // (i.e., somebody else requested a larger buffer since the original request - // occured), the provided buffers are incompatible and have to be discarded. - if (allocation_size != buffer_size_) { - while (first != last) { - Allocator::free(*first++); - } - return; - } - - // If the workspace that borrowed a stream was moved out of the RAII scope - // where it was created, it could happen that the stream was destroyed when - // we return the buffer ownershup. This `cudaStreamQuery` will prevent that. - if (stream && cudaStreamQuery(stream) != cudaErrorInvalidResourceHandle) { - for (; first != last; ++first) { - // Avoid adding already deallocated buffers. - if (*first == nullptr) { - continue; - } - - // Spin lock if too many pending buffers (i.e., let CPU wait for GPU). - while (ready_events_.empty()) { - collect_pending_unsafe(stream); - if (!ready_events_.empty()) { - break; - } - std::this_thread::yield(); - } - - // Queue buffer. - cudaEvent_t ready_event{ready_events_.back()}; - ready_events_.pop_back(); - CUDA_CHECK(cudaEventRecord(ready_event, stream)); - pending_.emplace_back(*first, allocation_size, ready_event); - } - } else { - // Without stream context, we must force a hard sync with the GPU. - CUDA_CHECK(cudaDeviceSynchronize()); - - for (; first != last; ++first) { - // Avoid adding already deallocated buffers. - if (*first == nullptr) { - continue; - } - - // Stock buffers and destroy those that are no longer needed. - if (stock_.size() < options_.max_stock) { - stock_.emplace_back(*first); - } else { - Allocator::free(*first); - } - } - } - } - - const MemoryPoolOptions options_; - - mutable std::mutex mutex_; - size_t buffer_size_{1}; - std::vector stock_; - std::vector ready_events_; - - std::vector> pending_; -}; - -template -std::ostream& operator<<(std::ostream& os, const MemoryPool& pool) { - std::lock_guard lock(pool.mutex_); - - for (size_t i{0}; i < 80; ++i) { - os << '-'; - } - - // Current stock. - os << "\nStock =\n"; - for (size_t i{0}; i < pool.stock_.size(); ++i) { - os << "[ " << i << " ] buffer " << static_cast(pool.stock_[i]) - << ", size = " << pool.buffer_size_ << '\n'; - } - - // Pending buffers. - os << "\nPending =\n"; - for (size_t i{0}; i < pool.pending_.size(); ++i) { - os << "[ " << i - << " ] buffer = " << static_cast(std::get<0>(pool.pending_[i])) - << ", size = " << std::get<1>(pool.pending_[i]) << ", ready_event = " - << static_cast(std::get<2>(pool.pending_[i])) << '\n'; - } - - // Available ready events. - os << "\nReady Events =\n"; - for (size_t i{0}; i < pool.ready_events_.size(); ++i) { - os << "[ " << i << " ] " << static_cast(pool.ready_events_[i]) - << '\n'; - } - - for (size_t i{0}; i < 80; ++i) { - os << '-'; - } - - os << '\n'; - return os; -} - -} // namespace merlin -} // namespace nv diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/optimizers.cuh b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/optimizers.cuh deleted file mode 100644 index b3fc1cc70..000000000 --- a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/optimizers.cuh +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (c) 2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include "types.cuh" -#include "utils.cuh" - -namespace nv { -namespace merlin { -namespace optimizers { - -template -__global__ void adam_update_kernel(int len, float* weight, T* m, T* v, - const T* wgrad, float alpha_t, float beta1, - float beta2, float epsilon, float scaler) { - const int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < len) { - float gi = TypeConvertFunc::convert(wgrad[i]) / scaler; - float mi = - beta1 * TypeConvertFunc::convert(m[i]) + (1.f - beta1) * gi; - float vi = beta2 * TypeConvertFunc::convert(v[i]) + - (1.f - beta2) * gi * gi; - m[i] = TypeConvertFunc::convert(mi); - v[i] = TypeConvertFunc::convert(vi); - weight[i] -= alpha_t * mi / (sqrt(vi) + epsilon); - } -} - -template -__global__ void ada_grad_update_kernel(int len, float* weight, const T* wgrad, - T* sum, float lr, const float epsilon, - float scaler) { - const int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < len) { - float gi = TypeConvertFunc::convert(wgrad[i]) / scaler; - float accum_ = TypeConvertFunc::convert(__ldg(&sum[i])); - accum_ += gi * gi; - float std_ = epsilon + sqrtf(accum_); - weight[i] -= lr * gi / std_; - sum[i] = TypeConvertFunc::convert(accum_); - } -} - -template -__global__ void momentum_sgd_update_kernel(int len, float* weight, T* momentum, - const T* wgrad, float lr, - float momentum_factor, - float scaler) { - int idx = blockDim.x * blockIdx.x + threadIdx.x; - if (idx < len) { - float mv = - momentum_factor * TypeConvertFunc::convert(momentum[idx]) - - lr * TypeConvertFunc::convert(wgrad[idx]) / scaler; - momentum[idx] = TypeConvertFunc::convert(mv); - weight[idx] += mv; - } - return; -} - -} // namespace optimizers -} // namespace merlin -} // namespace nv \ No newline at end of file diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/types.cuh b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/types.cuh deleted file mode 100644 index 210acd4a5..000000000 --- a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/types.cuh +++ /dev/null @@ -1,217 +0,0 @@ -/* - * Copyright (c) 2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include - -namespace nv { -namespace merlin { - -/** - * Shorthand for a Key-Value-Meta tuple. - */ -template -struct KVM { - K key; - V* value; - M meta; -}; - -constexpr uint64_t EMPTY_KEY = UINT64_C(0xFFFFFFFFFFFFFFFF); -constexpr uint64_t RECLAIM_KEY = UINT64_C(0xFFFFFFFFFFFFFFFE); -constexpr uint64_t VACANT_KEY_MASK = UINT64_C(0xFFFFFFFFFFFFFFFE); -constexpr uint64_t LOCKED_KEY = UINT64_C(0xFFFFFFFFFFFFFFFD); -constexpr uint64_t RESERVED_KEY_MASK = UINT64_C(0xFFFFFFFFFFFFFFFC); -constexpr uint64_t MAX_META = UINT64_C(0xFFFFFFFFFFFFFFFF); -constexpr uint64_t EMPTY_META = UINT64_C(0); - -#define IS_RESERVED_KEY(key) ((RESERVED_KEY_MASK & (key)) == RESERVED_KEY_MASK) -#define IS_VACANT_KEY(key) ((VACANT_KEY_MASK & (key)) == VACANT_KEY_MASK) - -template -using AtomicKey = cuda::atomic; - -template -using AtomicMeta = cuda::atomic; - -template -using AtomicPos = cuda::atomic; - -template -struct Bucket { - AtomicKey* keys_; - AtomicMeta* metas_; - V* vectors; // Pinned memory or HBM - - /* For upsert_kernel without user specified metas - recording the current meta, the cur_meta will - increment by 1 when a new inserting happens. */ - AtomicMeta cur_meta; - - /* min_meta and min_pos is for or upsert_kernel - with user specified meta. They record the minimum - meta and its pos in the bucket. */ - AtomicMeta min_meta; - AtomicPos min_pos; - - __forceinline__ __device__ AtomicKey* keys(int index) const { - return keys_ + index; - } - - __forceinline__ __device__ AtomicMeta* metas(int index) const { - return metas_ + index; - } -}; - -template -class Lock { - mutable cuda::atomic _lock; - - public: - __device__ Lock() : _lock{1} {} - - template - __forceinline__ __device__ void acquire(CG const& g, - unsigned long long lane = 0) const { - if (g.thread_rank() == lane) { - T expected = 1; - while (!_lock.compare_exchange_weak(expected, 2, - cuda::std::memory_order_acquire)) { - expected = 1; - } - } - g.sync(); - } - - template - __forceinline__ __device__ void release(CG const& g, - unsigned long long lane = 0) const { - g.sync(); - if (g.thread_rank() == lane) { - _lock.store(1, cuda::std::memory_order_release); - } - } -}; - -using Mutex = Lock; - -template -struct Table { - Bucket* buckets; - Mutex* locks; // mutex for write buckets - int* buckets_size; // size of each buckets. - V** slices; // Handles of the HBM/ HMEM slices. - size_t dim; // Dimension of the `vectors`. - size_t bytes_per_slice; // Size by byte of one slice. - size_t num_of_memory_slices; // Number of vectors memory slices. - size_t capacity = 134217728; // Initial capacity. - size_t max_size = - std::numeric_limits::max(); // Up limit of the table capacity. - size_t buckets_num; // Number of the buckets. - size_t bucket_max_size = 128; // Volume of each buckets. - size_t max_hbm_for_vectors = 0; // Max HBM allocated for vectors - size_t remaining_hbm_for_vectors = 0; // Remaining HBM allocated for vectors - bool is_pure_hbm = true; // unused - bool primary = true; // unused - int slots_offset = 0; // unused - int slots_number = 0; // unused - int device_id = 0; // Device id - int tile_size; -}; - -template -using EraseIfPredictInternal = - bool (*)(const K& key, ///< iterated key in table - M& meta, ///< iterated meta in table - const K& pattern, ///< input key from caller - const M& threshold ///< input meta from caller - ); - -/** - * An abstract class provides interface between the nv::merlin::HashTable - * and a file, which enables the table to save to the file or load from - * the file, by overriding the `read` and `write` method. - * - * @tparam K The data type of the key. - * @tparam V The data type of the vector's elements. - * The item data type should be a basic data type of C++/CUDA. - * @tparam M The data type for `meta`. - * The currently supported data type is only `uint64_t`. - * - */ -template -class BaseKVFile { - public: - virtual ~BaseKVFile() {} - - /** - * Read from file and fill into the keys, values, and metas buffer. - * When calling save/load method from table, it can assume that the - * received buffer of keys, vectors, and metas are automatically - * pre-allocated. - * - * @param n The number of KV pairs expect to read. `int64_t` was used - * here to adapt to various filesytem and formats. - * @param dim The dimension of the `vectors`. - * @param keys The pointer to received buffer for keys. - * @param vectors The pointer to received buffer for vectors. - * @param metas The pointer to received buffer for metas. - * - * @return Number of KV pairs have been successfully read. - */ - virtual size_t read(const size_t n, const size_t dim, K* keys, V* vectors, - M* metas) = 0; - - /** - * Write keys, values, metas from table to the file. It defines - * an abstract method to get batch of KV pairs and write them into - * file. - * - * @param n The number of KV pairs to be written. `int64_t` was used - * here to adapt to various filesytem and formats. - * @param dim The dimension of the `vectors`. - * @param keys The keys will be written to file. - * @param vectors The vectors of values will be written to file. - * @param metas The metas will be written to file. - * - * @return Number of KV pairs have been successfully written. - */ - virtual size_t write(const size_t n, const size_t dim, const K* keys, - const V* vectors, const M* metas) = 0; -}; - -enum class OccupyResult { - INITIAL, ///< Initial status - CONTINUE, ///< Insert did not succeed, continue trying to insert - OCCUPIED_EMPTY, ///< New pair inserted successfully - OCCUPIED_RECLAIMED, - DUPLICATE, ///< Insert did not succeed, key is already present - EVICT, ///< Insert succeeded by evicting one key with minimum meta. - REFUSED, ///< Insert did not succeed, insert meta is too low. -}; - -enum class OverrideResult { - INITIAL, ///< Initial status - CONTINUE, ///< Override did not succeed, continue trying to override - SUCCESS, ///< Override successfully - REFUSED, ///< Override is refused. -}; - -} // namespace merlin -} // namespace nv diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/utils.cuh b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/utils.cuh deleted file mode 100644 index bd60f93fc..000000000 --- a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/utils.cuh +++ /dev/null @@ -1,368 +0,0 @@ -/* - * Copyright (c) 2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include -#include -#include "cuda_fp16.h" -#include "cuda_runtime_api.h" -#include "debug.hpp" -#include "tensorflow_recommenders_addons/dynamic_embedding/core/lib/utils/cuda_utils.cuh" - -using namespace cooperative_groups; -namespace cg = cooperative_groups; - -/* -__inline__ __device__ uint64_t atomicCAS(uint64_t* address, uint64_t compare, - uint64_t val) { - return (uint64_t)atomicCAS((unsigned long long*)address, - (unsigned long long)compare, - (unsigned long long)val); -} - -__inline__ __device__ int64_t atomicCAS(int64_t* address, int64_t compare, - int64_t val) { - return (int64_t)atomicCAS((unsigned long long*)address, - (unsigned long long)compare, - (unsigned long long)val); -} -*/ - -__inline__ __device__ uint64_t atomicExch(uint64_t* address, uint64_t val) { - return (uint64_t)atomicExch((unsigned long long*)address, - (unsigned long long)val); -} - -__inline__ __device__ int64_t atomicExch(int64_t* address, int64_t val) { - return (int64_t)atomicExch((unsigned long long*)address, - (unsigned long long)val); -} - -__inline__ __device__ signed char atomicExch(signed char* address, - signed char val) { - signed char old = *address; - *address = val; - return old; -} - -/* -__inline__ __device__ int64_t atomicAdd(int64_t* address, const int64_t val) { - return (int64_t)atomicAdd((unsigned long long*)address, val); -} - -__inline__ __device__ uint64_t atomicAdd(uint64_t* address, - const uint64_t val) { - return (uint64_t)atomicAdd((unsigned long long*)address, val); -} -*/ - -namespace nv { -namespace merlin { - -inline void __cudaCheckError(const char* file, const int line) { -#ifdef CUDA_ERROR_CHECK - cudaError err = cudaGetLastError(); - if (cudaSuccess != err) { - fprintf(stderr, "cudaCheckError() failed at %s:%i : %s\n", file, line, - cudaGetErrorString(err)); - exit(-1); - } - - // More careful checking. However, this will affect performance. - // Comment away if needed. - err = cudaDeviceSynchronize(); - if (cudaSuccess != err) { - fprintf(stderr, "cudaCheckError() with sync failed at %s:%i : %s\n", file, - line, cudaGetErrorString(err)); - exit(-1); - } -#endif - - return; -} -//#define CudaCheckError() nv::merlin::__cudaCheckError(__FILE__, __LINE__) -#define CudaCheckError() {} - -static inline size_t SAFE_GET_GRID_SIZE(size_t N, int block_size) { - return ((N) > std::numeric_limits::max()) - ? ((1 << 30 - 1) / block_size + 1) - : (((N)-1) / block_size + 1); -} - -static inline int SAFE_GET_BLOCK_SIZE(int block_size, int device = -1) { - cudaDeviceProp prop; - int current_device = device; - if (current_device == -1) { - CUDA_CHECK(cudaGetDevice(¤t_device)); - } - CUDA_CHECK(cudaGetDeviceProperties(&prop, current_device)); - if (block_size > prop.maxThreadsPerBlock) { - fprintf(stdout, - "The requested block_size=%d exceeds the device limit, " - "the maxThreadsPerBlock=%d will be applied.\n", - block_size, prop.maxThreadsPerBlock); - } - return std::min(prop.maxThreadsPerBlock, block_size); -} - -inline uint64_t Murmur3HashHost(const uint64_t& key) { - uint64_t k = key; - k ^= k >> 33; - k *= UINT64_C(0xff51afd7ed558ccd); - k ^= k >> 33; - k *= UINT64_C(0xc4ceb9fe1a85ec53); - k ^= k >> 33; - return k; -} - -__inline__ __device__ uint64_t Murmur3HashDevice(uint64_t const& key) { - uint64_t k = key; - k ^= k >> 33; - k *= UINT64_C(0xff51afd7ed558ccd); - k ^= k >> 33; - k *= UINT64_C(0xc4ceb9fe1a85ec53); - k ^= k >> 33; - return k; -} - -__inline__ __device__ int64_t Murmur3HashDevice(int64_t const& key) { - uint64_t k = uint64_t(key); - k ^= k >> 33; - k *= UINT64_C(0xff51afd7ed558ccd); - k ^= k >> 33; - k *= UINT64_C(0xc4ceb9fe1a85ec53); - k ^= k >> 33; - return int64_t(k); -} - -__inline__ __device__ uint32_t Murmur3HashDevice(uint32_t const& key) { - uint32_t k = key; - k ^= k >> 16; - k *= UINT32_C(0x85ebca6b); - k ^= k >> 13; - k *= UINT32_C(0xc2b2ae35); - k ^= k >> 16; - - return k; -} - -__inline__ __device__ int32_t Murmur3HashDevice(int32_t const& key) { - uint32_t k = uint32_t(key); - k ^= k >> 16; - k *= UINT32_C(0x85ebca6b); - k ^= k >> 13; - k *= UINT32_C(0xc2b2ae35); - k ^= k >> 16; - - return int32_t(k); -} - -class CudaDeviceRestorer { - public: - CudaDeviceRestorer() { CUDA_CHECK(cudaGetDevice(&dev_)); } - ~CudaDeviceRestorer() { CUDA_CHECK(cudaSetDevice(dev_)); } - - private: - int dev_; -}; - -static inline int get_dev(const void* ptr) { - cudaPointerAttributes attr; - CUDA_CHECK(cudaPointerGetAttributes(&attr, ptr)); - int dev = -1; - -#if CUDART_VERSION >= 10000 - if (attr.type == cudaMemoryTypeDevice) -#else - if (attr.memoryType == cudaMemoryTypeDevice) -#endif - { - dev = attr.device; - } - return dev; -} - -static inline void switch_to_dev(const void* ptr) { - int dev = get_dev(ptr); - if (dev >= 0) { - CUDA_CHECK(cudaSetDevice(dev)); - } -} - -static inline bool is_on_device(const void* ptr) { - cudaPointerAttributes attr; - CUDA_CHECK(cudaPointerGetAttributes(&attr, ptr)); - -#if CUDART_VERSION >= 10000 - return (attr.type == cudaMemoryTypeDevice); -#else - return (attr.memoryType == cudaMemoryTypeDevice); -#endif -} - -template -struct TypeConvertFunc; - -template <> -struct TypeConvertFunc<__half, float> { - static __forceinline__ __device__ __half convert(float val) { - return __float2half(val); - } -}; - -template <> -struct TypeConvertFunc { - static __forceinline__ __device__ float convert(__half val) { - return __half2float(val); - } -}; - -template <> -struct TypeConvertFunc { - static __forceinline__ __device__ float convert(float val) { return val; } -}; - -template <> -struct TypeConvertFunc { - static __forceinline__ __device__ float convert(long long val) { - return static_cast(val); - } -}; - -template <> -struct TypeConvertFunc { - static __forceinline__ __device__ float convert(unsigned int val) { - return static_cast(val); - } -}; - -template <> -struct TypeConvertFunc { - static __forceinline__ __device__ int convert(long long val) { - return static_cast(val); - } -}; - -template <> -struct TypeConvertFunc { - static __forceinline__ __device__ int convert(unsigned int val) { - return static_cast(val); - } -}; - -template -void realloc(P* ptr, size_t old_size, size_t new_size) { - // Truncate old_size to limit dowstream copy ops. - old_size = std::min(old_size, new_size); - - // Alloc new buffer and copy at old data. - char* new_ptr; - CUDA_CHECK(cudaMalloc(&new_ptr, new_size)); - if (*ptr != nullptr) { - CUDA_CHECK(cudaMemcpy(new_ptr, *ptr, old_size, cudaMemcpyDefault)); - CUDA_CHECK(cudaFree(*ptr)); - } - - // Zero-fill remainder. - CUDA_CHECK(cudaMemset(new_ptr + old_size, 0, new_size - old_size)); - - // Switch to new pointer. - *ptr = reinterpret_cast

(new_ptr); - return; -} - -template -void realloc_managed(P* ptr, size_t old_size, size_t new_size) { - // Truncate old_size to limit dowstream copy ops. - old_size = std::min(old_size, new_size); - - // Alloc new buffer and copy at old data. - char* new_ptr; - CUDA_CHECK(cudaMallocManaged(&new_ptr, new_size)); - if (*ptr != nullptr) { - CUDA_CHECK(cudaMemcpy(new_ptr, *ptr, old_size, cudaMemcpyDefault)); - CUDA_CHECK(cudaFree(*ptr)); - } - - // Zero-fill remainder. - CUDA_CHECK(cudaMemset(new_ptr + old_size, 0, new_size - old_size)); - - // Switch to new pointer. - *ptr = reinterpret_cast

(new_ptr); - return; -} - -template -__forceinline__ __device__ void lock( - const cg::thread_block_tile& tile, mutex& set_mutex, - unsigned long long lane = 0) { - if (THREAD_SAFE) { - set_mutex.acquire(tile, lane); - } -} - -template -__forceinline__ __device__ void unlock( - const cg::thread_block_tile& tile, mutex& set_mutex, - unsigned long long lane = 0) { - if (THREAD_SAFE) { - set_mutex.release(tile, lane); - } -} - -inline void free_pointers(cudaStream_t stream, int n, ...) { - va_list args; - va_start(args, n); - void* ptr = nullptr; - for (int i = 0; i < n; i++) { - ptr = va_arg(args, void*); - if (ptr) { - cudaPointerAttributes attr; - memset(&attr, 0, sizeof(cudaPointerAttributes)); - try { - CUDA_CHECK(cudaPointerGetAttributes(&attr, ptr)); - if (attr.devicePointer && (!attr.hostPointer)) { - CUDA_CHECK(cudaFreeAsync(ptr, stream)); - } else if (attr.devicePointer && attr.hostPointer) { - CUDA_CHECK(cudaFreeHost(ptr)); - } else { - free(ptr); - } - } catch (const nv::merlin::CudaException& e) { - va_end(args); - throw e; - } - } - } - va_end(args); -} - -#define CUDA_FREE_POINTERS(stream, ...) \ - nv::merlin::free_pointers( \ - stream, (sizeof((void*[]){__VA_ARGS__}) / sizeof(void*)), __VA_ARGS__); - -static inline size_t GB(size_t n) { return n << 30; } - -static inline size_t MB(size_t n) { return n << 20; } - -static inline size_t KB(size_t n) { return n << 10; } - -} // namespace merlin -} // namespace nv diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin_hashtable.cuh b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin_hashtable.cuh deleted file mode 100644 index 19da172ea..000000000 --- a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin_hashtable.cuh +++ /dev/null @@ -1,1643 +0,0 @@ -/* - * Copyright (c) 2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "merlin/array_kernels.cuh" -#include "merlin/core_kernels.cuh" -#include "merlin/flexible_buffer.cuh" -#include "merlin/group_lock.hpp" -#include "merlin/memory_pool.cuh" -#include "merlin/types.cuh" -#include "merlin/utils.cuh" - -namespace nv { -namespace merlin { - -/** - * @brief Enumeration of the eviction strategies. - * - * @note The `meta` is introduced to define the importance of each key, the - * larger, the more important, the less likely they will be evicted. On `kLru` - * mode, the `metas` parameter of the APIs should keep `nullptr`, the meta for - * each key is assigned internally in LRU(Least Recently Used) policy. On - * `kCustomized` mode, the `metas` should be provided by caller. - * - * @note Eviction occurs automatically when a bucket is full. The keys with the - * minimum `meta` value are evicted first. - * - */ -enum class EvictStrategy { - kLru = 0, ///< LRU mode. - kCustomized = 1 ///< Customized mode. -}; - -/** - * @brief The options struct of HierarchicalKV. - */ -struct HashTableOptions { - size_t init_capacity = 0; ///< The initial capacity of the hash table. - size_t max_capacity = 0; ///< The maximum capacity of the hash table. - size_t max_hbm_for_vectors = 0; ///< The maximum HBM for vectors, in bytes. - size_t max_bucket_size = 128; ///< The length of each bucket. - size_t dim = 64; ///< The dimension of the vectors. - float max_load_factor = 0.5f; ///< The max load factor before rehashing. - int block_size = 128; ///< The default block size for CUDA kernels. - int io_block_size = 1024; ///< The block size for IO CUDA kernels. - int device_id = -1; ///< The ID of device. - bool io_by_cpu = false; ///< The flag indicating if the CPU handles IO. - EvictStrategy evict_strategy = EvictStrategy::kLru; ///< The evict strategy. - bool use_constant_memory = false; ///< reserved - MemoryPoolOptions - device_memory_pool; ///< Configuration options for device memory pool. - MemoryPoolOptions - host_memory_pool; ///< Configuration options for host memory pool. -}; - -/** - * @brief A customizable template function indicates which keys should be - * erased from the hash table by returning `true`. - * - * @note The `erase_if` or `export_batch_if` API traverses all of the items by - * this function and the items that return `true` are removed or exported. - * - * Example for erase_if: - * - * ``` - * template - * __forceinline__ __device__ bool erase_if_pred(const K& key, - * M& meta, - * const K& pattern, - * const M& threshold) { - * return ((key & 0xFFFF000000000000 == pattern) && - * (meta < threshold)); - * } - * ``` - * - * Example for export_batch_if: - * ``` - * template - * __forceinline__ __device__ bool export_if_pred(const K& key, - * M& meta, - * const K& pattern, - * const M& threshold) { - * return meta >= threshold; - * } - * ``` - */ -template -using EraseIfPredict = bool (*)( - const K& key, ///< The traversed key in a hash table. - M& meta, ///< The traversed meta in a hash table. - const K& pattern, ///< The key pattern to compare with the `key` argument. - const M& threshold ///< The threshold to compare with the `meta` argument. -); - -/** - * A HierarchicalKV hash table is a concurrent and hierarchical hash table that - * is powered by GPUs and can use HBM and host memory as storage for key-value - * pairs. Support for SSD storage is a future consideration. - * - * The `meta` is introduced to define the importance of each key, the - * larger, the more important, the less likely they will be evicted. Eviction - * occurs automatically when a bucket is full. The keys with the minimum `meta` - * value are evicted first. In a customized eviction strategy, we recommend - * using the timestamp or frequency of the key occurrence as the `meta` value - * for each key. You can also assign a special value to the `meta` to - * perform a customized eviction strategy. - * - * @note By default configuration, this class is thread-safe. - * - * @tparam K The data type of the key. - * @tparam V The data type of the vector's item type. - * The item data type should be a basic data type of C++/CUDA. - * @tparam M The data type for `meta`. - * The currently supported data type is only `uint64_t`. - * - */ -template -class HashTable { - public: - using size_type = size_t; - using key_type = K; - using value_type = V; - using meta_type = M; - using Pred = EraseIfPredict; - - private: - using TableCore = nv::merlin::Table; - static constexpr unsigned int TILE_SIZE = 4; - - using DeviceMemoryPool = MemoryPool>; - using HostMemoryPool = MemoryPool>; - -#if THRUST_VERSION >= 101600 - static constexpr auto thrust_par = thrust::cuda::par_nosync; -#else - static constexpr auto thrust_par = thrust::cuda::par; -#endif - - public: - /** - * @brief Default constructor for the hash table class. - */ - HashTable(){}; - - /** - * @brief Frees the resources used by the hash table and destroys the hash - * table object. - */ - ~HashTable() { - if (initialized_) { - CUDA_CHECK(cudaDeviceSynchronize()); - - initialized_ = false; - destroy_table(&table_); - CUDA_CHECK(cudaFree(d_table_)); - dev_mem_pool_.reset(); - host_mem_pool_.reset(); - } - } - - private: - HashTable(const HashTable&) = delete; - HashTable& operator=(const HashTable&) = delete; - HashTable(HashTable&&) = delete; - HashTable& operator=(HashTable&&) = delete; - - public: - /** - * @brief Initialize a merlin::HashTable. - * - * @param options The configuration options. - */ - void init(const HashTableOptions options) { - if (initialized_) { - return; - } - options_ = options; - - if (options_.device_id >= 0) { - CUDA_CHECK(cudaSetDevice(options_.device_id)); - } else { - CUDA_CHECK(cudaGetDevice(&(options_.device_id))); - } - - // Construct table. - cudaDeviceProp deviceProp; - CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, options_.device_id)); - shared_mem_size_ = deviceProp.sharedMemPerBlock; - create_table( - &table_, options_.dim, options_.init_capacity, options_.max_capacity, - options_.max_hbm_for_vectors, options_.max_bucket_size); - options_.block_size = SAFE_GET_BLOCK_SIZE(options_.block_size); - reach_max_capacity_ = (options_.init_capacity * 2 > options_.max_capacity); - MERLIN_CHECK((!(options_.io_by_cpu && options_.max_hbm_for_vectors != 0)), - "[HierarchicalKV] `io_by_cpu` should not be true when " - "`max_hbm_for_vectors` is not 0!"); - CUDA_CHECK(cudaMalloc((void**)&(d_table_), sizeof(TableCore))); - - sync_table_configuration(); - - // Create memory pools. - dev_mem_pool_ = std::make_unique>>( - options_.device_memory_pool); - host_mem_pool_ = std::make_unique>>( - options_.host_memory_pool); - - CUDA_CHECK(cudaDeviceSynchronize()); - initialized_ = true; - CudaCheckError(); - } - - /** - * @brief Insert new key-value-meta tuples into the hash table. - * If the key already exists, the values and metas are assigned new values. - * - * If the target bucket is full, the keys with minimum meta will be - * overwritten by new key unless the meta of the new key is even less than - * minimum meta of the target bucket. - * - * @param n Number of key-value-meta tuples to insert or assign. - * @param keys The keys to insert on GPU-accessible memory with shape - * (n). - * @param values The values to insert on GPU-accessible memory with - * shape (n, DIM). - * @param metas The metas to insert on GPU-accessible memory with shape - * (n). - * @parblock - * The metas should be a `uint64_t` value. You can specify a value that - * such as the timestamp of the key insertion, number of the key - * occurrences, or another value to perform a custom eviction strategy. - * - * The @p metas should be `nullptr`, when the LRU eviction strategy is - * applied. - * @endparblock - * - * @param stream The CUDA stream that is used to execute the operation. - * - * @param ignore_evict_strategy A boolean option indicating whether if - * the insert_or_assign ignores the evict strategy of table with current - * metas anyway. If true, it does not check whether the metas confroms to - * the evict strategy. If false, it requires the metas follow the evict - * strategy of table. - */ - void insert_or_assign(const size_type n, - const key_type* keys, // (n) - const value_type* values, // (n, DIM) - const meta_type* metas = nullptr, // (n) - cudaStream_t stream = 0, - bool ignore_evict_strategy = false) { - if (n == 0) { - return; - } - - while (!reach_max_capacity_ && - fast_load_factor(n, stream) > options_.max_load_factor) { - reserve(capacity() * 2, stream); - } - - if (!ignore_evict_strategy) { - check_evict_strategy(metas); - } - - writer_shared_lock lock(mutex_); - - if (is_fast_mode()) { - using Selector = - SelectUpsertKernelWithIO; - static thread_local int step_counter = 0; - static thread_local float load_factor = 0.0; - - if (((step_counter++) % kernel_select_interval_) == 0) { - load_factor = fast_load_factor(0, stream, false); - } - - Selector::execute_kernel( - load_factor, options_.block_size, options_.max_bucket_size, - table_->buckets_num, options_.dim, stream, n, d_table_, keys, - reinterpret_cast(values), metas); - } else { - const size_type dev_ws_size{n * (sizeof(value_type*) + sizeof(int))}; - auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)}; - auto d_dst{dev_ws.get(0)}; - auto d_src_offset{reinterpret_cast(d_dst + n)}; - - CUDA_CHECK(cudaMemsetAsync(d_dst, 0, dev_ws_size, stream)); - - { - const size_t block_size = options_.block_size; - const size_t N = n * TILE_SIZE; - const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); - - upsert_kernel - <<>>( - d_table_, options_.max_bucket_size, table_->buckets_num, - options_.dim, keys, d_dst, metas, d_src_offset, N); - } - - { - thrust::device_ptr d_dst_ptr( - reinterpret_cast(d_dst)); - thrust::device_ptr d_src_offset_ptr(d_src_offset); - - thrust::sort_by_key(thrust_par.on(stream), d_dst_ptr, d_dst_ptr + n, - d_src_offset_ptr, thrust::less()); - } - - if (options_.io_by_cpu) { - const size_type host_ws_size{dev_ws_size + - n * sizeof(value_type) * dim()}; - auto host_ws{host_mem_pool_->get_workspace<1>(host_ws_size, stream)}; - auto h_dst{host_ws.get(0)}; - auto h_src_offset{reinterpret_cast(h_dst + n)}; - auto h_values{reinterpret_cast(h_src_offset + n)}; - - CUDA_CHECK(cudaMemcpyAsync(h_dst, d_dst, dev_ws_size, - cudaMemcpyDeviceToHost, stream)); - CUDA_CHECK(cudaMemcpyAsync(h_values, values, host_ws_size - dev_ws_size, - cudaMemcpyDeviceToHost, stream)); - CUDA_CHECK(cudaStreamSynchronize(stream)); - - write_by_cpu(h_dst, h_values, h_src_offset, dim(), n); - } else { - const size_t block_size = options_.io_block_size; - const size_t N = n * dim(); - const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); - - write_kernel - <<>>(values, d_dst, d_src_offset, - dim(), N); - } - } - - CudaCheckError(); - } - - /** - * @brief Insert new key-value-meta tuples into the hash table. - * If the key already exists, the values and metas are assigned new values. - * - * If the target bucket is full, the keys with minimum meta will be - * overwritten by new key unless the meta of the new key is even less than - * minimum meta of the target bucket. The overwritten key with minimum - * meta will be evicted, with its values and meta, to evicted_keys, - * evicted_values, evcted_metas seperately in compact format. - * - * @param n Number of key-value-meta tuples to insert or assign. - * @param keys The keys to insert on GPU-accessible memory with shape - * (n). - * @param values The values to insert on GPU-accessible memory with - * shape (n, DIM). - * @param metas The metas to insert on GPU-accessible memory with shape - * (n). - * @param metas The metas to insert on GPU-accessible memory with shape - * (n). - * @params evicted_keys The output of keys replaced with minimum meta. - * @params evicted_values The output of values replaced with minimum meta on - * keys. - * @params evicted_metas The output of metas replaced with minimum meta on - * keys. - * @parblock - * The metas should be a `uint64_t` value. You can specify a value that - * such as the timestamp of the key insertion, number of the key - * occurrences, or another value to perform a custom eviction strategy. - * - * The @p metas should be `nullptr`, when the LRU eviction strategy is - * applied. - * @endparblock - * - * @param stream The CUDA stream that is used to execute the operation. - * - * @param ignore_evict_strategy A boolean option indicating whether if - * the insert_or_assign ignores the evict strategy of table with current - * metas anyway. If true, it does not check whether the metas confroms to - * the evict strategy. If false, it requires the metas follow the evict - * strategy of table. - */ - size_type insert_and_evict(const size_type n, - const key_type* keys, // (n) - const value_type* values, // (n, DIM) - const meta_type* metas, // (n) - key_type* evicted_keys, // (n) - value_type* evicted_values, // (n, DIM) - meta_type* evicted_metas, // (n) - cudaStream_t stream = 0) { - if (n == 0) { - return 0; - } - - while (!reach_max_capacity_ && - fast_load_factor(n, stream) > options_.max_load_factor) { - reserve(capacity() * 2, stream); - } - - writer_shared_lock lock(mutex_); - - // TODO: Currently only need eviction when using HashTable as HBM cache. - if (!is_fast_mode()) { - throw std::runtime_error("Only allow insert_and_evict in pure HBM mode."); - } - - using Selector = - SelectUpsertAndEvictKernelWithIO; - static thread_local int step_counter = 0; - static thread_local float load_factor = 0.0; - - if (((step_counter++) % kernel_select_interval_) == 0) { - load_factor = fast_load_factor(0, stream, false); - } - - // always use max tile to avoid data-deps as possible. - const int TILE_SIZE = 32; - size_t n_offsets = (n + TILE_SIZE - 1) / TILE_SIZE; - const size_type dev_ws_size = - n_offsets * sizeof(int64_t) + n * sizeof(bool) + sizeof(size_type); - - auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)}; - auto d_offsets{dev_ws.get(0)}; - auto dn_evicted = reinterpret_cast(d_offsets + n_offsets); - auto d_masks = reinterpret_cast(dn_evicted + 1); - - CUDA_CHECK( - cudaMemsetAsync(d_offsets, 0, n_offsets * sizeof(int64_t), stream)); - CUDA_CHECK(cudaMemsetAsync(dn_evicted, 0, sizeof(size_type), stream)); - CUDA_CHECK(cudaMemsetAsync(d_masks, 0, n * sizeof(bool), stream)); - - size_type block_size = options_.block_size; - size_type grid_size = SAFE_GET_GRID_SIZE(n, block_size); - CUDA_CHECK(cudaMemsetAsync(evicted_keys, static_cast(EMPTY_KEY), - n * sizeof(K), stream)); - - Selector::execute_kernel( - load_factor, options_.block_size, options_.max_bucket_size, - table_->buckets_num, options_.dim, stream, n, d_table_, keys, values, - metas, evicted_keys, evicted_values, evicted_metas); - - keys_not_empty - <<>>(evicted_keys, d_masks, n); - size_type n_evicted = 0; - gpu_pick_kvm_inplace( - grid_size, block_size, d_masks, true, n, dn_evicted, d_offsets, evicted_keys, - evicted_values, evicted_metas, dim(), stream); - CUDA_CHECK(cudaMemcpyAsync(&n_evicted, dn_evicted, sizeof(size_type), - cudaMemcpyDeviceToHost, stream)); - CUDA_CHECK(cudaStreamSynchronize(stream)); - CudaCheckError(); - return n_evicted; - } - - /** - * Searches for each key in @p keys in the hash table. - * If the key is found and the corresponding value in @p accum_or_assigns is - * `true`, the @p vectors_or_deltas is treated as a delta to the old - * value, and the delta is added to the old value of the key. - * - * If the key is not found and the corresponding value in @p accum_or_assigns - * is `false`, the @p vectors_or_deltas is treated as a new value and the - * key-value pair is updated in the table directly. - * - * @note When the key is found and the value of @p accum_or_assigns is - * `false`, or when the key is not found and the value of @p accum_or_assigns - * is `true`, nothing is changed and this operation is ignored. - * The algorithm assumes these situations occur while the key was modified or - * removed by other processes just now. - * - * @param n The number of key-value-meta tuples to process. - * @param keys The keys to insert on GPU-accessible memory with shape (n). - * @param value_or_deltas The values or deltas to insert on GPU-accessible - * memory with shape (n, DIM). - * @param accum_or_assigns The operation type with shape (n). A value of - * `true` indicates to accum and `false` indicates to assign. - * @param metas The metas to insert on GPU-accessible memory with shape (n). - * @parblock - * The metas should be a `uint64_t` value. You can specify a value that - * such as the timestamp of the key insertion, number of the key - * occurrences, or another value to perform a custom eviction strategy. - * - * The @p metas should be `nullptr`, when the LRU eviction strategy is - * applied. - * @endparblock - * - * @param stream The CUDA stream that is used to execute the operation. - * - * @param ignore_evict_strategy A boolean option indicating whether if - * the accum_or_assign ignores the evict strategy of table with current - * metas anyway. If true, it does not check whether the metas confroms to - * the evict strategy. If false, it requires the metas follow the evict - * strategy of table. - * - */ - void accum_or_assign(const size_type n, - const key_type* keys, // (n) - const value_type* value_or_deltas, // (n, DIM) - const bool* accum_or_assigns, // (n) - const meta_type* metas = nullptr, // (n) - cudaStream_t stream = 0, - bool ignore_evict_strategy = false) { - if (n == 0) { - return; - } - - while (!reach_max_capacity_ && - fast_load_factor(n, stream) > options_.max_load_factor) { - reserve(capacity() * 2, stream); - } - - if (!ignore_evict_strategy) { - check_evict_strategy(metas); - } - - writer_shared_lock lock(mutex_); - - const size_type dev_ws_size{ - n * (sizeof(value_type*) + sizeof(int) + sizeof(bool))}; - auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)}; - auto dst{dev_ws.get(0)}; - auto src_offset{reinterpret_cast(dst + n)}; - auto founds{reinterpret_cast(src_offset + n)}; - - CUDA_CHECK(cudaMemsetAsync(dst, 0, dev_ws_size, stream)); - - { - const size_t block_size = options_.block_size; - const size_t N = n * TILE_SIZE; - const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); - - accum_kernel - <<>>( - table_, keys, dst, metas, accum_or_assigns, table_->buckets, - table_->buckets_size, table_->bucket_max_size, - table_->buckets_num, src_offset, founds, N); - } - - if (!is_fast_mode()) { - thrust::device_ptr dst_ptr(reinterpret_cast(dst)); - thrust::device_ptr src_offset_ptr(src_offset); - - thrust::sort_by_key(thrust_par.on(stream), dst_ptr, dst_ptr + n, - src_offset_ptr, thrust::less()); - } - - { - const size_t block_size = options_.io_block_size; - const size_t N = n * dim(); - const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); - - write_with_accum_kernel - <<>>(value_or_deltas, dst, - accum_or_assigns, founds, - src_offset, dim(), N); - } - - CudaCheckError(); - } - - /** - * @brief Searches the hash table for the specified keys. - * When a key is missing, the value in @p values and @p metas will be - * inserted. - * - * @param n The number of key-value-meta tuples to search or insert. - * @param keys The keys to search on GPU-accessible memory with shape (n). - * @param values The values to search on GPU-accessible memory with - * shape (n, DIM). - * @param metas The metas to search on GPU-accessible memory with shape (n). - * @parblock - * If @p metas is `nullptr`, the meta for each key will not be returned. - * @endparblock - * @param stream The CUDA stream that is used to execute the operation. - * - */ - void find_or_insert(const size_type n, const key_type* keys, // (n) - value_type* values, // (n * DIM) - meta_type* metas = nullptr, // (n) - cudaStream_t stream = 0, - bool ignore_evict_strategy = false) { - if (n == 0) { - return; - } - - while (!reach_max_capacity_ && - fast_load_factor(n, stream) > options_.max_load_factor) { - reserve(capacity() * 2, stream); - } - - if (!ignore_evict_strategy) { - check_evict_strategy(metas); - } - - writer_shared_lock lock(mutex_); - - if (is_fast_mode()) { - using Selector = - SelectFindOrInsertKernelWithIO; - static thread_local int step_counter = 0; - static thread_local float load_factor = 0.0; - - if (((step_counter++) % kernel_select_interval_) == 0) { - load_factor = fast_load_factor(0, stream, false); - } - Selector::execute_kernel(load_factor, options_.block_size, - options_.max_bucket_size, table_->buckets_num, - options_.dim, stream, n, d_table_, keys, values, - metas); - } else { - const size_type dev_ws_size{ - n * (sizeof(value_type*) + sizeof(int) + sizeof(bool))}; - auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)}; - auto d_table_value_addrs{dev_ws.get(0)}; - auto param_key_index{reinterpret_cast(d_table_value_addrs + n)}; - auto founds{reinterpret_cast(param_key_index + n)}; - - CUDA_CHECK(cudaMemsetAsync(d_table_value_addrs, 0, dev_ws_size, stream)); - - { - const size_t block_size = options_.block_size; - const size_t N = n * TILE_SIZE; - const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); - - find_or_insert_kernel - <<>>( - d_table_, options_.max_bucket_size, table_->buckets_num, - options_.dim, keys, d_table_value_addrs, metas, founds, - param_key_index, N); - } - - { - thrust::device_ptr table_value_ptr( - reinterpret_cast(d_table_value_addrs)); - thrust::device_ptr param_key_index_ptr(param_key_index); - - thrust::sort_by_key(thrust_par.on(stream), table_value_ptr, - table_value_ptr + n, param_key_index_ptr, - thrust::less()); - } - - if (options_.io_by_cpu) { - const size_type host_ws_size{ - dev_ws_size + n * (sizeof(bool) + sizeof(value_type) * dim())}; - auto host_ws{host_mem_pool_->get_workspace<1>(host_ws_size, stream)}; - auto h_table_value_addrs{host_ws.get(0)}; - auto h_param_key_index{reinterpret_cast(h_table_value_addrs + n)}; - auto h_founds{reinterpret_cast(h_param_key_index + n)}; - auto h_param_values{reinterpret_cast(h_founds + n)}; - - CUDA_CHECK(cudaMemcpyAsync(h_table_value_addrs, d_table_value_addrs, - dev_ws_size, cudaMemcpyDeviceToHost, - stream)); - CUDA_CHECK(cudaMemcpyAsync(h_founds, founds, n * sizeof(bool), - cudaMemcpyDeviceToHost, stream)); - CUDA_CHECK(cudaMemcpyAsync(h_param_values, values, - n * sizeof(value_type) * dim(), - cudaMemcpyDeviceToHost, stream)); - CUDA_CHECK(cudaStreamSynchronize(stream)); - - read_or_write_by_cpu(h_table_value_addrs, h_param_values, - h_param_key_index, h_founds, dim(), n); - CUDA_CHECK(cudaMemcpyAsync(values, h_param_values, - n * sizeof(value_type) * dim(), - cudaMemcpyHostToDevice, stream)); - } else { - const size_t block_size = options_.io_block_size; - const size_t N = n * dim(); - const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); - - read_or_write_kernel - <<>>( - d_table_value_addrs, values, founds, param_key_index, dim(), N); - } - } - - CudaCheckError(); - } - - /** - * @brief Searches the hash table for the specified keys and returns address - * of the values. When a key is missing, the value in @p values and @p metas - * will be inserted. - * - * @warning This API returns internal addresses for high-performance but - * thread-unsafe. The caller is responsible for guaranteeing data consistency. - * - * @param n The number of key-value-meta tuples to search or insert. - * @param keys The keys to search on GPU-accessible memory with shape (n). - * @param values The addresses of values to search on GPU-accessible memory - * with shape (n). - * @param founds The status that indicates if the keys are found on - * @param metas The metas to search on GPU-accessible memory with shape (n). - * @parblock - * If @p metas is `nullptr`, the meta for each key will not be returned. - * @endparblock - * @param stream The CUDA stream that is used to execute the operation. - * - */ - void find_or_insert(const size_type n, const key_type* keys, // (n) - value_type** values, // (n) - bool* founds, // (n) - meta_type* metas = nullptr, // (n) - cudaStream_t stream = 0, - bool ignore_evict_strategy = false) { - if (n == 0) { - return; - } - - while (!reach_max_capacity_ && - fast_load_factor(n, stream) > options_.max_load_factor) { - reserve(capacity() * 2, stream); - } - - if (!ignore_evict_strategy) { - check_evict_strategy(metas); - } - - writer_shared_lock lock(mutex_); - - using Selector = - SelectFindOrInsertPtrKernel; - static thread_local int step_counter = 0; - static thread_local float load_factor = 0.0; - - if (((step_counter++) % kernel_select_interval_) == 0) { - load_factor = fast_load_factor(0, stream, false); - } - Selector::execute_kernel(load_factor, options_.block_size, - options_.max_bucket_size, table_->buckets_num, - options_.dim, stream, n, d_table_, keys, values, - metas, founds); - - CudaCheckError(); - } - /** - * @brief Assign new key-value-meta tuples into the hash table. - * If the key doesn't exist, the operation on the key will be ignored. - * - * @param n Number of key-value-meta tuples to insert or assign. - * @param keys The keys to insert on GPU-accessible memory with shape - * (n). - * @param values The values to insert on GPU-accessible memory with - * shape (n, DIM). - * @param metas The metas to insert on GPU-accessible memory with shape - * (n). - * @parblock - * The metas should be a `uint64_t` value. You can specify a value that - * such as the timestamp of the key insertion, number of the key - * occurrences, or another value to perform a custom eviction strategy. - * - * The @p metas should be `nullptr`, when the LRU eviction strategy is - * applied. - * @endparblock - * - * @param stream The CUDA stream that is used to execute the operation. - */ - void assign(const size_type n, - const key_type* keys, // (n) - const value_type* values, // (n, DIM) - const meta_type* metas = nullptr, // (n) - cudaStream_t stream = 0) { - if (n == 0) { - return; - } - - writer_shared_lock lock(mutex_); - - if (is_fast_mode()) { - using Selector = - SelectUpdateKernelWithIO; - static thread_local int step_counter = 0; - static thread_local float load_factor = 0.0; - - if (((step_counter++) % kernel_select_interval_) == 0) { - load_factor = fast_load_factor(0, stream, false); - } - - Selector::execute_kernel(load_factor, options_.block_size, - options_.max_bucket_size, table_->buckets_num, - options_.dim, stream, n, d_table_, keys, values, - metas); - } else { - const size_type dev_ws_size{n * (sizeof(value_type*) + sizeof(int))}; - auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)}; - auto d_dst{dev_ws.get(0)}; - auto d_src_offset{reinterpret_cast(d_dst + n)}; - - CUDA_CHECK(cudaMemsetAsync(d_dst, 0, dev_ws_size, stream)); - - { - const size_t block_size = options_.block_size; - const size_t N = n * TILE_SIZE; - const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); - - update_kernel - <<>>( - d_table_, options_.max_bucket_size, table_->buckets_num, - options_.dim, keys, d_dst, metas, d_src_offset, N); - } - - { - thrust::device_ptr d_dst_ptr( - reinterpret_cast(d_dst)); - thrust::device_ptr d_src_offset_ptr(d_src_offset); - - thrust::sort_by_key(thrust_par.on(stream), d_dst_ptr, d_dst_ptr + n, - d_src_offset_ptr, thrust::less()); - } - - if (options_.io_by_cpu) { - const size_type host_ws_size{dev_ws_size + - n * sizeof(value_type) * dim()}; - auto host_ws{host_mem_pool_->get_workspace<1>(host_ws_size, stream)}; - auto h_dst{host_ws.get(0)}; - auto h_src_offset{reinterpret_cast(h_dst + n)}; - auto h_values{reinterpret_cast(h_src_offset + n)}; - - CUDA_CHECK(cudaMemcpyAsync(h_dst, d_dst, dev_ws_size, - cudaMemcpyDeviceToHost, stream)); - CUDA_CHECK(cudaMemcpyAsync(h_values, values, host_ws_size - dev_ws_size, - cudaMemcpyDeviceToHost, stream)); - CUDA_CHECK(cudaStreamSynchronize(stream)); - - write_by_cpu(h_dst, h_values, h_src_offset, dim(), n); - } else { - const size_t block_size = options_.io_block_size; - const size_t N = n * dim(); - const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); - - write_kernel - <<>>(values, d_dst, d_src_offset, - dim(), N); - } - } - - CudaCheckError(); - } - - /** - * @brief Searches the hash table for the specified keys. - * - * @note When a key is missing, the value in @p values is not changed. - * - * @param n The number of key-value-meta tuples to search. - * @param keys The keys to search on GPU-accessible memory with shape (n). - * @param values The values to search on GPU-accessible memory with - * shape (n, DIM). - * @param founds The status that indicates if the keys are found on - * GPU-accessible memory with shape (n). - * @param metas The metas to search on GPU-accessible memory with shape (n). - * @parblock - * If @p metas is `nullptr`, the meta for each key will not be returned. - * @endparblock - * @param stream The CUDA stream that is used to execute the operation. - * - */ - void find(const size_type n, const key_type* keys, // (n) - value_type* values, // (n, DIM) - bool* founds, // (n) - meta_type* metas = nullptr, // (n) - cudaStream_t stream = 0) const { - if (n == 0) { - return; - } - - CUDA_CHECK(cudaMemsetAsync(founds, 0, n * sizeof(bool), stream)); - - reader_shared_lock lock(mutex_); - - if (is_fast_mode()) { - using Selector = - SelectLookupKernelWithIO; - static thread_local int step_counter = 0; - static thread_local float load_factor = 0.0; - - if (((step_counter++) % kernel_select_interval_) == 0) { - load_factor = fast_load_factor(0, stream, false); - } - Selector::execute_kernel(load_factor, options_.block_size, - options_.max_bucket_size, table_->buckets_num, - options_.dim, stream, n, d_table_, keys, values, - metas, founds); - } else { - const size_type dev_ws_size{n * (sizeof(value_type*) + sizeof(int))}; - auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)}; - auto src{dev_ws.get(0)}; - auto dst_offset{reinterpret_cast(src + n)}; - - CUDA_CHECK(cudaMemsetAsync(src, 0, dev_ws_size, stream)); - - { - const size_t block_size = options_.block_size; - const size_t N = n * TILE_SIZE; - const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); - - lookup_kernel - <<>>( - d_table_, options_.max_bucket_size, table_->buckets_num, - options_.dim, keys, src, metas, founds, dst_offset, N); - } - - { - thrust::device_ptr src_ptr( - reinterpret_cast(src)); - thrust::device_ptr dst_offset_ptr(dst_offset); - - thrust::sort_by_key(thrust_par.on(stream), src_ptr, src_ptr + n, - dst_offset_ptr, thrust::less()); - } - - { - const size_t block_size = options_.io_block_size; - const size_t N = n * dim(); - const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); - - read_kernel - <<>>(src, values, founds, - dst_offset, dim(), N); - } - } - - CudaCheckError(); - } - - /** - * @brief Searches the hash table for the specified keys and returns address - * of the values. - * - * @note When a key is missing, the data in @p values won't change. - * @warning This API returns internal addresses for high-performance but - * thread-unsafe. The caller is responsible for guaranteeing data consistency. - * - * @param n The number of key-value-meta tuples to search. - * @param keys The keys to search on GPU-accessible memory with shape (n). - * @param values The addresses of values to search on GPU-accessible memory - * with shape (n). - * @param founds The status that indicates if the keys are found on - * GPU-accessible memory with shape (n). - * @param metas The metas to search on GPU-accessible memory with shape (n). - * @parblock - * If @p metas is `nullptr`, the meta for each key will not be returned. - * @endparblock - * @param stream The CUDA stream that is used to execute the operation. - * - */ - void find(const size_type n, const key_type* keys, // (n) - value_type** values, // (n) - bool* founds, // (n) - meta_type* metas = nullptr, // (n) - cudaStream_t stream = 0) const { - if (n == 0) { - return; - } - - CUDA_CHECK(cudaMemsetAsync(founds, 0, n * sizeof(bool), stream)); - - reader_shared_lock lock(mutex_); - - using Selector = SelectLookupPtrKernel; - static thread_local int step_counter = 0; - static thread_local float load_factor = 0.0; - - if (((step_counter++) % kernel_select_interval_) == 0) { - load_factor = fast_load_factor(0, stream, false); - } - Selector::execute_kernel(load_factor, options_.block_size, - options_.max_bucket_size, table_->buckets_num, - options_.dim, stream, n, d_table_, keys, values, - metas, founds); - - CudaCheckError(); - } - - /** - * @brief Removes specified elements from the hash table. - * - * @param n The number of keys to remove. - * @param keys The keys to remove on GPU-accessible memory. - * @param stream The CUDA stream that is used to execute the operation. - * - */ - void erase(const size_type n, const key_type* keys, cudaStream_t stream = 0) { - if (n == 0) { - return; - } - - write_read_lock lock(mutex_); - - { - const size_t block_size = options_.block_size; - const size_t N = n * TILE_SIZE; - const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); - - remove_kernel - <<>>( - table_, keys, table_->buckets, table_->buckets_size, - table_->bucket_max_size, table_->buckets_num, N); - } - - CudaCheckError(); - return; - } - - /** - * @brief Erases all elements that satisfy the predicate @p pred from the - * hash table. - * - * The value for @p pred should be a function with type `Pred` defined like - * the following example: - * - * ``` - * template - * __forceinline__ __device__ bool erase_if_pred(const K& key, - * const M& meta, - * const K& pattern, - * const M& threshold) { - * return ((key & 0x1 == pattern) && (meta < threshold)); - * } - * ``` - * - * @param pred The predicate function with type Pred that returns `true` if - * the element should be erased. - * @param pattern The third user-defined argument to @p pred with key_type - * type. - * @param threshold The fourth user-defined argument to @p pred with meta_type - * type. - * @param stream The CUDA stream that is used to execute the operation. - * - * @return The number of elements removed. - * - */ - size_type erase_if(const Pred& pred, const key_type& pattern, - const meta_type& threshold, cudaStream_t stream = 0) { - write_read_lock lock(mutex_); - - auto dev_ws{dev_mem_pool_->get_workspace<1>(sizeof(size_type), stream)}; - auto d_count{dev_ws.get(0)}; - - CUDA_CHECK(cudaMemsetAsync(d_count, 0, sizeof(size_type), stream)); - - Pred h_pred; - CUDA_CHECK(cudaMemcpyFromSymbolAsync(&h_pred, pred, sizeof(Pred), 0, - cudaMemcpyDeviceToHost, stream)); - - { - const size_t block_size = options_.block_size; - const size_t N = table_->buckets_num; - const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); - - remove_kernel - <<>>( - table_, h_pred, pattern, threshold, d_count, table_->buckets, - table_->buckets_size, table_->bucket_max_size, - table_->buckets_num, N); - } - - size_type count = 0; - CUDA_CHECK(cudaMemcpyAsync(&count, d_count, sizeof(size_type), - cudaMemcpyDeviceToHost, stream)); - CUDA_CHECK(cudaStreamSynchronize(stream)); - - CudaCheckError(); - return count; - } - - /** - * @brief Removes all of the elements in the hash table with no release - * object. - */ - void clear(cudaStream_t stream = 0) { - write_read_lock lock(mutex_); - - const size_t block_size = options_.block_size; - const size_t N = table_->buckets_num * table_->bucket_max_size; - const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); - - clear_kernel - <<>>(table_, N); - - CudaCheckError(); - } - - public: - /** - * @brief Exports a certain number of the key-value-meta tuples from the - * hash table. - * - * @param n The maximum number of exported pairs. - * @param offset The position of the key to remove. - * @param counter Accumulates amount of successfully exported values. - * @param keys The keys to dump from GPU-accessible memory with shape (n). - * @param values The values to dump from GPU-accessible memory with shape - * (n, DIM). - * @param metas The metas to search on GPU-accessible memory with shape (n). - * @parblock - * If @p metas is `nullptr`, the meta for each key will not be returned. - * @endparblock - * - * @param stream The CUDA stream that is used to execute the operation. - * - * @return The number of elements dumped. - * - * @throw CudaException If the key-value size is too large for GPU shared - * memory. Reducing the value for @p n is currently required if this exception - * occurs. - */ - void export_batch(size_type n, const size_type offset, - size_type* counter, // (1) - key_type* keys, // (n) - value_type* values, // (n, DIM) - meta_type* metas = nullptr, // (n) - cudaStream_t stream = 0) const { - reader_shared_lock lock(mutex_); - - if (offset >= table_->capacity) { - return; - } - n = std::min(table_->capacity - offset, n); - - size_type shared_size; - size_type block_size; - std::tie(shared_size, block_size) = - dump_kernel_shared_memory_size(shared_mem_size_); - - const size_t grid_size = SAFE_GET_GRID_SIZE(n, block_size); - - dump_kernel - <<>>( - table_, keys, values, metas, offset, n, counter); - - CudaCheckError(); - } - - size_type export_batch(const size_type n, const size_type offset, - key_type* keys, // (n) - value_type* values, // (n, DIM) - meta_type* metas = nullptr, // (n) - cudaStream_t stream = 0) const { - auto dev_ws{dev_mem_pool_->get_workspace<1>(sizeof(size_type), stream)}; - auto d_counter{dev_ws.get(0)}; - - CUDA_CHECK(cudaMemsetAsync(d_counter, 0, sizeof(size_type), stream)); - export_batch(n, offset, d_counter, keys, values, metas, stream); - - size_type counter = 0; - CUDA_CHECK(cudaMemcpyAsync(&counter, d_counter, sizeof(size_type), - cudaMemcpyDeviceToHost, stream)); - CUDA_CHECK(cudaStreamSynchronize(stream)); - return counter; - } - - /** - * @brief Exports a certain number of the key-value-meta tuples which match - * specified condition from the hash table. - * - * @param n The maximum number of exported pairs. - * The value for @p pred should be a function with type `Pred` defined like - * the following example: - * - * ``` - * template - * __forceinline__ __device__ bool export_if_pred(const K& key, - * M& meta, - * const K& pattern, - * const M& threshold) { - * - * return meta > threshold; - * } - * ``` - * - * @param pred The predicate function with type Pred that returns `true` if - * the element should be exported. - * @param pattern The third user-defined argument to @p pred with key_type - * type. - * @param threshold The fourth user-defined argument to @p pred with meta_type - * type. - * @param offset The position of the key to remove. - * @param keys The keys to dump from GPU-accessible memory with shape (n). - * @param values The values to dump from GPU-accessible memory with shape - * (n, DIM). - * @param metas The metas to search on GPU-accessible memory with shape (n). - * @parblock - * If @p metas is `nullptr`, the meta for each key will not be returned. - * @endparblock - * - * @param stream The CUDA stream that is used to execute the operation. - * - * @return The number of elements dumped. - * - * @throw CudaException If the key-value size is too large for GPU shared - * memory. Reducing the value for @p n is currently required if this exception - * occurs. - */ - template