diff --git a/WORKSPACE b/WORKSPACE
index 4cf968237..ba74f8c23 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -54,6 +54,46 @@ http_archive(
     url = "https://github.com/sewenew/redis-plus-plus/archive/refs/tags/1.2.3.zip",
 )
 
+http_archive(
+    name = "hkv",
+    build_file = "//build_deps/toolchains/hkv:hkv.BUILD",
+    patch_cmds = [
+        """sed -i.bak '1772i\\'$'\\n    ThrustAllocator<uint8_t> thrust_allocator_;\\n' include/merlin_hashtable.cuh""",
+        """sed -i.bak '225i\\'$'\\n    thrust_allocator_.set_allocator(allocator_);\\n' include/merlin_hashtable.cuh""",
+        "sed -i.bak 's/thrust::sort_by_key(thrust_par.on(stream)/thrust::sort_by_key(thrust_par(thrust_allocator_).on(stream)/' include/merlin_hashtable.cuh",
+        "sed -i.bak 's/reduce(thrust_par.on(stream)/reduce(thrust_par(thrust_allocator_).on(stream)/' include/merlin_hashtable.cuh",
+        """sed -i.bak '125i\\'$'\\n    template <typename T>\\n' include/merlin/allocator.cuh""",
+        """sed -i.bak '126i\\'$'\\n    struct ThrustAllocator : thrust::device_malloc_allocator<T> {\\n' include/merlin/allocator.cuh""",
+        """sed -i.bak '127i\\'$'\\n     public:\\n' include/merlin/allocator.cuh""",
+        """sed -i.bak '128i\\'$'\\n      typedef thrust::device_malloc_allocator<T> super_t;\\n' include/merlin/allocator.cuh""",
+        """sed -i.bak '129i\\'$'\\n      typedef typename super_t::pointer pointer;\\n' include/merlin/allocator.cuh""",
+        """sed -i.bak '130i\\'$'\\n      typedef typename super_t::size_type size_type;\\n' include/merlin/allocator.cuh""",
+        """sed -i.bak '131i\\'$'\\n     public:\\n' include/merlin/allocator.cuh""",
+        """sed -i.bak '132i\\'$'\\n      pointer allocate(size_type n) {\\n' include/merlin/allocator.cuh""",
+        """sed -i.bak '133i\\'$'\\n        void* ptr = nullptr;\\n' include/merlin/allocator.cuh""",
+        """sed -i.bak '134i\\'$'\\n        MERLIN_CHECK(\\n' include/merlin/allocator.cuh""",
+        """sed -i.bak '135i\\'$'\\n            allocator_ != nullptr,\\n' include/merlin/allocator.cuh""",
+        """sed -i.bak '136i\\'$'\\n            "[ThrustAllocator] set_allocator should be called in advance!");\\n' include/merlin/allocator.cuh""",
+        """sed -i.bak '137i\\'$'\\n        allocator_->alloc(MemoryType::Device, &ptr, sizeof(T) * n);\\n' include/merlin/allocator.cuh""",
+        """sed -i.bak '138i\\'$'\\n        return pointer(reinterpret_cast<T*>(ptr));\\n' include/merlin/allocator.cuh""",
+        """sed -i.bak '139i\\'$'\\n      }\\n' include/merlin/allocator.cuh""",
+        """sed -i.bak '140i\\'$'\\n      void deallocate(pointer p, size_type n) {\\n' include/merlin/allocator.cuh""",
+        """sed -i.bak '141i\\'$'\\n        MERLIN_CHECK(\\n' include/merlin/allocator.cuh""",
+        """sed -i.bak '142i\\'$'\\n            allocator_ != nullptr,\\n' include/merlin/allocator.cuh""",
+        """sed -i.bak '143i\\'$'\\n            "[ThrustAllocator] set_allocator should be called in advance!");\\n' include/merlin/allocator.cuh""",
+        """sed -i.bak '144i\\'$'\\n        allocator_->free(MemoryType::Device, reinterpret_cast<void*>(p.get()));\\n' include/merlin/allocator.cuh""",
+        """sed -i.bak '145i\\'$'\\n      }\\n' include/merlin/allocator.cuh""",
+        """sed -i.bak '146i\\'$'\\n      void set_allocator(BaseAllocator* allocator) { allocator_ = allocator; }\\n' include/merlin/allocator.cuh""",
+        """sed -i.bak '147i\\'$'\\n     public:\\n' include/merlin/allocator.cuh""",
+        """sed -i.bak '148i\\'$'\\n      BaseAllocator* allocator_ = nullptr;\\n' include/merlin/allocator.cuh""",
+        """sed -i.bak '149i\\'$'\\n     };\\n' include/merlin/allocator.cuh""",
+        """sed -i.bak '20i\\'$'\\n     #include <thrust/device_malloc_allocator.h>\\n' include/merlin/allocator.cuh""",
+    ],
+    sha256 = "f8179c445a06a558262946cda4d8ae7252d313e73f792586be9b1bc0c993b1cf",
+    strip_prefix = "HierarchicalKV-0.1.0-beta.6",
+    url = "https://github.com/NVIDIA-Merlin/HierarchicalKV/archive/refs/tags/v0.1.0-beta.6.tar.gz",
+)
+
 tf_configure(
     name = "local_config_tf",
 )
diff --git a/build_deps/toolchains/gpu/cuda_configure.bzl b/build_deps/toolchains/gpu/cuda_configure.bzl
index 2257b54dc..25c968672 100644
--- a/build_deps/toolchains/gpu/cuda_configure.bzl
+++ b/build_deps/toolchains/gpu/cuda_configure.bzl
@@ -68,8 +68,8 @@ _DEFAULT_CUDA_COMPUTE_CAPABILITIES = {
 
 _DEFAULT_CUDA_COMPUTE_CAPABILITIES.update(
     {"11.{}".format(v): [
-        "6.0",
-        "6.1",
+        # "6.0",
+        # "6.1",
         "7.0",
         "7.5",
         "8.0",
diff --git a/build_deps/toolchains/hkv/BUILD b/build_deps/toolchains/hkv/BUILD
new file mode 100644
index 000000000..e69de29bb
diff --git a/build_deps/toolchains/hkv/hkv.BUILD b/build_deps/toolchains/hkv/hkv.BUILD
new file mode 100644
index 000000000..6250a9fec
--- /dev/null
+++ b/build_deps/toolchains/hkv/hkv.BUILD
@@ -0,0 +1,18 @@
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda", "if_cuda_is_configured")
+
+package(default_visibility = ["//visibility:public"])
+
+cc_library(
+    name = "hkv",
+    hdrs = glob([
+        "include/merlin/core_kernels/*.cuh",
+        "include/merlin/*.cuh",
+        "include/merlin_hashtable.cuh",
+        "include/merlin_localfile.hpp",
+    ]),
+    copts = [
+        "-Ofast",
+    ],
+    include_prefix = "include",
+    includes = ["include"],
+)
\ No newline at end of file
diff --git a/docs/api_docs/tfra/dynamic_embedding.md b/docs/api_docs/tfra/dynamic_embedding.md
index 860dfc5be..51bd79263 100644
--- a/docs/api_docs/tfra/dynamic_embedding.md
+++ b/docs/api_docs/tfra/dynamic_embedding.md
@@ -47,6 +47,12 @@ Export dynamic_embedding APIs.
 
 [`class ModelMode`](../tfra/dynamic_embedding/ModelMode.md): The global config of model modes.
 
+[`class HkvHashTable`](../tfra/dynamic_embedding/HkvHashTable.md): A generic mutable hash table implementation.
+
+[`class HkvHashTableConfig`](../tfra/dynamic_embedding/HkvHashTableConfig.md): HkvHashTableConfig config init_capacity, max_capacity, max_hbm_for_values of HkvHashTable
+
+[`class HkvHashTableCreator`](../tfra/dynamic_embedding/HkvHashTableCreator.md):   A generic KV table creator.
+
 [`class RedisTable`](../tfra/dynamic_embedding/RedisTable.md): A generic mutable hash table implementation.
 
 [`class RedisTableConfig`](../tfra/dynamic_embedding/RedisTableConfig.md): RedisTableConfig config json file for connecting Redis service and 
diff --git a/docs/api_docs/tfra/dynamic_embedding/CuckooHashTable.md b/docs/api_docs/tfra/dynamic_embedding/CuckooHashTable.md
index 7293afae5..1bef31687 100644
--- a/docs/api_docs/tfra/dynamic_embedding/CuckooHashTable.md
+++ b/docs/api_docs/tfra/dynamic_embedding/CuckooHashTable.md
@@ -53,7 +53,8 @@ remove method. It does not support initialization via the init method.
 ```python
 table = tfra.dynamic_embedding.CuckooHashTable(key_dtype=tf.string,
                                                value_dtype=tf.int64,
-                                               default_value=-1)
+                                               default_value=-1,
+                                               device=['/GPU:0'])
 sess.run(table.insert(keys, values))
 out = table.lookup(query_keys)
 print(out.eval())
@@ -106,6 +107,10 @@ A `CuckooHashTable` object.
 * <b>`ValueError`</b>: If checkpoint is True and no name was specified.
 
 
+## <b>`Important update！！`</b>
+
+We have made updates to the underlying implementation of the CuckooHashTable. The original CPU table remains unchanged, but the GPU table now uses the HKV implementation instead of nvhash. To ensure interface consistency, the init_capacity and max_capacity of HKV will be set to the init_size value you pass in. It is important to note that after this setting, the GPU hash table will not automatically resize, and the final capacity will be the same as the init_size. The max_hbm_for_values parameter of hkv will be set to a sufficiently large number to ensure that all your data is stored in the GPU table. Additionally, hkv has requirements for GPU compute capability, which needs to be 8.0 or above. For more detailed information about HKV, please refer to the documentation of HKV.
+
 
 ## Properties
 
diff --git a/docs/api_docs/tfra/dynamic_embedding/CuckooHashTableCreator.md b/docs/api_docs/tfra/dynamic_embedding/CuckooHashTableCreator.md
index deb8b588a..db8afb080 100644
--- a/docs/api_docs/tfra/dynamic_embedding/CuckooHashTableCreator.md
+++ b/docs/api_docs/tfra/dynamic_embedding/CuckooHashTableCreator.md
@@ -41,13 +41,10 @@ class for creating the real KV table backend(TF resource).
 
 #### Example usage:
 
-
+Due to CuckooHashTableConfig include nothing for parameter default satisfied. Just setting the parameter saver is enough.
 
 ```python
-redis_config1=tfra.dynamic_embedding.RedisTableConfig(
-  redis_config_abs_dir="xx/yy.json"
-)
-redis_creator1=tfra.dynamic_embedding.RedisTableCreator(redis_config1)
+cuckoo_creator=tfra.dynamic_embedding.CuckooHashTableCreator(saver=de.FileSystemSaver())
 ```
 
 <h2 id="__init__"><code>__init__</code></h2>
diff --git a/docs/api_docs/tfra/dynamic_embedding/HkvHashTable.md b/docs/api_docs/tfra/dynamic_embedding/HkvHashTable.md
new file mode 100644
index 000000000..04022b0b3
--- /dev/null
+++ b/docs/api_docs/tfra/dynamic_embedding/HkvHashTable.md
@@ -0,0 +1,381 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfra.dynamic_embedding.HkvHashTable" />
+<meta itemprop="path" content="Stable" />
+<meta itemprop="property" content="key_dtype"/>
+<meta itemprop="property" content="name"/>
+<meta itemprop="property" content="resource_handle"/>
+<meta itemprop="property" content="value_dtype"/>
+<meta itemprop="property" content="__getitem__"/>
+<meta itemprop="property" content="__init__"/>
+<meta itemprop="property" content="accum"/>
+<meta itemprop="property" content="clear"/>
+<meta itemprop="property" content="export"/>
+<meta itemprop="property" content="insert"/>
+<meta itemprop="property" content="lookup"/>
+<meta itemprop="property" content="remove"/>
+<meta itemprop="property" content="size"/>
+</div>
+
+# tfra.dynamic_embedding.HkvHashTable
+
+<!-- Insert buttons and diff -->
+
+<table class="tfo-notebook-buttons tfo-api" align="left">
+
+<td>
+  <a target="_blank" href="https://github.com/tensorflow/recommenders-addons/tree/master/tensorflow_recommenders_addons/dynamic_embedding/python/ops/hkv_hashtable_ops.py">
+    <img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />
+    View source on GitHub
+  </a>
+</td></table>
+<br/>
+<br/>
+<br/>
+<br/>
+
+
+
+## Class `HkvHashTable`
+
+A generic mutable hash table implementation.
+
+HkvHashTable is a multi-level cache hash table that allows storing values simultaneously in both GPU and CPU. It enables efficient utilization of training resources while ensuring high-performance queries, insert. This greatly expands the capacity of the hash table, making it suitable for more complex training tasks. For more detailed information about HierarchicalKV, please refer to [HierarchicalKV
+](https://github.com/NVIDIA-Merlin/HierarchicalKV).
+
+
+#### Environment request
+
+* CUDA version >= 11.2
+* NVIDIA GPU with compute capability 8.0, 8.6, 8.7 or 9.0
+* GCC supports `C++17' standard or later.
+
+
+#### Example usage:
+
+
+
+```python
+table = tfra.dynamic_embedding.HkvHashTable(key_dtype=tf.string,
+                                               value_dtype=tf.int64,
+                                               default_value=-1)
+sess.run(table.insert(keys, values))
+out = table.lookup(query_keys)
+print(out.eval())
+```
+
+<h2 id="__init__"><code>__init__</code></h2>
+
+<a target="_blank" href="https://github.com/tensorflow/recommenders-addons/tree/master/tensorflow_recommenders_addons/dynamic_embedding/python/ops/hkv_hashtable_ops.py">View source</a>
+
+``` python
+KHkvHashTableInitCapacity = 1024 * 1024
+KHkvHashTableMaxCapacity = 1024 * 1024
+KHkvHashTableMaxHbmForValues = 1024 * 1024 * 1024
+
+
+__init__(
+    key_dtype,
+    value_dtype,
+    default_value,
+    name='HkvHashTable',
+    checkpoint=(True),
+    init_capacity=KHkvHashTableInitCapacity,
+    max_capacity=KHkvHashTableMaxCapacity,
+    max_hbm_for_values=KHkvHashTableMaxHbmForValues,
+    config=None,
+    device='',
+)
+```
+
+Creates an empty `HkvHashTable` object.
+
+Creates a table, the type of its keys and values are specified by key_dtype
+and value_dtype, respectively.
+
+#### Args:
+
+
+* <b>`key_dtype`</b>: the type of the key tensors.
+* <b>`value_dtype`</b>: the type of the value tensors.
+* <b>`default_value`</b>: The value to use if a key is missing in the table.
+* <b>`name`</b>: A name for the operation (optional).
+* <b>`checkpoint`</b>: if True, the contents of the table are saved to and restored
+  from checkpoints. If `shared_name` is empty for a checkpointed table, it
+  is shared using the table node name.
+* <b>`init_capacity`</b>: initial size for the Variable and initial size of each hash
+* <b>`max_capacity`</b>: max capacity for the Variable and max capacity of each hash
+* <b>`max_hbm_for_values`</b>: The maximum HBM capacity occupied by the values of the hash table, measured in bytes.
+* <b>`config`</b>: a HkvHashTableConfig object
+* <b>`device`</b>: initial size for the Variable and initial size of each hash
+  tables will be int(init_size / N), N is the number of the devices.
+
+
+#### Returns:
+
+A `HkvHashTable` object.
+
+
+
+#### Raises:
+
+
+* <b>`ValueError`</b>: If checkpoint is True and no name was specified.
+
+
+
+## Properties
+
+<h3 id="key_dtype"><code>key_dtype</code></h3>
+
+The table key dtype.
+
+
+<h3 id="name"><code>name</code></h3>
+
+The name of the table.
+
+
+<h3 id="resource_handle"><code>resource_handle</code></h3>
+
+Returns the resource handle associated with this Resource.
+
+
+<h3 id="value_dtype"><code>value_dtype</code></h3>
+
+The table value dtype.
+
+
+
+
+## Methods
+
+<h3 id="__getitem__"><code>__getitem__</code></h3>
+
+``` python
+__getitem__(keys)
+```
+
+Looks up `keys` in a table, outputs the corresponding values.
+
+
+<h3 id="accum"><code>accum</code></h3>
+
+<a target="_blank" href="https://github.com/tensorflow/recommenders-addons/tree/master/tensorflow_recommenders_addons/dynamic_embedding/python/ops/hkv_hashtable_ops.py">View source</a>
+
+``` python
+accum(
+    keys,
+    values_or_deltas,
+    exists,
+    name=None
+)
+```
+
+Associates `keys` with `values`.
+
+
+#### Args:
+
+
+* <b>`keys`</b>: Keys to accmulate. Can be a tensor of any shape.
+  Must match the table's key type.
+* <b>`values_or_deltas`</b>: values to be associated with keys. Must be a tensor of
+  the same shape as `keys` and match the table's value type.
+* <b>`exists`</b>: A bool type tensor indicates if keys already exist or not.
+  Must be a tensor of the same shape as `keys`.
+* <b>`name`</b>: A name for the operation (optional).
+
+
+#### Returns:
+
+The created Operation.
+
+
+
+#### Raises:
+
+
+* <b>`TypeError`</b>: when `keys` or `values` doesn't match the table data
+  types.
+
+<h3 id="clear"><code>clear</code></h3>
+
+<a target="_blank" href="https://github.com/tensorflow/recommenders-addons/tree/master/tensorflow_recommenders_addons/dynamic_embedding/python/ops/hkv_hashtable_ops.py">View source</a>
+
+``` python
+clear(name=None)
+```
+
+clear all keys and values in the table.
+
+
+#### Args:
+
+
+* <b>`name`</b>: A name for the operation (optional).
+
+
+#### Returns:
+
+The created Operation.
+
+
+<h3 id="export"><code>export</code></h3>
+
+<a target="_blank" href="https://github.com/tensorflow/recommenders-addons/tree/master/tensorflow_recommenders_addons/dynamic_embedding/python/ops/hkv_hashtable_ops.py">View source</a>
+
+``` python
+export(name=None)
+```
+
+Returns tensors of all keys and values in the table.
+
+
+#### Args:
+
+
+* <b>`name`</b>: A name for the operation (optional).
+
+
+#### Returns:
+
+A pair of tensors with the first tensor containing all keys and the
+  second tensors containing all values in the table.
+
+
+<h3 id="insert"><code>insert</code></h3>
+
+<a target="_blank" href="https://github.com/tensorflow/recommenders-addons/tree/master/tensorflow_recommenders_addons/dynamic_embedding/python/ops/hkv_hashtable_ops.py">View source</a>
+
+``` python
+insert(
+    keys,
+    values,
+    name=None
+)
+```
+
+Associates `keys` with `values`.
+
+
+#### Args:
+
+
+* <b>`keys`</b>: Keys to insert. Can be a tensor of any shape. Must match the table's
+  key type.
+* <b>`values`</b>: Values to be associated with keys. Must be a tensor of the same
+  shape as `keys` and match the table's value type.
+* <b>`name`</b>: A name for the operation (optional).
+
+
+#### Returns:
+
+The created Operation.
+
+
+
+#### Raises:
+
+
+* <b>`TypeError`</b>: when `keys` or `values` doesn't match the table data
+  types.
+
+<h3 id="lookup"><code>lookup</code></h3>
+
+<a target="_blank" href="https://github.com/tensorflow/recommenders-addons/tree/master/tensorflow_recommenders_addons/dynamic_embedding/python/ops/hkv_hashtable_ops.py">View source</a>
+
+``` python
+lookup(
+    keys,
+    dynamic_default_values=None,
+    return_exists=(False),
+    name=None
+)
+```
+
+Looks up `keys` in a table, outputs the corresponding values.
+
+The `default_value` is used for keys not present in the table.
+
+#### Args:
+
+
+* <b>`keys`</b>: Keys to look up. Can be a tensor of any shape. Must match the
+  table's key_dtype.
+* <b>`dynamic_default_values`</b>: The values to use if a key is missing in the
+  table. If None (by default), the static default_value
+  `self._default_value` will be used.
+* <b>`return_exists`</b>: if True, will return a additional Tensor which indicates
+  if or not keys are existing in the table.
+* <b>`name`</b>: A name for the operation (optional).
+
+
+#### Returns:
+
+A tensor containing the values in the same shape as `keys` using the
+  table's value type.
+
+* <b>`exists`</b>:   A bool type Tensor of the same shape as `keys` which indicates
+    if keys are existing in the table.
+    Only provided if `return_exists` is True.
+
+
+#### Raises:
+
+
+* <b>`TypeError`</b>: when `keys` do not match the table data types.
+
+<h3 id="remove"><code>remove</code></h3>
+
+<a target="_blank" href="https://github.com/tensorflow/recommenders-addons/tree/master/tensorflow_recommenders_addons/dynamic_embedding/python/ops/hkv_hashtable_ops.py">View source</a>
+
+``` python
+remove(
+    keys,
+    name=None
+)
+```
+
+Removes `keys` and its associated values from the table.
+
+If a key is not present in the table, it is silently ignored.
+
+#### Args:
+
+
+* <b>`keys`</b>: Keys to remove. Can be a tensor of any shape. Must match the table's
+  key type.
+* <b>`name`</b>: A name for the operation (optional).
+
+
+#### Returns:
+
+The created Operation.
+
+
+
+#### Raises:
+
+
+* <b>`TypeError`</b>: when `keys` do not match the table data types.
+
+<h3 id="size"><code>size</code></h3>
+
+<a target="_blank" href="https://github.com/tensorflow/recommenders-addons/tree/master/tensorflow_recommenders_addons/dynamic_embedding/python/ops/hkv_hashtable_ops.py">View source</a>
+
+``` python
+size(name=None)
+```
+
+Compute the number of elements in this table.
+
+
+#### Args:
+
+
+* <b>`name`</b>: A name for the operation (optional).
+
+
+#### Returns:
+
+A scalar tensor containing the number of elements in this table.
diff --git a/docs/api_docs/tfra/dynamic_embedding/HkvHashTableConfig.md b/docs/api_docs/tfra/dynamic_embedding/HkvHashTableConfig.md
new file mode 100644
index 000000000..4bba1b907
--- /dev/null
+++ b/docs/api_docs/tfra/dynamic_embedding/HkvHashTableConfig.md
@@ -0,0 +1,66 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfra.dynamic_embedding.HkvHashTableConfig" />
+<meta itemprop="path" content="Stable" />
+<meta itemprop="property" content="__init__"/>
+</div>
+
+# tfra.dynamic_embedding.HkvHashTableConfig
+
+<!-- Insert buttons and diff -->
+
+<table class="tfo-notebook-buttons tfo-api" align="left">
+
+<td>
+  <a target="_blank" href="https://github.com/tensorflow/recommenders-addons/tree/master/tensorflow_recommenders_addons/dynamic_embedding/python/ops/dynamic_embedding_creator.py">
+    <img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />
+    View source on GitHub
+  </a>
+</td></table>
+<br/>
+<br/>
+<br/>
+<br/>
+
+
+
+## Class `HkvHashTableConfig`
+
+
+
+
+
+<!-- Placeholder for "Used in" -->
+
+
+<h2 id="__init__"><code>__init__</code></h2>
+
+<a target="_blank" href="https://github.com/tensorflow/recommenders-addons/tree/master/tensorflow_recommenders_addons/dynamic_embedding/python/ops/dynamic_embedding_creator.py">View source</a>
+
+``` python
+
+KHkvHashTableInitCapacity = 1024 * 1024
+KHkvHashTableMaxCapacity = 1024 * 1024
+KHkvHashTableMaxHbmForValues = 1024 * 1024 * 1024
+
+__init__(
+    init_capacity=KHkvHashTableInitCapacity,
+    max_capacity=KHkvHashTableMaxCapacity,
+    max_hbm_for_values=KHkvHashTableMaxHbmForValues
+):
+```
+
+HkvHashTableConfig contains three parameters to configure the HashTable, They all have default values.
+
+#### Args:
+
+
+* <b>`init_capacity`</b>: The initial capacity of the hash table.
+* <b>`max_capacity`</b>: The maximum capacity of the hash table.
+* <b>`max_hbm_for_values`</b>: The maximum HBM for values, in bytes.
+
+#### Configuration Suggestion
+
+* <b>`Pure HBM mode`</b>: set the max_hbm_for_values >= sizeof(V) * dim * max_capacity
+* <b>`HBM + HMEM mode`</b>: set the max_hbm_for_values < sizeof(V) * dim * max_capacity
+* <b>`Pure HMEM mode`</b>: set the max_hbm_for_values = 0
+* if max_capacity == init_capacity, the HBM + HMEM consumption = sizeof(V) * dim * max_capacity
diff --git a/docs/api_docs/tfra/dynamic_embedding/HkvHashTableCreator.md b/docs/api_docs/tfra/dynamic_embedding/HkvHashTableCreator.md
new file mode 100644
index 000000000..e318ad90d
--- /dev/null
+++ b/docs/api_docs/tfra/dynamic_embedding/HkvHashTableCreator.md
@@ -0,0 +1,118 @@
+<div itemscope itemtype="http://developers.google.com/ReferenceObject">
+<meta itemprop="name" content="tfra.dynamic_embedding.HkvTableCreator" />
+<meta itemprop="path" content="Stable" />
+<meta itemprop="property" content="__init__"/>
+<meta itemprop="property" content="create"/>
+<meta itemprop="property" content="get_config"/>
+</div>
+
+# tfra.dynamic_embedding.HkvHashTableCreator
+
+<!-- Insert buttons and diff -->
+
+<table class="tfo-notebook-buttons tfo-api" align="left">
+
+<td>
+  <a target="_blank" href="https://github.com/tensorflow/recommenders-addons/tree/master/tensorflow_recommenders_addons/dynamic_embedding/python/ops/dynamic_embedding_creator.py">
+    <img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />
+    View source on GitHub
+  </a>
+</td></table>
+<br/>
+<br/>
+<br/>
+<br/>
+
+
+
+## Class `HkvHashTableCreator`
+
+  A generic KV table creator.
+
+
+
+<!-- Placeholder for "Used in" -->
+
+  KV table instance will be created by the create function with config.
+And also a config class for specific table instance backend should be
+inited before callling the creator function.
+  And then, the KVCreator class instance will be passed to the Variable
+class for creating the real KV table backend(TF resource).
+
+#### Example usage:
+
+
+
+```python
+hkv_config=tfra.dynamic_embedding.HkvHashTableConfig(
+  init_capacity=1024 * 1024,
+  max_capacity=1024 * 1024,
+  max_hbm_for_values=0,
+)
+hkv_creator=tfra.dynamic_embedding.HkvHashTableCreator(config=hkv_config)
+```
+
+<h2 id="__init__"><code>__init__</code></h2>
+
+<a target="_blank" href="https://github.com/tensorflow/recommenders-addons/tree/master/tensorflow_recommenders_addons/dynamic_embedding/python/ops/dynamic_embedding_creator.py">View source</a>
+
+``` python
+__init__(config=None)
+```
+
+Initialize self.  See help(type(self)) for accurate signature.
+
+
+
+
+## Methods
+
+<h3 id="create"><code>create</code></h3>
+
+<a target="_blank" href="https://github.com/tensorflow/recommenders-addons/tree/master/tensorflow_recommenders_addons/dynamic_embedding/python/ops/dynamic_embedding_creator.py">View source</a>
+
+``` python
+create(
+    key_dtype=None,
+    value_dtype=None,
+    default_value=None,
+    name=None,
+    checkpoint=None,
+    init_size=KHkvHashTableInitCapacity,
+    config=None,
+    device=None,
+    shard_saveable_object_fn=None,
+)
+```
+
+
+
+
+<h3 id="get_config"><code>get_config</code></h3>
+
+<a target="_blank" href="https://github.com/tensorflow/recommenders-addons/tree/master/tensorflow_recommenders_addons/dynamic_embedding/python/ops/dynamic_embedding_creator.py">View source</a>
+
+``` python
+get_config()
+
+#return as follow
+
+config = {
+    'key_dtype': self.key_dtype,
+    'value_dtype': self.value_dtype,
+    'default_value': self.default_value.numpy(),
+    'name': self.name,
+    'checkpoint': self.checkpoint,
+    'init_capacity': self.init_capacity,
+    'max_capacity': self.max_capacity,
+    'max_hbm_for_values': self.max_hbm_for_values
+    'config': self.config,
+    'device': self.device,
+}
+```
+
+
+
+
+
+
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/__init__.py b/tensorflow_recommenders_addons/dynamic_embedding/__init__.py
index 6f601ea1c..3269c9171 100644
--- a/tensorflow_recommenders_addons/dynamic_embedding/__init__.py
+++ b/tensorflow_recommenders_addons/dynamic_embedding/__init__.py
@@ -18,6 +18,9 @@
     'CuckooHashTable',
     'CuckooHashTableConfig',
     'CuckooHashTableCreator',
+    'HkvHashTable',
+    'HkvHashTableConfig',
+    'HkvHashTableCreator',
     'RedisTable',
     'RedisTableConfig',
     'RedisTableCreator',
@@ -51,10 +54,13 @@
 from tensorflow_recommenders_addons.dynamic_embedding.python.ops import math_ops as math
 from tensorflow_recommenders_addons.dynamic_embedding.python.ops import data_flow_ops as data_flow
 from tensorflow_recommenders_addons.dynamic_embedding.python.ops.dynamic_embedding_creator import (
-    KVCreator, CuckooHashTableConfig, CuckooHashTableCreator, RedisTableConfig,
+    KVCreator, CuckooHashTableConfig, CuckooHashTableCreator,
+    HkvHashTableConfig, HkvHashTableCreator, RedisTableConfig,
     RedisTableCreator, FileSystemSaver)
 from tensorflow_recommenders_addons.dynamic_embedding.python.ops.cuckoo_hashtable_ops import (
     CuckooHashTable,)
+from tensorflow_recommenders_addons.dynamic_embedding.python.ops.hkv_hashtable_ops import (
+    HkvHashTable,)
 from tensorflow_recommenders_addons.dynamic_embedding.python.ops.redis_table_ops import (
     RedisTable,)
 from tensorflow_recommenders_addons.dynamic_embedding.python.ops.dynamic_embedding_ops import (
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/BUILD b/tensorflow_recommenders_addons/dynamic_embedding/core/BUILD
index a2e030135..0879258bb 100644
--- a/tensorflow_recommenders_addons/dynamic_embedding/core/BUILD
+++ b/tensorflow_recommenders_addons/dynamic_embedding/core/BUILD
@@ -4,7 +4,7 @@ load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda", "if_cuda_is_configure
 
 package(default_visibility = ["//visibility:public"])
 
-load("//tensorflow_recommenders_addons:tensorflow_recommenders_addons.bzl", "custom_op_library", "if_cuda_for_tf_serving")
+load("//tensorflow_recommenders_addons:tensorflow_recommenders_addons.bzl", "custom_cuda_op_library", "custom_op_library", "if_cuda_for_tf_serving")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("@local_config_tf//:build_defs.bzl", "FOR_TF_SERVING")
 
@@ -17,18 +17,6 @@ custom_op_library(
         "utils/utils.h",
         "utils/types.h",
     ] + glob(["kernels/lookup_impl/lookup_table_op_cpu*"]),
-    cuda_deps = if_cuda_for_tf_serving(
-        ["//tensorflow_recommenders_addons/dynamic_embedding/core/lib/nvhash:nvhashtable"],
-        [],
-        FOR_TF_SERVING,
-    ),
-    cuda_srcs = if_cuda([
-        "kernels/cuckoo_hashtable_op.h",
-        "kernels/cuckoo_hashtable_op_gpu.h",
-        "kernels/cuckoo_hashtable_op_gpu.cu.cc",
-        "utils/utils.h",
-        "utils/types.h",
-    ] + glob(["kernels/lookup_impl/lookup_table_op_gpu*"])),
     deps = [
         "//tensorflow_recommenders_addons/dynamic_embedding/core/lib/cuckoo:cuckoohash",
     ],
@@ -79,11 +67,6 @@ custom_op_library(
         "ops/math_ops.cc",
         "utils/utils.h",
     ],
-    cuda_deps = if_cuda_for_tf_serving(
-        ["//tensorflow_recommenders_addons/dynamic_embedding/core/lib/nvhash:nvhashtable"],
-        [],
-        FOR_TF_SERVING,
-    ),
     cuda_srcs = [
         "kernels/segment_reduction_ops.h",
         "kernels/segment_reduction_ops_gpu.cu.cc",
@@ -98,11 +81,6 @@ custom_op_library(
         "ops/data_flow_ops.cc",
         "utils/utils.h",
     ],
-    cuda_deps = if_cuda_for_tf_serving(
-        ["//tensorflow_recommenders_addons/dynamic_embedding/core/lib/nvhash:nvhashtable"],
-        [],
-        FOR_TF_SERVING,
-    ),
     cuda_srcs = [
         "kernels/fill_functor.cu.cc",
         "kernels/dynamic_partition_op_gpu.cu.cc",
@@ -112,3 +90,24 @@ custom_op_library(
 )
 
 # TODO: Add hkv targets.
+custom_cuda_op_library(
+    name = "_hkv_ops.so",
+    srcs = [
+        "ops/hkv_hashtable_ops.cc",
+        "utils/utils.h",
+    ],
+    copts = [
+        "-Ofast",
+    ],
+    cuda_deps = if_cuda_for_tf_serving(
+        ["@hkv//:hkv"],
+        [],
+        FOR_TF_SERVING,
+    ),
+    cuda_srcs = if_cuda([
+        "utils/utils.h",
+        "utils/types.h",
+        "kernels/cuckoo_hashtable_op_gpu.h",
+        "kernels/hkv_hashtable_op_gpu.cu.cc",
+    ]) + glob(["kernels/lookup_impl/lookup_table_op_hkv*"]),
+)
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/hkv_hashtable_ops.cc b/tensorflow_recommenders_addons/dynamic_embedding/core/hkv_hashtable_ops.cc
deleted file mode 100644
index 7af17be5c..000000000
--- a/tensorflow_recommenders_addons/dynamic_embedding/core/hkv_hashtable_ops.cc
+++ /dev/null
@@ -1,311 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_def_builder.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow_recommenders_addons/dynamic_embedding/core/utils/utils.h"
-
-namespace tensorflow {
-
-using shape_inference::DimensionHandle;
-using shape_inference::InferenceContext;
-using shape_inference::ShapeAndType;
-using shape_inference::ShapeHandle;
-
-namespace {
-
-Status ScalarAndTwoElementVectorInputsAndScalarOutputs(InferenceContext* c) {
-  ShapeHandle handle;
-  DimensionHandle unused_handle;
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
-  for (int i = 1; i < c->num_inputs(); ++i) {
-    TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 1, &handle));
-    TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_handle));
-  }
-  for (int i = 0; i < c->num_outputs(); ++i) {
-    c->set_output(i, c->Scalar());
-  }
-  return Status::OK();
-}
-
-}  // namespace
-
-Status ValidateTableResourceHandle(InferenceContext* c, ShapeHandle keys,
-                                   const string& key_dtype_attr,
-                                   const string& value_dtype_attr,
-                                   bool is_lookup,
-                                   ShapeAndType* output_shape_and_type) {
-  auto* handle_data = c->input_handle_shapes_and_types(0);
-  if (handle_data == nullptr || handle_data->size() != 2) {
-    output_shape_and_type->shape = c->UnknownShape();
-    output_shape_and_type->dtype = DT_INVALID;
-  } else {
-    const ShapeAndType& key_shape_and_type = (*handle_data)[0];
-    const ShapeAndType& value_shape_and_type = (*handle_data)[1];
-    DataType key_dtype;
-    TF_RETURN_IF_ERROR(c->GetAttr(key_dtype_attr, &key_dtype));
-    if (key_shape_and_type.dtype != key_dtype) {
-      return errors::InvalidArgument(
-          "Trying to read value with wrong dtype. "
-          "Expected ",
-          DataTypeString(key_shape_and_type.dtype), " got ",
-          DataTypeString(key_dtype));
-    }
-    DataType value_dtype;
-    TF_RETURN_IF_ERROR(c->GetAttr(value_dtype_attr, &value_dtype));
-    if (value_shape_and_type.dtype != value_dtype) {
-      return errors::InvalidArgument(
-          "Trying to read value with wrong dtype. "
-          "Expected ",
-          DataTypeString(value_shape_and_type.dtype), " got ",
-          DataTypeString(value_dtype));
-    }
-    output_shape_and_type->dtype = value_shape_and_type.dtype;
-
-    if (is_lookup) {
-      if (c->RankKnown(key_shape_and_type.shape) && c->RankKnown(keys)) {
-        int keys_rank = c->Rank(keys);
-        int key_suffix_rank = c->Rank(key_shape_and_type.shape);
-        if (keys_rank < key_suffix_rank) {
-          return errors::InvalidArgument(
-              "Expected keys to have suffix ",
-              c->DebugString(key_shape_and_type.shape),
-              " but saw shape: ", c->DebugString(keys));
-        }
-        for (int d = 0; d < key_suffix_rank; d++) {
-          // Ensure the suffix of keys match what's in the Table.
-          DimensionHandle dim = c->Dim(key_shape_and_type.shape, d);
-          TF_RETURN_IF_ERROR(
-              c->ReplaceDim(keys, keys_rank - key_suffix_rank + d, dim, &keys));
-        }
-        std::vector<DimensionHandle> keys_prefix_vec;
-        keys_prefix_vec.reserve(keys_rank - key_suffix_rank);
-        for (int d = 0; d < keys_rank - key_suffix_rank; ++d) {
-          keys_prefix_vec.push_back(c->Dim(keys, d));
-        }
-        ShapeHandle keys_prefix = c->MakeShape(keys_prefix_vec);
-        TF_RETURN_IF_ERROR(c->Concatenate(keys_prefix,
-                                          value_shape_and_type.shape,
-                                          &output_shape_and_type->shape));
-      } else {
-        output_shape_and_type->shape = c->UnknownShape();
-      }
-    } else {
-      TF_RETURN_IF_ERROR(c->Concatenate(keys, value_shape_and_type.shape,
-                                        &output_shape_and_type->shape));
-    }
-  }
-  return Status::OK();
-}
-
-Status HkvHashTableShape(InferenceContext* c, const ShapeHandle& key,
-                            const ShapeHandle& value) {
-  c->set_output(0, c->Scalar());
-
-  ShapeHandle key_s;
-  TF_RETURN_IF_ERROR(c->WithRankAtMost(key, 1, &key_s));
-
-  DataType key_t;
-  TF_RETURN_IF_ERROR(c->GetAttr("key_dtype", &key_t));
-
-  DataType value_t;
-  TF_RETURN_IF_ERROR(c->GetAttr("value_dtype", &value_t));
-
-  c->set_output_handle_shapes_and_types(
-      0, std::vector<ShapeAndType>{{key_s, key_t}, {value, value_t}});
-
-  return Status::OK();
-}
-
-REGISTER_OP("TfraHkvHashTableFind")
-    .Input("table_handle: resource")
-    .Input("keys: Tin")
-    .Input("default_value: Tout")
-    .Output("values: Tout")
-    .Attr("Tin: type")
-    .Attr("Tout: type")
-    .SetShapeFn([](InferenceContext* c) {
-      ShapeHandle handle;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
-
-      ShapeAndType value_shape_and_type;
-      TF_RETURN_IF_ERROR(ValidateTableResourceHandle(
-          c,
-          /*keys=*/c->input(1),
-          /*key_dtype_attr=*/"Tin",
-          /*value_dtype_attr=*/"Tout",
-          /*is_lookup=*/true, &value_shape_and_type));
-      c->set_output(0, value_shape_and_type.shape);
-
-      return Status::OK();
-    });
-
-REGISTER_OP("TfraHkvHashTableFindWithExists")
-    .Input("table_handle: resource")
-    .Input("keys: Tin")
-    .Input("default_value: Tout")
-    .Output("values: Tout")
-    .Output("exists: bool")
-    .Attr("Tin: type")
-    .Attr("Tout: type")
-    .SetShapeFn([](InferenceContext* c) {
-      ShapeHandle handle;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
-
-      ShapeHandle keys = c->UnknownShapeOfRank(1);
-      ShapeAndType value_shape_and_type;
-      TF_RETURN_IF_ERROR(ValidateTableResourceHandle(
-          c,
-          /*keys=*/c->input(1),
-          /*key_dtype_attr=*/"Tin",
-          /*value_dtype_attr=*/"Tout",
-          /*is_lookup=*/true, &value_shape_and_type));
-      c->set_output(0, value_shape_and_type.shape);
-      c->set_output(1, keys);
-
-      return Status::OK();
-    });
-
-REGISTER_OP("TfraHkvHashTableInsert")
-    .Input("table_handle: resource")
-    .Input("keys: Tin")
-    .Input("values: Tout")
-    .Attr("Tin: type")
-    .Attr("Tout: type")
-    .SetShapeFn([](InferenceContext* c) {
-      ShapeHandle handle;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
-
-      // TODO: Validate keys and values shape.
-      return Status::OK();
-    });
-
-REGISTER_OP("TfraHkvHashTableAccum")
-    .Input("table_handle: resource")
-    .Input("keys: key_dtype")
-    .Input("values_or_deltas: value_dtype")
-    .Input("exists: bool")
-    .Attr("key_dtype: type")
-    .Attr("value_dtype: type")
-    .SetShapeFn([](InferenceContext* c) {
-      ShapeHandle handle;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
-
-      // TODO: Validate keys and values shape.
-      return Status::OK();
-    });
-
-REGISTER_OP("TfraHkvHashTableRemove")
-    .Input("table_handle: resource")
-    .Input("keys: Tin")
-    .Attr("Tin: type")
-    .SetShapeFn([](InferenceContext* c) {
-      ShapeHandle handle;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
-      TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(1), 1, &handle));
-
-      // TODO(turboale): Validate keys shape.
-      return Status::OK();
-    });
-
-REGISTER_OP("TfraHkvHashTableClear")
-    .Input("table_handle: resource")
-    .Attr("key_dtype: type")
-    .Attr("value_dtype: type");
-
-REGISTER_OP("TfraHkvHashTableSize")
-    .Input("table_handle: resource")
-    .Output("size: int64")
-    .SetShapeFn(ScalarAndTwoElementVectorInputsAndScalarOutputs);
-
-REGISTER_OP("TfraHkvHashTableExport")
-    .Input("table_handle: resource")
-    .Output("keys: Tkeys")
-    .Output("values: Tvalues")
-    .Attr("Tkeys: type")
-    .Attr("Tvalues: type")
-    .SetShapeFn([](InferenceContext* c) {
-      ShapeHandle handle;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
-      ShapeHandle keys = c->UnknownShapeOfRank(1);
-      ShapeAndType value_shape_and_type;
-      TF_RETURN_IF_ERROR(ValidateTableResourceHandle(
-          c,
-          /*keys=*/keys,
-          /*key_dtype_attr=*/"Tkeys",
-          /*value_dtype_attr=*/"Tvalues",
-          /*is_lookup=*/false, &value_shape_and_type));
-      c->set_output(0, keys);
-      c->set_output(1, value_shape_and_type.shape);
-      return Status::OK();
-    });
-
-REGISTER_OP("TfraHkvHashTableSaveToFileSystem")
-    .Input("table_handle: resource")
-    .Input("dirpath: string")
-    .Input("file_name: string")
-    .Attr("key_dtype: type")
-    .Attr("value_dtype: type")
-    .Attr("dirpath_env: string")
-    .Attr("append_to_file: bool")
-    .Attr("buffer_size: int >= 1");
-
-REGISTER_OP("TfraHkvHashTableImport")
-    .Input("table_handle: resource")
-    .Input("keys: Tin")
-    .Input("values: Tout")
-    .Attr("Tin: type")
-    .Attr("Tout: type")
-    .SetShapeFn([](InferenceContext* c) {
-      ShapeHandle handle;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
-
-      ShapeHandle keys;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &keys));
-      TF_RETURN_IF_ERROR(c->Merge(keys, c->input(2), &keys));
-      return Status::OK();
-    });
-
-REGISTER_OP("TfraHkvHashTableLoadFromFileSystem")
-    .Input("table_handle: resource")
-    .Input("dirpath: string")
-    .Input("file_name: string")
-    .Attr("key_dtype: type")
-    .Attr("value_dtype: type")
-    .Attr("dirpath_env: string")
-    .Attr("load_entire_dir: bool")
-    .Attr("buffer_size: int >= 1");
-
-REGISTER_OP("TfraHkvHashTableOfTensors")
-    .Output("table_handle: resource")
-    .Attr("container: string = ''")
-    .Attr("shared_name: string = ''")
-    .Attr("use_node_name_sharing: bool = false")
-    .Attr("key_dtype: type")
-    .Attr("value_dtype: type")
-    .Attr("value_shape: shape = {}")
-    .Attr("init_capacity: int = 0")
-    .Attr("max_capacity: int = 0")
-    .SetIsStateful()
-    .SetShapeFn([](InferenceContext* c) {
-      PartialTensorShape value_p;
-      TF_RETURN_IF_ERROR(c->GetAttr("value_shape", &value_p));
-      ShapeHandle value_s;
-      TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(value_p, &value_s));
-      return HkvHashTableShape(c, /*key=*/c->Scalar(), /*value=*/value_s);
-    });
-}  // namespace tensorflow
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/hkv_hashtable_op_gpu.cu.cc b/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/hkv_hashtable_op_gpu.cu.cc
index 3803893c3..5911bb778 100644
--- a/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/hkv_hashtable_op_gpu.cu.cc
+++ b/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/hkv_hashtable_op_gpu.cu.cc
@@ -15,7 +15,7 @@ limitations under the License.
 #if GOOGLE_CUDA
 
 #include "tensorflow_recommenders_addons/dynamic_embedding/core/kernels/cuckoo_hashtable_op_gpu.h"
-#include "tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_gpu.h"
+#include "tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_hkv.h"
 #include "tensorflow_recommenders_addons/dynamic_embedding/core/utils/utils.h"
 
 #define EIGEN_USE_GPU
@@ -35,10 +35,9 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/path.h"
 #include "tensorflow/core/util/env_var.h"
-#include "tensorflow/stream_executor/stream.h"
-
 #include "tensorflow/core/util/gpu_device_functions.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
+#include "tensorflow/stream_executor/stream.h"
 
 namespace tensorflow {
 
@@ -47,7 +46,7 @@ using GPUDevice = Eigen::GpuDevice;
 namespace recommenders_addons {
 namespace lookup {
 
-constexpr size_t kDefaultGpuInitCapacity = 1024;
+constexpr size_t kDefaultGpuInitCapacity = 1024 * 1024;
 
 using tensorflow::OpKernelContext;
 using tensorflow::lookup::LookupInterface;
@@ -55,6 +54,7 @@ using tensorflow::lookup::LookupInterface;
 template <class K, class V>
 class HkvHashTableOfTensorsGpu final : public LookupInterface {
  private:
+  std::unique_ptr<nv::merlin::BaseAllocator> allocator_ptr_;
 
  public:
   HkvHashTableOfTensorsGpu(OpKernelContext* ctx, OpKernel* kernel) {
@@ -70,56 +70,78 @@ class HkvHashTableOfTensorsGpu final : public LookupInterface {
 
     int64 init_capacity_i64 = 0;
     int64 max_capacity_i64 = 0;
-    OP_REQUIRES_OK(ctx, GetNodeAttr(kernel->def(), "init_capacity", &init_capacity_i64));
-    OP_REQUIRES_OK(ctx, GetNodeAttr(kernel->def(), "max_capacity", &max_capacity_i64));
+    int64 max_hbm_for_vectors_i64 = 0;
+    OP_REQUIRES_OK(
+        ctx, GetNodeAttr(kernel->def(), "init_capacity", &init_capacity_i64));
+    OP_REQUIRES_OK(
+        ctx, GetNodeAttr(kernel->def(), "max_capacity", &max_capacity_i64));
+    OP_REQUIRES_OK(ctx, GetNodeAttr(kernel->def(), "max_hbm_for_vectors",
+                                    &max_hbm_for_vectors_i64));
+    OP_REQUIRES(
+        ctx, (max_hbm_for_vectors_i64 >= 0),
+        errors::InvalidArgument("params max_hbm_for_vectors less than 0"));
+
     options.init_capacity = static_cast<size_t>(init_capacity_i64);
     options.max_capacity = static_cast<size_t>(max_capacity_i64);
+    options.max_hbm_for_vectors = static_cast<size_t>(max_hbm_for_vectors_i64);
 
     if (options.max_capacity == 0) {
-      char* env_max_capacity_str = std::getenv("TFRA_GPU_HASHTABLE_UPLIMIT_SIZE");
-      if (env_max_capacity_str) {
-        options.max_capacity = static_cast<size_t>(std::atoll(env_max_capacity_str));
-        LOG(WARNING) << "GPU table max capacity was not set in attribute, get "
-                     << options.max_capacity << " from env TFRA_GPU_HASHTABLE_UPLIMIT_SIZE.";
-      } else {
-        throw std::runtime_error("max_capaicty=0 and TFRA_GPU_HASHTABLE_UPLIMIT_SIZE not set is not valid.");
-      }
+      char* env_max_capacity_str =
+          std::getenv("TFRA_GPU_HASHTABLE_UPLIMIT_SIZE");
+      OP_REQUIRES(ctx, (env_max_capacity_str != nullptr),
+                  errors::InvalidArgument(
+                      "max_capaicty=0 and TFRA_GPU_HASHTABLE_UPLIMIT_SIZE not "
+                      "set is not valid."));
+      options.max_capacity =
+          static_cast<size_t>(std::atoll(env_max_capacity_str));
+      LOG(WARNING) << "GPU table max capacity was not set in attribute, get "
+                   << options.max_capacity
+                   << " from env TFRA_GPU_HASHTABLE_UPLIMIT_SIZE.";
     }
     if (options.init_capacity == 0) {
       options.init_capacity = kDefaultGpuInitCapacity;
-      LOG(WARNING) << "GPU table init capacity was not set in attribute, use default"
-                   << kDefaultGpuInitCapacity;
+      LOG(WARNING)
+          << "GPU table init capacity was not set in attribute, use default"
+          << kDefaultGpuInitCapacity;
     }
     if (options.max_capacity < options.init_capacity) {
-      LOG(WARNING) << "GPU table max_capacity < init_capacity, (" << options.max_capacity
-                   << "/" << options.init_capacity << "). Reset to " << options.init_capacity;
+      LOG(WARNING) << "GPU table max_capacity < init_capacity, ("
+                   << options.max_capacity << "/" << options.init_capacity
+                   << "). Reset to " << options.init_capacity;
       options.max_capacity = options.init_capacity;
     }
 
     if (table_) {
       return;
     }
-    this->CreateTable(options, &table_);
+    allocator_ptr_ = std::make_unique<gpu::TFOrDefaultAllocator>(ctx);
+    OP_REQUIRES_OK(ctx,
+                   this->CreateTable(options, allocator_ptr_.get(), &table_));
     OP_REQUIRES(ctx, (table_ != nullptr),
                 errors::InvalidArgument("HashTable on GPU is created failed!"));
-
     LOG(INFO) << "GPU table max capacity was created on max_capacity: "
-              << options.max_capacity << ", and init capacity: "
-              << options.init_capacity 
+              << options.max_capacity
+              << ", and init capacity: " << options.init_capacity
               << " with K=" << std::type_index(typeid(K)).name()
               << ", V=" << std::type_index(typeid(V)).name();
   }
 
   ~HkvHashTableOfTensorsGpu() {
+    mutex_lock l(mu_);
+    if (table_) {
+      delete table_;
+      table_ = nullptr;
+    }
   }
 
-  void CreateTable(gpu::TableWrapperInitOptions& options, gpu::TableWrapper<K, V>** pptable) {
-      gpu::CreateTableImpl(pptable, options, runtime_dim_);
+  Status CreateTable(gpu::TableWrapperInitOptions& options,
+                     nv::merlin::BaseAllocator* allocator,
+                     gpu::TableWrapper<K, V>** pptable) {
+    return gpu::CreateTableImpl(pptable, options, allocator, runtime_dim_);
   }
 
   size_t size() const override {
     tf_shared_lock l(mu_);
-
     cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
     size_t retv = table_->get_size(stream);
@@ -132,7 +154,8 @@ class HkvHashTableOfTensorsGpu final : public LookupInterface {
     tf_shared_lock l(mu_);
     auto stream = ctx->eigen_device<GPUDevice>().stream();
     int64 hret = static_cast<int64>(table_->get_size(stream));
-    CUDA_CHECK(cudaMemcpyAsync(s, &hret, sizeof(int64), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(s, &hret, sizeof(int64), cudaMemcpyHostToDevice,
+                               stream));
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
@@ -154,18 +177,23 @@ class HkvHashTableOfTensorsGpu final : public LookupInterface {
           is_full_default ? default_value.shape().dim_size(0) : 1;
       CUDA_CHECK(cudaMallocAsync(&d_status, sizeof(bool) * len, stream));
       CUDA_CHECK(cudaMemsetAsync(d_status, 0, sizeof(bool) * len, stream));
+      CUDA_CHECK(cudaStreamSynchronize(stream));
       {
         tf_shared_lock l(mu_);
-        table_->get((const K*)d_keys.tensor_data().data(),
-                    (V*)(value->tensor_data().data()),
-                    d_status, len,
-                    (V*)(default_value.tensor_data().data()),
-                    stream, is_full_default);
-        CUDA_CHECK(cudaStreamSynchronize(stream));
+        try {
+          table_->get((const K*)d_keys.tensor_data().data(),
+                      (V*)(value->tensor_data().data()), d_status, len,
+                      (V*)(default_value.tensor_data().data()), stream,
+                      is_full_default);
+          CUDA_CHECK(cudaStreamSynchronize(stream));
+        } catch (std::runtime_error& e) {
+          return Status(tensorflow::error::INTERNAL, e.what());
+        }
       }
       CUDA_CHECK(cudaFreeAsync(d_status, stream));
+      CUDA_CHECK(cudaStreamSynchronize(stream));
     }
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+
     return Status::OK();
   }
 
@@ -187,14 +215,19 @@ class HkvHashTableOfTensorsGpu final : public LookupInterface {
           is_full_default ? default_value.shape().dim_size(0) : 1;
       {
         tf_shared_lock l(mu_);
-        table_->get((const K*)d_keys.tensor_data().data(),
-                    (V*)(value->tensor_data().data()),
-                    (bool*)exists->tensor_data().data(), len,
-                    (V*)(default_value.tensor_data().data()),
-                    stream, is_full_default);
+        try {
+          table_->get((const K*)d_keys.tensor_data().data(),
+                      (V*)(value->tensor_data().data()),
+                      (bool*)exists->tensor_data().data(), len,
+                      (V*)(default_value.tensor_data().data()), stream,
+                      is_full_default);
+        } catch (std::runtime_error& e) {
+          return Status(tensorflow::error::INTERNAL, e.what());
+        }
       }
       CUDA_CHECK(cudaStreamSynchronize(stream));
     }
+
     return Status::OK();
   }
 
@@ -204,10 +237,13 @@ class HkvHashTableOfTensorsGpu final : public LookupInterface {
     auto stream = ctx->eigen_device<GPUDevice>().stream();
     {
       mutex_lock l(mu_);
-      table_->upsert((const K*)keys.tensor_data().data(),
-                     (const V*)(values.tensor_data().data()),
-                     len, stream);
-    };
+      try {
+        table_->upsert((const K*)keys.tensor_data().data(),
+                       (const V*)(values.tensor_data().data()), len, stream);
+      } catch (std::runtime_error& e) {
+        return Status(tensorflow::error::INTERNAL, e.what());
+      }
+    }
     CUDA_CHECK(cudaStreamSynchronize(stream));
 
     return Status::OK();
@@ -219,11 +255,14 @@ class HkvHashTableOfTensorsGpu final : public LookupInterface {
     auto stream = ctx->eigen_device<GPUDevice>().stream();
     {
       mutex_lock l(mu_);
-      table_->accum(
-          (const K*)keys.tensor_data().data(),
-          (const V*)(values_or_deltas.tensor_data().data()),
-          (const bool*)exists.tensor_data().data(), len, stream);
-    };
+      try {
+        table_->accum((const K*)keys.tensor_data().data(),
+                      (const V*)(values_or_deltas.tensor_data().data()),
+                      (const bool*)exists.tensor_data().data(), len, stream);
+      } catch (std::runtime_error& e) {
+        return Status(tensorflow::error::INTERNAL, e.what());
+      }
+    }
     CUDA_CHECK(cudaStreamSynchronize(stream));
 
     return Status::OK();
@@ -236,16 +275,23 @@ class HkvHashTableOfTensorsGpu final : public LookupInterface {
 
     if (len > 0) {
       CUDA_CHECK(cudaMallocAsync((void**)&d_keys, sizeof(K) * len, stream));
-      CUDA_CHECK(cudaMemsetAsync((void*)&d_keys, 0, sizeof(K) * len, stream));
-      CUDA_CHECK(cudaMemcpyAsync((void*)d_keys, (void*)keys.tensor_data().data(),
-                            sizeof(K) * len, cudaMemcpyDefault, stream));
+      CUDA_CHECK(cudaMemsetAsync((void*)d_keys, 0, sizeof(K) * len, stream));
+      CUDA_CHECK(cudaMemcpyAsync((void*)d_keys,
+                                 (void*)keys.tensor_data().data(),
+                                 sizeof(K) * len, cudaMemcpyDefault, stream));
+      CUDA_CHECK(cudaStreamSynchronize(stream));
       {
         mutex_lock l(mu_);
-        table_->remove((const K*)d_keys, len, stream);
+        try {
+          table_->remove((const K*)d_keys, len, stream);
+        } catch (std::runtime_error& e) {
+          return Status(tensorflow::error::INTERNAL, e.what());
+        }
       }
       CUDA_CHECK(cudaFreeAsync(d_keys, stream));
+      CUDA_CHECK(cudaStreamSynchronize(stream));
     }
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+
     return Status::OK();
   }
 
@@ -253,7 +299,11 @@ class HkvHashTableOfTensorsGpu final : public LookupInterface {
     auto stream = ctx->eigen_device<GPUDevice>().stream();
     {
       mutex_lock l(mu_);
-      table_->clear(stream);
+      try {
+        table_->clear(stream);
+      } catch (std::runtime_error& e) {
+        return Status(tensorflow::error::INTERNAL, e.what());
+      }
     }
     CUDA_CHECK(cudaStreamSynchronize(stream));
     return Status::OK();
@@ -264,29 +314,47 @@ class HkvHashTableOfTensorsGpu final : public LookupInterface {
     size_t len = keys.flat<K>().size();
     K* d_keys;
     V* d_values;
-    auto stream = ctx->eigen_device<GPUDevice>().stream();
     if (len > 0) {
-      CUDA_CHECK(cudaMallocAsync((void**)&d_keys, sizeof(K) * len, stream));
-      CUDA_CHECK(cudaMemsetAsync((void*)&d_keys, 0, sizeof(K) * len, stream));
-      CUDA_CHECK(
-          cudaMallocAsync((void**)&d_values, sizeof(V) * runtime_dim_ * len, stream));
-      CUDA_CHECK(
-          cudaMemsetAsync((void*)&d_values, 0, sizeof(V) * runtime_dim_ * len, stream));
-      CUDA_CHECK(cudaMemcpyAsync((void*)d_keys, (void*)keys.tensor_data().data(),
-                            sizeof(K) * len, cudaMemcpyDefault, stream));
-      CUDA_CHECK(cudaMemcpyAsync((void*)d_values, (void*)values.tensor_data().data(),
-                            sizeof(V) * runtime_dim_ * len, cudaMemcpyDefault, stream));
+      auto stream = ctx->eigen_device<GPUDevice>().stream();
+      cudaPointerAttributes keys_attr;
+      CUDA_CHECK(cudaPointerGetAttributes(&keys_attr,
+                                          (void*)keys.tensor_data().data()));
+      if (keys_attr.type != cudaMemoryTypeDevice) {
+        CUDA_CHECK(cudaMallocManaged((void**)&d_keys, sizeof(K) * len));
+        CUDA_CHECK(cudaMemcpy((void*)d_keys, (void*)keys.tensor_data().data(),
+                              sizeof(K) * len, cudaMemcpyDefault));
+      } else {
+        d_keys = (K*)keys.tensor_data().data();
+      }
+      cudaPointerAttributes values_attr;
+      CUDA_CHECK(cudaPointerGetAttributes(&values_attr,
+                                          (void*)values.tensor_data().data()));
+      if (values_attr.type != cudaMemoryTypeDevice) {
+        CUDA_CHECK(cudaMallocManaged((void**)&d_values,
+                                     sizeof(V) * runtime_dim_ * len));
+        CUDA_CHECK(
+            cudaMemcpy((void*)d_values, (void*)values.tensor_data().data(),
+                       sizeof(V) * runtime_dim_ * len, cudaMemcpyDefault));
+      } else {
+        d_values = (V*)values.tensor_data().data();
+      }
       {
         mutex_lock l(mu_);
-        table_->clear(stream);
-        table_->upsert((const K*)d_keys,
-                       (const V*)d_values, len, stream);
+        try {
+          table_->clear(stream);
+          table_->upsert((const K*)d_keys, (const V*)d_values, len, stream);
+          CUDA_CHECK(cudaStreamSynchronize(stream));
+        } catch (std::runtime_error& e) {
+          return Status(tensorflow::error::INTERNAL, e.what());
+        }
+      }
+      if (keys_attr.type != cudaMemoryTypeDevice) {
+        CUDA_CHECK(cudaFree(d_keys));
+      }
+      if (values_attr.type != cudaMemoryTypeDevice) {
+        CUDA_CHECK(cudaFree(d_values));
       }
-      CUDA_CHECK(cudaStreamSynchronize(stream));
-      CUDA_CHECK(cudaFreeAsync(d_keys, stream));
-      CUDA_CHECK(cudaFreeAsync(d_values, stream));
     }
-    CUDA_CHECK(cudaStreamSynchronize(stream));
     return Status::OK();
   }
 
@@ -306,15 +374,15 @@ class HkvHashTableOfTensorsGpu final : public LookupInterface {
       tf_shared_lock l(mu_);
       len = table_->get_capacity();
       size = (int64)table_->get_size(stream);
+      CUDA_CHECK(cudaStreamSynchronize(stream));
     }
-    CUDA_CHECK(cudaStreamSynchronize(stream));
-
     CUDA_CHECK(cudaMallocAsync(&d_dump_counter, sizeof(size_t), stream));
     CUDA_CHECK(cudaMemsetAsync(d_dump_counter, 0, sizeof(size_t), stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
 
     AllocatorAttributes attr;
-    //attr.set_gpu_compatible(true);
-    //attr.set_nic_compatible(true);
+    // attr.set_gpu_compatible(true);
+    // attr.set_nic_compatible(true);
     attr.set_on_host(false);
 
     TF_RETURN_IF_ERROR(
@@ -323,9 +391,14 @@ class HkvHashTableOfTensorsGpu final : public LookupInterface {
         "values", TensorShape({size, (int64)runtime_dim_}), &values, attr));
     if (size) {
       tf_shared_lock l(mu_);
-      table_->dump((K*)keys->flat<K>().data(),
-                   (V*)(values->matrix<V>().data()), offset,
-                   len, d_dump_counter, stream);
+      try {
+        table_->dump((K*)keys->flat<K>().data(),
+                     (V*)(values->matrix<V>().data()), offset, len,
+                     d_dump_counter, stream);
+        CUDA_CHECK(cudaStreamSynchronize(stream));
+      } catch (std::runtime_error& e) {
+        return Status(tensorflow::error::INTERNAL, e.what());
+      }
     }
     CUDA_CHECK(cudaFreeAsync(d_dump_counter, stream));
     CUDA_CHECK(cudaStreamSynchronize(stream));
@@ -349,15 +422,15 @@ class HkvHashTableOfTensorsGpu final : public LookupInterface {
       tf_shared_lock l(mu_);
       len = table_->get_capacity();
       size = (int64)table_->get_size(stream);
+      CUDA_CHECK(cudaStreamSynchronize(stream));
     }
-    CUDA_CHECK(cudaStreamSynchronize(stream));
-
     CUDA_CHECK(cudaMallocAsync(&d_dump_counter, sizeof(size_t), stream));
     CUDA_CHECK(cudaMemsetAsync(d_dump_counter, 0, sizeof(size_t), stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
 
     AllocatorAttributes attr;
-    //attr.set_gpu_compatible(true);
-    //attr.set_nic_compatible(true);
+    // attr.set_gpu_compatible(true);
+    // attr.set_nic_compatible(true);
     attr.set_on_host(false);
 
     TF_RETURN_IF_ERROR(
@@ -368,11 +441,15 @@ class HkvHashTableOfTensorsGpu final : public LookupInterface {
         ctx->allocate_output("metas", TensorShape({(size)}), &metas, attr));
     if (size) {
       tf_shared_lock l(mu_);
-      table_->dump_with_metas((K*)keys->flat<K>().data(),
-                             (V*)(values->matrix<V>().data()),
-                             (uint64_t*)(metas->flat<V>().data()),
-                             offset, len, d_dump_counter, stream);
-      CUDA_CHECK(cudaStreamSynchronize(stream));
+      try {
+        table_->dump_with_metas((K*)keys->flat<K>().data(),
+                                (V*)(values->matrix<V>().data()),
+                                (uint64_t*)(metas->flat<V>().data()), offset,
+                                len, d_dump_counter, stream);
+        CUDA_CHECK(cudaStreamSynchronize(stream));
+      } catch (std::runtime_error& e) {
+        return Status(tensorflow::error::INTERNAL, e.what());
+      }
     }
     CUDA_CHECK(cudaFreeAsync(d_dump_counter, stream));
     CUDA_CHECK(cudaStreamSynchronize(stream));
@@ -381,10 +458,10 @@ class HkvHashTableOfTensorsGpu final : public LookupInterface {
 
   Status ExportKeysAndMetas(OpKernelContext* ctx, size_t split_size) {
     tf_shared_lock l(mu_);
-    size_t span_len = 0;
+    // size_t span_len = 0;
     int64 size = 0;
 
-    const size_t offset = 0;
+    // const size_t offset = 0;
 
     Tensor* keys = nullptr;
     Tensor* metas = nullptr;
@@ -403,10 +480,13 @@ class HkvHashTableOfTensorsGpu final : public LookupInterface {
           ctx->allocate_output("metas", TensorShape({(size)}), &metas, attr));
 
       if (size) {
-        table_->dump_keys_and_metas((K*)keys->flat<K>().data(),
-                                    (int64*)(metas->flat<int64>().data()),
-                                    static_cast<size_t>(size),
-                                    split_size, stream);
+        try {
+          table_->dump_keys_and_metas(
+              (K*)keys->flat<K>().data(), (int64*)(metas->flat<int64>().data()),
+              static_cast<size_t>(size), split_size, stream);
+        } catch (std::runtime_error& e) {
+          return Status(tensorflow::error::INTERNAL, e.what());
+        }
       }
     }
     CUDA_CHECK(cudaStreamSynchronize(stream));
@@ -414,39 +494,77 @@ class HkvHashTableOfTensorsGpu final : public LookupInterface {
   }
 
   Status ExportValuesToFile(OpKernelContext* ctx, const string filepath,
-                            const size_t buffer_size) {
+                            const size_t buffer_size, bool append_to_file) {
     auto stream = ctx->eigen_device<GPUDevice>().stream();
+    FileSystem* fs;
+    const auto env = ctx->env();
+    TF_RETURN_IF_ERROR(env->GetFileSystemForFile(filepath, &fs));
 
     {
       tf_shared_lock l(mu_);
-      table_->dump_to_file(filepath, runtime_dim_, stream, buffer_size);
+      try {
+        table_->dump_to_file(fs, filepath, runtime_dim_, stream, buffer_size,
+                             append_to_file);
+      } catch (std::runtime_error& e) {
+        return Status(tensorflow::error::INTERNAL, e.what());
+      }
     }
     CUDA_CHECK(cudaStreamSynchronize(stream));
+
     return Status::OK();
   }
 
-  Status ImportValuesFromFile(OpKernelContext* ctx, const string filepath,
-                              const size_t buffer_size) {
+  Status ImportValuesFromFile(OpKernelContext* ctx, const string& dirpath,
+                              const std::string& file_name,
+                              const size_t buffer_size, bool load_entire_dir) {
     auto stream = ctx->eigen_device<GPUDevice>().stream();
+    FileSystem* fs;
+    const auto env = ctx->env();
+    TF_RETURN_WITH_CONTEXT_IF_ERROR(env->GetFileSystemForFile(dirpath, &fs),
+                                    "Please make sure you have already "
+                                    "imported tensorflow_io before using "
+                                    "TFRA file system operation.");
+    const size_t value_dim = static_cast<size_t>(value_shape_.dim_size(0));
+
+    std::vector<std::string> all_filepath;
+    std::string filepath = io::JoinPath(dirpath, file_name);
+
+    if (load_entire_dir) {
+      string separator = "_mht_";
+      int separator_pos = file_name.rfind(separator);
+      string file_pattern =
+          io::JoinPath(dirpath,
+                       file_name.substr(0, separator_pos + separator.size())) +
+          "*";
+      TF_RETURN_IF_ERROR(fs->GetMatchingPaths(file_pattern, &all_filepath));
+      // delete -keys/-values postfix
+      for (auto it = all_filepath.begin(); it != all_filepath.end(); ++it) {
+        int kv_separator_pos = it->rfind("-");
+        *it = it->substr(0, kv_separator_pos);
+      }
+      // remove duplicate elements
+      sort(all_filepath.begin(), all_filepath.end());
+      all_filepath.erase(unique(all_filepath.begin(), all_filepath.end()),
+                         all_filepath.end());
+    }
 
     {
       mutex_lock l(mu_);
-
-      string keyfile = filepath + ".keys";
-      FILE* tmpfd = fopen(keyfile.c_str(), "rb");
-      if (tmpfd == nullptr) {
-        return errors::NotFound("Failed to read key file", keyfile);
+      try {
+        table_->clear(stream);
+        if (load_entire_dir) {
+          for (const auto& path : all_filepath) {
+            CUDA_CHECK(cudaStreamSynchronize(stream));
+            table_->load_from_file(fs, path, runtime_dim_, stream, buffer_size);
+          }
+        } else {
+          CUDA_CHECK(cudaStreamSynchronize(stream));
+          table_->load_from_file(fs, filepath, runtime_dim_, stream,
+                                 buffer_size);
+        }
+      } catch (std::runtime_error& e) {
+        return Status(tensorflow::error::INTERNAL, e.what());
       }
-      fseek(tmpfd, 0, SEEK_END);
-      long int filesize = ftell(tmpfd);
-      size_t size = static_cast<size_t>(filesize) / sizeof(K);
-      fseek(tmpfd, 0, SEEK_SET);
-      fclose(tmpfd);
-
-      table_->clear(stream);
-      CUDA_CHECK(cudaStreamSynchronize(stream));
-      table_->load_from_file(filepath, size, runtime_dim_, stream,
-                             buffer_size);
     }
     CUDA_CHECK(cudaStreamSynchronize(stream));
     return Status::OK();
@@ -467,6 +585,7 @@ class HkvHashTableOfTensorsGpu final : public LookupInterface {
 }  // namespace lookup
 
 // Table lookup op. Perform the lookup operation on the given table.
+template <class K, class V>
 class HashTableFindGpuOp : public OpKernel {
  public:
   explicit HashTableFindGpuOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
@@ -475,6 +594,8 @@ class HashTableFindGpuOp : public OpKernel {
     lookup::LookupInterface* table;
     OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table));
     core::ScopedUnref unref_me(table);
+    lookup::HkvHashTableOfTensorsGpu<K, V>* table_hkv =
+        (lookup::HkvHashTableOfTensorsGpu<K, V>*)table;
 
     // Input 0 could be a STRING_REF or a RESOURCE
     DataType expected_input_0 = DT_RESOURCE;
@@ -495,13 +616,13 @@ class HashTableFindGpuOp : public OpKernel {
     OP_REQUIRES_OK(ctx,
                    ctx->allocate_output("values", output_shape, &out, attr));
 
-    OP_REQUIRES_OK(ctx, table->Find(ctx, keys, out, default_values));
+    OP_REQUIRES_OK(ctx, table_hkv->Find(ctx, keys, out, default_values));
   }
 };
 
-REGISTER_KERNEL_BUILDER(
-    Name(PREFIX_OP_NAME(HkvHashTableFind)).Device(DEVICE_GPU),
-    HashTableFindGpuOp);
+// REGISTER_KERNEL_BUILDER(
+//     Name(PREFIX_OP_NAME(HkvHashTableFind)).Device(DEVICE_GPU),
+//     HashTableFindGpuOp);
 
 // Table lookup op. Perform the lookup operation on the given table.
 
@@ -542,7 +663,7 @@ class HashTableFindWithExistsGpuOp : public OpKernel {
                    ctx->allocate_output("exists", keys.shape(), &exists, attr));
 
     OP_REQUIRES_OK(ctx, table_hkv->FindWithExists(ctx, keys, values,
-                                                     default_values, exists));
+                                                  default_values, exists));
   }
 };
 
@@ -596,12 +717,12 @@ class HashTableAccumGpuOp : public OpKernel {
     const Tensor& exists = ctx->input(3);
     OP_REQUIRES_OK(
         ctx, table->CheckKeyAndValueTensorsForInsert(keys, values_or_deltas));
-    OP_REQUIRES_OK(ctx,
-                   table_hkv->Accum(ctx, keys, values_or_deltas, exists));
+    OP_REQUIRES_OK(ctx, table_hkv->Accum(ctx, keys, values_or_deltas, exists));
   }
 };
 
 // Table remove op.
+// template<class K, class V>
 class HashTableRemoveGpuOp : public OpKernel {
  public:
   explicit HashTableRemoveGpuOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
@@ -642,6 +763,7 @@ class HashTableClearGpuOp : public OpKernel {
 };
 
 // Op that returns the size of the given table.
+template <class K, class V>
 class HashTableSizeGpuOp : public OpKernel {
  public:
   explicit HashTableSizeGpuOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
@@ -660,12 +782,15 @@ class HashTableSizeGpuOp : public OpKernel {
 
     OP_REQUIRES_OK(ctx,
                    ctx->allocate_output("size", TensorShape({}), &out, attr));
-
     int64* p_size = (int64*)out->flat<int64>().data();
     table_hkv->size_i64(ctx, p_size);
   }
 };
 
+// REGISTER_KERNEL_BUILDER(
+//     Name(PREFIX_OP_NAME(HkvHashTableSize)).Device(DEVICE_GPU),
+//     HashTableSizeGpuOp);
+
 // Op that outputs tensors of all keys and all values.
 class HashTableExportGpuOp : public OpKernel {
  public:
@@ -697,8 +822,7 @@ class HashTableExportWithMetasGpuOp : public OpKernel {
     core::ScopedUnref unref_me(table);
     lookup::HkvHashTableOfTensorsGpu<K, V>* table_hkv =
         (lookup::HkvHashTableOfTensorsGpu<K, V>*)table;
-    OP_REQUIRES_OK(
-        ctx, table_hkv->ExportValuesWithMetas(ctx));
+    OP_REQUIRES_OK(ctx, table_hkv->ExportValuesWithMetas(ctx));
   }
 };
 
@@ -716,39 +840,12 @@ class HashTableExportKeysAndMetasGpuOp : public OpKernel {
     core::ScopedUnref unref_me(table);
     lookup::HkvHashTableOfTensorsGpu<K, V>* table_hkv =
         (lookup::HkvHashTableOfTensorsGpu<K, V>*)table;
-    OP_REQUIRES_OK(ctx, table_hkv->ExportKeysAndMetas(ctx, static_cast<size_t>(split_size_i64_)));
-  }
- private:
-  int64 split_size_i64_;
-};
-
-template <class K, class V>
-class HashTableExportToFileGpuOp : public OpKernel {
- public:
-  explicit HashTableExportToFileGpuOp(OpKernelConstruction* ctx)
-      : OpKernel(ctx) {
-    int64 signed_buffer_size = 0;
-    ctx->GetAttr("buffer_size", &signed_buffer_size);
-    buffer_size_ = static_cast<size_t>(signed_buffer_size);
-  }
-
-  void Compute(OpKernelContext* ctx) override {
-    lookup::LookupInterface* table;
-    OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table));
-    core::ScopedUnref unref_me(table);
-
-    const Tensor& ftensor = ctx->input(1);
-    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(ftensor.shape()),
-                errors::InvalidArgument("filepath must be scalar."));
-    string filepath = string(ftensor.scalar<tstring>()().data());
-    lookup::HkvHashTableOfTensorsGpu<K, V>* table_hkv =
-        (lookup::HkvHashTableOfTensorsGpu<K, V>*)table;
-    OP_REQUIRES_OK(
-        ctx, table_hkv->ExportValuesToFile(ctx, filepath, buffer_size_));
+    OP_REQUIRES_OK(ctx, table_hkv->ExportKeysAndMetas(
+                            ctx, static_cast<size_t>(split_size_i64_)));
   }
 
  private:
-  size_t buffer_size_;
+  int64 split_size_i64_;
 };
 
 // Clear the table and insert data.
@@ -777,14 +874,71 @@ REGISTER_KERNEL_BUILDER(
     Name(PREFIX_OP_NAME(HkvHashTableImport)).Device(DEVICE_GPU),
     HashTableImportGpuOp);
 
+// Op that export all keys and values to FileSystem.
+template <class K, class V>
+class HashTableSaveToFileSystemGpuOp : public OpKernel {
+ public:
+  explicit HashTableSaveToFileSystemGpuOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dirpath_env", &dirpath_env_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("append_to_file", &append_to_file_));
+    int64 signed_buffer_size = 0;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("buffer_size", &signed_buffer_size));
+    buffer_size_ = static_cast<size_t>(signed_buffer_size);
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    lookup::LookupInterface* table;
+    OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table));
+    core::ScopedUnref unref_me(table);
+
+    string dirpath;
+    TF_CHECK_OK(ReadStringFromEnvVar(dirpath_env_, "NotFound", &dirpath));
+    if (dirpath != "NotFound") {
+      LOG(INFO) << "Read TFRA key/value file directory path from the "
+                   "environment variable "
+                << dirpath_env_ << " successfully. Saving directory path is "
+                << dirpath;
+    } else {
+      const Tensor& dir_tensor = ctx->input(1);
+      OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(dir_tensor.shape()),
+                  errors::InvalidArgument("directory path must be scalar."));
+      dirpath = string(dir_tensor.scalar<tstring>()().data());
+    }
+
+    const Tensor& fname_tensor = ctx->input(2);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(fname_tensor.shape()),
+                errors::InvalidArgument("file name must be scalar."));
+    string file_name = string(fname_tensor.scalar<tstring>()().data());
+
+    lookup::HkvHashTableOfTensorsGpu<K, V>* table_hkv =
+        (lookup::HkvHashTableOfTensorsGpu<K, V>*)table;
+    LOG(INFO) << "c++ dirpath: " << dirpath << " filename: " << file_name;
+    std::string filepath = io::JoinPath(dirpath, file_name);
+
+    // OP_REQUIRES_OK(
+    //     ctx, fs->RecursivelyCreateDir(std::string(fs->Dirname(filepath))));
+
+    OP_REQUIRES_OK(ctx, table_hkv->ExportValuesToFile(
+                            ctx, filepath, buffer_size_, append_to_file_));
+  }
+
+ private:
+  string dirpath_env_;
+  bool append_to_file_;
+  size_t buffer_size_;
+};
+
 // Clear the table and insert data from FileSystem.
 template <class K, class V>
-class HashTableImportFromFileGpuOp : public OpKernel {
+class HashTableLoadFromFileSystemGpuOp : public OpKernel {
  public:
-  explicit HashTableImportFromFileGpuOp(OpKernelConstruction* ctx)
+  explicit HashTableLoadFromFileSystemGpuOp(OpKernelConstruction* ctx)
       : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dirpath_env", &dirpath_env_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("load_entire_dir", &load_entire_dir_));
     int64 signed_buffer_size = 0;
-    ctx->GetAttr("buffer_size", &signed_buffer_size);
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("buffer_size", &signed_buffer_size));
     buffer_size_ = static_cast<size_t>(signed_buffer_size);
   }
 
@@ -793,85 +947,108 @@ class HashTableImportFromFileGpuOp : public OpKernel {
     OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table));
     core::ScopedUnref unref_me(table);
 
-    const Tensor& ftensor = ctx->input(1);
-    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(ftensor.shape()),
-                errors::InvalidArgument("filepath must be scalar."));
-    string filepath = string(ftensor.scalar<tstring>()().data());
+    string dirpath;
+    TF_CHECK_OK(ReadStringFromEnvVar(dirpath_env_, "NotFound", &dirpath));
+    if (dirpath != "NotFound") {
+      LOG(INFO) << "Read TFRA key/value file directory path from the "
+                   "environment variable "
+                << dirpath_env_ << " successfully. Saving directory path is "
+                << dirpath;
+    } else {
+      const Tensor& dir_tensor = ctx->input(1);
+      OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(dir_tensor.shape()),
+                  errors::InvalidArgument("directory path must be scalar."));
+      dirpath = string(dir_tensor.scalar<tstring>()().data());
+    }
+
+    const Tensor& fname_tensor = ctx->input(2);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(fname_tensor.shape()),
+                errors::InvalidArgument("file name must be scalar."));
+    string file_name = string(fname_tensor.scalar<tstring>()().data());
+
+    LOG(INFO) << "c++ dirpath :" << dirpath << " filename: " << file_name;
+
     lookup::HkvHashTableOfTensorsGpu<K, V>* table_hkv =
         (lookup::HkvHashTableOfTensorsGpu<K, V>*)table;
     OP_REQUIRES_OK(
-        ctx, table_hkv->ImportValuesFromFile(ctx, filepath, buffer_size_));
+        ctx, table_hkv->ImportValuesFromFile(ctx, dirpath, file_name,
+                                             buffer_size_, load_entire_dir_));
   }
 
  private:
+  string dirpath_env_;
+  bool load_entire_dir_;
   size_t buffer_size_;
 };
 
 // Register the HkvHashTableOfTensors op.
-
 #define REGISTER_KERNEL(key_dtype, value_dtype)                                \
   REGISTER_KERNEL_BUILDER(                                                     \
-      Name(PREFIX_OP_NAME(HkvHashTableOfTensors))                           \
+      Name(PREFIX_OP_NAME(HkvHashTableOfTensors))                              \
           .Device(DEVICE_GPU)                                                  \
           .TypeConstraint<key_dtype>("key_dtype")                              \
           .TypeConstraint<value_dtype>("value_dtype"),                         \
-      HashTableGpuOp<                                                          \
-          lookup::HkvHashTableOfTensorsGpu<key_dtype, value_dtype>,         \
-          key_dtype, value_dtype>);                                            \
-  REGISTER_KERNEL_BUILDER(Name(PREFIX_OP_NAME(HkvHashTableClear))           \
+      HashTableGpuOp<lookup::HkvHashTableOfTensorsGpu<key_dtype, value_dtype>, \
+                     key_dtype, value_dtype>);                                 \
+  REGISTER_KERNEL_BUILDER(Name(PREFIX_OP_NAME(HkvHashTableClear))              \
                               .Device(DEVICE_GPU)                              \
                               .TypeConstraint<key_dtype>("key_dtype")          \
                               .TypeConstraint<value_dtype>("value_dtype"),     \
                           HashTableClearGpuOp<key_dtype, value_dtype>);        \
-  REGISTER_KERNEL_BUILDER(Name(PREFIX_OP_NAME(HkvHashTableSize))            \
+  REGISTER_KERNEL_BUILDER(Name(PREFIX_OP_NAME(HkvHashTableSize))               \
                               .Device(DEVICE_GPU)                              \
                               .TypeConstraint<key_dtype>("key_dtype")          \
                               .TypeConstraint<value_dtype>("value_dtype"),     \
                           HashTableSizeGpuOp<key_dtype, value_dtype>);         \
-  REGISTER_KERNEL_BUILDER(Name(PREFIX_OP_NAME(HkvHashTableAccum))           \
+  REGISTER_KERNEL_BUILDER(Name(PREFIX_OP_NAME(HkvHashTableAccum))              \
                               .Device(DEVICE_GPU)                              \
                               .TypeConstraint<key_dtype>("key_dtype")          \
                               .TypeConstraint<value_dtype>("value_dtype"),     \
                           HashTableAccumGpuOp<key_dtype, value_dtype>);        \
-  REGISTER_KERNEL_BUILDER(Name(PREFIX_OP_NAME(HkvHashTableExportWithMetas)) \
-                              .Device(DEVICE_GPU)                              \
-                              .TypeConstraint<key_dtype>("key_dtype")          \
-                              .TypeConstraint<value_dtype>("value_dtype"),     \
-                          HashTableExportWithMetasGpuOp<key_dtype, value_dtype>); \
-  REGISTER_KERNEL_BUILDER(Name(PREFIX_OP_NAME(HkvHashTableExportToFile))    \
-                              .Device(DEVICE_GPU)                              \
-                              .HostMemory("filepath")                          \
-                              .TypeConstraint<key_dtype>("key_dtype")          \
-                              .TypeConstraint<value_dtype>("value_dtype"),     \
-                          HashTableExportToFileGpuOp<key_dtype, value_dtype>); \
   REGISTER_KERNEL_BUILDER(                                                     \
-      Name(PREFIX_OP_NAME(HkvHashTableImportFromFile))                      \
+      Name(PREFIX_OP_NAME(HkvHashTableExportWithMetas))                        \
           .Device(DEVICE_GPU)                                                  \
-          .HostMemory("filepath")                                              \
           .TypeConstraint<key_dtype>("key_dtype")                              \
           .TypeConstraint<value_dtype>("value_dtype"),                         \
-      HashTableImportFromFileGpuOp<key_dtype, value_dtype>);                   \
+      HashTableExportWithMetasGpuOp<key_dtype, value_dtype>);                  \
+  REGISTER_KERNEL_BUILDER(Name(PREFIX_OP_NAME(HkvHashTableFind))               \
+                              .Device(DEVICE_GPU)                              \
+                              .TypeConstraint<key_dtype>("Tin")                \
+                              .TypeConstraint<value_dtype>("Tout"),            \
+                          HashTableFindGpuOp<key_dtype, value_dtype>);         \
   REGISTER_KERNEL_BUILDER(                                                     \
-      Name(PREFIX_OP_NAME(HkvHashTableFindWithExists))                      \
+      Name(PREFIX_OP_NAME(HkvHashTableFindWithExists))                         \
           .Device(DEVICE_GPU)                                                  \
           .TypeConstraint<key_dtype>("Tin")                                    \
           .TypeConstraint<value_dtype>("Tout"),                                \
-      HashTableFindWithExistsGpuOp<key_dtype, value_dtype>);
+      HashTableFindWithExistsGpuOp<key_dtype, value_dtype>);                   \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name(PREFIX_OP_NAME(HkvHashTableSaveToFileSystem))                       \
+          .Device(DEVICE_GPU)                                                  \
+          .TypeConstraint<key_dtype>("key_dtype")                              \
+          .TypeConstraint<value_dtype>("value_dtype"),                         \
+      HashTableSaveToFileSystemGpuOp<key_dtype, value_dtype>);                 \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name(PREFIX_OP_NAME(HkvHashTableLoadFromFileSystem))                     \
+          .Device(DEVICE_GPU)                                                  \
+          .TypeConstraint<key_dtype>("key_dtype")                              \
+          .TypeConstraint<value_dtype>("value_dtype"),                         \
+      HashTableLoadFromFileSystemGpuOp<key_dtype, value_dtype>);
 
 REGISTER_KERNEL(int64, float);
-REGISTER_KERNEL(int64, Eigen::half);
-REGISTER_KERNEL(int64, int64);
-REGISTER_KERNEL(int64, int32);
 REGISTER_KERNEL(int64, int8);
-REGISTER_KERNEL(int32, float);
+REGISTER_KERNEL(int64, int32);
+REGISTER_KERNEL(int64, int64);
+REGISTER_KERNEL(int64, Eigen::half);
 
 #undef REGISTER_KERNEL
 
-#define SINGLE_ATTR_REGISTER_KERNEL(key_dtype, value_type)                    \
-  REGISTER_KERNEL_BUILDER(Name(PREFIX_OP_NAME(HkvHashTableExportKeysAndMetas)) \
-                              .Device(DEVICE_GPU)                              \
-                              .TypeConstraint<key_dtype>("Tkeys"),          \
-                          HashTableExportKeysAndMetasGpuOp<key_dtype, value_type>);
+#define SINGLE_ATTR_REGISTER_KERNEL(key_dtype, value_type) \
+  REGISTER_KERNEL_BUILDER(                                 \
+      Name(PREFIX_OP_NAME(HkvHashTableExportKeysAndMetas)) \
+          .Device(DEVICE_GPU)                              \
+          .TypeConstraint<key_dtype>("Tkeys"),             \
+      HashTableExportKeysAndMetasGpuOp<key_dtype, value_type>);
 
 SINGLE_ATTR_REGISTER_KERNEL(int64, float);
 
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_hkv.h b/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_hkv.h
index 0da759780..3e109962e 100644
--- a/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_hkv.h
+++ b/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_hkv.h
@@ -16,14 +16,20 @@ limitations under the License.
 #ifndef TFRA_CORE_KERNELS_LOOKUP_TABLE_OP_GPU_H_
 #define TFRA_CORE_KERNELS_LOOKUP_TABLE_OP_GPU_H_
 
-#include <typeindex>
 #include <stddef.h>
 #include <stdio.h>
-#include <string>
-#include <vector>
 #include <time.h>
+
 #include <limits>
+#include <string>
+#include <typeindex>
+#include <vector>
 
+#include "merlin/allocator.cuh"
+#include "merlin/types.cuh"
+#include "merlin/utils.cuh"
+#include "merlin_hashtable.cuh"
+#include "merlin_localfile.hpp"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/lookup_interface.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -33,12 +39,10 @@ limitations under the License.
 #include "tensorflow/core/kernels/lookup_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/io/buffered_inputstream.h"
+#include "tensorflow/core/lib/io/random_inputstream.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/thread_annotations.h"
-#include "tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin_hashtable.cuh"
-#include "tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin_localfile.hpp"
-#include "tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/types.cuh"
-#include "tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/utils.cuh"
 
 namespace tensorflow {
 namespace recommenders_addons {
@@ -50,9 +54,7 @@ class KVOnlyFile : public nv::merlin::BaseKVFile<K, V, M> {
  public:
   KVOnlyFile() : keys_fp_(nullptr), values_fp_(nullptr) {}
 
-  ~KVOnlyFile() {
-    close();
-  }
+  ~KVOnlyFile() { close(); }
 
   bool open(const std::string& keys_path, const std::string& values_path,
             const char* mode) {
@@ -80,21 +82,23 @@ class KVOnlyFile : public nv::merlin::BaseKVFile<K, V, M> {
     }
   }
 
-  size_t read(const size_t n, const size_t dim, K* keys, V* vectors, M* metas) override {
+  size_t read(const size_t n, const size_t dim, K* keys, V* vectors,
+              M* metas) override {
     size_t nread_keys =
         fread(keys, sizeof(K), static_cast<size_t>(n), keys_fp_);
     size_t nread_vecs =
         fread(vectors, sizeof(V) * dim, static_cast<size_t>(n), values_fp_);
     if (nread_keys != nread_vecs) {
-      LOG(INFO) << "Partially read failed. " << nread_keys << " kv pairs by KVOnlyFile.";
+      LOG(INFO) << "Partially read failed. " << nread_keys
+                << " kv pairs by KVOnlyFile.";
       return 0;
     }
     LOG(INFO) << "Partially read " << nread_keys << " kv pairs by KVOnlyFile.";
     return nread_keys;
   }
 
-  size_t write(const size_t n, const size_t dim, const K* keys, const V* vectors,
-               const M* metas) override {
+  size_t write(const size_t n, const size_t dim, const K* keys,
+               const V* vectors, const M* metas) override {
     size_t nwritten_keys =
         fwrite(keys, sizeof(K), static_cast<size_t>(n), keys_fp_);
     size_t nwritten_vecs =
@@ -102,7 +106,8 @@ class KVOnlyFile : public nv::merlin::BaseKVFile<K, V, M> {
     if (nwritten_keys != nwritten_vecs) {
       return 0;
     }
-    LOG(INFO) << "Partially write " << nwritten_keys << " kv pairs by KVOnlyFile.";
+    LOG(INFO) << "Partially write " << nwritten_keys
+              << " kv pairs by KVOnlyFile.";
     return nwritten_keys;
   }
 
@@ -111,9 +116,157 @@ class KVOnlyFile : public nv::merlin::BaseKVFile<K, V, M> {
   FILE* values_fp_;
 };
 
+template <typename K, typename V, typename M>
+class RandomKVFile : public nv::merlin::BaseKVFile<K, V, M> {
+ public:
+  RandomKVFile(FileSystem* fs, const std::string& filepath, size_t value_dim,
+               size_t buffer_size, bool append_to_file = false)
+      : fs_(fs),
+        filepath_(filepath),
+        value_dim_(value_dim),
+        buffer_size_(buffer_size),
+        append_to_file_(append_to_file) {}
+
+  ~RandomKVFile() {}
+
+  Status open(const std::string& key_filepath,
+              const std::string& value_filepath, const std::string& mode) {
+    key_buffer_byte_size_ = buffer_size_ * sizeof(K);
+    const size_t value_len = sizeof(V) * value_dim_;
+    value_buffer_byte_size_ = buffer_size_ * value_len;
+
+    if ("rb" == mode) {
+      TF_RETURN_IF_ERROR(fs_->FileExists(key_filepath));
+      TF_RETURN_IF_ERROR(fs_->NewRandomAccessFile(key_filepath, &key_file_));
+      key_input_stream_ =
+          std::make_unique<io::RandomAccessInputStream>(key_file_.get());
+      key_reader_ = std::make_unique<io::BufferedInputStream>(
+          key_input_stream_.get(), key_buffer_byte_size_ * 2);
+
+      TF_RETURN_IF_ERROR(fs_->FileExists(value_filepath));
+      TF_RETURN_IF_ERROR(
+          fs_->NewRandomAccessFile(value_filepath, &value_file_));
+      value_input_stream_ =
+          std::make_unique<io::RandomAccessInputStream>(value_file_.get());
+      value_reader_ = std::make_unique<io::BufferedInputStream>(
+          value_input_stream_.get(), value_buffer_byte_size_ * 2);
+
+      uint64 key_file_size = 0;
+      TF_RETURN_IF_ERROR(fs_->GetFileSize(key_filepath, &key_file_size));
+      size_t key_size = key_file_size / sizeof(K);
+
+      uint64 value_file_size = 0;
+      TF_RETURN_IF_ERROR(fs_->GetFileSize(value_filepath, &value_file_size));
+      size_t value_size = value_file_size / value_len;
+
+      if (key_size != value_size) {
+        return errors::Unavailable(
+            "the keys number in file " + key_filepath +
+            " is not equal to the value vectors number in file " +
+            value_filepath + ".");
+      }
+    } else if ("wb" == mode) {
+      std::string key_tmpfilepath(key_filepath + ".tmp");
+      std::string value_tmpfilepath(value_filepath + ".tmp");
+
+      bool has_atomic_move = false;
+      auto has_atomic_move_ret =
+          fs_->HasAtomicMove(filepath_, &has_atomic_move);
+      bool need_tmp_file =
+          (has_atomic_move == false) || (has_atomic_move_ret != Status::OK());
+
+      if (!need_tmp_file) {
+        key_tmpfilepath = key_filepath;
+        value_tmpfilepath = value_filepath;
+      }
+      TF_RETURN_IF_ERROR(
+          fs_->RecursivelyCreateDir(std::string(fs_->Dirname(filepath_))));
+
+      if (append_to_file_) {
+        TF_RETURN_IF_ERROR(
+            fs_->NewAppendableFile(key_tmpfilepath, &key_writer_));
+        TF_RETURN_IF_ERROR(
+            fs_->NewAppendableFile(value_tmpfilepath, &value_writer_));
+      } else {
+        TF_RETURN_IF_ERROR(fs_->NewWritableFile(key_tmpfilepath, &key_writer_));
+        TF_RETURN_IF_ERROR(
+            fs_->NewWritableFile(value_tmpfilepath, &value_writer_));
+      }
+    }
+    return Status::OK();
+  }
+
+  void close() {
+    if (key_writer_) {
+      key_writer_->Flush();
+    }
+    if (value_writer_) {
+      value_writer_->Flush();
+    }
+  }
+
+  size_t read(const size_t n, const size_t dim, K* keys, V* vectors,
+              M* metas) override {
+    size_t key_read_byte = n * sizeof(K);
+    size_t value_read_byte = n * sizeof(V) * dim;
+
+    key_buffer_.resize(key_read_byte);
+    value_buffer_.resize(value_read_byte);
+
+    key_reader_->ReadNBytes(key_read_byte, &key_buffer_);
+    value_reader_->ReadNBytes(value_read_byte, &value_buffer_);
+
+    memcpy((char*)keys, key_buffer_.data(), key_buffer_.size());
+    memcpy((char*)vectors, value_buffer_.data(), value_buffer_.size());
+
+    size_t nread_keys = key_buffer_.size() / sizeof(K);
+    return nread_keys;
+  }
+
+  size_t write(const size_t n, const size_t dim, const K* keys,
+               const V* vectors, const M* metas) override {
+    size_t key_write_byte = n * sizeof(K);
+    size_t value_write_byte = n * sizeof(V) * value_dim_;
+    std::vector<char> key_buffer_vector(key_buffer_byte_size_);
+    char* key_buffer = key_buffer_vector.data();
+    std::vector<char> value_buffer_vector(value_buffer_byte_size_);
+    char* value_buffer = value_buffer_vector.data();
+
+    memcpy(key_buffer, (void*)keys, key_write_byte);
+    memcpy(value_buffer, (void*)vectors, value_write_byte);
+
+    key_writer_->Append(StringPiece(key_buffer, key_write_byte));
+    value_writer_->Append(StringPiece(value_buffer, value_write_byte));
+
+    return n;
+  }
+
+ private:
+  size_t value_dim_;
+  FileSystem* fs_ = nullptr;
+  std::string filepath_;
+  size_t buffer_size_;
+  size_t key_buffer_byte_size_;
+  size_t value_buffer_byte_size_;
+  tstring key_buffer_;
+  tstring value_buffer_;
+  bool append_to_file_ = false;
+
+  std::unique_ptr<WritableFile> key_writer_ = nullptr;
+  std::unique_ptr<WritableFile> value_writer_ = nullptr;
+
+  std::unique_ptr<RandomAccessFile> key_file_ = nullptr;
+  std::unique_ptr<RandomAccessFile> value_file_ = nullptr;
+  std::unique_ptr<io::RandomAccessInputStream> key_input_stream_ = nullptr;
+  std::unique_ptr<io::RandomAccessInputStream> value_input_stream_ = nullptr;
+  std::unique_ptr<io::BufferedInputStream> key_reader_ = nullptr;
+  std::unique_ptr<io::BufferedInputStream> value_reader_ = nullptr;
+};
+
 // template to avoid multidef in compile time only.
 template <typename K, typename V>
-__global__ void gpu_u64_to_i64_kernel(const uint64_t* u64, int64* i64, size_t len) {
+__global__ void gpu_u64_to_i64_kernel(const uint64_t* u64, int64* i64,
+                                      size_t len) {
   size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;
   if (tid < len) {
     i64[tid] = static_cast<int64>(u64[tid]);
@@ -129,10 +282,12 @@ __global__ void broadcast_kernel(T* data, T val, size_t n) {
 }
 
 template <typename K, typename V>
-void gpu_cast_u64_to_i64(const uint64_t* u64, int64* i64, size_t len, cudaStream_t stream) {
+void gpu_cast_u64_to_i64(const uint64_t* u64, int64* i64, size_t len,
+                         cudaStream_t stream) {
   size_t block_size = nv::merlin::SAFE_GET_BLOCK_SIZE(1024);
   size_t grid_size = nv::merlin::SAFE_GET_GRID_SIZE(len, block_size);
-  gpu_u64_to_i64_kernel<K, V><<<grid_size, block_size, 0, stream>>>(u64, i64, len);
+  gpu_u64_to_i64_kernel<K, V>
+      <<<grid_size, block_size, 0, stream>>>(u64, i64, len);
 }
 
 using GPUDevice = Eigen::ThreadPoolDevice;
@@ -140,32 +295,159 @@ using GPUDevice = Eigen::ThreadPoolDevice;
 struct TableWrapperInitOptions {
   size_t max_capacity;
   size_t init_capacity;
+  size_t max_hbm_for_vectors;
+  size_t max_bucket_size;
+  float max_load_factor;
+  int block_size;
+  int io_block_size;
+};
+
+template <typename V>
+__global__ void gpu_fill_default_values(V* d_vals, V* d_def_val, size_t len,
+                                        size_t dim) {
+  int threadId = blockIdx.x * blockDim.x + threadIdx.x;
+  if (threadId < len) {
+#pragma unroll
+    for (int i = 0; i < dim; i++) {
+      d_vals[threadId * dim + i] = d_def_val[i];
+    }
+  }
+}
+
+class TFOrDefaultAllocator : public nv::merlin::BaseAllocator {
+ private:
+  using NMMemType = nv::merlin::MemoryType;
+  // tensorflow::Allocator* tf_host_allocator_ = nullptr;
+  tensorflow::Allocator* tf_device_allocator_ = nullptr;
+  std::unique_ptr<nv::merlin::DefaultAllocator> default_allocator_ = nullptr;
+  bool use_default_allocator_ = false;
+  // bool tf_async_allocator_stream_set_ = false;
+  static constexpr size_t kAllocatorAlignment = 8;
+
+ public:
+  TFOrDefaultAllocator() : use_default_allocator_(true) {
+    default_allocator_ = std::make_unique<nv::merlin::DefaultAllocator>();
+  }
+
+  TFOrDefaultAllocator(OpKernelContext* ctx) {
+    if (ctx) {
+      tensorflow::AllocatorAttributes tf_alloc_attrs;
+      tf_device_allocator_ = ctx->get_allocator(tf_alloc_attrs);
+    } else {
+      use_default_allocator_ = true;
+      default_allocator_ = std::make_unique<nv::merlin::DefaultAllocator>();
+    }
+  }
+
+  ~TFOrDefaultAllocator() override {}
+
+  void alloc(const NMMemType type, void** ptr, size_t size,
+             unsigned int pinned_flags = cudaHostAllocDefault) override {
+    if (!use_default_allocator_) {
+      switch (type) {
+        case NMMemType::Device:
+          *ptr = tf_device_allocator_->AllocateRaw(kAllocatorAlignment, size);
+          if (nullptr == *ptr) {
+            throw std::runtime_error(
+                "Failed to allocator gpu memory, please adjust param 'max_hbm' "
+                "smaller.");
+          }
+          break;
+        case NMMemType::Pinned:
+          CUDA_CHECK(cudaMallocHost(ptr, size, pinned_flags));
+          break;
+        case NMMemType::Host:
+          *ptr = std::malloc(size);
+          break;
+      }
+    } else {
+      default_allocator_->alloc(type, ptr, size, pinned_flags);
+    }
+  }
+
+  void alloc_async(const NMMemType type, void** ptr, size_t size,
+                   cudaStream_t stream) override {
+    if (!use_default_allocator_) {
+      if (NMMemType::Device == type) {
+        *ptr = tf_device_allocator_->AllocateRaw(kAllocatorAlignment, size);
+        if (nullptr == *ptr) {
+          throw std::runtime_error(
+              "Failed to allocator gpu memory, please adjust param 'max_hbm' "
+              "smaller.");
+        }
+      }
+    } else {
+      default_allocator_->alloc_async(type, ptr, size, stream);
+    }
+  }
+
+  void free(const NMMemType type, void* ptr) override {
+    if (!use_default_allocator_) {
+      switch (type) {
+        case NMMemType::Device:
+          tf_device_allocator_->DeallocateRaw(ptr);
+          break;
+        case NMMemType::Pinned:
+          CUDA_CHECK(cudaFreeHost(ptr));
+          break;
+        case NMMemType::Host:
+          std::free(ptr);
+          break;
+      }
+    } else {
+      default_allocator_->free(type, ptr);
+    }
+  }
+
+  void free_async(const NMMemType type, void* ptr,
+                  cudaStream_t stream) override {
+    if (!use_default_allocator_) {
+      if (NMMemType::Device == type) {
+        tf_device_allocator_->DeallocateRaw(ptr);
+      }
+    } else {
+      default_allocator_->free_async(type, ptr, stream);
+    }
+  }
 };
 
 template <class K, class V>
 class TableWrapper {
  private:
-  //using M = uint64_t;
+  // using M = uint64_t;
   using Table = nv::merlin::HashTable<K, V, uint64_t>;
+  nv::merlin::HashTableOptions mkv_options_;
 
  public:
   TableWrapper(TableWrapperInitOptions& init_options, size_t dim) {
     max_capacity_ = init_options.max_capacity;
     dim_ = dim;
-    nv::merlin::HashTableOptions mkv_options;
-    mkv_options.init_capacity = std::min(init_options.init_capacity, max_capacity_);
-    mkv_options.max_capacity = max_capacity_;
+    // nv::merlin::HashTableOptions mkv_options_;
+    mkv_options_.init_capacity =
+        std::min(init_options.init_capacity, max_capacity_);
+    mkv_options_.max_capacity = max_capacity_;
     // Since currently GPU nodes are not compatible to fast
     // pcie connections for D2H non-continous wirte, so just
     // use pure hbm mode now.
-    mkv_options.max_hbm_for_vectors = std::numeric_limits<size_t>::max();
-    mkv_options.max_load_factor = 0.63;
-    mkv_options.block_size = nv::merlin::SAFE_GET_BLOCK_SIZE(1024);
-    mkv_options.dim = dim;
-    mkv_options.evict_strategy = nv::merlin::EvictStrategy::kCustomized;
-    block_size_ = mkv_options.block_size;
+    // mkv_options_.max_hbm_for_vectors = std::numeric_limits<size_t>::max();
+    mkv_options_.max_hbm_for_vectors = init_options.max_hbm_for_vectors;
+    mkv_options_.max_load_factor = 0.5;
+    mkv_options_.block_size = nv::merlin::SAFE_GET_BLOCK_SIZE(128);
+    mkv_options_.dim = dim;
+    // mkv_options_.evict_strategy = nv::merlin::EvictStrategy::kCustomized;
+    mkv_options_.evict_strategy = nv::merlin::EvictStrategy::kLru;
+
+    block_size_ = mkv_options_.block_size;
     table_ = new Table();
-    table_->init(mkv_options);
+  }
+
+  Status init(nv::merlin::BaseAllocator* allocator) {
+    try {
+      table_->init(mkv_options_, allocator);
+    } catch (std::runtime_error& e) {
+      return Status(tensorflow::error::INTERNAL, e.what());
+    }
+    return Status::OK();
   }
 
   ~TableWrapper() { delete table_; }
@@ -173,61 +455,79 @@ class TableWrapper {
   void upsert(const K* d_keys, const V* d_vals, size_t len,
               cudaStream_t stream) {
     uint64_t t0 = (uint64_t)time(NULL);
-    uint64_t* timestamp_metas = nullptr;
-    CUDA_CHECK(cudaMallocAsync(&timestamp_metas, len * sizeof(uint64_t), stream));
-    CUDA_CHECK(cudaMemsetAsync(timestamp_metas, 0, len * sizeof(uint64_t), stream));
+    // uint64_t* timestamp_metas = nullptr;
+    // CUDA_CHECK(
+    //     cudaMallocAsync(&timestamp_metas, len * sizeof(uint64_t), stream));
+    // CUDA_CHECK(
+    //     cudaMemsetAsync(timestamp_metas, 0, len * sizeof(uint64_t), stream));
+    // CUDA_CHECK(cudaStreamSynchronize(stream));
+
     size_t grid_size = nv::merlin::SAFE_GET_GRID_SIZE(len, block_size_);
-    broadcast_kernel<uint64_t><<<grid_size, block_size_, 0, stream>>>(timestamp_metas, t0, len);
+    // broadcast_kernel<uint64_t>
+    //     <<<grid_size, block_size_, 0, stream>>>(timestamp_metas, t0, len);
 
-    table_->insert_or_assign(len, d_keys, d_vals, /*d_metas=*/timestamp_metas, stream);
-    CUDA_CHECK(cudaFreeAsync(timestamp_metas, stream));
+    table_->insert_or_assign(len, d_keys, d_vals, /*d_metas=*/nullptr, stream);
+    // CUDA_CHECK(cudaFreeAsync(timestamp_metas, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void accum(const K* d_keys, const V* d_vals_or_deltas,
-             const bool* d_exists, size_t len, cudaStream_t stream) {
+  void accum(const K* d_keys, const V* d_vals_or_deltas, const bool* d_exists,
+             size_t len, cudaStream_t stream) {
     uint64_t t0 = (uint64_t)time(NULL);
-    uint64_t* timestamp_metas = nullptr;
-    CUDA_CHECK(cudaMallocAsync(&timestamp_metas, len * sizeof(uint64_t), stream));
-    CUDA_CHECK(cudaMemsetAsync(timestamp_metas, 0, len * sizeof(uint64_t), stream));
+    // uint64_t* timestamp_metas = nullptr;
+    // CUDA_CHECK(
+    //     cudaMallocAsync(&timestamp_metas, len * sizeof(uint64_t), stream));
+    // CUDA_CHECK(
+    //     cudaMemsetAsync(timestamp_metas, 0, len * sizeof(uint64_t), stream));
+    // CUDA_CHECK(cudaStreamSynchronize(stream));
+
     size_t grid_size = nv::merlin::SAFE_GET_GRID_SIZE(len, block_size_);
-    broadcast_kernel<uint64_t><<<grid_size, block_size_, 0, stream>>>(timestamp_metas, t0, len);
-    table_->accum_or_assign(len, d_keys, d_vals_or_deltas, d_exists, /*d_metas=*/timestamp_metas, stream);
-    CUDA_CHECK(cudaFreeAsync(timestamp_metas, stream));
-    CUDA_CHECK(cudaStreamSynchronize(stream))
+    // broadcast_kernel<uint64_t>
+    //     <<<grid_size, block_size_, 0, stream>>>(timestamp_metas, t0, len);
+    table_->accum_or_assign(len, d_keys, d_vals_or_deltas, d_exists,
+                            /*d_metas=*/nullptr, stream);
+    // CUDA_CHECK(cudaFreeAsync(timestamp_metas, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void dump(K* d_key, V* d_val, const size_t offset,
-            const size_t search_length, size_t* d_dump_counter,
-            cudaStream_t stream) const {
-    table_->export_batch(search_length, offset, d_dump_counter, d_key, d_val, /*d_metas=*/nullptr, stream);
+  void dump(K* d_key, V* d_val, const size_t offset, const size_t search_length,
+            size_t* d_dump_counter, cudaStream_t stream) const {
+    table_->export_batch(search_length, offset, d_dump_counter, d_key, d_val,
+                         /*d_metas=*/nullptr, stream);
   }
 
-  void dump_with_metas(K* d_key, V* d_val, uint64_t* d_metas, const size_t offset,
-                       const size_t search_length, size_t* d_dump_counter,
-                       cudaStream_t stream) const {
-    table_->export_batch(search_length, offset, d_dump_counter, d_key, d_val, d_metas, stream);
+  void dump_with_metas(K* d_key, V* d_val, uint64_t* d_metas,
+                       const size_t offset, const size_t search_length,
+                       size_t* d_dump_counter, cudaStream_t stream) const {
+    table_->export_batch(search_length, offset, d_dump_counter, d_key, d_val,
+                         d_metas, stream);
   }
 
-  void dump_keys_and_metas(K* keys, int64* metas, size_t len,
-                           size_t split_len, cudaStream_t stream) const {
+  void dump_keys_and_metas(K* keys, int64* metas, size_t len, size_t split_len,
+                           cudaStream_t stream) const {
     V* values_buf = nullptr;
     size_t offset = 0;
     size_t real_offset = 0;
     size_t skip = split_len;
     uint64_t* metas_u64 = reinterpret_cast<uint64_t*>(metas);
     size_t span_len = table_->capacity();
-    CUDA_CHECK(cudaMallocAsync(&values_buf, sizeof(V) * dim_ * split_len, stream));
-    CUDA_CHECK(cudaMemsetAsync(values_buf, 0, sizeof(V) * dim_ * split_len, stream));
+    CUDA_CHECK(
+        cudaMallocAsync(&values_buf, sizeof(V) * dim_ * split_len, stream));
+    CUDA_CHECK(
+        cudaMemsetAsync(values_buf, 0, sizeof(V) * dim_ * split_len, stream));
     for (; offset < span_len; offset += split_len) {
       if (offset + skip > span_len) {
         skip = span_len - offset;
       }
       // TODO: overlap the loop
-      size_t h_dump_counter = table_->export_batch(skip, offset, keys + real_offset, values_buf, metas_u64 + real_offset, stream);
+      size_t h_dump_counter =
+          table_->export_batch(skip, offset, keys + real_offset, values_buf,
+                               metas_u64 + real_offset, stream);
       CudaCheckError();
 
       if (h_dump_counter > 0) {
-        gpu_cast_u64_to_i64<K, V>(metas_u64 + real_offset, metas + real_offset, h_dump_counter, stream);
+        gpu_cast_u64_to_i64<K, V>(metas_u64 + real_offset, metas + real_offset,
+                                  h_dump_counter, stream);
         real_offset += h_dump_counter;
       }
       CUDA_CHECK(cudaStreamSynchronize(stream));
@@ -237,80 +537,131 @@ class TableWrapper {
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void dump_to_file(const string filepath, size_t dim,
-                    cudaStream_t stream,
-                    const size_t buffer_size) const {
+  // TODO (LinGeLin) support metas
+  bool is_valid_metas(const std::string& keyfile,
+                      const std::string& metafile) const {
+    return false;
+  }
+
+  void dump_to_file(FileSystem* fs, const string filepath, size_t dim,
+                    cudaStream_t stream, const size_t buffer_size,
+                    bool append_to_file) const {
     LOG(INFO) << "dump_to_file, filepath: " << filepath << ", dim: " << dim
               << ", stream: " << stream << ", buffer_size: " << buffer_size;
-    std::unique_ptr<TimestampV1CompatFile<K, V, uint64_t>> wfile;
-    string keyfile = ;
-    string valuefile = ;
-    string metafile = ;
-
-    wfile.reset(new TimestampV1CompatFile<K, V, uint64_t>);
-    bool open_ok = wfile->open(keyfile, valuefile, metafile, "wb");
-    if (!open_ok) {
-      std::string error_msg = "Failed to dump to file to " + keyfile + ", " + valuefile + ", " + metafile;
+
+    std::unique_ptr<nv::merlin::BaseKVFile<K, V, uint64_t>> wfile;
+
+    string keyfile = filepath + "-keys";
+    string valuefile = filepath + "-values";
+    string metafile = filepath + "-metas";
+    bool has_metas = false;
+    Status status = Status::OK();
+
+    if (is_valid_metas(keyfile, metafile)) {
+      wfile.reset(new nv::merlin::LocalKVFile<K, V, uint64_t>);
+      bool open_ok = reinterpret_cast<nv::merlin::LocalKVFile<K, V, uint64_t>*>(
+                         wfile.get())
+                         ->open(keyfile, valuefile, metafile, "wb");
+      has_metas = true;
+      if (!open_ok) {
+        std::string error_msg = "Failed to dump to file to " + keyfile + ", " +
+                                valuefile + ", " + metafile;
+        throw std::runtime_error(error_msg);
+      }
+    } else {
+      wfile.reset(new RandomKVFile<K, V, uint64_t>(
+          fs, filepath, dim, buffer_size, append_to_file));
+      status = reinterpret_cast<RandomKVFile<K, V, uint64_t>*>(wfile.get())
+                   ->open(keyfile, valuefile, "wb");
+    }
+    if (!status.ok()) {
+      std::string error_msg = "Failed to dump to file to " + keyfile + ", " +
+                              valuefile + ", " + metafile + " " +
+                              status.ToString();
       throw std::runtime_error(error_msg);
     }
 
     size_t n_saved = table_->save(wfile.get(), buffer_size, stream);
-    LOG(INFO) << "[op] Save " << n_saved << " pairs into keyfile: "
-              << keyfile << ", and valuefile: " << valuefile
-              << ", and metafile: " << metafile;
+    if (has_metas) {
+      LOG(INFO) << "[op] Save " << n_saved << " pairs from keyfile: " << keyfile
+                << ", and valuefile: " << valuefile << ", and metafile"
+                << metafile;
+    } else {
+      LOG(INFO) << "[op] Save " << n_saved << " pairs from keyfile: " << keyfile
+                << ", and valuefile: " << valuefile;
+    }
     CUDA_CHECK(cudaStreamSynchronize(stream));
-    wfile->close();
+    if (has_metas) {
+      reinterpret_cast<nv::merlin::LocalKVFile<K, V, uint64_t>*>(wfile.get())
+          ->close();
+    } else {
+      reinterpret_cast<RandomKVFile<K, V, uint64_t>*>(wfile.get())->close();
+    }
   }
 
-  void load_from_file(const string filepath,
-                      size_t key_num, size_t dim, cudaStream_t stream,
-                      const size_t buffer_size) {
+  void load_from_file(FileSystem* fs, const string filepath, size_t dim,
+                      cudaStream_t stream, const size_t buffer_size) {
     std::unique_ptr<nv::merlin::BaseKVFile<K, V, uint64_t>> rfile;
-    string keyfile = ;
-    string valuefile = ;
-    string metafile = ;
-    //rfile.reset(new TimestampV1CompatFile<K, V, uint64_t>);
+    string keyfile = filepath + "-keys";
+    string valuefile = filepath + "-values";
+    string metafile = filepath + "-metas";
     bool has_metas = false;
-    bool open_ok = false;
+    Status status = Status::OK();
 
     if (is_valid_metas(keyfile, metafile)) {
-      rfile.reset(new TimestampV1CompatFile<K, V, uint64_t>);
-      open_ok = reinterpret_cast<TimestampV1CompatFile<K, V, uint64_t>*>(rfile.get())->open(keyfile, valuefile, metafile, "rb");
+      rfile.reset(new nv::merlin::LocalKVFile<K, V, uint64_t>);
+      bool open_ok = reinterpret_cast<nv::merlin::LocalKVFile<K, V, uint64_t>*>(
+                         rfile.get())
+                         ->open(keyfile, valuefile, metafile, "rb");
       has_metas = true;
+      if (!open_ok) {
+        std::string error_msg = "Failed to load from file " + keyfile + ", " +
+                                valuefile + ", " + metafile;
+        throw std::runtime_error(error_msg);
+      }
     } else {
-      rfile.reset(new KVOnlyFile<K, V, uint64_t>);
-      open_ok = reinterpret_cast<KVOnlyFile<K, V, uint64_t>*>(rfile.get())->open(keyfile, valuefile, "rb");
+      rfile.reset(
+          new RandomKVFile<K, V, uint64_t>(fs, filepath, dim, buffer_size));
+      status = reinterpret_cast<RandomKVFile<K, V, uint64_t>*>(rfile.get())
+                   ->open(keyfile, valuefile, "rb");
     }
-    if (!open_ok) {
-      std::string error_msg = "Failed to load from file to " + keyfile + ", " + valuefile + ", " + metafile;
-      throw std::runtime_error("Failed to ");
+    if (!status.ok()) {
+      std::string error_msg = "Failed to load from file " + keyfile + ", " +
+                              valuefile + ", " + metafile + " " +
+                              status.ToString();
+      throw std::runtime_error(error_msg);
     }
 
     size_t n_loaded = table_->load(rfile.get(), buffer_size, stream);
     if (has_metas) {
-      LOG(INFO) << "[op] Load " << n_loaded << " pairs into keyfile: "
-                << keyfile << ", and valuefile: " << valuefile
-                << ", and metafile" << metafile;
+      LOG(INFO) << "[op] Load " << n_loaded
+                << " pairs from keyfile: " << keyfile
+                << ", and valuefile: " << valuefile << ", and metafile"
+                << metafile;
     } else {
-      LOG(INFO) << "[op] Load " << n_loaded << " pairs into keyfile: "
-                << keyfile << ", and valuefile: " << valuefile;
+      LOG(INFO) << "[op] Load " << n_loaded
+                << " pairs from keyfile: " << keyfile
+                << ", and valuefile: " << valuefile;
     }
     CUDA_CHECK(cudaStreamSynchronize(stream));
     if (has_metas) {
-      reinterpret_cast<TimestampV1CompatFile<K, V, uint64_t>*>(rfile.get())->close();
+      reinterpret_cast<nv::merlin::LocalKVFile<K, V, uint64_t>*>(rfile.get())
+          ->close();
     } else {
-      reinterpret_cast<KVOnlyFile<K, V, uint64_t>*>(rfile.get())->close();
+      reinterpret_cast<RandomKVFile<K, V, uint64_t>*>(rfile.get())->close();
     }
   }
 
-  void get(const K* d_keys, V* d_vals, bool* d_status, size_t len,
-           V* d_def_val, cudaStream_t stream,
-           bool is_full_size_default) const {
+  void get(const K* d_keys, V* d_vals, bool* d_status, size_t len, V* d_def_val,
+           cudaStream_t stream, bool is_full_size_default) const {
     if (is_full_size_default) {
-      CUDA_CHECK(cudaMemcpyAsync(d_vals, d_def_val, sizeof(V) * dim_ * len, cudaMemcpyDeviceToDevice, stream));
+      CUDA_CHECK(cudaMemcpyAsync(d_vals, d_def_val, sizeof(V) * dim_ * len,
+                                 cudaMemcpyDeviceToDevice, stream));
     } else {
       size_t grid_size = nv::merlin::SAFE_GET_GRID_SIZE(len, block_size_);
-      gpu_fill_default_values<V><<<grid_size, block_size_, dim_ * sizeof(V), stream>>>(d_vals, d_def_val, len, dim_);
+      gpu_fill_default_values<V>
+          <<<grid_size, block_size_, dim_ * sizeof(V), stream>>>(
+              d_vals, d_def_val, len, dim_);
     }
     table_->find(len, d_keys, d_vals, d_status, /*d_metas=*/nullptr, stream);
   }
@@ -326,9 +677,7 @@ class TableWrapper {
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  size_t get_size(cudaStream_t stream) const {
-    return table_->size(stream);
-  }
+  size_t get_size(cudaStream_t stream) const { return table_->size(stream); }
 
   size_t get_capacity() const { return table_->capacity(); }
 
@@ -347,9 +696,12 @@ class TableWrapper {
 };
 
 template <class K, class V>
-void CreateTableImpl(TableWrapper<K, V>** pptable, TableWrapperInitOptions& options,
-                     size_t runtime_dim) {
+Status CreateTableImpl(TableWrapper<K, V>** pptable,
+                       TableWrapperInitOptions& options,
+                       nv::merlin::BaseAllocator* allocator,
+                       size_t runtime_dim) {
   *pptable = new TableWrapper<K, V>(options, runtime_dim);
+  return (*pptable)->init(allocator);
 }
 
 }  // namespace gpu
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_hkv_impl.cu.cc b/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_hkv_impl.cu.cc
index 47660f73c..10878c3d7 100644
--- a/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_hkv_impl.cu.cc
+++ b/tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_hkv_impl.cu.cc
@@ -13,24 +13,25 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_gpu.h"
+#include "tensorflow_recommenders_addons/dynamic_embedding/core/kernels/lookup_impl/lookup_table_op_hkv.h"
+
 namespace tensorflow {
 namespace recommenders_addons {
 namespace lookup {
 namespace gpu {
 
 #define DEFINE_PURE_GPU_HASHTABLE(key_type, value_type) \
-    template<> class TableWrapper<key_type, value_type>
-
+  template <>                                           \
+  class TableWrapper<key_type, value_type>
 
 DEFINE_PURE_GPU_HASHTABLE(int64, float);
+DEFINE_PURE_GPU_HASHTABLE(int64, int8);
 DEFINE_PURE_GPU_HASHTABLE(int64, int32);
 DEFINE_PURE_GPU_HASHTABLE(int64, int64);
-DEFINE_PURE_GPU_HASHTABLE(int64, int64);
+DEFINE_PURE_GPU_HASHTABLE(int64, Eigen::half);
 
 #undef DEFINE_PURE_GPU_HASHTABLE
 
-
 }  // namespace gpu
 }  // namespace lookup
 }  // namespace recommenders_addons
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/BUILD b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/BUILD
deleted file mode 100644
index b7d22e73b..000000000
--- a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/BUILD
+++ /dev/null
@@ -1,42 +0,0 @@
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda", "if_cuda_is_configured")
-
-package(default_visibility = ["//visibility:public"])
-
-load("//tensorflow_recommenders_addons:tensorflow_recommenders_addons.bzl", "custom_op_library",
-     "if_cuda_for_tf_serving", "tf_nvcc_binary")
-
-cc_library(
-    name = "merlin_kv",
-    hdrs = [
-        "merlin/array_kernels.cuh",
-        "merlin/core_kernels.cuh",
-        "merlin/debug.hpp",
-        "merlin/flexible_buffer.cuh",
-        "merlin/memory_pool.cuh",
-        "merlin/group_lock.hpp",
-        #"merlin/initializers.cuh",
-        #"merlin/managed.cuh",
-        #"merlin/optimizers.cuh",
-        "merlin/types.cuh",
-        "merlin/utils.cuh",
-        "merlin_hashtable.cuh",
-        "merlin_localfile.hpp",
-    ],
-    deps = [
-        "//tensorflow_recommenders_addons/dynamic_embedding/core/lib/utils:cuda_utils",
-    ],
-)
-
-tf_nvcc_binary(
-    name="merlin_kv_insert_and_evict_test",
-    srcs=[],
-    cuda_srcs=[
-        "insert_and_evict_test.cu.cc",
-        "test_util.cu.h",
-    ],
-    cuda_deps=[
-        ":merlin_kv",
-    ],
-    copts=[],
-    linkopts=[],
-)
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/array_kernels.cuh b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/array_kernels.cuh
deleted file mode 100644
index cd11fb713..000000000
--- a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/array_kernels.cuh
+++ /dev/null
@@ -1,345 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http:///www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "tensorflow_recommenders_addons/dynamic_embedding/core/lib/utils/cuda_utils.cuh"
-
-#include <cooperative_groups.h>
-#include "cuda_runtime.h"
-#include "thrust/device_vector.h"
-#include "thrust/execution_policy.h"
-#include "thrust/scan.h"
-#include "thrust/count.h"
-#include "types.cuh"
-#include "utils.cuh"
-
-namespace nv {
-namespace merlin {
-
-template <typename K>
-__global__ void keys_not_empty(const K* keys, bool* masks, size_t n) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (tid < n) {
-    masks[tid] = keys[tid] != EMPTY_KEY;
-  }
-}
-
-template <typename Tidx, int TILE_SIZE = 8>
-__global__ void gpu_cell_count(const bool* masks, bool target,
-                               Tidx* offsets, size_t n, size_t* n_existed) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
-  int rank = g.thread_rank();
-  bool is_existed = false;
-  if (tid < n) {
-    if (masks[tid] == target) {
-      is_existed = true;
-    }
-  }
-  unsigned int vote = g.ballot(is_existed);
-  int g_ones = __popc((int)vote);
-  if (rank == 0 && tid < n) {
-    offsets[tid / TILE_SIZE] = static_cast<Tidx>(g_ones);
-    atomicAdd(static_cast<size_t*>(n_existed), static_cast<size_t>(g_ones));
-  }
-}
-
-template <typename K, typename Tidx, int TILE_SIZE = 8>
-__global__ void gpu_select_key_kernel(const bool* masks, bool target, size_t n,
-                                      const Tidx* offsets, const K* __restrict keys,
-                                      K* __restrict outkeys, Tidx* outoffsets) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
-  int rank = g.thread_rank();
-
-  bool is_existed = false;
-  if (tid < n) {
-    if (masks[tid] == target) {
-      is_existed = true;
-    }
-  }
-  unsigned int vote = g.ballot(is_existed);
-  unsigned int r_vote = __brev(vote) >> (32 - TILE_SIZE);
-
-  if (tid < n) {
-    r_vote = r_vote >> (TILE_SIZE - rank - 1);
-    if (masks[tid] == target) {
-      int prefix_n = __popc(r_vote) - 1;
-      Tidx bias = offsets[tid / TILE_SIZE] + static_cast<Tidx>(prefix_n);
-      outkeys[bias] = keys[tid];
-      outoffsets[bias] = static_cast<Tidx>(tid);
-    }
-  }
-}
-
-template <typename K, typename V, typename Tidx, int TILE_SIZE = 8>
-__global__ void gpu_select_kv_kernel(const bool* masks, bool target, size_t n,
-                                      const Tidx* offsets,
-                                      const K* __restrict keys,
-                                      V* __restrict values,
-                                      K* __restrict outkeys,
-                                      V* __restrict outvalues,
-                                      const size_t dim) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
-  int rank = g.thread_rank();
-
-  bool is_existed = false;
-  if (tid < n) {
-    if (masks[tid] == target) {
-      is_existed = true;
-    }
-  }
-  unsigned int vote = g.ballot(is_existed);
-  unsigned int r_vote = __brev(vote) >> (32 - TILE_SIZE);
-
-  if (tid < n) {
-    r_vote = r_vote >> (TILE_SIZE - rank - 1);
-    if (masks[tid] == target) {
-      int prefix_n = __popc(r_vote) - 1;
-      Tidx bias = offsets[tid / TILE_SIZE] + static_cast<Tidx>(prefix_n);
-      outkeys[bias] = keys[tid];
-      for (size_t i=0;i<dim;i++) {
-        outvalues[dim * bias + i] = values[dim * tid + i];
-      }
-    }
-  }
-}
-
-template <typename K, typename V, typename M, typename Tidx, int TILE_SIZE = 8>
-__global__ void gpu_select_kvm_kernel(const bool* masks, size_t n,
-                                      const Tidx* offsets, K* __restrict keys,
-                                      V* __restrict values, M* __restrict metas,
-                                      const size_t dim) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
-  int rank = g.thread_rank();
-
-  bool is_existed = false;
-  if (tid < n) {
-    if (masks[tid]) {
-      is_existed = true;
-    }
-  }
-  unsigned int vote = g.ballot(is_existed);
-  unsigned int r_vote = __brev(vote) >> (32 - TILE_SIZE);
-  K empty_key = (K)EMPTY_KEY;
-
-  if (tid < n) {
-    r_vote = r_vote >> (TILE_SIZE - rank - 1);
-    if (masks[tid]) {
-      int prefix_n = __popc(r_vote) - 1;
-      Tidx bias = offsets[tid / TILE_SIZE] + static_cast<Tidx>(prefix_n);
-
-      if (bias == tid) return;
-
-      K target_key = 0;
-      AtomicKey<K>* atomic_key = reinterpret_cast<AtomicKey<K>*>(keys) + bias;
-      while (target_key != empty_key) {
-        //target_key = atomicCAS(keys + bias, empty_key, keys[tid]);
-        target_key = empty_key;
-        atomic_key->compare_exchange_weak(target_key, keys[tid],
-                                          cuda::std::memory_order_relaxed,
-                                          cuda::std::memory_order_relaxed);
-      }
-      if (metas) metas[bias] = metas[tid];
-      for (size_t j = 0; j < dim; j++) {
-        values[dim * bias + j] = values[dim * tid + j];
-      }
-      //atomicExch(keys + tid, empty_key);
-      atomic_key = reinterpret_cast<AtomicKey<K>*>(keys) + tid;
-      atomic_key->store(empty_key, cuda::std::memory_order_relaxed);
-    }
-  }
-}
-
-template <typename K, typename V, typename M, int TILE_SIZE = 8>
-__global__ void gpu_select_kvm_kernel_v2(size_t n,
-                                         K* __restrict keys,
-                                         V* __restrict values,
-                                         M* __restrict metas,
-                                         K* __restrict tmp_keys,
-                                         V* __restrict tmp_values,
-                                         M* __restrict tmp_metas,
-                                         size_t* cnt,
-                                         const size_t dim) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (tid < n) {
-    size_t offset = atomicAdd(cnt, 1llu);
-    tmp_keys[offset] = keys[tid];
-    for (size_t j = 0; j < dim; j++) {
-      tmp_values[offset * dim + j] = values[tid * dim + j];
-    }
-    if (metas) {
-      tmp_metas[offset] = metas[tid];
-    }
-  }
-}
-
-template <typename K, typename V, typename M, typename Tidx, int TILE_SIZE = 8>
-void gpu_pick_kvm_inplace(size_t grid_size, size_t block_size, const bool* masks,
-                      bool target, size_t n, size_t* n_evicted, Tidx* offsets,
-                      K* __restrict keys, V* __restrict values,
-                      M* __restrict metas, size_t dim, cudaStream_t stream) {
-  size_t n_offsets = (n + TILE_SIZE - 1) / TILE_SIZE;
-  gpu_cell_count<Tidx, TILE_SIZE>
-      <<<grid_size, block_size, 0, stream>>>(masks, target, offsets, n, n_evicted);
-#if THRUST_VERSION >= 101600
-  auto policy = thrust::cuda::par_nosync.on(stream);
-#else
-  auto policy = thrust::cuda::par.on(stream);
-#endif
-  thrust::device_ptr<Tidx> d_src(offsets);
-  thrust::device_ptr<Tidx> d_dest(offsets);
-  thrust::exclusive_scan(policy, d_src, d_src + n_offsets, d_dest);
-  if (target) {
-    gpu_select_kvm_kernel<K, V, M, Tidx, TILE_SIZE>
-        <<<grid_size, block_size, 0, stream>>>(masks, n, offsets,
-                keys, values, metas, dim);
-  } else {
-    throw std::runtime_error("Not used");
-    //gpu_select_kvm_kernel_reverse<K, V, M, Tidx, TILE_SIZE>
-    //    <<<grid_size, block_size, 0, stream>>>(masks, n, offsets,
-    //            keys, values, metas, dim);
-  }
-}
-
-template <typename K, typename V, typename M, int TILE_SIZE = 8>
-size_t gpu_pick_kvm_v2(size_t grid_size, size_t block_size,
-                      bool target, size_t n, size_t* n_evicted,
-                      K* __restrict keys, V* __restrict values,
-                      M* __restrict metas, size_t dim, cudaStream_t stream) {
-#if THRUST_VERSION >= 101600
-  auto policy = thrust::cuda::par_nosync.on(stream);
-#else
-  auto policy = thrust::cuda::par.on(stream);
-#endif
-  thrust::device_ptr<K> d_src(keys);
-  int empty_cnt = thrust::count(policy, d_src, d_src + n, (K)EMPTY_KEY);
-  size_t h_cnt = n - static_cast<size_t>(empty_cnt);
-  if (h_cnt == 0) {
-    return 0;
-  }
-  K* tmp_keys = nullptr;
-  V* tmp_values = nullptr;
-  M* tmp_metas = nullptr;
-  if (target) {
-    CUDA_CHECK(cudaMallocAsync(&tmp_keys, h_cnt * sizeof(K), stream));
-    CUDA_CHECK(cudaMemsetAsync(tmp_keys, 0, h_cnt * sizeof(K), stream));
-    CUDA_CHECK(cudaMallocAsync(&tmp_values, h_cnt * dim * sizeof(V), stream));
-    CUDA_CHECK(cudaMemsetAsync(tmp_values, 0, h_cnt * dim * sizeof(V), stream));
-    if (metas) {
-      CUDA_CHECK(cudaMallocAsync(&tmp_metas, h_cnt * sizeof(M), stream));
-      CUDA_CHECK(cudaMemsetAsync(tmp_metas, 0, h_cnt * sizeof(M), stream));
-    }
-    gpu_select_kvm_kernel_v2<K, V, M, TILE_SIZE>
-        <<<grid_size, block_size, 0, stream>>>(n,
-                keys, values, metas, tmp_keys, tmp_values, tmp_metas, n_evicted, dim);
-    CUDA_CHECK(cudaMemcpyAsync(keys, tmp_keys, h_cnt * sizeof(K), cudaMemcpyDeviceToDevice, stream));
-    CUDA_CHECK(cudaMemcpyAsync(values, tmp_values, h_cnt * dim * sizeof(V), cudaMemcpyDeviceToDevice, stream));
-    if(metas) {
-      CUDA_CHECK(cudaMemcpyAsync(metas, tmp_metas, h_cnt * sizeof(M), cudaMemcpyDeviceToDevice, stream));
-    }
-    CUDA_CHECK(cudaFreeAsync(tmp_keys, stream));
-    CUDA_CHECK(cudaFreeAsync(tmp_values, stream));
-    if (tmp_metas) {
-      CUDA_CHECK(cudaFreeAsync(tmp_metas, stream));
-    }
-    CUDA_CHECK(cudaStreamSynchronize(stream));
-  } else {
-    throw std::runtime_error("Not used");
-    //gpu_select_kvm_kernel_reverse<K, V, M, Tidx, TILE_SIZE>
-    //    <<<grid_size, block_size, 0, stream>>>(masks, n, offsets,
-    //            keys, values, metas, dim);
-  }
-  return h_cnt;
-}
-
-template <typename K, typename V, typename M, int TILE_SIZE = 8>
-void gpu_pick_kvm_inplace_wrap(const bool* masks, bool target,
-                           size_t n, size_t* n_evicted,
-                           K* __restrict keys, V* __restrict values,
-                           M* __restrict metas, size_t dim, cudaStream_t stream) {
-  size_t block_size = 256;
-  size_t grid_size = SAFE_GET_GRID_SIZE(n, block_size);
-  size_t n_offsets = (n + TILE_SIZE - 1) / TILE_SIZE;
-  int64_t* offsets = nullptr;
-  CUDA_CHECK(cudaMallocAsync(&offsets, sizeof(int64_t) * n_offsets, stream));
-  gpu_pick_kvm_inplace<K, V, M, int64_t, TILE_SIZE>(grid_size, block_size,
-          masks, target, n, n_evicted, offsets, keys, values, metas, dim, stream);
-  CUDA_CHECK(cudaFreeAsync(offsets, stream));
-}
-
-template <typename K, int TILE_SIZE = 32>
-void gpu_pick_keys(const bool* masks, bool target, size_t n, size_t* n_evicted,
-                           const K* __restrict keys, K* __restrict outkeys,
-                           int64_t* outoffsets, cudaStream_t stream) {
-  size_t block_size = 256;
-  size_t grid_size = SAFE_GET_GRID_SIZE(n, block_size);
-  size_t n_offsets = (n + TILE_SIZE - 1) / TILE_SIZE;
-  int64_t* offsets = nullptr;
-  CUDA_CHECK(cudaMallocAsync(&offsets, sizeof(int64_t) * n_offsets, stream));
-  CUDA_CHECK(cudaMemsetAsync(offsets, 0, sizeof(int64_t) * n_offsets, stream));
-
-  gpu_cell_count<int64_t, TILE_SIZE>
-      <<<grid_size, block_size, 0, stream>>>(masks, target, offsets, n, n_evicted);
-#if THRUST_VERSION >= 101600
-  auto policy = thrust::cuda::par_nosync.on(stream);
-#else
-  auto policy = thrust::cuda::par.on(stream);
-#endif
-  thrust::device_ptr<int64_t> d_src(offsets);
-  thrust::device_ptr<int64_t> d_dest(offsets);
-  thrust::exclusive_scan(policy, d_src, d_src + n_offsets, d_dest);
-  gpu_select_key_kernel<K, int64_t, TILE_SIZE>
-      <<<grid_size, block_size, 0, stream>>>(masks, target, n, offsets,
-              keys, outkeys, outoffsets);
-  CUDA_CHECK(cudaFreeAsync(offsets, stream));
-}
-
-template <typename K, typename V, int TILE_SIZE = 8>
-void gpu_pick_kvs(const bool* masks, bool target, size_t n, size_t* n_evicted,
-                           const K* __restrict keys,
-                           V* __restrict values,
-                           K* __restrict outkeys,
-                           V* __restrict outvalues,
-                           size_t dim,
-                           cudaStream_t stream) {
-  size_t block_size = 256;
-  size_t grid_size = SAFE_GET_GRID_SIZE(n, block_size);
-  size_t n_offsets = (n + TILE_SIZE - 1) / TILE_SIZE;
-  int64_t* offsets = nullptr;
-  CUDA_CHECK(cudaMallocAsync(&offsets, sizeof(int64_t) * n_offsets, stream));
-  CUDA_CHECK(cudaMemsetAsync(offsets, 0, sizeof(int64_t) * n_offsets, stream));
-
-  gpu_cell_count<int64_t, TILE_SIZE>
-      <<<grid_size, block_size, 0, stream>>>(masks, target, offsets, n, n_evicted);
-#if THRUST_VERSION >= 101600
-  auto policy = thrust::cuda::par_nosync.on(stream);
-#else
-  auto policy = thrust::cuda::par.on(stream);
-#endif
-  thrust::device_ptr<int64_t> d_src(offsets);
-  thrust::device_ptr<int64_t> d_dest(offsets);
-  thrust::exclusive_scan(policy, d_src, d_src + n_offsets, d_dest);
-  gpu_select_kv_kernel<K, V, int64_t, TILE_SIZE>
-      <<<grid_size, block_size, 0, stream>>>(masks, target, n, offsets,
-              keys, values, outkeys, outvalues, dim);
-  CUDA_CHECK(cudaFreeAsync(offsets, stream));
-}
-}  // namespace merlin
-}  // namespace nv
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/core_kernels.cuh b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/core_kernels.cuh
deleted file mode 100644
index 43504dd68..000000000
--- a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/core_kernels.cuh
+++ /dev/null
@@ -1,2510 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http:///www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cooperative_groups.h>
-#include <cooperative_groups/reduce.h>
-#include <cuda/barrier>
-#include <mutex>
-#include <thread>
-#include <vector>
-#include "types.cuh"
-#include "utils.cuh"
-
-using namespace cooperative_groups;
-namespace cg = cooperative_groups;
-
-namespace nv {
-namespace merlin {
-
-/* For improving performance consideration, allocating up to 64 table structures
- * in constant memory is supported. To close this function, please set
- * `TableOption::use_constant_memory` to `false`.
- */
-constexpr int MAX_CONSTANT_TABLE = 64;
-static std::mutex constant_table_mutex;
-static uint64_t constant_table_flag = 0;
-
-__constant__ char
-    c_table_[sizeof(Table<uint64_t, float, uint64_t>) * MAX_CONSTANT_TABLE];
-
-template <class T = uint64_t>
-int allocate_constant_table() {
-  std::lock_guard<std::mutex> guard(constant_table_mutex);
-  if (constant_table_flag == std::numeric_limits<uint64_t>::max()) return -1;
-  int table_index = 0;
-  while (constant_table_flag & (1l << table_index)) {
-    table_index++;
-  }
-
-  constant_table_flag = constant_table_flag | (1l << table_index);
-
-  return table_index;
-}
-
-template <class T = uint64_t>
-void release_constant_table(int table_index) {
-  std::lock_guard<std::mutex> guard(constant_table_mutex);
-  if (table_index < 0 || table_index >= MAX_CONSTANT_TABLE) return;
-  constant_table_flag = constant_table_flag & (~(1l << table_index));
-}
-
-template <class M>
-__global__ void create_locks(M* __restrict mutex, const size_t start,
-                             const size_t end) {
-  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;
-  if (start + tid < end) {
-    new (mutex + start + tid) M();
-  }
-}
-
-template <class M>
-__global__ void release_locks(M* __restrict mutex, const size_t start,
-                              const size_t end) {
-  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;
-  if (start + tid < end) {
-    (mutex + start + tid)->~M();
-  }
-}
-
-template <class K, class V, class M>
-__global__ void create_atomic_keys(Bucket<K, V, M>* __restrict buckets,
-                                   const size_t start, const size_t end,
-                                   const size_t bucket_max_size) {
-  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;
-  if (start + tid < end) {
-    for (size_t i = 0; i < bucket_max_size; i++)
-      new (buckets[start + tid].keys(i))
-          AtomicKey<K>{static_cast<K>(EMPTY_KEY)};
-  }
-}
-
-template <class K, class V, class M>
-__global__ void create_atomic_metas(Bucket<K, V, M>* __restrict buckets,
-                                    const size_t start, const size_t end,
-                                    const size_t bucket_max_size) {
-  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;
-  if (start + tid < end) {
-    for (size_t i = 0; i < bucket_max_size; i++) {
-      new (buckets[start + tid].metas(i))
-          AtomicMeta<M>{static_cast<M>(EMPTY_META)};
-    }
-    new (&(buckets[start + tid].cur_meta))
-        AtomicMeta<M>{static_cast<M>(EMPTY_META)};
-    new (&(buckets[start + tid].min_meta))
-        AtomicMeta<M>{static_cast<M>(EMPTY_META)};
-    new (&(buckets[start + tid].min_pos)) AtomicPos<int>{1};
-  }
-}
-
-/* Initialize the buckets with index from start to end. */
-template <class K, class V, class M>
-void initialize_buckets(Table<K, V, M>** table, const size_t start,
-                        const size_t end) {
-  /* As testing results show us, when the number of buckets is greater than
-   * the 4 million the performance will drop significantly, we believe the
-   * to many pinned memory allocation causes this issue, so we change the
-   * strategy to allocate some memory slices whose size is not greater than
-   * 64GB, and put the buckets pointer point to the slices.
-   */
-  MERLIN_CHECK(start < end,
-               "initialize_buckets, start should be less than end!");
-  size_t buckets_num = end - start;
-  const size_t total_size_of_vectors =
-      buckets_num * (*table)->bucket_max_size * sizeof(V) * (*table)->dim;
-  const size_t num_of_memory_slices =
-      1 + (total_size_of_vectors - 1) / (*table)->bytes_per_slice;
-  size_t num_of_buckets_in_one_slice =
-      (*table)->bytes_per_slice /
-      ((*table)->bucket_max_size * sizeof(V) * (*table)->dim);
-  size_t num_of_allocated_buckets = 0;
-
-  realloc_managed<V**>(
-      &((*table)->slices), (*table)->num_of_memory_slices * sizeof(V*),
-      ((*table)->num_of_memory_slices + num_of_memory_slices) * sizeof(V*));
-
-  for (size_t i = (*table)->num_of_memory_slices;
-       i < (*table)->num_of_memory_slices + num_of_memory_slices; i++) {
-    if (i == (*table)->num_of_memory_slices + num_of_memory_slices - 1) {
-      num_of_buckets_in_one_slice = buckets_num - num_of_allocated_buckets;
-    }
-    size_t slice_real_size = num_of_buckets_in_one_slice *
-                             (*table)->bucket_max_size * sizeof(V) *
-                             (*table)->dim;
-    if ((*table)->remaining_hbm_for_vectors >= slice_real_size) {
-      CUDA_CHECK(cudaMalloc(&((*table)->slices[i]), slice_real_size));
-      (*table)->remaining_hbm_for_vectors -= slice_real_size;
-    } else {
-      (*table)->is_pure_hbm = false;
-      CUDA_CHECK(
-          cudaMallocHost(&((*table)->slices[i]), slice_real_size,
-                         cudaHostAllocMapped | cudaHostAllocWriteCombined));
-    }
-    for (int j = 0; j < num_of_buckets_in_one_slice; j++) {
-      (*table)->buckets[start + num_of_allocated_buckets + j].vectors =
-          (*table)->slices[i] + j * (*table)->bucket_max_size * (*table)->dim;
-    }
-    num_of_allocated_buckets += num_of_buckets_in_one_slice;
-  }
-
-  (*table)->num_of_memory_slices += num_of_memory_slices;
-  for (int i = start; i < end; i++) {
-    CUDA_CHECK(cudaMalloc(&((*table)->buckets[i].keys_),
-                          (*table)->bucket_max_size * sizeof(AtomicKey<K>)));
-    CUDA_CHECK(cudaMalloc(&((*table)->buckets[i].metas_),
-                          (*table)->bucket_max_size * sizeof(AtomicMeta<M>)));
-  }
-
-  {
-    const size_t block_size = 512;
-    const size_t N = end - start + 1;
-    const int grid_size = SAFE_GET_GRID_SIZE(N, block_size);
-    create_locks<Mutex><<<grid_size, block_size>>>((*table)->locks, start, end);
-  }
-
-  {
-    const size_t block_size = 512;
-    const size_t N = end - start + 1;
-    const int grid_size = SAFE_GET_GRID_SIZE(N, block_size);
-    create_atomic_keys<K, V, M><<<grid_size, block_size>>>(
-        (*table)->buckets, start, end, (*table)->bucket_max_size);
-  }
-
-  {
-    const size_t block_size = 512;
-    const size_t N = end - start + 1;
-    const int grid_size = SAFE_GET_GRID_SIZE(N, block_size);
-    create_atomic_metas<K, V, M><<<grid_size, block_size>>>(
-        (*table)->buckets, start, end, (*table)->bucket_max_size);
-  }
-  CudaCheckError();
-}
-
-template <class K, class V, class M>
-size_t get_slice_size(Table<K, V, M>** table) {
-  const size_t min_slice_size =
-      (*table)->bucket_max_size * sizeof(V) * (*table)->dim;
-  const size_t max_table_size = (*table)->max_size * sizeof(V) * (*table)->dim;
-  size_t slice_size = 0;
-
-  if (max_table_size >= GB(16)) {
-    slice_size = GB(2);
-  } else if (max_table_size >= GB(2)) {
-    slice_size = MB(128);
-  } else if (max_table_size >= MB(128)) {
-    slice_size = MB(16);
-  } else if (max_table_size >= MB(16)) {
-    slice_size = MB(1);
-  } else {
-    slice_size = min_slice_size;
-  }
-
-  return std::max(min_slice_size, slice_size);
-}
-
-/* Initialize a Table struct.
-
-   K: The key type
-   V: The value type which should be static array type and C++ class
-      with customized construct is not supported.
-   M: The meta type, the meta will be used to store the timestamp
-      or occurrence frequency or any thing for eviction.
-   DIM: Vector dimension.
-*/
-template <class K, class V, class M>
-void create_table(Table<K, V, M>** table, const size_t dim,
-                  const size_t init_size = 134217728,
-                  const size_t max_size = std::numeric_limits<size_t>::max(),
-                  const size_t max_hbm_for_vectors = 0,
-                  const size_t bucket_max_size = 128,
-                  const size_t tile_size = 32, const bool primary = true) {
-  CUDA_CHECK(cudaMallocManaged((void**)table, sizeof(Table<K, V, M>)));
-  CUDA_CHECK(cudaMemset(*table, 0, sizeof(Table<K, V, M>)));
-  (*table)->dim = dim;
-  (*table)->bucket_max_size = bucket_max_size;
-  (*table)->max_size = std::max(init_size, max_size);
-  (*table)->tile_size = tile_size;
-  (*table)->is_pure_hbm = true;
-  (*table)->bytes_per_slice = get_slice_size<K, V, M>(table);
-
-  // The bucket number will be the minimum needed for saving memory if no
-  // rehash.
-  if ((init_size * 2) > (*table)->max_size) {
-    (*table)->buckets_num =
-        1 + (((*table)->max_size - 1) / (*table)->bucket_max_size);
-  } else {
-    (*table)->buckets_num = 1;
-    while ((*table)->buckets_num * (*table)->bucket_max_size < init_size) {
-      (*table)->buckets_num *= 2;
-    }
-  }
-
-  (*table)->capacity = (*table)->buckets_num * (*table)->bucket_max_size;
-  (*table)->max_hbm_for_vectors = max_hbm_for_vectors;
-  (*table)->remaining_hbm_for_vectors = max_hbm_for_vectors;
-  (*table)->primary = primary;
-
-  CUDA_CHECK(cudaMalloc((void**)&((*table)->locks),
-                        (*table)->buckets_num * sizeof(Mutex)));
-  CUDA_CHECK(
-      cudaMemset((*table)->locks, 0, (*table)->buckets_num * sizeof(Mutex)));
-
-  CUDA_CHECK(cudaMalloc((void**)&((*table)->buckets_size),
-                        (*table)->buckets_num * sizeof(int)));
-  CUDA_CHECK(cudaMemset((*table)->buckets_size, 0,
-                        (*table)->buckets_num * sizeof(int)));
-
-  CUDA_CHECK(
-      cudaMallocManaged((void**)&((*table)->buckets),
-                        (*table)->buckets_num * sizeof(Bucket<K, V, M>)));
-  CUDA_CHECK(cudaMemset((*table)->buckets, 0,
-                        (*table)->buckets_num * sizeof(Bucket<K, V, M>)));
-
-  initialize_buckets<K, V, M>(table, 0, (*table)->buckets_num);
-  CudaCheckError();
-}
-
-/* Double the capacity on storage, must be followed by calling the
- * rehash_kernel. */
-template <class K, class V, class M>
-void double_capacity(Table<K, V, M>** table) {
-  realloc<Mutex*>(&((*table)->locks), (*table)->buckets_num * sizeof(Mutex),
-                  (*table)->buckets_num * sizeof(Mutex) * 2);
-  realloc<int*>(&((*table)->buckets_size), (*table)->buckets_num * sizeof(int),
-                (*table)->buckets_num * sizeof(int) * 2);
-
-  realloc_managed<Bucket<K, V, M>*>(
-      &((*table)->buckets), (*table)->buckets_num * sizeof(Bucket<K, V, M>),
-      (*table)->buckets_num * sizeof(Bucket<K, V, M>) * 2);
-
-  initialize_buckets<K, V, M>(table, (*table)->buckets_num,
-                              (*table)->buckets_num * 2);
-
-  (*table)->capacity *= 2;
-  (*table)->buckets_num *= 2;
-}
-
-/* free all of the resource of a Table. */
-template <class K, class V, class M>
-void destroy_table(Table<K, V, M>** table) {
-  for (int i = 0; i < (*table)->buckets_num; i++) {
-    CUDA_CHECK(cudaFree((*table)->buckets[i].keys_));
-    CUDA_CHECK(cudaFree((*table)->buckets[i].metas_));
-  }
-
-  for (int i = 0; i < (*table)->num_of_memory_slices; i++) {
-    if (is_on_device((*table)->slices[i])) {
-      CUDA_CHECK(cudaFree((*table)->slices[i]));
-    } else {
-      CUDA_CHECK(cudaFreeHost((*table)->slices[i]));
-    }
-  }
-  {
-    const size_t block_size = 512;
-    const size_t N = (*table)->buckets_num;
-    const int grid_size = SAFE_GET_GRID_SIZE(N, block_size);
-    release_locks<Mutex>
-        <<<grid_size, block_size>>>((*table)->locks, 0, (*table)->buckets_num);
-  }
-  CUDA_CHECK(cudaFree((*table)->slices));
-  CUDA_CHECK(cudaFree((*table)->buckets_size));
-  CUDA_CHECK(cudaFree((*table)->buckets));
-  CUDA_CHECK(cudaFree((*table)->locks));
-  CUDA_CHECK(cudaFree(*table));
-  CUDA_CHECK(cudaDeviceSynchronize());
-  CudaCheckError();
-}
-
-template <class K, class V, class M, uint32_t TILE_SIZE = 4>
-__forceinline__ __device__ void defragmentation_for_rehash(
-    Bucket<K, V, M>* __restrict bucket, uint32_t remove_pos,
-    const size_t bucket_max_size, const size_t buckets_num, const size_t dim) {
-  uint32_t key_idx;
-  size_t global_idx = 0;
-  size_t start_idx = 0;
-  K find_key;
-  K hashed_key;
-
-  uint32_t empty_pos = remove_pos;
-
-  int i = 1;
-  while (i < bucket_max_size) {
-    key_idx = (remove_pos + i) & (bucket_max_size - 1);
-    find_key = (bucket->keys(key_idx))->load(cuda::std::memory_order_relaxed);
-    if (find_key == static_cast<K>(EMPTY_KEY)) {
-      break;
-    }
-    hashed_key = Murmur3HashDevice(find_key);
-    global_idx = hashed_key % (buckets_num * bucket_max_size);
-    start_idx = global_idx % bucket_max_size;
-
-    if ((start_idx <= empty_pos && empty_pos < key_idx) ||
-        (key_idx < start_idx && start_idx <= empty_pos) ||
-        (empty_pos <= key_idx && key_idx < start_idx)) {
-      const K key =
-          (*(bucket->keys(key_idx))).load(cuda::std::memory_order_relaxed);
-      (*(bucket->keys(empty_pos))).store(key, cuda::std::memory_order_relaxed);
-      const M meta =
-          (*(bucket->metas(key_idx))).load(cuda::std::memory_order_relaxed);
-      (*(bucket->metas(empty_pos)))
-          .store(meta, cuda::std::memory_order_relaxed);
-      for (int j = 0; j < dim; j++) {
-        bucket->vectors[empty_pos * dim + j] =
-            bucket->vectors[key_idx * dim + j];
-      }
-      (*(bucket->keys(key_idx)))
-          .store(static_cast<K>(EMPTY_KEY), cuda::std::memory_order_relaxed);
-      empty_pos = key_idx;
-      remove_pos = key_idx;
-      i = 1;
-    } else {
-      i++;
-    }
-  }
-}
-
-template <class K, class V, class M, uint32_t TILE_SIZE = 4>
-__forceinline__ __device__ void refresh_bucket_meta(
-    cg::thread_block_tile<TILE_SIZE> g, Bucket<K, V, M>* bucket,
-    const size_t bucket_max_size) {
-  M min_val = MAX_META;
-  int min_pos = 0;
-
-  for (int i = g.thread_rank(); i < bucket_max_size; i += TILE_SIZE) {
-    const K key = (bucket->keys(i))->load(cuda::std::memory_order_relaxed);
-    if (key == static_cast<K>(EMPTY_KEY) ||
-        key == static_cast<K>(RECLAIM_KEY)) {
-      continue;
-    }
-    const M meta = bucket->metas(i)->load(cuda::std::memory_order_relaxed);
-    if (meta < min_val) {
-      min_pos = i;
-      min_val = meta;
-    }
-  }
-  M global_min_val = cg::reduce(g, min_val, cg::less<M>());
-  if (min_val == global_min_val) {
-    bucket->min_pos.store(min_pos, cuda::std::memory_order_relaxed);
-    bucket->min_meta.store(min_val, cuda::std::memory_order_relaxed);
-  }
-}
-
-template <class V, uint32_t TILE_SIZE = 4>
-__device__ __forceinline__ void copy_vector(
-    cg::thread_block_tile<TILE_SIZE> const& g, const V* src, V* dst,
-    const size_t dim) {
-  for (auto i = g.thread_rank(); i < dim; i += g.size()) {
-    dst[i] = src[i];
-  }
-
-  //  cuda::barrier<cuda::thread_scope_device> bar;
-  //  init(&bar, 1);
-  //  cuda::memcpy_async(g, dst, src, dim * sizeof(V), bar);
-  //
-  //  bar.arrive_and_wait();
-}
-
-/* Write the N data from src to each address in *dst by using CPU threads,
- * usually called by upsert kernel.
- *
- * @note: In some machines with AMD CPUs, the `write_kernel` has low performance
- * thru PCI-E, so we try to use the `memcpy` on CPU threads for writing work to
- * reach better performance.
- */
-template <class V>
-void write_by_cpu(V** __restrict dst, const V* __restrict src,
-                  const int* __restrict offset, size_t dim, int N,
-                  int n_worker = 16) {
-  std::vector<std::thread> thds;
-  if (n_worker < 1) n_worker = 1;
-
-  auto functor = [dim](V** __restrict dst, const V* __restrict src,
-                       const int* __restrict offset, int handled_size,
-                       int trunk_size) -> void {
-    for (int i = handled_size; i < handled_size + trunk_size; i++) {
-      if (dst[i] != nullptr) {
-        memcpy(dst[i], src + offset[i] * dim, sizeof(V) * dim);
-      }
-    }
-  };
-
-  int32_t trunk_size_floor = N / n_worker;
-  int32_t trunk_size_remain = N % n_worker;
-  int32_t n_worker_used = trunk_size_floor == 0 ? trunk_size_remain : n_worker;
-
-  size_t handled_size = 0;
-  for (int i = 0; i < n_worker_used; i++) {
-    int32_t cur_trunk_size = trunk_size_floor;
-    if (trunk_size_remain != 0) {
-      cur_trunk_size += 1;
-      trunk_size_remain--;
-    }
-    thds.push_back(
-        std::thread(functor, dst, src, offset, handled_size, cur_trunk_size));
-    handled_size += cur_trunk_size;
-  }
-
-  for (int i = 0; i < n_worker_used; i++) {
-    thds[i].join();
-  }
-}
-
-template <class K, class V, class M, uint32_t TILE_SIZE = 4>
-__forceinline__ __device__ void move_key_to_new_bucket(
-    cg::thread_block_tile<TILE_SIZE> g, int rank, const K& key, const M& meta,
-    const V* __restrict vector, Bucket<K, V, M>* __restrict new_bucket,
-    const size_t new_bkt_idx, const size_t new_start_idx,
-    int* __restrict buckets_size, const size_t bucket_max_size,
-    const size_t buckets_num, const size_t dim) {
-  uint32_t key_pos;
-  unsigned empty_vote;
-  int local_size;
-  int src_lane;
-
-  for (uint32_t tile_offset = 0; tile_offset < bucket_max_size;
-       tile_offset += TILE_SIZE) {
-    size_t key_offset =
-        (new_start_idx + tile_offset + rank) & (bucket_max_size - 1);
-    const K current_key =
-        (*(new_bucket->keys(key_offset))).load(cuda::std::memory_order_relaxed);
-    empty_vote = g.ballot(current_key == static_cast<K>(EMPTY_KEY));
-    if (empty_vote) {
-      src_lane = __ffs(empty_vote) - 1;
-      key_pos =
-          (new_start_idx + tile_offset + src_lane) & (bucket_max_size - 1);
-      local_size = buckets_size[new_bkt_idx];
-      if (rank == src_lane) {
-        new_bucket->keys(key_pos)->store(key, cuda::std::memory_order_relaxed);
-        new_bucket->metas(key_pos)->store(meta,
-                                          cuda::std::memory_order_relaxed);
-        atomicAdd(&(buckets_size[new_bkt_idx]), 1);
-      }
-      local_size = g.shfl(local_size, src_lane);
-      if (local_size >= bucket_max_size) {
-        refresh_bucket_meta<K, V, M, TILE_SIZE>(g, new_bucket, bucket_max_size);
-      }
-      copy_vector<V, TILE_SIZE>(g, vector, new_bucket->vectors + key_pos * dim,
-                                dim);
-      break;
-    }
-  }
-}
-
-template <class K, class V, class M, uint32_t TILE_SIZE = 4>
-__global__ void rehash_kernel_for_fast_mode(
-    const Table<K, V, M>* __restrict table, size_t N) {
-  Bucket<K, V, M>* buckets = table->buckets;
-  int* __restrict buckets_size = table->buckets_size;
-  const size_t bucket_max_size = table->bucket_max_size;
-  const size_t buckets_num = table->buckets_num;
-  const size_t dim = table->dim;
-
-  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
-  int rank = g.thread_rank();
-  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;
-  size_t global_idx;
-  uint32_t start_idx = 0;
-  K target_key = 0;
-  M target_meta = 0;
-
-  for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) {
-    uint32_t bkt_idx = t / TILE_SIZE;
-    Bucket<K, V, M>* bucket = (buckets + bkt_idx);
-
-    lock<Mutex, TILE_SIZE>(g, table->locks[bkt_idx]);
-    uint32_t key_idx = 0;
-    while (key_idx < bucket_max_size) {
-      key_idx = g.shfl(key_idx, 0);
-      target_key =
-          (bucket->keys(key_idx))->load(cuda::std::memory_order_relaxed);
-      target_meta =
-          bucket->metas(key_idx)->load(cuda::std::memory_order_relaxed);
-      if (target_key != static_cast<K>(EMPTY_KEY) &&
-          target_key != static_cast<K>(RECLAIM_KEY)) {
-        K hashed_key = Murmur3HashDevice(target_key);
-        global_idx = hashed_key % (buckets_num * bucket_max_size);
-        uint32_t new_bkt_idx = global_idx / bucket_max_size;
-        if (new_bkt_idx != bkt_idx) {
-          start_idx = global_idx % bucket_max_size;
-          move_key_to_new_bucket<K, V, M, TILE_SIZE>(
-              g, rank, target_key, target_meta,
-              (bucket->vectors + key_idx * dim), buckets + new_bkt_idx,
-              new_bkt_idx, start_idx, buckets_size, bucket_max_size,
-              buckets_num, table->dim);
-          if (rank == 0) {
-            (bucket->keys(key_idx))
-                ->store(static_cast<K>(EMPTY_KEY),
-                        cuda::std::memory_order_relaxed);
-            atomicSub(&(buckets_size[bkt_idx]), 1);
-            defragmentation_for_rehash<K, V, M, TILE_SIZE>(
-                bucket, key_idx, bucket_max_size, buckets_num / 2, dim);
-            key_idx = 0;
-          }
-        } else {
-          key_idx++;
-        }
-      } else {
-        key_idx++;
-      }
-    }
-    unlock<Mutex, TILE_SIZE>(g, table->locks[bkt_idx]);
-  }
-}
-
-/* Write the N data from src to each address in *dst,
-   usually called by upsert kernel.
-
-   `src`: A continuous memory pointer with Vector
-          which can be HBM.
-   `dst`: A pointer of pointer to V which should be on HBM,
-          but each value (a pointer of V) could point to a
-          memory on HBM or HMEM.
-   `N`: Number of vectors that need to be written.
-*/
-template <class K, class V, class M>
-__global__ void write_kernel(const V* __restrict src, V** __restrict dst,
-                             const int* __restrict src_offset, const size_t dim,
-                             const size_t N) {
-  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;
-
-  for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) {
-    int vec_index = int(t / dim);
-    int dim_index = t % dim;
-
-    if (dst[vec_index] != nullptr) {
-      if (src_offset != nullptr) {
-        dst[vec_index][dim_index] =
-            src[src_offset[vec_index] * dim + dim_index];
-      } else {
-        dst[vec_index][dim_index] = src[vec_index * dim + dim_index];
-      }
-    }
-  }
-}
-
-/* Write the values of delta_or_val into the table. If the key[i] is already in
-   the table indicted be @exists[i], a @delta_or_val[i] will be added to the the
-   existing value. if the key not exists, the value @val_or_delta[i] will be
-   assigned to the address @dst[i].
-
-   `delta_or_val`: will be treated as val and accumlating should be executed.
-   `dst`: A pointer of pointer to V which should be on HBM,
-          but each value (a pointer of V) could point to a
-          memory on HBM or HMEM.
-   `existed`: If the keys existed before this kernel is executed.
-   `status`: The existence status for each key when the kernel is being
-   executed.
-
-   `N`: number of vectors needed to be writen.
-*/
-template <class K, class V, class M>
-__global__ void write_with_accum_kernel(const V* __restrict delta_or_val,
-                                        V** __restrict dst,
-                                        const bool* __restrict existed,
-                                        const bool* __restrict status,
-                                        const int* __restrict src_offset,
-                                        const size_t dim, size_t N) {
-  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;
-
-  for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) {
-    int vec_index = int(t / dim);
-    int dim_index = t % dim;
-
-    if (dst[vec_index] != nullptr &&
-        existed[src_offset[vec_index]] == status[src_offset[vec_index]]) {
-      if (status[src_offset[vec_index]]) {
-        dst[vec_index][dim_index] +=
-            delta_or_val[src_offset[vec_index] * dim + dim_index];
-      } else {
-        dst[vec_index][dim_index] =
-            delta_or_val[src_offset[vec_index] * dim + dim_index];
-      }
-    }
-  }
-}
-
-/* Add a @delta[i] to the the value saved in the address @dst[i].
-
-   `delta`: a delta value which should be add to.
-   `dst`: A pointer of pointer to V which should be on HBM,
-          but each value (a pointer of V) could point to a
-          memory on HBM or HMEM.
-   `N`: number of vectors needed to be writen.
-*/
-template <class K, class V, class M>
-__global__ void write_with_accum_kernel(const V* __restrict delta,
-                                        V** __restrict dst,
-                                        const int* __restrict src_offset,
-                                        const size_t dim, size_t N) {
-  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;
-
-  for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) {
-    int vec_index = int(t / dim);
-    int dim_index = t % dim;
-
-    if (dst[vec_index] != nullptr) {
-      dst[vec_index][dim_index] +=
-          delta[src_offset[vec_index] * dim + dim_index];
-    }
-  }
-}
-
-/* Read the N data from src to each address in *dst,
-   usually called by upsert kernel.
-
-   `src`: A pointer of pointer of V which should be on HBM,
-          but each value (a pointer of V) could point to a
-          memory on HBM or HMEM.
-   `dst`: A continue memory pointer with Vector
-          which should be HBM.
-   `mask`: One for each `dst`. If true, reading from src,
-           or false reading from default_val.
-   `default_val`: Default value with shape (1, DIM) or (N, DIM)
-   `N`: The number of vectors needed to be read.
-   'full_size_default':
-      If true, the d_def_val will be treated as
-      a full size default value which shape must be (N, DIM).
-*/
-template <class K, class V, class M>
-__global__ void read_kernel(const V* const* __restrict src, V* __restrict dst,
-                            const bool* mask, const int* __restrict dst_offset,
-                            const size_t dim, size_t N) {
-  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;
-
-  for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) {
-    int vec_index = int(t / dim);
-    int dim_index = t % dim;
-    int real_dst_offset =
-        dst_offset != nullptr ? dst_offset[vec_index] : vec_index;
-
-    /// Copy selected values and fill in default value for all others.
-    if (mask[real_dst_offset] && src[vec_index] != nullptr) {
-      dst[real_dst_offset * dim + dim_index] = src[vec_index][dim_index];
-    }
-  }
-}
-
-/* Read the N data from src to each address in *dst,
- *  usually called by upsert kernel.
- *
- *  `src`: A pointer of pointer of V which should be on HBM,
- *         but each value (a pointer of V) could point to a
- *         memory on HBM or HMEM.
- *  `dst`: A continue memory pointer with Vector
- *         which should be HBM.
- *  `N`: Number of vectors needed to be read.
- */
-template <class K, class V, class M>
-__global__ void read_kernel(const V** __restrict src, V* __restrict dst,
-                            const int* __restrict dst_offset, const size_t dim,
-                            const size_t N) {
-  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;
-
-  for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) {
-    int vec_index = int(t / dim);
-    int real_dst_offset =
-        dst_offset != nullptr ? dst_offset[vec_index] : vec_index;
-    int dim_index = t % dim;
-    if (src[vec_index] != nullptr) {
-      dst[real_dst_offset * dim + dim_index] = src[vec_index * dim + dim_index];
-    }
-  }
-}
-
-template <class K, class V, class M, uint32_t TILE_SIZE = 4>
-__device__ __forceinline__ unsigned find_in_bucket(
-    cg::thread_block_tile<TILE_SIZE> g, Bucket<K, V, M>* bucket,
-    const K& find_key, uint32_t& tile_offset, const uint32_t& start_idx,
-    const size_t& bucket_max_size) {
-  uint32_t key_pos = 0;
-
-#pragma unroll
-  for (tile_offset = 0; tile_offset < bucket_max_size;
-       tile_offset += TILE_SIZE) {
-    key_pos =
-        (start_idx + tile_offset + g.thread_rank()) & (bucket_max_size - 1);
-    auto const current_key =
-        bucket->keys(key_pos)->load(cuda::std::memory_order_relaxed);
-    auto const found_vote = g.ballot(find_key == current_key);
-    if (found_vote) {
-      return found_vote;
-    }
-
-    if (g.any(current_key == static_cast<K>(EMPTY_KEY))) {
-      return 0;
-    }
-  }
-  return 0;
-}
-
-template <class K, class V, class M, uint32_t TILE_SIZE = 4>
-__device__ __forceinline__ OccupyResult find_without_lock(
-    cg::thread_block_tile<TILE_SIZE> g, Bucket<K, V, M>* __restrict__ bucket,
-    const K desired_key, const size_t start_idx, int& key_pos, int& src_lane,
-    const size_t bucket_max_size) {
-  K expected_key = static_cast<K>(EMPTY_KEY);
-
-  AtomicKey<K>* current_key;
-
-  unsigned vote = 0;
-
-  for (uint32_t tile_offset = 0; tile_offset < bucket_max_size;
-       tile_offset += TILE_SIZE) {
-    key_pos = (start_idx + tile_offset + g.thread_rank()) % bucket_max_size;
-
-    current_key = bucket->keys(key_pos);
-
-    expected_key = current_key->load(cuda::std::memory_order_relaxed);
-    vote = g.ballot(desired_key == expected_key);
-    if (vote) {
-      src_lane = __ffs(vote) - 1;
-      key_pos = g.shfl(key_pos, src_lane);
-      return OccupyResult::DUPLICATE;
-    }
-    vote = g.ballot(expected_key == static_cast<K>(EMPTY_KEY));
-    if (vote) break;
-  }
-  return OccupyResult::CONTINUE;
-}
-
-template <class K, class V, class M, uint32_t TILE_SIZE = 4>
-__device__ __inline__ OccupyResult find_and_lock_when_vacant(
-    cg::thread_block_tile<TILE_SIZE> g, Bucket<K, V, M>* __restrict__ bucket,
-    const K desired_key, const M desired_meta, K& evicted_key,
-    const size_t start_idx, int& key_pos, int& src_lane,
-    const size_t bucket_max_size) {
-  K expected_key = static_cast<K>(EMPTY_KEY);
-
-  AtomicKey<K>* current_key;
-  AtomicMeta<M>* current_meta;
-
-  K local_min_meta_key = static_cast<K>(EMPTY_KEY);
-
-  M local_min_meta_val = MAX_META;
-  M temp_min_meta_val = MAX_META;
-  int local_min_meta_pos = -1;
-
-  unsigned vote = 0;
-  bool result = false;
-
-  for (uint32_t tile_offset = 0; tile_offset < bucket_max_size;
-       tile_offset += TILE_SIZE) {
-    key_pos = (start_idx + tile_offset + g.thread_rank()) % bucket_max_size;
-
-    current_key = bucket->keys(key_pos);
-
-    // Step 1: try find and lock the desired_key.
-    do {
-      expected_key = desired_key;
-      result = current_key->compare_exchange_strong(
-          expected_key, static_cast<K>(LOCKED_KEY),
-          cuda::std::memory_order_relaxed, cuda::std::memory_order_relaxed);
-      vote = g.ballot(result);
-      if (vote) {
-        src_lane = __ffs(vote) - 1;
-        key_pos = g.shfl(key_pos, src_lane);
-        return OccupyResult::DUPLICATE;
-      }
-      vote = g.ballot(expected_key == static_cast<K>(EMPTY_KEY));
-      if (vote) break;
-      vote = g.ballot(expected_key == static_cast<K>(LOCKED_KEY));
-    } while (vote != 0);
-
-    // Step 2: (TBD)try find empty location.
-    while (vote) {
-      src_lane = __ffs(vote) - 1;
-      if (src_lane == g.thread_rank()) {
-        expected_key = static_cast<K>(EMPTY_KEY);
-        result = current_key->compare_exchange_strong(
-            expected_key, static_cast<K>(LOCKED_KEY),
-            cuda::std::memory_order_relaxed, cuda::std::memory_order_relaxed);
-      }
-      result = g.shfl(result, src_lane);
-      if (result) {
-        key_pos = g.shfl(key_pos, src_lane);
-        return OccupyResult::OCCUPIED_EMPTY;
-      }
-      vote -= ((unsigned(0x1)) << src_lane);
-    }
-  }
-
-  for (uint32_t tile_offset = 0; tile_offset < bucket_max_size;
-       tile_offset += TILE_SIZE) {
-    key_pos = (start_idx + tile_offset + g.thread_rank()) % bucket_max_size;
-
-    current_meta = bucket->metas(key_pos);
-
-    // Step 4: record min meta location.
-    temp_min_meta_val = current_meta->load(cuda::std::memory_order_relaxed);
-    if (temp_min_meta_val < local_min_meta_val) {
-      expected_key =
-          bucket->keys(key_pos)->load(cuda::std::memory_order_relaxed);
-      if (expected_key != static_cast<K>(LOCKED_KEY) &&
-          expected_key != static_cast<K>(EMPTY_KEY)) {
-        local_min_meta_key = expected_key;
-        local_min_meta_val = temp_min_meta_val;
-        local_min_meta_pos = key_pos;
-      }
-    }
-  }
-  // Step 5: insert by evicting some one.
-  const M global_min_meta_val =
-      cg::reduce(g, local_min_meta_val, cg::less<M>());
-  if (desired_meta < global_min_meta_val) {
-    return OccupyResult::REFUSED;
-  }
-  vote = g.ballot(local_min_meta_val <= global_min_meta_val);
-  if (vote) {
-    src_lane = __ffs(vote) - 1;
-    result = false;
-    if (src_lane == g.thread_rank()) {
-      // TBD: Here can be compare_exchange_weak. Do benchmark.
-      current_key = bucket->keys(local_min_meta_pos);
-      current_meta = bucket->metas(local_min_meta_pos);
-      evicted_key = local_min_meta_key;
-      result = current_key->compare_exchange_strong(
-          local_min_meta_key, static_cast<K>(LOCKED_KEY),
-          cuda::std::memory_order_relaxed, cuda::std::memory_order_relaxed);
-
-      // Need to recover when fail.
-      if (result && (current_meta->load(cuda::std::memory_order_relaxed) >
-                     global_min_meta_val)) {
-        current_key->store(local_min_meta_key, cuda::std::memory_order_relaxed);
-        result = false;
-      }
-    }
-    result = g.shfl(result, src_lane);
-    if (result) {
-      // Not every `evicted_key` is correct expect the `src_lane` thread.
-      key_pos = g.shfl(local_min_meta_pos, src_lane);
-      return (evicted_key == static_cast<K>(RECLAIM_KEY))
-                 ? OccupyResult::OCCUPIED_RECLAIMED
-                 : OccupyResult::EVICT;
-    }
-  }
-  return OccupyResult::CONTINUE;
-}
-
-template <class K, class V, class M, uint32_t TILE_SIZE = 4>
-__device__ __forceinline__ OccupyResult find_and_lock_when_full(
-    cg::thread_block_tile<TILE_SIZE> g, Bucket<K, V, M>* __restrict__ bucket,
-    const K desired_key, const M desired_meta, K& evicted_key,
-    const size_t start_idx, int& key_pos, int& src_lane,
-    const size_t bucket_max_size) {
-  K expected_key = static_cast<K>(EMPTY_KEY);
-
-  AtomicKey<K>* current_key;
-  AtomicMeta<M>* current_meta;
-
-  K local_min_meta_key = static_cast<K>(EMPTY_KEY);
-
-  M local_min_meta_val = MAX_META;
-  M temp_min_meta_val = MAX_META;
-  int local_min_meta_pos = -1;
-
-  unsigned vote = 0;
-  bool result = false;
-
-  for (uint32_t tile_offset = 0; tile_offset < bucket_max_size;
-       tile_offset += TILE_SIZE) {
-    key_pos = (start_idx + tile_offset + g.thread_rank()) % bucket_max_size;
-
-    current_key = bucket->keys(key_pos);
-
-    // Step 1: try find and lock the desired_key.
-    do {
-      expected_key = desired_key;
-      result = current_key->compare_exchange_strong(
-          expected_key, static_cast<K>(LOCKED_KEY),
-          cuda::std::memory_order_relaxed, cuda::std::memory_order_relaxed);
-      vote = g.ballot(result);
-      if (vote) {
-        src_lane = __ffs(vote) - 1;
-        key_pos = g.shfl(key_pos, src_lane);
-        return OccupyResult::DUPLICATE;
-      }
-      vote = g.ballot(expected_key == static_cast<K>(LOCKED_KEY));
-    } while (vote != 0);
-  }
-
-  for (uint32_t tile_offset = 0; tile_offset < bucket_max_size;
-       tile_offset += TILE_SIZE) {
-    key_pos = (start_idx + tile_offset + g.thread_rank()) % bucket_max_size;
-
-    // Step 2: record min meta location.
-    temp_min_meta_val =
-        bucket->metas(key_pos)->load(cuda::std::memory_order_relaxed);
-    if (temp_min_meta_val < local_min_meta_val) {
-      while ((expected_key = bucket->keys(key_pos)->load(
-                  cuda::std::memory_order_relaxed)) ==
-             static_cast<K>(LOCKED_KEY))
-        ;
-      local_min_meta_key = expected_key;
-      local_min_meta_val = temp_min_meta_val;
-      local_min_meta_pos = key_pos;
-    }
-  }
-
-  // Step 3: insert by evicting some one.
-  const M global_min_meta_val =
-      cg::reduce(g, local_min_meta_val, cg::less<M>());
-  if (desired_meta < global_min_meta_val) {
-    return OccupyResult::REFUSED;
-  }
-  vote = g.ballot(local_min_meta_val <= global_min_meta_val);
-  if (vote) {
-    src_lane = __ffs(vote) - 1;
-    result = false;
-    if (src_lane == g.thread_rank()) {
-      // TBD: Here can be compare_exchange_weak. Do benchmark.
-      current_key = bucket->keys(local_min_meta_pos);
-      current_meta = bucket->metas(local_min_meta_pos);
-      evicted_key = local_min_meta_key;
-      result = current_key->compare_exchange_strong(
-          local_min_meta_key, static_cast<K>(LOCKED_KEY),
-          cuda::std::memory_order_relaxed, cuda::std::memory_order_relaxed);
-
-      // Need to recover when fail.
-      if (result && (current_meta->load(cuda::std::memory_order_relaxed) >
-                     global_min_meta_val)) {
-        current_key->store(local_min_meta_key, cuda::std::memory_order_relaxed);
-        result = false;
-      }
-    }
-    result = g.shfl(result, src_lane);
-    if (result) {
-      // Not every `evicted_key` is correct expect the `src_lane` thread.
-      key_pos = g.shfl(local_min_meta_pos, src_lane);
-      return (evicted_key == static_cast<K>(RECLAIM_KEY))
-                 ? OccupyResult::OCCUPIED_RECLAIMED
-                 : OccupyResult::EVICT;
-    }
-  }
-  return OccupyResult::CONTINUE;
-}
-
-template <class K, class V, class M, uint32_t TILE_SIZE = 4>
-__device__ __forceinline__ OccupyResult find_and_lock_for_update(
-    cg::thread_block_tile<TILE_SIZE> g, Bucket<K, V, M>* __restrict__ bucket,
-    const K desired_key, const size_t start_idx, int& key_pos, int& src_lane,
-    const size_t bucket_max_size) {
-  K expected_key = static_cast<K>(EMPTY_KEY);
-
-  AtomicKey<K>* current_key;
-
-  unsigned vote = 0;
-  bool result = false;
-
-  for (uint32_t tile_offset = 0; tile_offset < bucket_max_size;
-       tile_offset += TILE_SIZE) {
-    key_pos = (start_idx + tile_offset + g.thread_rank()) % bucket_max_size;
-
-    current_key = bucket->keys(key_pos);
-
-    // Step 1: try find and lock the desired_key.
-    do {
-      expected_key = desired_key;
-      result = current_key->compare_exchange_strong(
-          expected_key, static_cast<K>(LOCKED_KEY),
-          cuda::std::memory_order_relaxed, cuda::std::memory_order_relaxed);
-      vote = g.ballot(result);
-      if (vote) {
-        src_lane = __ffs(vote) - 1;
-        key_pos = g.shfl(key_pos, src_lane);
-        return OccupyResult::DUPLICATE;
-      }
-      vote = g.ballot(expected_key == static_cast<K>(EMPTY_KEY));
-      if (vote) return OccupyResult::REFUSED;
-      vote = g.ballot(expected_key == static_cast<K>(LOCKED_KEY));
-    } while (vote != 0);
-  }
-  return OccupyResult::REFUSED;
-}
-
-template <class K, class V, class M>
-__forceinline__ __device__ Bucket<K, V, M>* get_key_position(
-    Bucket<K, V, M>* __restrict buckets, const K key, size_t& bkt_idx,
-    size_t& start_idx, const size_t buckets_num, const size_t bucket_max_size) {
-  const uint32_t hashed_key = Murmur3HashDevice(key);
-  const size_t global_idx = hashed_key % (buckets_num * bucket_max_size);
-  bkt_idx = global_idx / bucket_max_size;
-  start_idx = global_idx % bucket_max_size;
-  return buckets + bkt_idx;
-}
-
-template <class K, class V, class M>
-__forceinline__ __device__ void update_meta(Bucket<K, V, M>* __restrict bucket,
-                                            const int key_pos,
-                                            const M* __restrict metas,
-                                            const int key_idx) {
-  if (metas == nullptr) {
-    M cur_meta =
-        bucket->cur_meta.fetch_add(1, cuda::std::memory_order_relaxed) + 1;
-    bucket->metas(key_pos)->store(cur_meta, cuda::std::memory_order_relaxed);
-  } else {
-    bucket->metas(key_pos)->store(metas[key_idx],
-                                  cuda::std::memory_order_relaxed);
-  }
-  return;
-}
-
-template <class K, class V, class M, uint32_t TILE_SIZE = 4>
-__global__ void upsert_kernel_with_io_core(
-    const Table<K, V, M>* __restrict table, const size_t bucket_max_size,
-    const size_t buckets_num, const size_t dim, const K* __restrict keys,
-    const V* __restrict values, const M* __restrict metas, size_t N) {
-  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
-  int* buckets_size = table->buckets_size;
-
-  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;
-       t += blockDim.x * gridDim.x) {
-    int key_pos = -1;
-    size_t key_idx = t / TILE_SIZE;
-
-    const K insert_key = keys[key_idx];
-
-    if (IS_RESERVED_KEY(insert_key)) continue;
-
-    const M insert_meta =
-        metas != nullptr ? metas[key_idx] : static_cast<M>(MAX_META);
-    const V* insert_value = values + key_idx * dim;
-
-    size_t bkt_idx = 0;
-    size_t start_idx = 0;
-    int src_lane = -1;
-    K evicted_key;
-
-    Bucket<K, V, M>* bucket =
-        get_key_position<K>(table->buckets, insert_key, bkt_idx, start_idx,
-                            buckets_num, bucket_max_size);
-
-    OccupyResult occupy_result{OccupyResult::INITIAL};
-    const int bucket_size = buckets_size[bkt_idx];
-    do {
-      if (bucket_size < bucket_max_size) {
-        occupy_result = find_and_lock_when_vacant<K, V, M, TILE_SIZE>(
-            g, bucket, insert_key, insert_meta, evicted_key, start_idx, key_pos,
-            src_lane, bucket_max_size);
-      } else {
-        start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;
-        occupy_result = find_and_lock_when_full<K, V, M, TILE_SIZE>(
-            g, bucket, insert_key, insert_meta, evicted_key, start_idx, key_pos,
-            src_lane, bucket_max_size);
-      }
-
-      occupy_result = g.shfl(occupy_result, src_lane);
-    } while (occupy_result == OccupyResult::CONTINUE);
-
-    if (occupy_result == OccupyResult::REFUSED) continue;
-
-    if ((occupy_result == OccupyResult::OCCUPIED_EMPTY ||
-         occupy_result == OccupyResult::OCCUPIED_RECLAIMED) &&
-        g.thread_rank() == src_lane) {
-      atomicAdd(&(buckets_size[bkt_idx]), 1);
-    }
-
-    copy_vector<V, TILE_SIZE>(g, insert_value, bucket->vectors + key_pos * dim,
-                              dim);
-    if (g.thread_rank() == src_lane) {
-      update_meta(bucket, key_pos, metas, key_idx);
-      (bucket->keys(key_pos))
-          ->store(insert_key, cuda::std::memory_order_relaxed);
-    }
-  }
-}
-
-template <class K, class V, class M, uint32_t TILE_SIZE = 4>
-__global__ void upsert_and_evict_kernel_with_io_core(
-    const Table<K, V, M>* __restrict table, const size_t bucket_max_size,
-    const size_t buckets_num, const size_t dim, const K* __restrict keys,
-    const V* __restrict values, const M* __restrict metas,
-    K* __restrict evicted_keys, V* __restrict evicted_values,
-    M* __restrict evicted_metas, size_t N) {
-  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
-  int* buckets_size = table->buckets_size;
-
-  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;
-       t += blockDim.x * gridDim.x) {
-    int key_pos = -1;
-    const size_t key_idx = t / TILE_SIZE;
-
-    const K insert_key = keys[key_idx];
-
-    if (IS_RESERVED_KEY(insert_key)) continue;
-
-    const M insert_meta =
-        metas != nullptr ? metas[key_idx] : static_cast<M>(MAX_META);
-    const V* insert_value = values + key_idx * dim;
-
-    size_t bkt_idx = 0;
-    size_t start_idx = 0;
-    int src_lane = -1;
-    K evicted_key;
-
-    Bucket<K, V, M>* bucket =
-        get_key_position<K>(table->buckets, insert_key, bkt_idx, start_idx,
-                            buckets_num, bucket_max_size);
-
-    OccupyResult occupy_result{OccupyResult::INITIAL};
-    const int bucket_size = buckets_size[bkt_idx];
-    do {
-      if (bucket_size < bucket_max_size) {
-        occupy_result = find_and_lock_when_vacant<K, V, M, TILE_SIZE>(
-            g, bucket, insert_key, insert_meta, evicted_key, start_idx, key_pos,
-            src_lane, bucket_max_size);
-      } else {
-        start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;
-        occupy_result = find_and_lock_when_full<K, V, M, TILE_SIZE>(
-            g, bucket, insert_key, insert_meta, evicted_key, start_idx, key_pos,
-            src_lane, bucket_max_size);
-      }
-      occupy_result = g.shfl(occupy_result, src_lane);
-    } while (occupy_result == OccupyResult::CONTINUE);
-
-    if (occupy_result == OccupyResult::REFUSED) {
-      copy_vector<V, TILE_SIZE>(g, insert_value, evicted_values + key_idx * dim,
-                                dim);
-      continue;
-    }
-
-    if ((occupy_result == OccupyResult::OCCUPIED_EMPTY ||
-         occupy_result == OccupyResult::OCCUPIED_RECLAIMED) &&
-        g.thread_rank() == src_lane) {
-      atomicAdd(&(buckets_size[bkt_idx]), 1);
-    }
-
-    if (occupy_result == OccupyResult::EVICT) {
-      if (g.thread_rank() == src_lane) {
-        evicted_keys[key_idx] = evicted_key;
-      }
-      if (metas != nullptr) {
-        evicted_metas[key_idx] = metas[key_idx];
-      }
-      copy_vector<V, TILE_SIZE>(g, bucket->vectors + key_pos * dim,
-                                evicted_values + key_idx * dim, dim);
-    }
-
-    copy_vector<V, TILE_SIZE>(g, insert_value, bucket->vectors + key_pos * dim,
-                              dim);
-    if (g.thread_rank() == src_lane) {
-      update_meta(bucket, key_pos, metas, key_idx);
-      (bucket->keys(key_pos))
-          ->store(insert_key, cuda::std::memory_order_relaxed);
-    }
-  }
-}
-
-template <typename K, typename V, typename M>
-struct SelectUpsertKernelWithIO {
-  static void execute_kernel(const float& load_factor, const int& block_size,
-                             const size_t bucket_max_size,
-                             const size_t buckets_num, const size_t dim,
-                             cudaStream_t& stream, const size_t& n,
-                             const Table<K, V, M>* __restrict table,
-                             const K* __restrict keys,
-                             const V* __restrict values,
-                             const M* __restrict metas) {
-    if (load_factor <= 0.5) {
-      const unsigned int tile_size = 4;
-      const size_t N = n * tile_size;
-      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
-      upsert_kernel_with_io_core<K, V, M, tile_size>
-          <<<grid_size, block_size, 0, stream>>>(
-              table, bucket_max_size, buckets_num, dim, keys, values, metas, N);
-
-    } else if (load_factor <= 0.875) {
-      const unsigned int tile_size = 8;
-      const size_t N = n * tile_size;
-      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
-      upsert_kernel_with_io_core<K, V, M, tile_size>
-          <<<grid_size, block_size, 0, stream>>>(
-              table, bucket_max_size, buckets_num, dim, keys, values, metas, N);
-    } else {
-      const unsigned int tile_size = 32;
-      const size_t N = n * tile_size;
-      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
-      upsert_kernel_with_io_core<K, V, M, tile_size>
-          <<<grid_size, block_size, 0, stream>>>(
-              table, bucket_max_size, buckets_num, dim, keys, values, metas, N);
-    }
-    return;
-  }
-};
-
-template <typename K, typename V, typename M>
-struct SelectUpsertAndEvictKernelWithIO {
-  static void execute_kernel(
-      const float& load_factor, const int& block_size,
-      const size_t bucket_max_size, const size_t buckets_num, const size_t dim,
-      cudaStream_t& stream, const size_t& n,
-      const Table<K, V, M>* __restrict table, const K* __restrict keys,
-      const V* __restrict values, const M* __restrict metas,
-      K* __restrict evicted_keys, V* __restrict evicted_values,
-      M* __restrict evicted_metas) {
-    if (load_factor <= 0.5) {
-      const unsigned int tile_size = 4;
-      const size_t N = n * tile_size;
-      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
-      upsert_and_evict_kernel_with_io_core<K, V, M, tile_size>
-          <<<grid_size, block_size, 0, stream>>>(
-              table, bucket_max_size, buckets_num, dim, keys, values, metas,
-              evicted_keys, evicted_values, evicted_metas, N);
-
-    } else if (load_factor <= 0.875) {
-      const unsigned int tile_size = 8;
-      const size_t N = n * tile_size;
-      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
-
-      upsert_and_evict_kernel_with_io_core<K, V, M, tile_size>
-          <<<grid_size, block_size, 0, stream>>>(
-              table, bucket_max_size, buckets_num, dim, keys, values, metas,
-              evicted_keys, evicted_values, evicted_metas, N);
-
-    } else {
-      const unsigned int tile_size = 32;
-      const size_t N = n * tile_size;
-      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
-      upsert_and_evict_kernel_with_io_core<K, V, M, tile_size>
-          <<<grid_size, block_size, 0, stream>>>(
-              table, bucket_max_size, buckets_num, dim, keys, values, metas,
-              evicted_keys, evicted_values, evicted_metas, N);
-    }
-    return;
-  }
-};
-
-/* Upsert with the end-user specified meta.
- */
-template <class K, class V, class M, uint32_t TILE_SIZE = 4>
-__global__ void upsert_kernel(const Table<K, V, M>* __restrict table,
-                              const size_t bucket_max_size,
-                              const size_t buckets_num, const size_t dim,
-                              const K* __restrict keys, V** __restrict vectors,
-                              const M* __restrict metas,
-                              int* __restrict src_offset, size_t N) {
-  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
-  int* buckets_size = table->buckets_size;
-
-  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;
-       t += blockDim.x * gridDim.x) {
-    int key_pos = -1;
-    size_t key_idx = t / TILE_SIZE;
-
-    const K insert_key = keys[key_idx];
-    if (IS_RESERVED_KEY(insert_key)) continue;
-
-    const M insert_meta =
-        metas != nullptr ? metas[key_idx] : static_cast<M>(MAX_META);
-
-    size_t bkt_idx = 0;
-    size_t start_idx = 0;
-    int src_lane = -1;
-    K evicted_key;
-
-    Bucket<K, V, M>* bucket =
-        get_key_position<K>(table->buckets, insert_key, bkt_idx, start_idx,
-                            buckets_num, bucket_max_size);
-
-    if (src_offset != nullptr && g.thread_rank() == 0) {
-      *(src_offset + key_idx) = key_idx;
-    }
-
-    OccupyResult occupy_result{OccupyResult::INITIAL};
-    const int bucket_size = buckets_size[bkt_idx];
-    do {
-      if (bucket_size < bucket_max_size) {
-        occupy_result = find_and_lock_when_vacant<K, V, M, TILE_SIZE>(
-            g, bucket, insert_key, insert_meta, evicted_key, start_idx, key_pos,
-            src_lane, bucket_max_size);
-      } else {
-        start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;
-        occupy_result = find_and_lock_when_vacant<K, V, M, TILE_SIZE>(
-            g, bucket, insert_key, insert_meta, evicted_key, start_idx, key_pos,
-            src_lane, bucket_max_size);
-      }
-
-      occupy_result = g.shfl(occupy_result, src_lane);
-    } while (occupy_result == OccupyResult::CONTINUE);
-
-    if (occupy_result == OccupyResult::REFUSED) continue;
-
-    if ((occupy_result == OccupyResult::OCCUPIED_EMPTY ||
-         occupy_result == OccupyResult::OCCUPIED_RECLAIMED) &&
-        g.thread_rank() == src_lane) {
-      atomicAdd(&(buckets_size[bkt_idx]), 1);
-    }
-
-    if (g.thread_rank() == src_lane) {
-      *(vectors + key_idx) = (bucket->vectors + key_pos * dim);
-      update_meta(bucket, key_pos, metas, key_idx);
-      (bucket->keys(key_pos))
-          ->store(insert_key, cuda::std::memory_order_relaxed);
-    }
-  }
-}
-
-/* Accum kernel with customized metas.
- */
-template <class K, class V, class M, uint32_t TILE_SIZE = 4>
-__global__ void accum_kernel(
-    const Table<K, V, M>* __restrict table, const K* __restrict keys,
-    V** __restrict vectors, const M* __restrict metas,
-    const bool* __restrict existed, Bucket<K, V, M>* __restrict buckets,
-    int* __restrict buckets_size, const size_t bucket_max_size,
-    const size_t buckets_num, int* __restrict src_offset,
-    bool* __restrict status, size_t N) {
-  const size_t dim = table->dim;
-  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;
-  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
-  int rank = g.thread_rank();
-
-  for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) {
-    int key_pos = -1;
-    int local_size = 0;
-    bool local_found = false;
-    unsigned found_or_empty_vote = 0;
-
-    size_t key_idx = t / TILE_SIZE;
-    K insert_key = *(keys + key_idx);
-
-    if (IS_RESERVED_KEY(insert_key)) continue;
-
-    K hashed_key = Murmur3HashDevice(insert_key);
-    size_t global_idx = hashed_key % (buckets_num * bucket_max_size);
-    size_t bkt_idx = global_idx / bucket_max_size;
-    size_t start_idx = global_idx % bucket_max_size;
-
-    int src_lane = -1;
-
-    Bucket<K, V, M>* bucket = buckets + bkt_idx;
-    lock<Mutex, TILE_SIZE>(g, table->locks[bkt_idx]);
-    if (rank == 0 && src_offset != nullptr) {
-      *(src_offset + key_idx) = key_idx;
-    }
-
-    for (uint32_t tile_offset = 0; tile_offset < bucket_max_size;
-         tile_offset += TILE_SIZE) {
-      size_t key_offset =
-          (start_idx + tile_offset + rank) & (bucket_max_size - 1);
-      K current_key =
-          bucket->keys(key_offset)->load(cuda::std::memory_order_relaxed);
-      found_or_empty_vote = g.ballot(current_key == static_cast<K>(EMPTY_KEY) ||
-                                     insert_key == current_key);
-      if (found_or_empty_vote) {
-        src_lane = __ffs(found_or_empty_vote) - 1;
-        key_pos = (start_idx + tile_offset + src_lane) & (bucket_max_size - 1);
-        local_size = buckets_size[bkt_idx];
-        if (rank == src_lane) {
-          if (current_key == insert_key) {
-            local_found = true;
-            *(status + key_idx) = local_found;
-          }
-          if (local_found == existed[key_idx]) {
-            (bucket->keys(key_pos))
-                ->store(insert_key, cuda::std::memory_order_relaxed);
-            if (!local_found) {
-              buckets_size[bkt_idx]++;
-              local_size++;
-            }
-            *(vectors + key_idx) = (bucket->vectors + key_pos * dim);
-            update_meta(bucket, key_pos, metas, key_idx);
-          }
-        }
-        local_size = g.shfl(local_size, src_lane);
-        if (local_size >= bucket_max_size) {
-          refresh_bucket_meta<K, V, M, TILE_SIZE>(g, bucket, bucket_max_size);
-        }
-        break;
-      }
-    }
-    if (!found_or_empty_vote) {
-      if (rank == (bucket->min_pos % TILE_SIZE)) {
-        key_pos = bucket->min_pos;
-        (bucket->keys(key_pos))
-            ->store(insert_key, cuda::std::memory_order_relaxed);
-        *(vectors + key_idx) = (bucket->vectors + key_pos * dim);
-        update_meta(bucket, key_pos, metas, key_idx);
-      }
-      refresh_bucket_meta<K, V, M, TILE_SIZE>(g, bucket, bucket_max_size);
-    }
-    unlock<Mutex, TILE_SIZE>(g, table->locks[bkt_idx]);
-  }
-}
-
-/* lookup with IO operation. This kernel is
- * usually used for the pure HBM mode for better performance.
- */
-template <class K, class V, class M, uint32_t TILE_SIZE = 4>
-__global__ void lookup_kernel_with_io(const Table<K, V, M>* __restrict table,
-                                      const size_t bucket_max_size,
-                                      const size_t buckets_num,
-                                      const size_t dim,
-                                      const K* __restrict keys,
-                                      V* __restrict values, M* __restrict metas,
-                                      bool* __restrict found, size_t N) {
-  int* buckets_size = table->buckets_size;
-  Bucket<K, V, M>* buckets = table->buckets;
-
-  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
-  int rank = g.thread_rank();
-
-  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;
-       t += blockDim.x * gridDim.x) {
-    int key_idx = t / TILE_SIZE;
-
-    const K find_key = keys[key_idx];
-    if (IS_RESERVED_KEY(find_key)) continue;
-
-    V* find_value = values + key_idx * dim;
-
-    int key_pos = -1;
-    int src_lane = -1;
-    size_t bkt_idx = 0;
-    size_t start_idx = 0;
-
-    Bucket<K, V, M>* bucket = get_key_position<K>(
-        buckets, find_key, bkt_idx, start_idx, buckets_num, bucket_max_size);
-
-    const int bucket_size = buckets_size[bkt_idx];
-    if (bucket_size >= bucket_max_size) {
-      start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;
-    }
-
-    OccupyResult occupy_result{OccupyResult::INITIAL};
-    occupy_result = find_without_lock<K, V, M, TILE_SIZE>(
-        g, bucket, find_key, start_idx, key_pos, src_lane, bucket_max_size);
-
-    if (occupy_result == OccupyResult::DUPLICATE) {
-      copy_vector<V, TILE_SIZE>(g, bucket->vectors + key_pos * dim, find_value,
-                                dim);
-      if (rank == src_lane) {
-        if (metas != nullptr) {
-          *(metas + key_idx) =
-              bucket->metas(key_pos)->load(cuda::std::memory_order_relaxed);
-        }
-        if (found != nullptr) {
-          *(found + key_idx) = true;
-        }
-      }
-    }
-  }
-}
-
-template <typename K, typename V, typename M>
-struct SelectLookupKernelWithIO {
-  static void execute_kernel(const float& load_factor, const int& block_size,
-                             const size_t bucket_max_size,
-                             const size_t buckets_num, const size_t dim,
-                             cudaStream_t& stream, const size_t& n,
-                             const Table<K, V, M>* __restrict table,
-                             const K* __restrict keys, V* __restrict values,
-                             M* __restrict metas, bool* __restrict found) {
-    if (load_factor <= 0.75) {
-      const unsigned int tile_size = 4;
-      const size_t N = n * tile_size;
-      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
-      lookup_kernel_with_io<K, V, M, tile_size>
-          <<<grid_size, block_size, 0, stream>>>(table, bucket_max_size,
-                                                 buckets_num, dim, keys, values,
-                                                 metas, found, N);
-    } else {
-      const unsigned int tile_size = 16;
-      const size_t N = n * tile_size;
-      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
-      lookup_kernel_with_io<K, V, M, tile_size>
-          <<<grid_size, block_size, 0, stream>>>(table, bucket_max_size,
-                                                 buckets_num, dim, keys, values,
-                                                 metas, found, N);
-    }
-    return;
-  }
-};
-
-/* lookup kernel.
- */
-template <class K, class V, class M, uint32_t TILE_SIZE = 4>
-__global__ void lookup_kernel(const Table<K, V, M>* __restrict table,
-                              const size_t bucket_max_size,
-                              const size_t buckets_num, const size_t dim,
-                              const K* __restrict keys, V** __restrict values,
-                              M* __restrict metas, bool* __restrict found,
-                              int* __restrict dst_offset, size_t N) {
-  int* buckets_size = table->buckets_size;
-  Bucket<K, V, M>* buckets = table->buckets;
-
-  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
-  int rank = g.thread_rank();
-
-  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;
-       t += blockDim.x * gridDim.x) {
-    int key_idx = t / TILE_SIZE;
-
-    const K find_key = keys[key_idx];
-    if (IS_RESERVED_KEY(find_key)) continue;
-
-    int key_pos = -1;
-    int src_lane = -1;
-    size_t bkt_idx = 0;
-    size_t start_idx = 0;
-
-    Bucket<K, V, M>* bucket = get_key_position<K>(
-        buckets, find_key, bkt_idx, start_idx, buckets_num, bucket_max_size);
-
-    const int bucket_size = buckets_size[bkt_idx];
-    if (bucket_size >= bucket_max_size) {
-      start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;
-    }
-
-    if (dst_offset != nullptr && rank == 0) {
-      *(dst_offset + key_idx) = key_idx;
-    }
-
-    OccupyResult occupy_result{OccupyResult::INITIAL};
-    occupy_result = find_without_lock<K, V, M, TILE_SIZE>(
-        g, bucket, find_key, start_idx, key_pos, src_lane, bucket_max_size);
-
-    if (occupy_result == OccupyResult::DUPLICATE) {
-      if (rank == src_lane) {
-        *(values + key_idx) = (bucket->vectors + key_pos * dim);
-        if (metas != nullptr) {
-          *(metas + key_idx) =
-              bucket->metas(key_pos)->load(cuda::std::memory_order_relaxed);
-        }
-        if (found != nullptr) {
-          *(found + key_idx) = true;
-        }
-      }
-    } else {
-      if (rank == 0) {
-        *(values + key_idx) = nullptr;
-      }
-    }
-  }
-}
-
-/* lookup with IO operation. This kernel is
- * usually used for the pure HBM mode for better performance.
- */
-template <class K, class V, class M, uint32_t TILE_SIZE = 4>
-__global__ void lookup_ptr_kernel(const Table<K, V, M>* __restrict table,
-                                  const size_t bucket_max_size,
-                                  const size_t buckets_num, const size_t dim,
-                                  const K* __restrict keys,
-                                  V** __restrict values, M* __restrict metas,
-                                  bool* __restrict found, size_t N) {
-  int* buckets_size = table->buckets_size;
-  Bucket<K, V, M>* buckets = table->buckets;
-
-  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
-  int rank = g.thread_rank();
-
-  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;
-       t += blockDim.x * gridDim.x) {
-    int key_idx = t / TILE_SIZE;
-
-    const K find_key = keys[key_idx];
-    if (IS_RESERVED_KEY(find_key)) continue;
-
-    int key_pos = -1;
-    int src_lane = -1;
-    size_t bkt_idx = 0;
-    size_t start_idx = 0;
-
-    Bucket<K, V, M>* bucket = get_key_position<K>(
-        buckets, find_key, bkt_idx, start_idx, buckets_num, bucket_max_size);
-
-    const int bucket_size = buckets_size[bkt_idx];
-    if (bucket_size >= bucket_max_size) {
-      start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;
-    }
-
-    OccupyResult occupy_result{OccupyResult::INITIAL};
-    occupy_result = find_without_lock<K, V, M, TILE_SIZE>(
-        g, bucket, find_key, start_idx, key_pos, src_lane, bucket_max_size);
-
-    if (occupy_result == OccupyResult::DUPLICATE) {
-      if (rank == src_lane) {
-        values[key_idx] = bucket->vectors + key_pos * dim;
-        if (metas != nullptr) {
-          *(metas + key_idx) =
-              bucket->metas(key_pos)->load(cuda::std::memory_order_relaxed);
-        }
-        if (found != nullptr) {
-          *(found + key_idx) = true;
-        }
-      }
-    }
-  }
-}
-
-template <typename K, typename V, typename M>
-struct SelectLookupPtrKernel {
-  static void execute_kernel(const float& load_factor, const int& block_size,
-                             const size_t bucket_max_size,
-                             const size_t buckets_num, const size_t dim,
-                             cudaStream_t& stream, const size_t& n,
-                             const Table<K, V, M>* __restrict table,
-                             const K* __restrict keys, V** __restrict values,
-                             M* __restrict metas, bool* __restrict found) {
-    if (load_factor <= 0.75) {
-      const unsigned int tile_size = 4;
-      const size_t N = n * tile_size;
-      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
-      lookup_ptr_kernel<K, V, M, tile_size>
-          <<<grid_size, block_size, 0, stream>>>(table, bucket_max_size,
-                                                 buckets_num, dim, keys, values,
-                                                 metas, found, N);
-    } else {
-      const unsigned int tile_size = 16;
-      const size_t N = n * tile_size;
-      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
-      lookup_ptr_kernel<K, V, M, tile_size>
-          <<<grid_size, block_size, 0, stream>>>(table, bucket_max_size,
-                                                 buckets_num, dim, keys, values,
-                                                 metas, found, N);
-    }
-    return;
-  }
-};
-
-/* Clear all key-value in the table. */
-template <class K, class V, class M>
-__global__ void clear_kernel(Table<K, V, M>* __restrict table, size_t N) {
-  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;
-  const size_t bucket_max_size = table->bucket_max_size;
-
-  for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) {
-    int key_idx = t % bucket_max_size;
-    int bkt_idx = t / bucket_max_size;
-    Bucket<K, V, M>* bucket = &(table->buckets[bkt_idx]);
-
-    (bucket->keys(key_idx))
-        ->store(static_cast<K>(EMPTY_KEY), cuda::std::memory_order_relaxed);
-    if (key_idx == 0) {
-      table->buckets_size[bkt_idx] = 0;
-    }
-  }
-}
-
-/* Remove specified keys. */
-template <class K, class V, class M, uint32_t TILE_SIZE = 4>
-__global__ void remove_kernel(const Table<K, V, M>* __restrict table,
-                              const K* __restrict keys,
-                              Bucket<K, V, M>* __restrict buckets,
-                              int* __restrict buckets_size,
-                              const size_t bucket_max_size,
-                              const size_t buckets_num, size_t N) {
-  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
-  int rank = g.thread_rank();
-
-  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;
-       t += blockDim.x * gridDim.x) {
-    int key_idx = t / TILE_SIZE;
-    K find_key = keys[key_idx];
-    if (IS_RESERVED_KEY(find_key)) continue;
-
-    int key_pos = -1;
-
-    size_t bkt_idx = 0;
-    size_t start_idx = 0;
-    uint32_t tile_offset = 0;
-
-    Bucket<K, V, M>* bucket = get_key_position<K>(
-        buckets, find_key, bkt_idx, start_idx, buckets_num, bucket_max_size);
-
-    unsigned found_vote = 0;
-#pragma unroll
-    for (tile_offset = 0; tile_offset < bucket_max_size;
-         tile_offset += TILE_SIZE) {
-      key_pos = (start_idx + tile_offset + rank) & (bucket_max_size - 1);
-
-      const K current_key =
-          (bucket->keys(key_pos))->load(cuda::std::memory_order_relaxed);
-
-      found_vote = g.ballot(find_key == current_key);
-      if (found_vote) {
-        break;
-      }
-
-      if (g.any(current_key == static_cast<K>(EMPTY_KEY))) {
-        break;
-      }
-    }
-
-    if (found_vote) {
-      const int src_lane = __ffs(found_vote) - 1;
-
-      if (g.thread_rank() == src_lane) {
-        const int key_pos =
-            (start_idx + tile_offset + src_lane) & (bucket_max_size - 1);
-        (bucket->keys(key_pos))
-            ->store(static_cast<K>(RECLAIM_KEY),
-                    cuda::std::memory_order_relaxed);
-        (bucket->metas(key_pos))
-            ->store(static_cast<M>(EMPTY_META),
-                    cuda::std::memory_order_relaxed);
-        atomicSub(&buckets_size[bkt_idx], 1);
-      }
-      break;
-    }
-  }
-}
-
-/* Remove specified keys which match the Predict. */
-template <class K, class V, class M, uint32_t TILE_SIZE = 1>
-__global__ void remove_kernel(const Table<K, V, M>* __restrict table,
-                              const EraseIfPredictInternal<K, M> pred,
-                              const K pattern, const M threshold,
-                              size_t* __restrict count,
-                              Bucket<K, V, M>* __restrict buckets,
-                              int* __restrict buckets_size,
-                              const size_t bucket_max_size,
-                              const size_t buckets_num, size_t N) {
-  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
-
-  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;
-       t += blockDim.x * gridDim.x) {
-    uint32_t bkt_idx = t;
-    uint32_t key_pos = 0;
-
-    Bucket<K, V, M>* bucket = buckets + bkt_idx;
-
-    K current_key = 0;
-    M current_meta = 0;
-    uint32_t key_offset = 0;
-    while (key_offset < bucket_max_size) {
-      current_key =
-          bucket->keys(key_offset)->load(cuda::std::memory_order_relaxed);
-      current_meta =
-          bucket->metas(key_offset)->load(cuda::std::memory_order_relaxed);
-      if (!IS_RESERVED_KEY(current_key)) {
-        if (pred(current_key, current_meta, pattern, threshold)) {
-          atomicAdd(count, 1);
-          key_pos = key_offset;
-          (bucket->keys(key_pos))
-              ->store(static_cast<K>(RECLAIM_KEY),
-                      cuda::std::memory_order_relaxed);
-          (bucket->metas(key_pos))
-              ->store(static_cast<M>(EMPTY_META),
-                      cuda::std::memory_order_relaxed);
-          atomicSub(&buckets_size[bkt_idx], 1);
-        } else {
-          key_offset++;
-        }
-      } else {
-        key_offset++;
-      }
-    }
-  }
-}
-
-/* Dump with meta. */
-template <class K, class V, class M>
-inline std::tuple<size_t, size_t> dump_kernel_shared_memory_size(
-    const size_t available_shared_memory) {
-  const size_t block_size{std::min(
-      available_shared_memory / 2 / sizeof(KVM<K, V, M>), UINT64_C(1024))};
-  MERLIN_CHECK(
-      block_size > 0,
-      "[HierarchicalKV] block_size <= 0, the K-V-M size may be too large!");
-
-  return {block_size * sizeof(KVM<K, V, M>), block_size};
-}
-
-template <class K, class V, class M>
-__global__ void dump_kernel(const Table<K, V, M>* __restrict table, K* d_key,
-                            V* __restrict d_val, M* __restrict d_meta,
-                            const size_t offset, const size_t search_length,
-                            size_t* d_dump_counter) {
-  extern __shared__ unsigned char s[];
-  KVM<K, V, M>* const block_tuples{reinterpret_cast<KVM<K, V, M>*>(s)};
-
-  const size_t bucket_max_size{table->bucket_max_size};
-  const size_t dim{table->dim};
-
-  __shared__ size_t block_acc;
-  __shared__ size_t global_acc;
-
-  const size_t tid{blockIdx.x * blockDim.x + threadIdx.x};
-
-  if (threadIdx.x == 0) {
-    block_acc = 0;
-  }
-  __syncthreads();
-
-  if (tid < search_length) {
-    Bucket<K, V, M>* const bucket{
-        &table->buckets[(tid + offset) / bucket_max_size]};
-
-    const int key_idx{static_cast<int>((tid + offset) % bucket_max_size)};
-    const K key{(bucket->keys(key_idx))->load(cuda::std::memory_order_relaxed)};
-
-    if (!IS_RESERVED_KEY(key)) {
-      size_t local_index{atomicAdd(&block_acc, 1)};
-      block_tuples[local_index] = {
-          key, &bucket->vectors[key_idx * dim],
-          bucket->metas(key_idx)->load(cuda::std::memory_order_relaxed)};
-    }
-  }
-  __syncthreads();
-
-  if (threadIdx.x == 0) {
-    global_acc = atomicAdd(d_dump_counter, block_acc);
-  }
-  __syncthreads();
-
-  if (threadIdx.x < block_acc) {
-    const KVM<K, V, M>& tuple{block_tuples[threadIdx.x]};
-
-    const size_t j{global_acc + threadIdx.x};
-    d_key[j] = tuple.key;
-    for (int i{0}; i < dim; ++i) {
-      d_val[j * dim + i] = tuple.value[i];
-    }
-    if (d_meta != nullptr) {
-      d_meta[j] = tuple.meta;
-    }
-  }
-}
-
-/* Dump with meta. */
-template <class K, class V, class M, template <typename, typename> class PredFunctor>
-__global__ void dump_kernel(const Table<K, V, M>* __restrict table,
-                            const K pattern, const M threshold, K* d_key,
-                            V* __restrict d_val, M* __restrict d_meta,
-                            const size_t offset, const size_t search_length,
-                            size_t* d_dump_counter) {
-  extern __shared__ unsigned char s[];
-  const size_t bucket_max_size = table->bucket_max_size;
-  const size_t dim = table->dim;
-  K* smem = (K*)s;
-  K* block_result_key = smem;
-  V* block_result_val = (V*)&(smem[blockDim.x]);
-  M* block_result_meta = (M*)&(block_result_val[blockDim.x * dim]);
-  __shared__ size_t block_acc;
-  __shared__ size_t global_acc;
-  PredFunctor<K, M> fn;
-
-  const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (threadIdx.x == 0) {
-    block_acc = 0;
-  }
-  __syncthreads();
-
-  if (tid < search_length) {
-    int bkt_idx = (tid + offset) / bucket_max_size;
-    int key_idx = (tid + offset) % bucket_max_size;
-    Bucket<K, V, M>* bucket = &(table->buckets[bkt_idx]);
-
-    const K key =
-        (bucket->keys(key_idx))->load(cuda::std::memory_order_relaxed);
-    M meta = bucket->metas(key_idx)->load(cuda::std::memory_order_relaxed);
-
-    if (key != static_cast<K>(EMPTY_KEY) &&
-        fn(key, meta, pattern, threshold)) {
-      size_t local_index = atomicAdd(&block_acc, 1);
-      block_result_key[local_index] = key;
-      for (int i = 0; i < dim; i++) {
-        atomicExch(&(block_result_val[local_index * dim + i]),
-                   bucket->vectors[key_idx * dim + i]);
-      }
-      if (d_meta != nullptr) {
-        block_result_meta[local_index] = meta;
-      }
-    }
-  }
-  __syncthreads();
-
-  if (threadIdx.x == 0) {
-    global_acc = atomicAdd(d_dump_counter, block_acc);
-  }
-  __syncthreads();
-
-  if (threadIdx.x < block_acc) {
-    d_key[global_acc + threadIdx.x] = block_result_key[threadIdx.x];
-    for (int i = 0; i < dim; i++) {
-      d_val[(global_acc + threadIdx.x) * dim + i] =
-          block_result_val[threadIdx.x * dim + i];
-    }
-    if (d_meta != nullptr) {
-      d_meta[global_acc + threadIdx.x] = block_result_meta[threadIdx.x];
-    }
-  }
-}
-
-/* If founds[i] = true, read data from corresponding address of
- * table_value_addrs and write to param_values; if founds[i] = false, write data
- * from param_values to corresponding address of table_value_addrs. usually
- * called by find_or_insert kernel.
- */
-template <class V>
-void read_or_write_by_cpu(V** __restrict table_value_addrs,
-                          V* __restrict param_values,
-                          const int* __restrict offset, const bool* founds,
-                          size_t dim, int N, int n_worker = 16) {
-  std::vector<std::thread> thds;
-  if (n_worker < 1) n_worker = 1;
-
-  auto functor = [founds, dim](V** __restrict table_value_addrs,
-                               V* __restrict param_values,
-                               const int* __restrict offset, int handled_size,
-                               int trunk_size) -> void {
-    for (int i = handled_size; i < handled_size + trunk_size; i++) {
-      if (table_value_addrs[i] != nullptr) {
-        if (founds[offset[i]]) {
-          memcpy(param_values + offset[i] * dim, table_value_addrs[i],
-                 sizeof(V) * dim);
-        } else {
-          memcpy(table_value_addrs[i], param_values + offset[i] * dim,
-                 sizeof(V) * dim);
-        }
-      }
-    }
-  };
-
-  int32_t trunk_size_floor = N / n_worker;
-  int32_t trunk_size_remain = N % n_worker;
-  int32_t n_worker_used = trunk_size_floor == 0 ? trunk_size_remain : n_worker;
-
-  size_t handled_size = 0;
-  for (int i = 0; i < n_worker_used; i++) {
-    int32_t cur_trunk_size = trunk_size_floor;
-    if (trunk_size_remain != 0) {
-      cur_trunk_size += 1;
-      trunk_size_remain--;
-    }
-    thds.push_back(std::thread(functor, table_value_addrs, param_values, offset,
-                               handled_size, cur_trunk_size));
-    handled_size += cur_trunk_size;
-  }
-
-  for (int i = 0; i < n_worker_used; i++) {
-    thds[i].join();
-  }
-}
-
-/*
- * find or insert with IO operation. This kernel is
- * usually used for the pure HBM mode for better performance.
- */
-template <class K, class V, class M, uint32_t TILE_SIZE = 4>
-__global__ void find_or_insert_kernel_with_io(
-    const Table<K, V, M>* __restrict table, const size_t bucket_max_size,
-    const size_t buckets_num, const size_t dim, const K* __restrict keys,
-    V* __restrict values, M* __restrict metas, const size_t N) {
-  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
-  int* buckets_size = table->buckets_size;
-
-  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;
-       t += blockDim.x * gridDim.x) {
-    int key_pos = -1;
-    const size_t key_idx = t / TILE_SIZE;
-
-    const K find_or_insert_key = keys[key_idx];
-
-    if (IS_RESERVED_KEY(find_or_insert_key)) continue;
-
-    const M find_or_insert_meta =
-        metas != nullptr ? metas[key_idx] : static_cast<M>(MAX_META);
-    V* find_or_insert_value = values + key_idx * dim;
-
-    size_t bkt_idx = 0;
-    size_t start_idx = 0;
-    int src_lane = -1;
-    K evicted_key;
-
-    Bucket<K, V, M>* bucket =
-        get_key_position<K>(table->buckets, find_or_insert_key, bkt_idx,
-                            start_idx, buckets_num, bucket_max_size);
-
-    OccupyResult occupy_result{OccupyResult::INITIAL};
-    const int bucket_size = buckets_size[bkt_idx];
-    do {
-      if (bucket_size < bucket_max_size) {
-        occupy_result = find_and_lock_when_vacant<K, V, M, TILE_SIZE>(
-            g, bucket, find_or_insert_key, find_or_insert_meta, evicted_key,
-            start_idx, key_pos, src_lane, bucket_max_size);
-      } else {
-        start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;
-        occupy_result = find_and_lock_when_full<K, V, M, TILE_SIZE>(
-            g, bucket, find_or_insert_key, find_or_insert_meta, evicted_key,
-            start_idx, key_pos, src_lane, bucket_max_size);
-      }
-
-      occupy_result = g.shfl(occupy_result, src_lane);
-    } while (occupy_result == OccupyResult::CONTINUE);
-
-    if (occupy_result == OccupyResult::REFUSED) continue;
-
-    if ((occupy_result == OccupyResult::OCCUPIED_EMPTY ||
-         occupy_result == OccupyResult::OCCUPIED_RECLAIMED) &&
-        g.thread_rank() == src_lane) {
-      atomicAdd(&(buckets_size[bkt_idx]), 1);
-    }
-
-    if (occupy_result == OccupyResult::DUPLICATE) {
-      copy_vector<V, TILE_SIZE>(g, bucket->vectors + key_pos * dim,
-                                find_or_insert_value, dim);
-      if (metas != nullptr && g.thread_rank() == src_lane) {
-        *(metas + key_idx) =
-            bucket->metas(key_pos)->load(cuda::std::memory_order_relaxed);
-      }
-    } else {
-      copy_vector<V, TILE_SIZE>(g, find_or_insert_value,
-                                bucket->vectors + key_pos * dim, dim);
-      if (g.thread_rank() == src_lane) {
-        update_meta(bucket, key_pos, metas, key_idx);
-      }
-    }
-
-    if (g.thread_rank() == src_lane) {
-      (bucket->keys(key_pos))
-          ->store(find_or_insert_key, cuda::std::memory_order_relaxed);
-    }
-  }
-}
-
-template <typename K, typename V, typename M>
-struct SelectFindOrInsertKernelWithIO {
-  static void execute_kernel(const float& load_factor, const int& block_size,
-                             const size_t bucket_max_size,
-                             const size_t buckets_num, const size_t dim,
-                             cudaStream_t& stream, const size_t& n,
-                             const Table<K, V, M>* __restrict table,
-                             const K* __restrict keys, V* __restrict values,
-                             M* __restrict metas) {
-    if (load_factor <= 0.75) {
-      const unsigned int tile_size = 4;
-      const size_t N = n * tile_size;
-      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
-      find_or_insert_kernel_with_io<K, V, M, tile_size>
-          <<<grid_size, block_size, 0, stream>>>(
-              table, bucket_max_size, buckets_num, dim, keys, values, metas, N);
-    } else {
-      const unsigned int tile_size = 32;
-      const size_t N = n * tile_size;
-      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
-      find_or_insert_kernel_with_io<K, V, M, tile_size>
-          <<<grid_size, block_size, 0, stream>>>(
-              table, bucket_max_size, buckets_num, dim, keys, values, metas, N);
-    }
-    return;
-  }
-};
-
-/* find or insert with the end-user specified meta.
- */
-template <class K, class V, class M, uint32_t TILE_SIZE = 4>
-__global__ void find_or_insert_kernel(
-    const Table<K, V, M>* __restrict table, const size_t bucket_max_size,
-    const size_t buckets_num, const size_t dim, const K* __restrict keys,
-    V** __restrict vectors, M* __restrict metas, bool* __restrict found,
-    int* __restrict keys_index, const size_t N) {
-  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
-  int* buckets_size = table->buckets_size;
-
-  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;
-       t += blockDim.x * gridDim.x) {
-    int key_pos = -1;
-    size_t key_idx = t / TILE_SIZE;
-
-    const K find_or_insert_key = keys[key_idx];
-
-    if (IS_RESERVED_KEY(find_or_insert_key)) continue;
-
-    const M find_or_insert_meta =
-        metas != nullptr ? metas[key_idx] : static_cast<M>(MAX_META);
-
-    size_t bkt_idx = 0;
-    size_t start_idx = 0;
-    int src_lane = -1;
-    K evicted_key;
-
-    Bucket<K, V, M>* bucket =
-        get_key_position<K>(table->buckets, find_or_insert_key, bkt_idx,
-                            start_idx, buckets_num, bucket_max_size);
-
-    if (g.thread_rank() == 0) {
-      *(keys_index + key_idx) = key_idx;
-    }
-
-    OccupyResult occupy_result{OccupyResult::INITIAL};
-    const int bucket_size = buckets_size[bkt_idx];
-    do {
-      if (bucket_size < bucket_max_size) {
-        occupy_result = find_and_lock_when_vacant<K, V, M, TILE_SIZE>(
-            g, bucket, find_or_insert_key, find_or_insert_meta, evicted_key,
-            start_idx, key_pos, src_lane, bucket_max_size);
-      } else {
-        start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;
-        occupy_result = find_and_lock_when_full<K, V, M, TILE_SIZE>(
-            g, bucket, find_or_insert_key, find_or_insert_meta, evicted_key,
-            start_idx, key_pos, src_lane, bucket_max_size);
-      }
-
-      occupy_result = g.shfl(occupy_result, src_lane);
-    } while (occupy_result == OccupyResult::CONTINUE);
-
-    if (occupy_result == OccupyResult::REFUSED) continue;
-
-    if ((occupy_result == OccupyResult::OCCUPIED_EMPTY ||
-         occupy_result == OccupyResult::OCCUPIED_RECLAIMED) &&
-        g.thread_rank() == src_lane) {
-      atomicAdd(&(buckets_size[bkt_idx]), 1);
-    }
-
-    if (occupy_result == OccupyResult::DUPLICATE) {
-      if (g.thread_rank() == src_lane) {
-        *(vectors + key_idx) = (bucket->vectors + key_pos * dim);
-
-        if (found != nullptr) {
-          *(found + key_idx) = true;
-        }
-
-        if (metas != nullptr) {
-          *(metas + key_idx) =
-              bucket->metas(key_pos)->load(cuda::std::memory_order_relaxed);
-        }
-      }
-    } else {
-      if (g.thread_rank() == src_lane) {
-        *(vectors + key_idx) = (bucket->vectors + key_pos * dim);
-        update_meta(bucket, key_pos, metas, key_idx);
-      }
-    }
-
-    if (g.thread_rank() == src_lane) {
-      (bucket->keys(key_pos))
-          ->store(find_or_insert_key, cuda::std::memory_order_relaxed);
-    }
-  }
-}
-
-/* find or insert with the end-user specified meta.
- */
-template <class K, class V, class M, uint32_t TILE_SIZE = 4>
-__global__ void find_ptr_or_insert_kernel(
-    const Table<K, V, M>* __restrict table, const size_t bucket_max_size,
-    const size_t buckets_num, const size_t dim, const K* __restrict keys,
-    V** __restrict vectors, M* __restrict metas, bool* __restrict found,
-    const size_t N) {
-  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
-  int* buckets_size = table->buckets_size;
-
-  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;
-       t += blockDim.x * gridDim.x) {
-    int key_pos = -1;
-    size_t key_idx = t / TILE_SIZE;
-
-    const K find_or_insert_key = keys[key_idx];
-
-    if (IS_RESERVED_KEY(find_or_insert_key)) continue;
-
-    const M find_or_insert_meta =
-        metas != nullptr ? metas[key_idx] : static_cast<M>(MAX_META);
-
-    size_t bkt_idx = 0;
-    size_t start_idx = 0;
-    int src_lane = -1;
-    K evicted_key;
-
-    Bucket<K, V, M>* bucket =
-        get_key_position<K>(table->buckets, find_or_insert_key, bkt_idx,
-                            start_idx, buckets_num, bucket_max_size);
-
-    OccupyResult occupy_result{OccupyResult::INITIAL};
-    const int bucket_size = buckets_size[bkt_idx];
-    do {
-      if (bucket_size < bucket_max_size) {
-        occupy_result = find_and_lock_when_vacant<K, V, M, TILE_SIZE>(
-            g, bucket, find_or_insert_key, find_or_insert_meta, evicted_key,
-            start_idx, key_pos, src_lane, bucket_max_size);
-      } else {
-        start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;
-        occupy_result = find_and_lock_when_full<K, V, M, TILE_SIZE>(
-            g, bucket, find_or_insert_key, find_or_insert_meta, evicted_key,
-            start_idx, key_pos, src_lane, bucket_max_size);
-      }
-
-      occupy_result = g.shfl(occupy_result, src_lane);
-    } while (occupy_result == OccupyResult::CONTINUE);
-
-    if (occupy_result == OccupyResult::REFUSED) continue;
-
-    if ((occupy_result == OccupyResult::OCCUPIED_EMPTY ||
-         occupy_result == OccupyResult::OCCUPIED_RECLAIMED) &&
-        g.thread_rank() == src_lane) {
-      atomicAdd(&(buckets_size[bkt_idx]), 1);
-    }
-
-    if (occupy_result == OccupyResult::DUPLICATE) {
-      if (g.thread_rank() == src_lane) {
-        *(vectors + key_idx) = (bucket->vectors + key_pos * dim);
-        *(found + key_idx) = true;
-        if (metas != nullptr) {
-          *(metas + key_idx) =
-              bucket->metas(key_pos)->load(cuda::std::memory_order_relaxed);
-        }
-      }
-    } else {
-      if (g.thread_rank() == src_lane) {
-        *(vectors + key_idx) = (bucket->vectors + key_pos * dim);
-        *(found + key_idx) = false;
-        update_meta(bucket, key_pos, metas, key_idx);
-      }
-    }
-
-    if (g.thread_rank() == src_lane) {
-      (bucket->keys(key_pos))
-          ->store(find_or_insert_key, cuda::std::memory_order_relaxed);
-    }
-  }
-}
-
-template <typename K, typename V, typename M>
-struct SelectFindOrInsertPtrKernel {
-  static void execute_kernel(const float& load_factor, const int& block_size,
-                             const size_t bucket_max_size,
-                             const size_t buckets_num, const size_t dim,
-                             cudaStream_t& stream, const size_t& n,
-                             const Table<K, V, M>* __restrict table,
-                             const K* __restrict keys, V** __restrict values,
-                             M* __restrict metas, bool* __restrict found) {
-    if (load_factor <= 0.5) {
-      const unsigned int tile_size = 4;
-      const size_t N = n * tile_size;
-      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
-      find_ptr_or_insert_kernel<K, V, M, tile_size>
-          <<<grid_size, block_size, 0, stream>>>(table, bucket_max_size,
-                                                 buckets_num, dim, keys, values,
-                                                 metas, found, N);
-    } else if (load_factor <= 0.875) {
-      const unsigned int tile_size = 8;
-      const size_t N = n * tile_size;
-      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
-      find_ptr_or_insert_kernel<K, V, M, tile_size>
-          <<<grid_size, block_size, 0, stream>>>(table, bucket_max_size,
-                                                 buckets_num, dim, keys, values,
-                                                 metas, found, N);
-    } else {
-      const unsigned int tile_size = 32;
-      const size_t N = n * tile_size;
-      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
-      find_ptr_or_insert_kernel<K, V, M, tile_size>
-          <<<grid_size, block_size, 0, stream>>>(table, bucket_max_size,
-                                                 buckets_num, dim, keys, values,
-                                                 metas, found, N);
-    }
-    return;
-  }
-};
-
-/* Read the data from address of table_value_addrs to corresponding position
-  in param_value if mask[i] is true, otherwise write data to table_value_addrs
-  form param_value,
-  usually called by find_or_insert kernel.
-
-  `table_value_addrs`: A pointer of pointer of V which should be on HBM,
-        but each value (a pointer of V) could point to a
-        memory on HBM or HMEM.
-  `param_value`: A continue memory pointer with Vector
-        which should be HBM.
-  `mask`: One for each `param_value`. If true, reading from table_value_addrs,
-          or false writing table_value_addrs from  param_value.
-  `param_key_index`: N values from address of table_value_addrs are mapped to
-        param_values according to param_key_index.
-  `dim`: the dim of value.
-  `N`: The number of vectors needed to be read.
-*/
-template <class K, class V, class M>
-__global__ void read_or_write_kernel(V** __restrict table_value_addrs,
-                                     V* __restrict param_values,
-                                     const bool* mask,
-                                     const int* __restrict param_key_index,
-                                     const size_t dim, const size_t N) {
-  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;
-
-  for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) {
-    int vec_index = int(t / dim);
-    int dim_index = t % dim;
-    int real_key_index =
-        param_key_index != nullptr ? param_key_index[vec_index] : vec_index;
-
-    /// if found, read the value form table, otherwise write it
-    if (table_value_addrs[vec_index] != nullptr) {
-      /// find
-      if (mask[real_key_index]) {
-        param_values[real_key_index * dim + dim_index] =
-            table_value_addrs[vec_index][dim_index];
-      }
-      /// insert
-      else {
-        table_value_addrs[vec_index][dim_index] =
-            param_values[real_key_index * dim + dim_index];
-      }
-    }
-  }
-}
-
-/*
- * update with IO operation. This kernel is
- * usually used for the pure HBM mode for better performance.
- */
-template <class K, class V, class M, uint32_t TILE_SIZE = 4>
-__global__ void update_kernel_with_io(
-    const Table<K, V, M>* __restrict table, const size_t bucket_max_size,
-    const size_t buckets_num, const size_t dim, const K* __restrict keys,
-    const V* __restrict values, const M* __restrict metas, const size_t N) {
-  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
-  int* buckets_size = table->buckets_size;
-
-  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;
-       t += blockDim.x * gridDim.x) {
-    int key_pos = -1;
-    size_t key_idx = t / TILE_SIZE;
-
-    const K update_key = keys[key_idx];
-
-    if (IS_RESERVED_KEY(update_key)) continue;
-
-    const V* update_value = values + key_idx * dim;
-
-    size_t bkt_idx = 0;
-    size_t start_idx = 0;
-    int src_lane = -1;
-
-    Bucket<K, V, M>* bucket =
-        get_key_position<K>(table->buckets, update_key, bkt_idx, start_idx,
-                            buckets_num, bucket_max_size);
-
-    OccupyResult occupy_result{OccupyResult::INITIAL};
-    const int bucket_size = buckets_size[bkt_idx];
-
-    if (bucket_size >= bucket_max_size) {
-      start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;
-    }
-    occupy_result = find_and_lock_for_update<K, V, M, TILE_SIZE>(
-        g, bucket, update_key, start_idx, key_pos, src_lane, bucket_max_size);
-
-    occupy_result = g.shfl(occupy_result, src_lane);
-
-    if (occupy_result == OccupyResult::REFUSED) continue;
-
-    if (occupy_result == OccupyResult::DUPLICATE) {
-      copy_vector<V, TILE_SIZE>(g, update_value,
-                                bucket->vectors + key_pos * dim, dim);
-      if (src_lane == g.thread_rank()) {
-        update_meta(bucket, key_pos, metas, key_idx);
-      }
-    }
-
-    if (g.thread_rank() == src_lane) {
-      (bucket->keys(key_pos))
-          ->store(update_key, cuda::std::memory_order_relaxed);
-    }
-  }
-}
-
-template <typename K, typename V, typename M>
-struct SelectUpdateKernelWithIO {
-  static void execute_kernel(const float& load_factor, const int& block_size,
-                             const size_t bucket_max_size,
-                             const size_t buckets_num, const size_t dim,
-                             cudaStream_t& stream, const size_t& n,
-                             const Table<K, V, M>* __restrict table,
-                             const K* __restrict keys,
-                             const V* __restrict values,
-                             const M* __restrict metas) {
-    if (load_factor <= 0.75) {
-      const unsigned int tile_size = 4;
-      const size_t N = n * tile_size;
-      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
-      update_kernel_with_io<K, V, M, tile_size>
-          <<<grid_size, block_size, 0, stream>>>(
-              table, bucket_max_size, buckets_num, dim, keys, values, metas, N);
-    } else {
-      const unsigned int tile_size = 32;
-      const size_t N = n * tile_size;
-      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
-      update_kernel_with_io<K, V, M, tile_size>
-          <<<grid_size, block_size, 0, stream>>>(
-              table, bucket_max_size, buckets_num, dim, keys, values, metas, N);
-    }
-    return;
-  }
-};
-
-template <class K, class V, class M, uint32_t TILE_SIZE = 4>
-__global__ void update_kernel(const Table<K, V, M>* __restrict table,
-                              const size_t bucket_max_size,
-                              const size_t buckets_num, const size_t dim,
-                              const K* __restrict keys, V** __restrict vectors,
-                              const M* __restrict metas,
-                              int* __restrict src_offset, size_t N) {
-  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
-  int* buckets_size = table->buckets_size;
-
-  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;
-       t += blockDim.x * gridDim.x) {
-    int key_pos = -1;
-    size_t key_idx = t / TILE_SIZE;
-
-    const K update_key = keys[key_idx];
-
-    if (IS_RESERVED_KEY(update_key)) continue;
-
-    size_t bkt_idx = 0;
-    size_t start_idx = 0;
-    int src_lane = -1;
-
-    Bucket<K, V, M>* bucket =
-        get_key_position<K>(table->buckets, update_key, bkt_idx, start_idx,
-                            buckets_num, bucket_max_size);
-
-    OccupyResult occupy_result{OccupyResult::INITIAL};
-    const int bucket_size = buckets_size[bkt_idx];
-    *(src_offset + key_idx) = key_idx;
-
-    if (bucket_size >= bucket_max_size) {
-      start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;
-    }
-    occupy_result = find_and_lock_for_update<K, V, M, TILE_SIZE>(
-        g, bucket, update_key, start_idx, key_pos, src_lane, bucket_max_size);
-
-    occupy_result = g.shfl(occupy_result, src_lane);
-
-    if (occupy_result == OccupyResult::REFUSED) continue;
-
-    if (g.thread_rank() == src_lane) {
-      if (occupy_result == OccupyResult::DUPLICATE) {
-        *(vectors + key_idx) = (bucket->vectors + key_pos * dim);
-        update_meta(bucket, key_pos, metas, key_idx);
-      } else {
-        *(vectors + key_idx) = nullptr;
-      }
-    }
-
-    if (g.thread_rank() == src_lane) {
-      (bucket->keys(key_pos))
-          ->store(update_key, cuda::std::memory_order_relaxed);
-    }
-  }
-}
-
-}  // namespace merlin
-}  // namespace nv
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/debug.hpp b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/debug.hpp
deleted file mode 100644
index 2d9ebfe8b..000000000
--- a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/debug.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cuda_runtime_api.h>
-#include <sstream>
-#include <stdexcept>
-#include <string>
-#include "tensorflow_recommenders_addons/dynamic_embedding/core/lib/utils/cuda_utils.cuh"
-
-namespace nv {
-namespace merlin {
-
-class CudaException : public std::runtime_error {
- public:
-  CudaException(const std::string& what) : runtime_error(what) {}
-};
-
-inline void cuda_check_(cudaError_t val, const char* file, int line) {
-  if (val != cudaSuccess) {
-    std::ostringstream os;
-    os << file << ':' << line << ": CUDA error " << cudaGetErrorName(val)
-       << " (#" << val << "): " << cudaGetErrorString(val);
-    throw CudaException(os.str());
-  }
-}
-
-//#ifdef CUDA_CHECK
-//#error Unexpected redfinition of CUDA_CHECK! Something is wrong.
-//#endif
-
-#ifndef CUDA_CHECK
-#define CUDA_CHECK(val)                                 \
-  do {                                                  \
-    nv::merlin::cuda_check_((val), __FILE__, __LINE__); \
-  } while (0)
-#endif  // CUDA_CHECK
-
-class MerlinException : public std::runtime_error {
- public:
-  MerlinException(const std::string& what) : runtime_error(what) {}
-};
-
-template <class Msg>
-inline void merlin_check_(bool cond, const Msg& msg, const char* file,
-                          int line) {
-  if (!cond) {
-    std::ostringstream os;
-    os << file << ':' << line << ": HierarchicalKV error " << msg;
-    throw MerlinException(os.str());
-  }
-}
-
-#ifdef MERLIN_CHECK
-#error Unexpected redfinition of MERLIN_CHECK! Something is wrong.
-#endif
-
-#define MERLIN_CHECK(cond, msg)                                   \
-  do {                                                            \
-    nv::merlin::merlin_check_((cond), (msg), __FILE__, __LINE__); \
-  } while (0)
-
-}  // namespace merlin
-}  // namespace nv
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/flexible_buffer.cuh b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/flexible_buffer.cuh
deleted file mode 100644
index d01dda6d1..000000000
--- a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/flexible_buffer.cuh
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <iostream>
-#include "utils.cuh"
-
-using std::cerr;
-using std::endl;
-
-namespace nv {
-namespace merlin {
-
-template <class T>
-class FlexPinnedBuffer {
- public:
-  FlexPinnedBuffer(const size_t size = 1) : ptr_(nullptr) {
-    if (!ptr_) {
-      size_ = size;
-      CUDA_CHECK(cudaMallocHost(&ptr_, sizeof(T) * size_));
-    }
-  }
-  ~FlexPinnedBuffer() {
-    try {
-      if (!ptr_) CUDA_CHECK(cudaFreeHost(ptr_));
-    } catch (const nv::merlin::CudaException& e) {
-      cerr << "[HierarchicalKV] Failed to free FlexPinnedBuffer!" << endl;
-    }
-  }
-
-  __inline__ T* alloc_or_reuse(const size_t size = 0) {
-    if (size > size_) {
-      CUDA_CHECK(cudaFreeHost(ptr_));
-      size_ = size;
-      CUDA_CHECK(cudaMallocHost(&ptr_, sizeof(T) * size_));
-    }
-    return ptr_;
-  }
-
- private:
-  T* ptr_;
-  size_t size_;
-};
-
-}  // namespace merlin
-}  // namespace nv
\ No newline at end of file
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/group_lock.hpp b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/group_lock.hpp
deleted file mode 100644
index 2cfdc6ed3..000000000
--- a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/group_lock.hpp
+++ /dev/null
@@ -1,229 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http:///www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Implementing a group mutex and relative lock guard for better E2E performance:
- * - Allow multiple writers (like `insert_or_assign` `assign` `insert_and_evict` etc.)
- *   The CUDA kernels guarantee the data consistency in this situation.
- * - Allow multiple readers (like `find` 'size` etc.)
- * - Not allow readers and writers to run concurrently
- * - The `write_read_lock` is used for special APIs (like `reserve` `erase` `clear` etc.)
- */
-#include <atomic>
-#include <cassert>
-#include <mutex>
-#include <system_error>
-#include <thread>
-
-namespace nv {
-namespace merlin {
-
-class group_shared_mutex {
- public:
-  group_shared_mutex(const group_shared_mutex&) = delete;
-  group_shared_mutex& operator=(const group_shared_mutex&) = delete;
-
-  group_shared_mutex() noexcept
-      : writer_count_(0), reader_count_(0), unique_flag_(false) {}
-
-  void lock_read() {
-    for (;;) {
-      while (writer_count_.load(std::memory_order_acquire)) {
-      }
-      reader_count_.fetch_add(1, std::memory_order_acq_rel);
-      if (writer_count_.load(std::memory_order_acquire) == 0) {
-        break;
-      }
-      reader_count_.fetch_sub(1, std::memory_order_acq_rel);
-    }
-  }
-
-  void unlock_read() { reader_count_.fetch_sub(1, std::memory_order_release); }
-
-  void lock_write() {
-    for (;;) {
-      while (reader_count_.load(std::memory_order_acquire)) {
-      }
-      writer_count_.fetch_add(1, std::memory_order_acq_rel);
-      if (reader_count_.load(std::memory_order_acquire) == 0) {
-        break;
-      }
-      writer_count_.fetch_sub(1, std::memory_order_acq_rel);
-    }
-  }
-
-  void unlock_write() { writer_count_.fetch_sub(1, std::memory_order_release); }
-
-  void lock_write_read() {
-    /* Lock unique flag */
-    bool expected = false;
-    while (!unique_flag_.compare_exchange_weak(expected, true,
-                                               std::memory_order_acq_rel)) {
-      expected = false;
-    }
-
-    /* Ban writer */
-    for (;;) {
-      while (writer_count_.load(std::memory_order_acquire)) {
-      }
-      reader_count_.fetch_add(1, std::memory_order_acq_rel);
-      if (writer_count_.load(std::memory_order_acquire) == 0) {
-        break;
-      }
-      reader_count_.fetch_sub(1, std::memory_order_acq_rel);
-    }
-
-    /* Ban reader */
-    for (;;) {
-      while (reader_count_.load(std::memory_order_acquire) > 1) {
-      }
-      writer_count_.fetch_add(1, std::memory_order_acq_rel);
-      if (reader_count_.load(std::memory_order_acquire) == 1) {
-        break;
-      }
-      writer_count_.fetch_sub(1, std::memory_order_acq_rel);
-    }
-  }
-
-  void unlock_write_read() noexcept {
-    reader_count_.fetch_sub(1, std::memory_order_release);
-    writer_count_.fetch_sub(1, std::memory_order_release);
-    unique_flag_.store(false, std::memory_order_release);
-  }
-
-  int writer_count() noexcept {
-    return writer_count_.load(std::memory_order_relaxed);
-  }
-
-  int reader_count() noexcept {
-    return reader_count_.load(std::memory_order_relaxed);
-  }
-
- private:
-  std::atomic<int> writer_count_;
-  std::atomic<int> reader_count_;
-  std::atomic<bool> unique_flag_;
-};
-
-class reader_shared_lock {
- public:
-  reader_shared_lock(const reader_shared_lock&) = delete;
-  reader_shared_lock(reader_shared_lock&&) = delete;
-
-  reader_shared_lock& operator=(const reader_shared_lock&) = delete;
-  reader_shared_lock& operator=(reader_shared_lock&&) = delete;
-
-  explicit reader_shared_lock(group_shared_mutex& mutex) : mutex_(&mutex) {
-    mutex_->lock_read();
-    owns_ = true;
-  }
-
-  explicit reader_shared_lock(group_shared_mutex& mutex, std::defer_lock_t)
-      : mutex_(&mutex), owns_(false) {}
-
-  ~reader_shared_lock() {
-    if (owns_) {
-      mutex_->unlock_read();
-    }
-  }
-
-  void lock() noexcept {
-    if (!owns_) {
-      mutex_->lock_read();
-      owns_ = true;
-    }
-  }
-
-  bool owns_lock() const noexcept { return owns_; }
-
- private:
-  group_shared_mutex* const mutex_;
-  bool owns_;
-};
-
-class writer_shared_lock {
- public:
-  writer_shared_lock(const writer_shared_lock&) = delete;
-  writer_shared_lock(writer_shared_lock&&) = delete;
-
-  writer_shared_lock& operator=(const writer_shared_lock&) = delete;
-  writer_shared_lock& operator=(writer_shared_lock&&) = delete;
-
-  explicit writer_shared_lock(group_shared_mutex& mutex) : mutex_(&mutex) {
-    mutex_->lock_write();
-    owns_ = true;
-  }
-
-  explicit writer_shared_lock(group_shared_mutex& mutex, std::defer_lock_t)
-      : mutex_(&mutex), owns_(false) {}
-
-  ~writer_shared_lock() {
-    if (owns_) {
-      mutex_->unlock_write();
-    }
-  }
-
-  void lock() noexcept {
-    if (!owns_) {
-      mutex_->lock_write();
-      owns_ = true;
-    }
-  }
-
-  bool owns_lock() const noexcept { return owns_; }
-
- private:
-  group_shared_mutex* const mutex_;
-  bool owns_;
-};
-
-class write_read_lock {
- public:
-  write_read_lock(const write_read_lock&) = delete;
-  write_read_lock(write_read_lock&&) = delete;
-
-  write_read_lock& operator=(const write_read_lock&) = delete;
-  write_read_lock& operator=(write_read_lock&&) = delete;
-
-  explicit write_read_lock(group_shared_mutex& mutex) : mutex_(&mutex) {
-    mutex_->lock_write_read();
-    owns_ = true;
-  }
-
-  explicit write_read_lock(group_shared_mutex& mutex, std::defer_lock_t) noexcept
-      : mutex_(&mutex), owns_(false) {}
-
-  ~write_read_lock() {
-    if (owns_) {
-      mutex_->unlock_write_read();
-    }
-  }
-
-  void lock() {
-    assert(!owns_ && "[write_read_lock] trying to lock twice!");
-    mutex_->lock_write_read();
-    owns_ = true;
-  }
-
-  bool owns_lock() const noexcept { return owns_; }
-
- private:
-  group_shared_mutex* const mutex_;
-  bool owns_;
-};
-
-}  // namespace merlin
-}  // namespace nv
\ No newline at end of file
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/initializers.cuh b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/initializers.cuh
deleted file mode 100644
index 6df875688..000000000
--- a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/initializers.cuh
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cuda_runtime.h>
-#include <curand.h>
-#include <curand_kernel.h>
-#include "curand_philox4x32_x.h"
-#include "types.cuh"
-#include "utils.cuh"
-
-namespace nv {
-namespace merlin {
-namespace initializers {
-
-inline void cuda_rand_check_(curandStatus_t val, const char* file, int line) {
-  if (val != CURAND_STATUS_SUCCESS) {
-    throw CudaException(std::string(file) + ":" + std::to_string(line) +
-                        ": CURAND error " + std::to_string(val));
-  }
-}
-
-#define CURAND_CHECK(val) \
-  { nv::merlin::initializers::cuda_rand_check_((val), __FILE__, __LINE__); }
-
-template <class T>
-void zeros(T* d_data, const size_t len, cudaStream_t stream) {
-  CUDA_CHECK(cudaMemsetAsync(d_data, 0, len, stream));
-}
-
-template <class T>
-void random_normal(T* d_data, const size_t len, cudaStream_t stream,
-                   const T mean = 0.0, const T stddev = 0.05,
-                   const unsigned long long seed = 2022ULL) {
-  curandGenerator_t generator;
-  CURAND_CHECK(curandCreateGenerator(&generator, CURAND_RNG_PSEUDO_DEFAULT));
-  CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(generator, seed));
-  CURAND_CHECK(curandGenerateNormal(generator, d_data, len, mean, stddev));
-}
-
-template <class T>
-__global__ void adjust_max_min(T* d_data, const T minval, const T maxval,
-                               const size_t N) {
-  int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
-  if (tid < N) {
-    d_data[tid] =
-        d_data[tid] * (maxval - minval) + (0.5 * (maxval + minval) - 0.5);
-  }
-}
-
-template <class T>
-void random_uniform(T* d_data, const size_t len, cudaStream_t stream,
-                    const T minval = 0.0, const T maxval = 1.0,
-                    const unsigned long long seed = 2022ULL) {
-  curandGenerator_t generator;
-
-  CURAND_CHECK(curandCreateGenerator(&generator, CURAND_RNG_PSEUDO_DEFAULT));
-  CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(generator, seed));
-
-  int N = len;
-  int block_size = 256;
-  int grid_size = (N + block_size - 1) / block_size;
-  CURAND_CHECK(curandGenerateUniform(generator, d_data, N));
-  adjust_max_min<T>
-      <<<grid_size, block_size, 0, stream>>>(d_data, minval, maxval, N);
-}
-
-template <class T>
-__global__ void init_states(curandStatePhilox4_32_10_t* states,
-                            const unsigned long long seed, const size_t N) {
-  int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
-  if (tid < N) {
-    curand_init(seed, tid, 0, &states[tid]);
-  }
-}
-
-template <class T>
-__global__ void make_truncated_normal(T* d_data,
-                                      curandStatePhilox4_32_10_t* states,
-                                      const size_t N) {
-  int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
-  if (tid < N) {
-    constexpr T truncated_val = T(2.0);
-    while (fabsf(d_data[tid]) > truncated_val) {
-      d_data[tid] = curand_normal(&states[tid]);
-    }
-  }
-}
-
-template <class T>
-void truncated_normal(T* d_data, const size_t len, cudaStream_t stream,
-                      const T minval = 0.0, const T maxval = 1.0,
-                      const unsigned long long seed = 2022ULL) {
-  curandGenerator_t generator;
-
-  CURAND_CHECK(curandCreateGenerator(&generator, CURAND_RNG_PSEUDO_DEFAULT));
-  CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(generator, seed));
-
-  int N = len;
-  int block_size = 256;
-  int grid_size = (N + block_size - 1) / block_size;
-  curandStatePhilox4_32_10_t* d_states;
-  CUDA_CHECK(cudaMallocAsync(&d_states, N, stream));
-
-  init_states<T><<<grid_size, block_size, 0, stream>>>(d_states, seed, N);
-
-  make_truncated_normal<T>
-      <<<grid_size, block_size, 0, stream>>>(d_data, d_states, N);
-
-  adjust_max_min<T>
-      <<<grid_size, block_size, 0, stream>>>(d_data, minval, maxval, N);
-
-  CUDA_CHECK(cudaFreeAsync(d_states, stream));
-}
-
-template <class T>
-class Initializer {
- public:
-  virtual ~Initializer() {}
-  virtual void initialize(T* data, size_t len, cudaStream_t stream) {}
-};
-
-template <class T>
-class Zeros final : public Initializer<T> {
- public:
-  void initialize(T* data, const size_t len, cudaStream_t stream) override {
-    zeros<T>(data, len, stream);
-  }
-};
-
-}  // namespace initializers
-}  // namespace merlin
-}  // namespace nv
\ No newline at end of file
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/memory_pool.cuh b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/memory_pool.cuh
deleted file mode 100644
index 271676a6e..000000000
--- a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/memory_pool.cuh
+++ /dev/null
@@ -1,619 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <algorithm>
-#include <array>
-#include <functional>
-#include <iostream>
-#include <memory>
-#include <mutex>
-#include <thread>
-#include <vector>
-#include "debug.hpp"
-
-namespace nv {
-namespace merlin {
-
-/**
- * Allocators are used by the memory pool (and maybe other classes) to create
- * RAII complient containers for buffers allocated in different memory areas.
- */
-template <class T, class Allocator>
-struct AllocatorBase {
-  using type = T;
-  using sync_unique_ptr = std::unique_ptr<type, Allocator>;
-  using async_unique_ptr = std::unique_ptr<type, std::function<void(type*)>>;
-  using shared_ptr = std::shared_ptr<type>;
-
-  inline static sync_unique_ptr make_unique(size_t n) {
-    return sync_unique_ptr(Allocator::alloc(n));
-  }
-
-  inline static async_unique_ptr make_unique(size_t n, cudaStream_t stream) {
-    return {Allocator::alloc(n, stream),
-            [stream](type* p) { Allocator::free(p); }};
-  }
-
-  inline static shared_ptr make_shared(size_t n, cudaStream_t stream = 0) {
-    return {Allocator::alloc(n, stream),
-            [stream](type* p) { Allocator::free(p, stream); }};
-  }
-
-  inline void operator()(type* ptr) { Allocator::free(ptr); }
-};
-
-/**
- * Trivial fallback implementation using the standard C++ allocator. This mostly
- * exists to ensure interface correctness, and as an illustration of what a
- * proper allocator implementation should look like.
- */
-template <class T>
-struct StandardAllocator final : AllocatorBase<T, StandardAllocator<T>> {
-  using type = typename AllocatorBase<T, StandardAllocator<T>>::type;
-
-  static constexpr const char* name{"StandardAllocator"};
-
-  inline static type* alloc(size_t n, cudaStream_t stream = 0) {
-    return new type[n];
-  }
-
-  inline static void free(type* ptr, cudaStream_t stream = 0) { delete[] ptr; }
-};
-
-/**
- * Claim/release buffers in pinned host memory.
- */
-template <class T>
-struct HostAllocator final : AllocatorBase<T, HostAllocator<T>> {
-  using type = typename AllocatorBase<T, HostAllocator<T>>::type;
-
-  static constexpr const char* name{"HostAllocator"};
-
-  inline static type* alloc(size_t n, cudaStream_t stream = 0) {
-    void* ptr;
-    CUDA_CHECK(cudaMallocHost(&ptr, sizeof(T) * n));
-    return reinterpret_cast<type*>(ptr);
-  }
-
-  inline static void free(type* ptr, cudaStream_t stream = 0) {
-    CUDA_CHECK(cudaFreeHost(ptr));
-  }
-};
-
-/**
- * Claim/release buffers in the active CUDA device. Will not test if the correct
- * device was used, and throw if CUDA runtime API response is negative.
- */
-template <class T>
-struct DeviceAllocator final : AllocatorBase<T, DeviceAllocator<T>> {
-  using type = typename AllocatorBase<T, DeviceAllocator<T>>::type;
-
-  static constexpr const char* name{"DeviceAllocator"};
-
-  inline static type* alloc(size_t n, cudaStream_t stream = 0) {
-    void* ptr;
-    cudaError_t res;
-    if (stream) {
-      res = cudaMallocAsync(&ptr, sizeof(T) * n, stream);
-    } else {
-      res = cudaMalloc(&ptr, sizeof(T) * n);
-    }
-    CUDA_CHECK(res);
-    return reinterpret_cast<type*>(ptr);
-  }
-
-  inline static void free(type* ptr, cudaStream_t stream = 0) {
-    cudaError_t res;
-    if (stream) {
-      res = cudaFreeAsync(ptr, stream);
-    } else {
-      res = cudaFree(ptr);
-    }
-    CUDA_CHECK(res);
-  }
-};
-
-/**
- * Helper structure to configure a memory pool.
- */
-struct MemoryPoolOptions {
-  size_t max_stock{4};     ///< Amount of buffers to keep in reserve.
-  size_t max_pending{16};  ///< Maximum amount of awaitable buffers. If this
-                           ///< limit is exceeded threads will start to block.
-};
-
-/**
- * Forward declares required to make templated ostream overload work.
- */
-template <class Allocator>
-class MemoryPool;
-
-template <class Allocator>
-std::ostream& operator<<(std::ostream&, const MemoryPool<Allocator>&);
-
-/**
- * CUDA deferred execution aware memory pool implementation. As for every memory
- * pool, the general idea is to have resuable buffers. All buffers have the same
- * size.
- *
- * General behavior:
- *
- * This memory pool implementation attempts to avoid blocking before the fact,
- * but also avoids relying on a background worker.
- *
- * Buffer borrow and return semantics tightly align with C++ RAII principles.
- * That is, if a workspace is requested, any borrowed buffers will be returned
- * automatically when leaving the scope.
- *
- * You can either borrow a single buffer, or a workspace (that is multiple
- * buffers). We support dynamic and static workspaces. Static workspaces have
- * the benefit that they will never require heap memory (no hidden allocations).
- *
- *
- * Buffer borrowing:
- *
- * If buffers are requested, we take them from the stock, if available. If the
- * stock is depleted, we check if any pending buffer has been used up by the GPU
- * and adds them to the stock. If was also not successful, we allocate a new
- * buffer. Buffers or workspaces (groups of buffers).
- *
- * When borrowing a buffer a streaming context can be specified. This context is
- * relevant for allocation and during returns. It is assumed that the stream you
- * provide as context will be the stream where you queue the workload. Not doing
- * so may lead to undefined behavior.
- *
- * Buffer return:
- *
- * If no context is provided, we cannot make any assumptions regarding the usage
- * one the device. So we sychronize the device first and then return the buffer
- * to the stock. If a streaming context was provided, we queue an event and add
- * the buffer to the `pending` pool. That means, the buffer has been
- * reqlinquished by the CPU, but may still be used by the GPU. If no pending
- * slot is available, we probe the currently pending buffers events for
- * completion. Completed pending buffers are returned to the reserve. If so, we
- * queue the buffer in the freed slot. If that was unsucessful (i.e., all
- * currently pending buffers are still in use by the GPU), we have no choice but
- * the free the buffer using the current stream.
- *
- * In either case, `max_reserve` represents the maxmum size of the stock. If
- * returning a buffer would lead to the stock exeeding this quantity, the buffer
- * is queued for destruction.
- */
-template <class Allocator>
-class MemoryPool final {
- public:
-  using pool_type = MemoryPool<Allocator>;
-  using alloc_type = typename Allocator::type;
-  template <class Container>
-  class Workspace {
-   public:
-    inline Workspace() : pool_{nullptr}, buffer_size_{0}, stream_{0} {}
-
-    inline Workspace(pool_type* pool, cudaStream_t stream)
-        : pool_{pool}, buffer_size_{0}, stream_{stream} {}
-
-    Workspace(const Workspace&) = delete;
-
-    Workspace& operator=(const Workspace&) = delete;
-
-    inline Workspace(Workspace&& other)
-        : pool_{other.pool_},
-          buffer_size_{other.buffer_size_},
-          stream_{other.stream_},
-          buffers_{std::move(other.buffers_)} {}
-
-    inline Workspace& operator=(Workspace&& other) {
-      if (pool_) {
-        pool_->put_raw(buffers_.begin(), buffers_.end(), buffer_size_, stream_);
-      }
-      pool_ = other.pool_;
-      buffer_size_ = other.buffer_size_;
-      stream_ = other.stream_;
-      buffers_ = std::move(other.buffers_);
-      other.pool_ = nullptr;
-      return *this;
-    }
-
-    inline ~Workspace() {
-      if (pool_) {
-        pool_->put_raw(buffers_.begin(), buffers_.end(), buffer_size_, stream_);
-      }
-    }
-
-    template <class T>
-    constexpr void at(const size_t n, T* ptr) const {
-      *ptr = at<T>(n);
-    }
-
-    template <class T>
-    constexpr T at(const size_t n) const {
-      return reinterpret_cast<T>(buffers_.at(n));
-    }
-
-    template <class T>
-    constexpr void get(const size_t n, T* ptr) const {
-      *ptr = get<T>(n);
-    }
-
-    template <class T>
-    constexpr T get(const size_t n) const {
-      return reinterpret_cast<T>(buffers_[n]);
-    }
-
-    constexpr alloc_type* operator[](const size_t n) const {
-      return buffers_[n];
-    }
-
-   protected:
-    pool_type* pool_;
-    size_t buffer_size_;
-    cudaStream_t stream_;
-    Container buffers_;
-  };
-
-  template <size_t N>
-  class StaticWorkspace final : public Workspace<std::array<alloc_type*, N>> {
-   public:
-    using base_type = Workspace<std::array<alloc_type*, N>>;
-
-    friend class MemoryPool<Allocator>;
-
-    inline StaticWorkspace() : base_type() {}
-
-    StaticWorkspace(const StaticWorkspace&) = delete;
-
-    StaticWorkspace& operator=(const StaticWorkspace&) = delete;
-
-    inline StaticWorkspace(StaticWorkspace&& other)
-        : base_type(std::move(other)) {}
-
-    inline StaticWorkspace& operator=(StaticWorkspace&& other) {
-      base_type::operator=(std::move(other));
-      return *this;
-    }
-
-   private:
-    inline StaticWorkspace(pool_type* pool, size_t requested_buffer_size,
-                           cudaStream_t stream)
-        : base_type(pool, stream) {
-      auto& buffers{this->buffers_};
-      this->buffer_size_ = pool->get_raw(buffers.begin(), buffers.end(),
-                                         requested_buffer_size, stream);
-    }
-  };
-
-  class DynamicWorkspace final : public Workspace<std::vector<alloc_type*>> {
-   public:
-    using base_type = Workspace<std::vector<alloc_type*>>;
-
-    friend class MemoryPool<Allocator>;
-
-    inline DynamicWorkspace() : base_type() {}
-
-    DynamicWorkspace(const DynamicWorkspace&) = delete;
-
-    DynamicWorkspace& operator=(const DynamicWorkspace&) = delete;
-
-    inline DynamicWorkspace(DynamicWorkspace&& other)
-        : base_type(std::move(other)) {}
-
-    inline DynamicWorkspace& operator=(DynamicWorkspace&& other) {
-      base_type::operator=(std::move(other));
-      return *this;
-    }
-
-   private:
-    inline DynamicWorkspace(pool_type* pool, size_t n,
-                            size_t requested_buffer_size, cudaStream_t stream)
-        : base_type(pool, stream) {
-      auto& buffers{this->buffers_};
-      buffers.resize(n);
-      this->buffer_size_ = pool->get_raw(buffers.begin(), buffers.end(),
-                                         requested_buffer_size, stream);
-    }
-  };
-
-  MemoryPool(const MemoryPoolOptions& options) : options_{options} {
-    // Create initial buffer stock.
-    stock_.reserve(options_.max_stock);
-
-    // Create enough events, so we have one per potentially pending buffer.
-    ready_events_.resize(options_.max_pending);
-    for (auto& ready_event : ready_events_) {
-      CUDA_CHECK(cudaEventCreate(&ready_event));
-    }
-
-    // Preallocate pending.
-    pending_.reserve(options_.max_pending);
-  }
-
-  ~MemoryPool() {
-    // Make sure all queued tasks are complete.
-    await_pending();
-
-    // Free event and buffer memory.
-    for (auto& ready_event : ready_events_) {
-      CUDA_CHECK(cudaEventDestroy(ready_event));
-    }
-
-    // Any remaining buffers need to be properly unallocated.
-    deplete_stock();
-  }
-
-  inline size_t buffer_size() const { return buffer_size_; }
-
-  inline size_t max_batch_size(size_t max_item_size) const {
-    return buffer_size_ / max_item_size;
-  }
-
-  template <class T>
-  inline size_t max_batch_size() const {
-    return max_batch_size(sizeof(T));
-  }
-
-  size_t current_stock() const {
-    std::lock_guard<std::mutex> lock(mutex_);
-    return stock_.size();
-  }
-
-  size_t num_pending() const {
-    std::lock_guard<std::mutex> lock(mutex_);
-    return pending_.size();
-  }
-
-  void await_pending(cudaStream_t stream = 0) {
-    std::lock_guard<std::mutex> lock(mutex_);
-    while (!pending_.empty()) {
-      collect_pending_unsafe(stream);
-      if (pending_.empty()) {
-        break;
-      }
-      std::this_thread::yield();
-    }
-  }
-
-  void deplete_stock() {
-    std::lock_guard<std::mutex> lock(mutex_);
-    for (auto& ptr : stock_) {
-      Allocator::free(ptr);
-    }
-    stock_.clear();
-  }
-
-  inline std::unique_ptr<alloc_type, std::function<void(alloc_type*)>>
-  get_unique(size_t requested_buffer_size, cudaStream_t stream = 0) {
-    alloc_type* ptr;
-    const size_t allocation_size =
-        get_raw(&ptr, (&ptr) + 1, requested_buffer_size, stream);
-    return {ptr, [this, allocation_size, stream](alloc_type* p) {
-              put_raw(&p, (&p) + 1, allocation_size, stream);
-            }};
-  }
-
-  inline std::shared_ptr<alloc_type> get_shared(size_t requested_buffer_size,
-                                                cudaStream_t stream = 0) {
-    alloc_type* ptr;
-    const size_t allocation_size =
-        get_raw(&ptr, (&ptr) + 1, requested_buffer_size, stream);
-    return {ptr, [this, allocation_size, stream](alloc_type* p) {
-              put_raw(&p, (&p) + 1, allocation_size, stream);
-            }};
-  }
-
-  template <size_t N>
-  inline StaticWorkspace<N> get_workspace(size_t requested_buffer_size,
-                                          cudaStream_t stream = 0) {
-    return {this, requested_buffer_size, stream};
-  }
-
-  inline DynamicWorkspace get_workspace(size_t n, size_t requested_buffer_size,
-                                        cudaStream_t stream = 0) {
-    return {this, n, requested_buffer_size, stream};
-  }
-
-  friend std::ostream& operator<<<Allocator>(std::ostream&, const MemoryPool&);
-
- private:
-  inline void collect_pending_unsafe(cudaStream_t stream) {
-    auto it{std::remove_if(
-        pending_.begin(), pending_.end(), [this, stream](const auto& pending) {
-          const cudaError_t state{cudaEventQuery(std::get<2>(pending))};
-          switch (state) {
-            case cudaSuccess:
-              // Stock buffers and destroy those that are no
-              // longer needed, but only if the allocation_size
-              // is still the same as the current buffer_size.
-              if (stock_.size() < options_.max_stock &&
-                  std::get<1>(pending) == buffer_size_) {
-                stock_.emplace_back(std::get<0>(pending));
-              } else {
-                Allocator::free(std::get<0>(pending), stream);
-              }
-              ready_events_.emplace_back(std::get<2>(pending));
-              return true;
-            case cudaErrorNotReady:
-              return false;
-            default:
-              CUDA_CHECK(state);
-              return false;
-          }
-        })};
-    pending_.erase(it, pending_.end());
-  }
-
-  inline void clear_stock_unsafe(cudaStream_t stream) {
-    for (auto& ptr : stock_) {
-      Allocator::free(ptr, stream);
-    }
-    stock_.clear();
-  }
-
-  template <class Iterator>
-  inline size_t get_raw(Iterator first, Iterator const last,
-                        size_t requested_buffer_size, cudaStream_t stream) {
-    // Get pre-allocated buffers if stock available.
-    size_t allocation_size;
-    {
-      std::lock_guard<std::mutex> lock(mutex_);
-
-      // If requested_buffer_size is within current buffer_size margins can
-      // reuse current buffers.
-      if (requested_buffer_size <= buffer_size_) {
-        while (first != last) {
-          // If no buffers available, try to make some available.
-          if (stock_.empty()) {
-            collect_pending_unsafe(stream);
-            if (stock_.empty()) {
-              // No buffers available.
-              break;
-            }
-          }
-
-          // Just take the next available buffer.
-          *first++ = stock_.back();
-          stock_.pop_back();
-        }
-      } else {
-        // Drop the stock because we need more memory and those buffers have
-        // become useless to that end.
-        clear_stock_unsafe(stream);
-        buffer_size_ = requested_buffer_size;
-      }
-
-      allocation_size = buffer_size_;
-    }
-
-    // Forge new buffers until request can be filled.
-    for (; first != last; ++first) {
-      *first = Allocator::alloc(allocation_size, stream);
-    }
-
-    return allocation_size;
-  }
-
-  template <class Iterator>
-  inline void put_raw(Iterator first, Iterator const last,
-                      size_t allocation_size, cudaStream_t stream) {
-    std::lock_guard<std::mutex> lock(mutex_);
-
-    // If allocation_size of the workspace differs from the current buffer_size
-    // (i.e., somebody else requested a larger buffer since the original request
-    // occured), the provided buffers are incompatible and have to be discarded.
-    if (allocation_size != buffer_size_) {
-      while (first != last) {
-        Allocator::free(*first++);
-      }
-      return;
-    }
-
-    // If the workspace that borrowed a stream was moved out of the RAII scope
-    // where it was created, it could happen that the stream was destroyed when
-    // we return the buffer ownershup. This `cudaStreamQuery` will prevent that.
-    if (stream && cudaStreamQuery(stream) != cudaErrorInvalidResourceHandle) {
-      for (; first != last; ++first) {
-        // Avoid adding already deallocated buffers.
-        if (*first == nullptr) {
-          continue;
-        }
-
-        // Spin lock if too many pending buffers (i.e., let CPU wait for GPU).
-        while (ready_events_.empty()) {
-          collect_pending_unsafe(stream);
-          if (!ready_events_.empty()) {
-            break;
-          }
-          std::this_thread::yield();
-        }
-
-        // Queue buffer.
-        cudaEvent_t ready_event{ready_events_.back()};
-        ready_events_.pop_back();
-        CUDA_CHECK(cudaEventRecord(ready_event, stream));
-        pending_.emplace_back(*first, allocation_size, ready_event);
-      }
-    } else {
-      // Without stream context, we must force a hard sync with the GPU.
-      CUDA_CHECK(cudaDeviceSynchronize());
-
-      for (; first != last; ++first) {
-        // Avoid adding already deallocated buffers.
-        if (*first == nullptr) {
-          continue;
-        }
-
-        // Stock buffers and destroy those that are no longer needed.
-        if (stock_.size() < options_.max_stock) {
-          stock_.emplace_back(*first);
-        } else {
-          Allocator::free(*first);
-        }
-      }
-    }
-  }
-
-  const MemoryPoolOptions options_;
-
-  mutable std::mutex mutex_;
-  size_t buffer_size_{1};
-  std::vector<alloc_type*> stock_;
-  std::vector<cudaEvent_t> ready_events_;
-
-  std::vector<std::tuple<alloc_type*, size_t, cudaEvent_t>> pending_;
-};
-
-template <class Allocator>
-std::ostream& operator<<(std::ostream& os, const MemoryPool<Allocator>& pool) {
-  std::lock_guard<std::mutex> lock(pool.mutex_);
-
-  for (size_t i{0}; i < 80; ++i) {
-    os << '-';
-  }
-
-  // Current stock.
-  os << "\nStock =\n";
-  for (size_t i{0}; i < pool.stock_.size(); ++i) {
-    os << "[ " << i << " ] buffer " << static_cast<void*>(pool.stock_[i])
-       << ", size = " << pool.buffer_size_ << '\n';
-  }
-
-  // Pending buffers.
-  os << "\nPending =\n";
-  for (size_t i{0}; i < pool.pending_.size(); ++i) {
-    os << "[ " << i
-       << " ] buffer = " << static_cast<void*>(std::get<0>(pool.pending_[i]))
-       << ", size = " << std::get<1>(pool.pending_[i]) << ", ready_event = "
-       << static_cast<void*>(std::get<2>(pool.pending_[i])) << '\n';
-  }
-
-  // Available ready events.
-  os << "\nReady Events =\n";
-  for (size_t i{0}; i < pool.ready_events_.size(); ++i) {
-    os << "[ " << i << " ] " << static_cast<void*>(pool.ready_events_[i])
-       << '\n';
-  }
-
-  for (size_t i{0}; i < 80; ++i) {
-    os << '-';
-  }
-
-  os << '\n';
-  return os;
-}
-
-}  // namespace merlin
-}  // namespace nv
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/optimizers.cuh b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/optimizers.cuh
deleted file mode 100644
index b3fc1cc70..000000000
--- a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/optimizers.cuh
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cuda_runtime.h>
-#include "types.cuh"
-#include "utils.cuh"
-
-namespace nv {
-namespace merlin {
-namespace optimizers {
-
-template <class T>
-__global__ void adam_update_kernel(int len, float* weight, T* m, T* v,
-                                   const T* wgrad, float alpha_t, float beta1,
-                                   float beta2, float epsilon, float scaler) {
-  const int i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i < len) {
-    float gi = TypeConvertFunc<float, T>::convert(wgrad[i]) / scaler;
-    float mi =
-        beta1 * TypeConvertFunc<float, T>::convert(m[i]) + (1.f - beta1) * gi;
-    float vi = beta2 * TypeConvertFunc<float, T>::convert(v[i]) +
-               (1.f - beta2) * gi * gi;
-    m[i] = TypeConvertFunc<T, float>::convert(mi);
-    v[i] = TypeConvertFunc<T, float>::convert(vi);
-    weight[i] -= alpha_t * mi / (sqrt(vi) + epsilon);
-  }
-}
-
-template <class T>
-__global__ void ada_grad_update_kernel(int len, float* weight, const T* wgrad,
-                                       T* sum, float lr, const float epsilon,
-                                       float scaler) {
-  const int i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i < len) {
-    float gi = TypeConvertFunc<float, T>::convert(wgrad[i]) / scaler;
-    float accum_ = TypeConvertFunc<float, T>::convert(__ldg(&sum[i]));
-    accum_ += gi * gi;
-    float std_ = epsilon + sqrtf(accum_);
-    weight[i] -= lr * gi / std_;
-    sum[i] = TypeConvertFunc<T, float>::convert(accum_);
-  }
-}
-
-template <class T>
-__global__ void momentum_sgd_update_kernel(int len, float* weight, T* momentum,
-                                           const T* wgrad, float lr,
-                                           float momentum_factor,
-                                           float scaler) {
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  if (idx < len) {
-    float mv =
-        momentum_factor * TypeConvertFunc<float, T>::convert(momentum[idx]) -
-        lr * TypeConvertFunc<float, T>::convert(wgrad[idx]) / scaler;
-    momentum[idx] = TypeConvertFunc<T, float>::convert(mv);
-    weight[idx] += mv;
-  }
-  return;
-}
-
-}  // namespace optimizers
-}  // namespace merlin
-}  // namespace nv
\ No newline at end of file
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/types.cuh b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/types.cuh
deleted file mode 100644
index 210acd4a5..000000000
--- a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/types.cuh
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <stddef.h>
-#include <cstdint>
-#include <cuda/std/semaphore>
-
-namespace nv {
-namespace merlin {
-
-/**
- * Shorthand for a Key-Value-Meta tuple.
- */
-template <class K, class V, class M>
-struct KVM {
-  K key;
-  V* value;
-  M meta;
-};
-
-constexpr uint64_t EMPTY_KEY = UINT64_C(0xFFFFFFFFFFFFFFFF);
-constexpr uint64_t RECLAIM_KEY = UINT64_C(0xFFFFFFFFFFFFFFFE);
-constexpr uint64_t VACANT_KEY_MASK = UINT64_C(0xFFFFFFFFFFFFFFFE);
-constexpr uint64_t LOCKED_KEY = UINT64_C(0xFFFFFFFFFFFFFFFD);
-constexpr uint64_t RESERVED_KEY_MASK = UINT64_C(0xFFFFFFFFFFFFFFFC);
-constexpr uint64_t MAX_META = UINT64_C(0xFFFFFFFFFFFFFFFF);
-constexpr uint64_t EMPTY_META = UINT64_C(0);
-
-#define IS_RESERVED_KEY(key) ((RESERVED_KEY_MASK & (key)) == RESERVED_KEY_MASK)
-#define IS_VACANT_KEY(key) ((VACANT_KEY_MASK & (key)) == VACANT_KEY_MASK)
-
-template <class K>
-using AtomicKey = cuda::atomic<K, cuda::thread_scope_device>;
-
-template <class M>
-using AtomicMeta = cuda::atomic<M, cuda::thread_scope_device>;
-
-template <class T>
-using AtomicPos = cuda::atomic<T, cuda::thread_scope_device>;
-
-template <class K, class V, class M>
-struct Bucket {
-  AtomicKey<K>* keys_;
-  AtomicMeta<M>* metas_;
-  V* vectors;  // Pinned memory or HBM
-
-  /* For upsert_kernel without user specified metas
-     recording the current meta, the cur_meta will
-     increment by 1 when a new inserting happens. */
-  AtomicMeta<M> cur_meta;
-
-  /* min_meta and min_pos is for or upsert_kernel
-     with user specified meta. They record the minimum
-     meta and its pos in the bucket. */
-  AtomicMeta<M> min_meta;
-  AtomicPos<int> min_pos;
-
-  __forceinline__ __device__ AtomicKey<K>* keys(int index) const {
-    return keys_ + index;
-  }
-
-  __forceinline__ __device__ AtomicMeta<M>* metas(int index) const {
-    return metas_ + index;
-  }
-};
-
-template <cuda::thread_scope Scope, class T = int>
-class Lock {
-  mutable cuda::atomic<T, Scope> _lock;
-
- public:
-  __device__ Lock() : _lock{1} {}
-
-  template <typename CG>
-  __forceinline__ __device__ void acquire(CG const& g,
-                                          unsigned long long lane = 0) const {
-    if (g.thread_rank() == lane) {
-      T expected = 1;
-      while (!_lock.compare_exchange_weak(expected, 2,
-                                          cuda::std::memory_order_acquire)) {
-        expected = 1;
-      }
-    }
-    g.sync();
-  }
-
-  template <typename CG>
-  __forceinline__ __device__ void release(CG const& g,
-                                          unsigned long long lane = 0) const {
-    g.sync();
-    if (g.thread_rank() == lane) {
-      _lock.store(1, cuda::std::memory_order_release);
-    }
-  }
-};
-
-using Mutex = Lock<cuda::thread_scope_device>;
-
-template <class K, class V, class M>
-struct Table {
-  Bucket<K, V, M>* buckets;
-  Mutex* locks;                 // mutex for write buckets
-  int* buckets_size;            // size of each buckets.
-  V** slices;                   // Handles of the HBM/ HMEM slices.
-  size_t dim;                   // Dimension of the `vectors`.
-  size_t bytes_per_slice;       // Size by byte of one slice.
-  size_t num_of_memory_slices;  // Number of vectors memory slices.
-  size_t capacity = 134217728;  // Initial capacity.
-  size_t max_size =
-      std::numeric_limits<uint64_t>::max();  // Up limit of the table capacity.
-  size_t buckets_num;                        // Number of the buckets.
-  size_t bucket_max_size = 128;              // Volume of each buckets.
-  size_t max_hbm_for_vectors = 0;            // Max HBM allocated for vectors
-  size_t remaining_hbm_for_vectors = 0;  // Remaining HBM allocated for vectors
-  bool is_pure_hbm = true;               // unused
-  bool primary = true;                   // unused
-  int slots_offset = 0;                  // unused
-  int slots_number = 0;                  // unused
-  int device_id = 0;                     // Device id
-  int tile_size;
-};
-
-template <class K, class M>
-using EraseIfPredictInternal =
-    bool (*)(const K& key,       ///< iterated key in table
-             M& meta,            ///< iterated meta in table
-             const K& pattern,   ///< input key from caller
-             const M& threshold  ///< input meta from caller
-    );
-
-/**
- * An abstract class provides interface between the nv::merlin::HashTable
- * and a file, which enables the table to save to the file or load from
- * the file, by overriding the `read` and `write` method.
- *
- * @tparam K The data type of the key.
- * @tparam V The data type of the vector's elements.
- *         The item data type should be a basic data type of C++/CUDA.
- * @tparam M The data type for `meta`.
- *           The currently supported data type is only `uint64_t`.
- *
- */
-template <class K, class V, class M>
-class BaseKVFile {
- public:
-  virtual ~BaseKVFile() {}
-
-  /**
-   * Read from file and fill into the keys, values, and metas buffer.
-   * When calling save/load method from table, it can assume that the
-   * received buffer of keys, vectors, and metas are automatically
-   * pre-allocated.
-   *
-   * @param n The number of KV pairs expect to read. `int64_t` was used
-   *          here to adapt to various filesytem and formats.
-   * @param dim The dimension of the `vectors`.
-   * @param keys The pointer to received buffer for keys.
-   * @param vectors The pointer to received buffer for vectors.
-   * @param metas The pointer to received buffer for metas.
-   *
-   * @return Number of KV pairs have been successfully read.
-   */
-  virtual size_t read(const size_t n, const size_t dim, K* keys, V* vectors,
-                      M* metas) = 0;
-
-  /**
-   * Write keys, values, metas from table to the file. It defines
-   * an abstract method to get batch of KV pairs and write them into
-   * file.
-   *
-   * @param n The number of KV pairs to be written. `int64_t` was used
-   *          here to adapt to various filesytem and formats.
-   * @param dim The dimension of the `vectors`.
-   * @param keys The keys will be written to file.
-   * @param vectors The vectors of values will be written to file.
-   * @param metas The metas will be written to file.
-   *
-   * @return Number of KV pairs have been successfully written.
-   */
-  virtual size_t write(const size_t n, const size_t dim, const K* keys,
-                       const V* vectors, const M* metas) = 0;
-};
-
-enum class OccupyResult {
-  INITIAL,         ///< Initial status
-  CONTINUE,        ///< Insert did not succeed, continue trying to insert
-  OCCUPIED_EMPTY,  ///< New pair inserted successfully
-  OCCUPIED_RECLAIMED,
-  DUPLICATE,  ///< Insert did not succeed, key is already present
-  EVICT,      ///< Insert succeeded by evicting one key with minimum meta.
-  REFUSED,    ///< Insert did not succeed, insert meta is too low.
-};
-
-enum class OverrideResult {
-  INITIAL,   ///< Initial status
-  CONTINUE,  ///< Override did not succeed, continue trying to override
-  SUCCESS,   ///< Override successfully
-  REFUSED,   ///< Override is refused.
-};
-
-}  // namespace merlin
-}  // namespace nv
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/utils.cuh b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/utils.cuh
deleted file mode 100644
index bd60f93fc..000000000
--- a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin/utils.cuh
+++ /dev/null
@@ -1,368 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cooperative_groups.h>
-#include <stdarg.h>
-#include <cstdint>
-#include <exception>
-#include <string>
-#include "cuda_fp16.h"
-#include "cuda_runtime_api.h"
-#include "debug.hpp"
-#include "tensorflow_recommenders_addons/dynamic_embedding/core/lib/utils/cuda_utils.cuh"
-
-using namespace cooperative_groups;
-namespace cg = cooperative_groups;
-
-/*
-__inline__ __device__ uint64_t atomicCAS(uint64_t* address, uint64_t compare,
-                                         uint64_t val) {
-  return (uint64_t)atomicCAS((unsigned long long*)address,
-                             (unsigned long long)compare,
-                             (unsigned long long)val);
-}
-
-__inline__ __device__ int64_t atomicCAS(int64_t* address, int64_t compare,
-                                        int64_t val) {
-  return (int64_t)atomicCAS((unsigned long long*)address,
-                            (unsigned long long)compare,
-                            (unsigned long long)val);
-}
-*/
-
-__inline__ __device__ uint64_t atomicExch(uint64_t* address, uint64_t val) {
-  return (uint64_t)atomicExch((unsigned long long*)address,
-                              (unsigned long long)val);
-}
-
-__inline__ __device__ int64_t atomicExch(int64_t* address, int64_t val) {
-  return (int64_t)atomicExch((unsigned long long*)address,
-                             (unsigned long long)val);
-}
-
-__inline__ __device__ signed char atomicExch(signed char* address,
-                                             signed char val) {
-  signed char old = *address;
-  *address = val;
-  return old;
-}
-
-/*
-__inline__ __device__ int64_t atomicAdd(int64_t* address, const int64_t val) {
-  return (int64_t)atomicAdd((unsigned long long*)address, val);
-}
-
-__inline__ __device__ uint64_t atomicAdd(uint64_t* address,
-                                         const uint64_t val) {
-  return (uint64_t)atomicAdd((unsigned long long*)address, val);
-}
-*/
-
-namespace nv {
-namespace merlin {
-
-inline void __cudaCheckError(const char* file, const int line) {
-#ifdef CUDA_ERROR_CHECK
-  cudaError err = cudaGetLastError();
-  if (cudaSuccess != err) {
-    fprintf(stderr, "cudaCheckError() failed at %s:%i : %s\n", file, line,
-            cudaGetErrorString(err));
-    exit(-1);
-  }
-
-  // More careful checking. However, this will affect performance.
-  // Comment away if needed.
-  err = cudaDeviceSynchronize();
-  if (cudaSuccess != err) {
-    fprintf(stderr, "cudaCheckError() with sync failed at %s:%i : %s\n", file,
-            line, cudaGetErrorString(err));
-    exit(-1);
-  }
-#endif
-
-  return;
-}
-//#define CudaCheckError() nv::merlin::__cudaCheckError(__FILE__, __LINE__)
-#define CudaCheckError() {}
-
-static inline size_t SAFE_GET_GRID_SIZE(size_t N, int block_size) {
-  return ((N) > std::numeric_limits<int>::max())
-             ? ((1 << 30 - 1) / block_size + 1)
-             : (((N)-1) / block_size + 1);
-}
-
-static inline int SAFE_GET_BLOCK_SIZE(int block_size, int device = -1) {
-  cudaDeviceProp prop;
-  int current_device = device;
-  if (current_device == -1) {
-    CUDA_CHECK(cudaGetDevice(&current_device));
-  }
-  CUDA_CHECK(cudaGetDeviceProperties(&prop, current_device));
-  if (block_size > prop.maxThreadsPerBlock) {
-    fprintf(stdout,
-            "The requested block_size=%d exceeds the device limit, "
-            "the maxThreadsPerBlock=%d will be applied.\n",
-            block_size, prop.maxThreadsPerBlock);
-  }
-  return std::min(prop.maxThreadsPerBlock, block_size);
-}
-
-inline uint64_t Murmur3HashHost(const uint64_t& key) {
-  uint64_t k = key;
-  k ^= k >> 33;
-  k *= UINT64_C(0xff51afd7ed558ccd);
-  k ^= k >> 33;
-  k *= UINT64_C(0xc4ceb9fe1a85ec53);
-  k ^= k >> 33;
-  return k;
-}
-
-__inline__ __device__ uint64_t Murmur3HashDevice(uint64_t const& key) {
-  uint64_t k = key;
-  k ^= k >> 33;
-  k *= UINT64_C(0xff51afd7ed558ccd);
-  k ^= k >> 33;
-  k *= UINT64_C(0xc4ceb9fe1a85ec53);
-  k ^= k >> 33;
-  return k;
-}
-
-__inline__ __device__ int64_t Murmur3HashDevice(int64_t const& key) {
-  uint64_t k = uint64_t(key);
-  k ^= k >> 33;
-  k *= UINT64_C(0xff51afd7ed558ccd);
-  k ^= k >> 33;
-  k *= UINT64_C(0xc4ceb9fe1a85ec53);
-  k ^= k >> 33;
-  return int64_t(k);
-}
-
-__inline__ __device__ uint32_t Murmur3HashDevice(uint32_t const& key) {
-  uint32_t k = key;
-  k ^= k >> 16;
-  k *= UINT32_C(0x85ebca6b);
-  k ^= k >> 13;
-  k *= UINT32_C(0xc2b2ae35);
-  k ^= k >> 16;
-
-  return k;
-}
-
-__inline__ __device__ int32_t Murmur3HashDevice(int32_t const& key) {
-  uint32_t k = uint32_t(key);
-  k ^= k >> 16;
-  k *= UINT32_C(0x85ebca6b);
-  k ^= k >> 13;
-  k *= UINT32_C(0xc2b2ae35);
-  k ^= k >> 16;
-
-  return int32_t(k);
-}
-
-class CudaDeviceRestorer {
- public:
-  CudaDeviceRestorer() { CUDA_CHECK(cudaGetDevice(&dev_)); }
-  ~CudaDeviceRestorer() { CUDA_CHECK(cudaSetDevice(dev_)); }
-
- private:
-  int dev_;
-};
-
-static inline int get_dev(const void* ptr) {
-  cudaPointerAttributes attr;
-  CUDA_CHECK(cudaPointerGetAttributes(&attr, ptr));
-  int dev = -1;
-
-#if CUDART_VERSION >= 10000
-  if (attr.type == cudaMemoryTypeDevice)
-#else
-  if (attr.memoryType == cudaMemoryTypeDevice)
-#endif
-  {
-    dev = attr.device;
-  }
-  return dev;
-}
-
-static inline void switch_to_dev(const void* ptr) {
-  int dev = get_dev(ptr);
-  if (dev >= 0) {
-    CUDA_CHECK(cudaSetDevice(dev));
-  }
-}
-
-static inline bool is_on_device(const void* ptr) {
-  cudaPointerAttributes attr;
-  CUDA_CHECK(cudaPointerGetAttributes(&attr, ptr));
-
-#if CUDART_VERSION >= 10000
-  return (attr.type == cudaMemoryTypeDevice);
-#else
-  return (attr.memoryType == cudaMemoryTypeDevice);
-#endif
-}
-
-template <typename TOUT, typename TIN>
-struct TypeConvertFunc;
-
-template <>
-struct TypeConvertFunc<__half, float> {
-  static __forceinline__ __device__ __half convert(float val) {
-    return __float2half(val);
-  }
-};
-
-template <>
-struct TypeConvertFunc<float, __half> {
-  static __forceinline__ __device__ float convert(__half val) {
-    return __half2float(val);
-  }
-};
-
-template <>
-struct TypeConvertFunc<float, float> {
-  static __forceinline__ __device__ float convert(float val) { return val; }
-};
-
-template <>
-struct TypeConvertFunc<float, long long> {
-  static __forceinline__ __device__ float convert(long long val) {
-    return static_cast<float>(val);
-  }
-};
-
-template <>
-struct TypeConvertFunc<float, unsigned int> {
-  static __forceinline__ __device__ float convert(unsigned int val) {
-    return static_cast<float>(val);
-  }
-};
-
-template <>
-struct TypeConvertFunc<int, long long> {
-  static __forceinline__ __device__ int convert(long long val) {
-    return static_cast<int>(val);
-  }
-};
-
-template <>
-struct TypeConvertFunc<int, unsigned int> {
-  static __forceinline__ __device__ int convert(unsigned int val) {
-    return static_cast<int>(val);
-  }
-};
-
-template <class P>
-void realloc(P* ptr, size_t old_size, size_t new_size) {
-  // Truncate old_size to limit dowstream copy ops.
-  old_size = std::min(old_size, new_size);
-
-  // Alloc new buffer and copy at old data.
-  char* new_ptr;
-  CUDA_CHECK(cudaMalloc(&new_ptr, new_size));
-  if (*ptr != nullptr) {
-    CUDA_CHECK(cudaMemcpy(new_ptr, *ptr, old_size, cudaMemcpyDefault));
-    CUDA_CHECK(cudaFree(*ptr));
-  }
-
-  // Zero-fill remainder.
-  CUDA_CHECK(cudaMemset(new_ptr + old_size, 0, new_size - old_size));
-
-  // Switch to new pointer.
-  *ptr = reinterpret_cast<P>(new_ptr);
-  return;
-}
-
-template <class P>
-void realloc_managed(P* ptr, size_t old_size, size_t new_size) {
-  // Truncate old_size to limit dowstream copy ops.
-  old_size = std::min(old_size, new_size);
-
-  // Alloc new buffer and copy at old data.
-  char* new_ptr;
-  CUDA_CHECK(cudaMallocManaged(&new_ptr, new_size));
-  if (*ptr != nullptr) {
-    CUDA_CHECK(cudaMemcpy(new_ptr, *ptr, old_size, cudaMemcpyDefault));
-    CUDA_CHECK(cudaFree(*ptr));
-  }
-
-  // Zero-fill remainder.
-  CUDA_CHECK(cudaMemset(new_ptr + old_size, 0, new_size - old_size));
-
-  // Switch to new pointer.
-  *ptr = reinterpret_cast<P>(new_ptr);
-  return;
-}
-
-template <typename mutex, uint32_t TILE_SIZE, bool THREAD_SAFE = true>
-__forceinline__ __device__ void lock(
-    const cg::thread_block_tile<TILE_SIZE>& tile, mutex& set_mutex,
-    unsigned long long lane = 0) {
-  if (THREAD_SAFE) {
-    set_mutex.acquire(tile, lane);
-  }
-}
-
-template <typename mutex, uint32_t TILE_SIZE, bool THREAD_SAFE = true>
-__forceinline__ __device__ void unlock(
-    const cg::thread_block_tile<TILE_SIZE>& tile, mutex& set_mutex,
-    unsigned long long lane = 0) {
-  if (THREAD_SAFE) {
-    set_mutex.release(tile, lane);
-  }
-}
-
-inline void free_pointers(cudaStream_t stream, int n, ...) {
-  va_list args;
-  va_start(args, n);
-  void* ptr = nullptr;
-  for (int i = 0; i < n; i++) {
-    ptr = va_arg(args, void*);
-    if (ptr) {
-      cudaPointerAttributes attr;
-      memset(&attr, 0, sizeof(cudaPointerAttributes));
-      try {
-        CUDA_CHECK(cudaPointerGetAttributes(&attr, ptr));
-        if (attr.devicePointer && (!attr.hostPointer)) {
-          CUDA_CHECK(cudaFreeAsync(ptr, stream));
-        } else if (attr.devicePointer && attr.hostPointer) {
-          CUDA_CHECK(cudaFreeHost(ptr));
-        } else {
-          free(ptr);
-        }
-      } catch (const nv::merlin::CudaException& e) {
-        va_end(args);
-        throw e;
-      }
-    }
-  }
-  va_end(args);
-}
-
-#define CUDA_FREE_POINTERS(stream, ...) \
-  nv::merlin::free_pointers(            \
-      stream, (sizeof((void*[]){__VA_ARGS__}) / sizeof(void*)), __VA_ARGS__);
-
-static inline size_t GB(size_t n) { return n << 30; }
-
-static inline size_t MB(size_t n) { return n << 20; }
-
-static inline size_t KB(size_t n) { return n << 10; }
-
-}  // namespace merlin
-}  // namespace nv
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin_hashtable.cuh b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin_hashtable.cuh
deleted file mode 100644
index 19da172ea..000000000
--- a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin_hashtable.cuh
+++ /dev/null
@@ -1,1643 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/device_vector.h>
-#include <thrust/execution_policy.h>
-#include <thrust/sort.h>
-#include <atomic>
-#include <cstdint>
-#include <limits>
-#include <mutex>
-#include <shared_mutex>
-#include <type_traits>
-#include "merlin/array_kernels.cuh"
-#include "merlin/core_kernels.cuh"
-#include "merlin/flexible_buffer.cuh"
-#include "merlin/group_lock.hpp"
-#include "merlin/memory_pool.cuh"
-#include "merlin/types.cuh"
-#include "merlin/utils.cuh"
-
-namespace nv {
-namespace merlin {
-
-/**
- * @brief Enumeration of the eviction strategies.
- *
- * @note The `meta` is introduced to define the importance of each key, the
- * larger, the more important, the less likely they will be evicted. On `kLru`
- * mode, the `metas` parameter of the APIs should keep `nullptr`, the meta for
- * each key is assigned internally in LRU(Least Recently Used) policy. On
- * `kCustomized` mode, the `metas` should be provided by caller.
- *
- * @note Eviction occurs automatically when a bucket is full. The keys with the
- * minimum `meta` value are evicted first.
- *
- */
-enum class EvictStrategy {
-  kLru = 0,        ///< LRU mode.
-  kCustomized = 1  ///< Customized mode.
-};
-
-/**
- * @brief The options struct of HierarchicalKV.
- */
-struct HashTableOptions {
-  size_t init_capacity = 0;        ///< The initial capacity of the hash table.
-  size_t max_capacity = 0;         ///< The maximum capacity of the hash table.
-  size_t max_hbm_for_vectors = 0;  ///< The maximum HBM for vectors, in bytes.
-  size_t max_bucket_size = 128;    ///< The length of each bucket.
-  size_t dim = 64;                 ///< The dimension of the vectors.
-  float max_load_factor = 0.5f;    ///< The max load factor before rehashing.
-  int block_size = 128;            ///< The default block size for CUDA kernels.
-  int io_block_size = 1024;        ///< The block size for IO CUDA kernels.
-  int device_id = -1;              ///< The ID of device.
-  bool io_by_cpu = false;  ///< The flag indicating if the CPU handles IO.
-  EvictStrategy evict_strategy = EvictStrategy::kLru;  ///< The evict strategy.
-  bool use_constant_memory = false;                    ///< reserved
-  MemoryPoolOptions
-      device_memory_pool;  ///< Configuration options for device memory pool.
-  MemoryPoolOptions
-      host_memory_pool;  ///< Configuration options for host memory pool.
-};
-
-/**
- * @brief A customizable template function indicates which keys should be
- * erased from the hash table by returning `true`.
- *
- * @note The `erase_if` or `export_batch_if` API traverses all of the items by
- * this function and the items that return `true` are removed or exported.
- *
- *  Example for erase_if:
- *
- *    ```
- *    template <class K, class M>
- *    __forceinline__ __device__ bool erase_if_pred(const K& key,
- *                                                  M& meta,
- *                                                  const K& pattern,
- *                                                  const M& threshold) {
- *      return ((key & 0xFFFF000000000000 == pattern) &&
- *              (meta < threshold));
- *    }
- *    ```
- *
- *  Example for export_batch_if:
- *    ```
- *    template <class K, class M>
- *    __forceinline__ __device__ bool export_if_pred(const K& key,
- *                                                   M& meta,
- *                                                   const K& pattern,
- *                                                   const M& threshold) {
- *      return meta >= threshold;
- *    }
- *    ```
- */
-template <class K, class M>
-using EraseIfPredict = bool (*)(
-    const K& key,       ///< The traversed key in a hash table.
-    M& meta,            ///< The traversed meta in a hash table.
-    const K& pattern,   ///< The key pattern to compare with the `key` argument.
-    const M& threshold  ///< The threshold to compare with the `meta` argument.
-);
-
-/**
- * A HierarchicalKV hash table is a concurrent and hierarchical hash table that
- * is powered by GPUs and can use HBM and host memory as storage for key-value
- * pairs. Support for SSD storage is a future consideration.
- *
- * The `meta` is introduced to define the importance of each key, the
- * larger, the more important, the less likely they will be evicted. Eviction
- * occurs automatically when a bucket is full. The keys with the minimum `meta`
- * value are evicted first. In a customized eviction strategy, we recommend
- * using the timestamp or frequency of the key occurrence as the `meta` value
- * for each key. You can also assign a special value to the `meta` to
- * perform a customized eviction strategy.
- *
- * @note By default configuration, this class is thread-safe.
- *
- * @tparam K The data type of the key.
- * @tparam V The data type of the vector's item type.
- *         The item data type should be a basic data type of C++/CUDA.
- * @tparam M The data type for `meta`.
- *           The currently supported data type is only `uint64_t`.
- *
- */
-template <class K, class V, class M = uint64_t>
-class HashTable {
- public:
-  using size_type = size_t;
-  using key_type = K;
-  using value_type = V;
-  using meta_type = M;
-  using Pred = EraseIfPredict<key_type, meta_type>;
-
- private:
-  using TableCore = nv::merlin::Table<key_type, value_type, meta_type>;
-  static constexpr unsigned int TILE_SIZE = 4;
-
-  using DeviceMemoryPool = MemoryPool<DeviceAllocator<char>>;
-  using HostMemoryPool = MemoryPool<HostAllocator<char>>;
-
-#if THRUST_VERSION >= 101600
-  static constexpr auto thrust_par = thrust::cuda::par_nosync;
-#else
-  static constexpr auto thrust_par = thrust::cuda::par;
-#endif
-
- public:
-  /**
-   * @brief Default constructor for the hash table class.
-   */
-  HashTable(){};
-
-  /**
-   * @brief Frees the resources used by the hash table and destroys the hash
-   * table object.
-   */
-  ~HashTable() {
-    if (initialized_) {
-      CUDA_CHECK(cudaDeviceSynchronize());
-
-      initialized_ = false;
-      destroy_table<key_type, value_type, meta_type>(&table_);
-      CUDA_CHECK(cudaFree(d_table_));
-      dev_mem_pool_.reset();
-      host_mem_pool_.reset();
-    }
-  }
-
- private:
-  HashTable(const HashTable&) = delete;
-  HashTable& operator=(const HashTable&) = delete;
-  HashTable(HashTable&&) = delete;
-  HashTable& operator=(HashTable&&) = delete;
-
- public:
-  /**
-   * @brief Initialize a merlin::HashTable.
-   *
-   * @param options The configuration options.
-   */
-  void init(const HashTableOptions options) {
-    if (initialized_) {
-      return;
-    }
-    options_ = options;
-
-    if (options_.device_id >= 0) {
-      CUDA_CHECK(cudaSetDevice(options_.device_id));
-    } else {
-      CUDA_CHECK(cudaGetDevice(&(options_.device_id)));
-    }
-
-    // Construct table.
-    cudaDeviceProp deviceProp;
-    CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, options_.device_id));
-    shared_mem_size_ = deviceProp.sharedMemPerBlock;
-    create_table<key_type, value_type, meta_type>(
-        &table_, options_.dim, options_.init_capacity, options_.max_capacity,
-        options_.max_hbm_for_vectors, options_.max_bucket_size);
-    options_.block_size = SAFE_GET_BLOCK_SIZE(options_.block_size);
-    reach_max_capacity_ = (options_.init_capacity * 2 > options_.max_capacity);
-    MERLIN_CHECK((!(options_.io_by_cpu && options_.max_hbm_for_vectors != 0)),
-                 "[HierarchicalKV] `io_by_cpu` should not be true when "
-                 "`max_hbm_for_vectors` is not 0!");
-    CUDA_CHECK(cudaMalloc((void**)&(d_table_), sizeof(TableCore)));
-
-    sync_table_configuration();
-
-    // Create memory pools.
-    dev_mem_pool_ = std::make_unique<MemoryPool<DeviceAllocator<char>>>(
-        options_.device_memory_pool);
-    host_mem_pool_ = std::make_unique<MemoryPool<HostAllocator<char>>>(
-        options_.host_memory_pool);
-
-    CUDA_CHECK(cudaDeviceSynchronize());
-    initialized_ = true;
-    CudaCheckError();
-  }
-
-  /**
-   * @brief Insert new key-value-meta tuples into the hash table.
-   * If the key already exists, the values and metas are assigned new values.
-   *
-   * If the target bucket is full, the keys with minimum meta will be
-   * overwritten by new key unless the meta of the new key is even less than
-   * minimum meta of the target bucket.
-   *
-   * @param n Number of key-value-meta tuples to insert or assign.
-   * @param keys The keys to insert on GPU-accessible memory with shape
-   * (n).
-   * @param values The values to insert on GPU-accessible memory with
-   * shape (n, DIM).
-   * @param metas The metas to insert on GPU-accessible memory with shape
-   * (n).
-   * @parblock
-   * The metas should be a `uint64_t` value. You can specify a value that
-   * such as the timestamp of the key insertion, number of the key
-   * occurrences, or another value to perform a custom eviction strategy.
-   *
-   * The @p metas should be `nullptr`, when the LRU eviction strategy is
-   * applied.
-   * @endparblock
-   *
-   * @param stream The CUDA stream that is used to execute the operation.
-   *
-   * @param ignore_evict_strategy A boolean option indicating whether if
-   * the insert_or_assign ignores the evict strategy of table with current
-   * metas anyway. If true, it does not check whether the metas confroms to
-   * the evict strategy. If false, it requires the metas follow the evict
-   * strategy of table.
-   */
-  void insert_or_assign(const size_type n,
-                        const key_type* keys,              // (n)
-                        const value_type* values,          // (n, DIM)
-                        const meta_type* metas = nullptr,  // (n)
-                        cudaStream_t stream = 0,
-                        bool ignore_evict_strategy = false) {
-    if (n == 0) {
-      return;
-    }
-
-    while (!reach_max_capacity_ &&
-           fast_load_factor(n, stream) > options_.max_load_factor) {
-      reserve(capacity() * 2, stream);
-    }
-
-    if (!ignore_evict_strategy) {
-      check_evict_strategy(metas);
-    }
-
-    writer_shared_lock lock(mutex_);
-
-    if (is_fast_mode()) {
-      using Selector =
-          SelectUpsertKernelWithIO<key_type, value_type, meta_type>;
-      static thread_local int step_counter = 0;
-      static thread_local float load_factor = 0.0;
-
-      if (((step_counter++) % kernel_select_interval_) == 0) {
-        load_factor = fast_load_factor(0, stream, false);
-      }
-
-      Selector::execute_kernel(
-          load_factor, options_.block_size, options_.max_bucket_size,
-          table_->buckets_num, options_.dim, stream, n, d_table_, keys,
-          reinterpret_cast<const value_type*>(values), metas);
-    } else {
-      const size_type dev_ws_size{n * (sizeof(value_type*) + sizeof(int))};
-      auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)};
-      auto d_dst{dev_ws.get<value_type**>(0)};
-      auto d_src_offset{reinterpret_cast<int*>(d_dst + n)};
-
-      CUDA_CHECK(cudaMemsetAsync(d_dst, 0, dev_ws_size, stream));
-
-      {
-        const size_t block_size = options_.block_size;
-        const size_t N = n * TILE_SIZE;
-        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
-
-        upsert_kernel<key_type, value_type, meta_type, TILE_SIZE>
-            <<<grid_size, block_size, 0, stream>>>(
-                d_table_, options_.max_bucket_size, table_->buckets_num,
-                options_.dim, keys, d_dst, metas, d_src_offset, N);
-      }
-
-      {
-        thrust::device_ptr<uintptr_t> d_dst_ptr(
-            reinterpret_cast<uintptr_t*>(d_dst));
-        thrust::device_ptr<int> d_src_offset_ptr(d_src_offset);
-
-        thrust::sort_by_key(thrust_par.on(stream), d_dst_ptr, d_dst_ptr + n,
-                            d_src_offset_ptr, thrust::less<uintptr_t>());
-      }
-
-      if (options_.io_by_cpu) {
-        const size_type host_ws_size{dev_ws_size +
-                                     n * sizeof(value_type) * dim()};
-        auto host_ws{host_mem_pool_->get_workspace<1>(host_ws_size, stream)};
-        auto h_dst{host_ws.get<value_type**>(0)};
-        auto h_src_offset{reinterpret_cast<int*>(h_dst + n)};
-        auto h_values{reinterpret_cast<value_type*>(h_src_offset + n)};
-
-        CUDA_CHECK(cudaMemcpyAsync(h_dst, d_dst, dev_ws_size,
-                                   cudaMemcpyDeviceToHost, stream));
-        CUDA_CHECK(cudaMemcpyAsync(h_values, values, host_ws_size - dev_ws_size,
-                                   cudaMemcpyDeviceToHost, stream));
-        CUDA_CHECK(cudaStreamSynchronize(stream));
-
-        write_by_cpu<value_type>(h_dst, h_values, h_src_offset, dim(), n);
-      } else {
-        const size_t block_size = options_.io_block_size;
-        const size_t N = n * dim();
-        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
-
-        write_kernel<key_type, value_type, meta_type>
-            <<<grid_size, block_size, 0, stream>>>(values, d_dst, d_src_offset,
-                                                   dim(), N);
-      }
-    }
-
-    CudaCheckError();
-  }
-
-  /**
-   * @brief Insert new key-value-meta tuples into the hash table.
-   * If the key already exists, the values and metas are assigned new values.
-   *
-   * If the target bucket is full, the keys with minimum meta will be
-   * overwritten by new key unless the meta of the new key is even less than
-   * minimum meta of the target bucket. The overwritten key with minimum
-   * meta will be evicted, with its values and meta, to evicted_keys,
-   * evicted_values, evcted_metas seperately in compact format.
-   *
-   * @param n Number of key-value-meta tuples to insert or assign.
-   * @param keys The keys to insert on GPU-accessible memory with shape
-   * (n).
-   * @param values The values to insert on GPU-accessible memory with
-   * shape (n, DIM).
-   * @param metas The metas to insert on GPU-accessible memory with shape
-   * (n).
-   * @param metas The metas to insert on GPU-accessible memory with shape
-   * (n).
-   * @params evicted_keys The output of keys replaced with minimum meta.
-   * @params evicted_values The output of values replaced with minimum meta on
-   * keys.
-   * @params evicted_metas The output of metas replaced with minimum meta on
-   * keys.
-   * @parblock
-   * The metas should be a `uint64_t` value. You can specify a value that
-   * such as the timestamp of the key insertion, number of the key
-   * occurrences, or another value to perform a custom eviction strategy.
-   *
-   * The @p metas should be `nullptr`, when the LRU eviction strategy is
-   * applied.
-   * @endparblock
-   *
-   * @param stream The CUDA stream that is used to execute the operation.
-   *
-   * @param ignore_evict_strategy A boolean option indicating whether if
-   * the insert_or_assign ignores the evict strategy of table with current
-   * metas anyway. If true, it does not check whether the metas confroms to
-   * the evict strategy. If false, it requires the metas follow the evict
-   * strategy of table.
-   */
-  size_type insert_and_evict(const size_type n,
-                             const key_type* keys,        // (n)
-                             const value_type* values,    // (n, DIM)
-                             const meta_type* metas,      // (n)
-                             key_type* evicted_keys,      // (n)
-                             value_type* evicted_values,  // (n, DIM)
-                             meta_type* evicted_metas,    // (n)
-                             cudaStream_t stream = 0) {
-    if (n == 0) {
-      return 0;
-    }
-
-    while (!reach_max_capacity_ &&
-           fast_load_factor(n, stream) > options_.max_load_factor) {
-      reserve(capacity() * 2, stream);
-    }
-
-    writer_shared_lock lock(mutex_);
-
-    // TODO: Currently only need eviction when using HashTable as HBM cache.
-    if (!is_fast_mode()) {
-      throw std::runtime_error("Only allow insert_and_evict in pure HBM mode.");
-    }
-
-    using Selector =
-        SelectUpsertAndEvictKernelWithIO<key_type, value_type, meta_type>;
-    static thread_local int step_counter = 0;
-    static thread_local float load_factor = 0.0;
-
-    if (((step_counter++) % kernel_select_interval_) == 0) {
-      load_factor = fast_load_factor(0, stream, false);
-    }
-
-    // always use max tile to avoid data-deps as possible.
-    const int TILE_SIZE = 32;
-    size_t n_offsets = (n + TILE_SIZE - 1) / TILE_SIZE;
-    const size_type dev_ws_size =
-        n_offsets * sizeof(int64_t) + n * sizeof(bool) + sizeof(size_type);
-
-    auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)};
-    auto d_offsets{dev_ws.get<int64_t*>(0)};
-    auto dn_evicted = reinterpret_cast<size_type*>(d_offsets + n_offsets);
-    auto d_masks = reinterpret_cast<bool*>(dn_evicted + 1);
-
-    CUDA_CHECK(
-        cudaMemsetAsync(d_offsets, 0, n_offsets * sizeof(int64_t), stream));
-    CUDA_CHECK(cudaMemsetAsync(dn_evicted, 0, sizeof(size_type), stream));
-    CUDA_CHECK(cudaMemsetAsync(d_masks, 0, n * sizeof(bool), stream));
-
-    size_type block_size = options_.block_size;
-    size_type grid_size = SAFE_GET_GRID_SIZE(n, block_size);
-    CUDA_CHECK(cudaMemsetAsync(evicted_keys, static_cast<int>(EMPTY_KEY),
-                               n * sizeof(K), stream));
-
-    Selector::execute_kernel(
-        load_factor, options_.block_size, options_.max_bucket_size,
-        table_->buckets_num, options_.dim, stream, n, d_table_, keys, values,
-        metas, evicted_keys, evicted_values, evicted_metas);
-
-    keys_not_empty<K>
-        <<<grid_size, block_size, 0, stream>>>(evicted_keys, d_masks, n);
-    size_type n_evicted = 0;
-    gpu_pick_kvm_inplace<K, V, M, int64_t, TILE_SIZE>(
-        grid_size, block_size, d_masks, true, n, dn_evicted, d_offsets, evicted_keys,
-        evicted_values, evicted_metas, dim(), stream);
-    CUDA_CHECK(cudaMemcpyAsync(&n_evicted, dn_evicted, sizeof(size_type),
-                               cudaMemcpyDeviceToHost, stream));
-    CUDA_CHECK(cudaStreamSynchronize(stream));
-    CudaCheckError();
-    return n_evicted;
-  }
-
-  /**
-   * Searches for each key in @p keys in the hash table.
-   * If the key is found and the corresponding value in @p accum_or_assigns is
-   * `true`, the @p vectors_or_deltas is treated as a delta to the old
-   * value, and the delta is added to the old value of the key.
-   *
-   * If the key is not found and the corresponding value in @p accum_or_assigns
-   * is `false`, the @p vectors_or_deltas is treated as a new value and the
-   * key-value pair is updated in the table directly.
-   *
-   * @note When the key is found and the value of @p accum_or_assigns is
-   * `false`, or when the key is not found and the value of @p accum_or_assigns
-   * is `true`, nothing is changed and this operation is ignored.
-   * The algorithm assumes these situations occur while the key was modified or
-   * removed by other processes just now.
-   *
-   * @param n The number of key-value-meta tuples to process.
-   * @param keys The keys to insert on GPU-accessible memory with shape (n).
-   * @param value_or_deltas The values or deltas to insert on GPU-accessible
-   * memory with shape (n, DIM).
-   * @param accum_or_assigns The operation type with shape (n). A value of
-   * `true` indicates to accum and `false` indicates to assign.
-   * @param metas The metas to insert on GPU-accessible memory with shape (n).
-   * @parblock
-   * The metas should be a `uint64_t` value. You can specify a value that
-   * such as the timestamp of the key insertion, number of the key
-   * occurrences, or another value to perform a custom eviction strategy.
-   *
-   * The @p metas should be `nullptr`, when the LRU eviction strategy is
-   * applied.
-   * @endparblock
-   *
-   * @param stream The CUDA stream that is used to execute the operation.
-   *
-   * @param ignore_evict_strategy A boolean option indicating whether if
-   * the accum_or_assign ignores the evict strategy of table with current
-   * metas anyway. If true, it does not check whether the metas confroms to
-   * the evict strategy. If false, it requires the metas follow the evict
-   * strategy of table.
-   *
-   */
-  void accum_or_assign(const size_type n,
-                       const key_type* keys,               // (n)
-                       const value_type* value_or_deltas,  // (n, DIM)
-                       const bool* accum_or_assigns,       // (n)
-                       const meta_type* metas = nullptr,   // (n)
-                       cudaStream_t stream = 0,
-                       bool ignore_evict_strategy = false) {
-    if (n == 0) {
-      return;
-    }
-
-    while (!reach_max_capacity_ &&
-           fast_load_factor(n, stream) > options_.max_load_factor) {
-      reserve(capacity() * 2, stream);
-    }
-
-    if (!ignore_evict_strategy) {
-      check_evict_strategy(metas);
-    }
-
-    writer_shared_lock lock(mutex_);
-
-    const size_type dev_ws_size{
-        n * (sizeof(value_type*) + sizeof(int) + sizeof(bool))};
-    auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)};
-    auto dst{dev_ws.get<value_type**>(0)};
-    auto src_offset{reinterpret_cast<int*>(dst + n)};
-    auto founds{reinterpret_cast<bool*>(src_offset + n)};
-
-    CUDA_CHECK(cudaMemsetAsync(dst, 0, dev_ws_size, stream));
-
-    {
-      const size_t block_size = options_.block_size;
-      const size_t N = n * TILE_SIZE;
-      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
-
-      accum_kernel<key_type, value_type, meta_type>
-          <<<grid_size, block_size, 0, stream>>>(
-              table_, keys, dst, metas, accum_or_assigns, table_->buckets,
-              table_->buckets_size, table_->bucket_max_size,
-              table_->buckets_num, src_offset, founds, N);
-    }
-
-    if (!is_fast_mode()) {
-      thrust::device_ptr<uintptr_t> dst_ptr(reinterpret_cast<uintptr_t*>(dst));
-      thrust::device_ptr<int> src_offset_ptr(src_offset);
-
-      thrust::sort_by_key(thrust_par.on(stream), dst_ptr, dst_ptr + n,
-                          src_offset_ptr, thrust::less<uintptr_t>());
-    }
-
-    {
-      const size_t block_size = options_.io_block_size;
-      const size_t N = n * dim();
-      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
-
-      write_with_accum_kernel<key_type, value_type, meta_type>
-          <<<grid_size, block_size, 0, stream>>>(value_or_deltas, dst,
-                                                 accum_or_assigns, founds,
-                                                 src_offset, dim(), N);
-    }
-
-    CudaCheckError();
-  }
-
-  /**
-   * @brief Searches the hash table for the specified keys.
-   * When a key is missing, the value in @p values and @p metas will be
-   * inserted.
-   *
-   * @param n The number of key-value-meta tuples to search or insert.
-   * @param keys The keys to search on GPU-accessible memory with shape (n).
-   * @param values The values to search on GPU-accessible memory with
-   * shape (n, DIM).
-   * @param metas The metas to search on GPU-accessible memory with shape (n).
-   * @parblock
-   * If @p metas is `nullptr`, the meta for each key will not be returned.
-   * @endparblock
-   * @param stream The CUDA stream that is used to execute the operation.
-   *
-   */
-  void find_or_insert(const size_type n, const key_type* keys,  // (n)
-                      value_type* values,                       // (n * DIM)
-                      meta_type* metas = nullptr,               // (n)
-                      cudaStream_t stream = 0,
-                      bool ignore_evict_strategy = false) {
-    if (n == 0) {
-      return;
-    }
-
-    while (!reach_max_capacity_ &&
-           fast_load_factor(n, stream) > options_.max_load_factor) {
-      reserve(capacity() * 2, stream);
-    }
-
-    if (!ignore_evict_strategy) {
-      check_evict_strategy(metas);
-    }
-
-    writer_shared_lock lock(mutex_);
-
-    if (is_fast_mode()) {
-      using Selector =
-          SelectFindOrInsertKernelWithIO<key_type, value_type, meta_type>;
-      static thread_local int step_counter = 0;
-      static thread_local float load_factor = 0.0;
-
-      if (((step_counter++) % kernel_select_interval_) == 0) {
-        load_factor = fast_load_factor(0, stream, false);
-      }
-      Selector::execute_kernel(load_factor, options_.block_size,
-                               options_.max_bucket_size, table_->buckets_num,
-                               options_.dim, stream, n, d_table_, keys, values,
-                               metas);
-    } else {
-      const size_type dev_ws_size{
-          n * (sizeof(value_type*) + sizeof(int) + sizeof(bool))};
-      auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)};
-      auto d_table_value_addrs{dev_ws.get<value_type**>(0)};
-      auto param_key_index{reinterpret_cast<int*>(d_table_value_addrs + n)};
-      auto founds{reinterpret_cast<bool*>(param_key_index + n)};
-
-      CUDA_CHECK(cudaMemsetAsync(d_table_value_addrs, 0, dev_ws_size, stream));
-
-      {
-        const size_t block_size = options_.block_size;
-        const size_t N = n * TILE_SIZE;
-        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
-
-        find_or_insert_kernel<key_type, value_type, meta_type, TILE_SIZE>
-            <<<grid_size, block_size, 0, stream>>>(
-                d_table_, options_.max_bucket_size, table_->buckets_num,
-                options_.dim, keys, d_table_value_addrs, metas, founds,
-                param_key_index, N);
-      }
-
-      {
-        thrust::device_ptr<uintptr_t> table_value_ptr(
-            reinterpret_cast<uintptr_t*>(d_table_value_addrs));
-        thrust::device_ptr<int> param_key_index_ptr(param_key_index);
-
-        thrust::sort_by_key(thrust_par.on(stream), table_value_ptr,
-                            table_value_ptr + n, param_key_index_ptr,
-                            thrust::less<uintptr_t>());
-      }
-
-      if (options_.io_by_cpu) {
-        const size_type host_ws_size{
-            dev_ws_size + n * (sizeof(bool) + sizeof(value_type) * dim())};
-        auto host_ws{host_mem_pool_->get_workspace<1>(host_ws_size, stream)};
-        auto h_table_value_addrs{host_ws.get<value_type**>(0)};
-        auto h_param_key_index{reinterpret_cast<int*>(h_table_value_addrs + n)};
-        auto h_founds{reinterpret_cast<bool*>(h_param_key_index + n)};
-        auto h_param_values{reinterpret_cast<value_type*>(h_founds + n)};
-
-        CUDA_CHECK(cudaMemcpyAsync(h_table_value_addrs, d_table_value_addrs,
-                                   dev_ws_size, cudaMemcpyDeviceToHost,
-                                   stream));
-        CUDA_CHECK(cudaMemcpyAsync(h_founds, founds, n * sizeof(bool),
-                                   cudaMemcpyDeviceToHost, stream));
-        CUDA_CHECK(cudaMemcpyAsync(h_param_values, values,
-                                   n * sizeof(value_type) * dim(),
-                                   cudaMemcpyDeviceToHost, stream));
-        CUDA_CHECK(cudaStreamSynchronize(stream));
-
-        read_or_write_by_cpu<value_type>(h_table_value_addrs, h_param_values,
-                                         h_param_key_index, h_founds, dim(), n);
-        CUDA_CHECK(cudaMemcpyAsync(values, h_param_values,
-                                   n * sizeof(value_type) * dim(),
-                                   cudaMemcpyHostToDevice, stream));
-      } else {
-        const size_t block_size = options_.io_block_size;
-        const size_t N = n * dim();
-        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
-
-        read_or_write_kernel<key_type, value_type, meta_type>
-            <<<grid_size, block_size, 0, stream>>>(
-                d_table_value_addrs, values, founds, param_key_index, dim(), N);
-      }
-    }
-
-    CudaCheckError();
-  }
-
-  /**
-   * @brief Searches the hash table for the specified keys and returns address
-   * of the values. When a key is missing, the value in @p values and @p metas
-   * will be inserted.
-   *
-   * @warning This API returns internal addresses for high-performance but
-   * thread-unsafe. The caller is responsible for guaranteeing data consistency.
-   *
-   * @param n The number of key-value-meta tuples to search or insert.
-   * @param keys The keys to search on GPU-accessible memory with shape (n).
-   * @param values  The addresses of values to search on GPU-accessible memory
-   * with shape (n).
-   * @param founds The status that indicates if the keys are found on
-   * @param metas The metas to search on GPU-accessible memory with shape (n).
-   * @parblock
-   * If @p metas is `nullptr`, the meta for each key will not be returned.
-   * @endparblock
-   * @param stream The CUDA stream that is used to execute the operation.
-   *
-   */
-  void find_or_insert(const size_type n, const key_type* keys,  // (n)
-                      value_type** values,                      // (n)
-                      bool* founds,                             // (n)
-                      meta_type* metas = nullptr,               // (n)
-                      cudaStream_t stream = 0,
-                      bool ignore_evict_strategy = false) {
-    if (n == 0) {
-      return;
-    }
-
-    while (!reach_max_capacity_ &&
-           fast_load_factor(n, stream) > options_.max_load_factor) {
-      reserve(capacity() * 2, stream);
-    }
-
-    if (!ignore_evict_strategy) {
-      check_evict_strategy(metas);
-    }
-
-    writer_shared_lock lock(mutex_);
-
-    using Selector =
-        SelectFindOrInsertPtrKernel<key_type, value_type, meta_type>;
-    static thread_local int step_counter = 0;
-    static thread_local float load_factor = 0.0;
-
-    if (((step_counter++) % kernel_select_interval_) == 0) {
-      load_factor = fast_load_factor(0, stream, false);
-    }
-    Selector::execute_kernel(load_factor, options_.block_size,
-                             options_.max_bucket_size, table_->buckets_num,
-                             options_.dim, stream, n, d_table_, keys, values,
-                             metas, founds);
-
-    CudaCheckError();
-  }
-  /**
-   * @brief Assign new key-value-meta tuples into the hash table.
-   * If the key doesn't exist, the operation on the key will be ignored.
-   *
-   * @param n Number of key-value-meta tuples to insert or assign.
-   * @param keys The keys to insert on GPU-accessible memory with shape
-   * (n).
-   * @param values The values to insert on GPU-accessible memory with
-   * shape (n, DIM).
-   * @param metas The metas to insert on GPU-accessible memory with shape
-   * (n).
-   * @parblock
-   * The metas should be a `uint64_t` value. You can specify a value that
-   * such as the timestamp of the key insertion, number of the key
-   * occurrences, or another value to perform a custom eviction strategy.
-   *
-   * The @p metas should be `nullptr`, when the LRU eviction strategy is
-   * applied.
-   * @endparblock
-   *
-   * @param stream The CUDA stream that is used to execute the operation.
-   */
-  void assign(const size_type n,
-              const key_type* keys,              // (n)
-              const value_type* values,          // (n, DIM)
-              const meta_type* metas = nullptr,  // (n)
-              cudaStream_t stream = 0) {
-    if (n == 0) {
-      return;
-    }
-
-    writer_shared_lock lock(mutex_);
-
-    if (is_fast_mode()) {
-      using Selector =
-          SelectUpdateKernelWithIO<key_type, value_type, meta_type>;
-      static thread_local int step_counter = 0;
-      static thread_local float load_factor = 0.0;
-
-      if (((step_counter++) % kernel_select_interval_) == 0) {
-        load_factor = fast_load_factor(0, stream, false);
-      }
-
-      Selector::execute_kernel(load_factor, options_.block_size,
-                               options_.max_bucket_size, table_->buckets_num,
-                               options_.dim, stream, n, d_table_, keys, values,
-                               metas);
-    } else {
-      const size_type dev_ws_size{n * (sizeof(value_type*) + sizeof(int))};
-      auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)};
-      auto d_dst{dev_ws.get<value_type**>(0)};
-      auto d_src_offset{reinterpret_cast<int*>(d_dst + n)};
-
-      CUDA_CHECK(cudaMemsetAsync(d_dst, 0, dev_ws_size, stream));
-
-      {
-        const size_t block_size = options_.block_size;
-        const size_t N = n * TILE_SIZE;
-        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
-
-        update_kernel<key_type, value_type, meta_type, TILE_SIZE>
-            <<<grid_size, block_size, 0, stream>>>(
-                d_table_, options_.max_bucket_size, table_->buckets_num,
-                options_.dim, keys, d_dst, metas, d_src_offset, N);
-      }
-
-      {
-        thrust::device_ptr<uintptr_t> d_dst_ptr(
-            reinterpret_cast<uintptr_t*>(d_dst));
-        thrust::device_ptr<int> d_src_offset_ptr(d_src_offset);
-
-        thrust::sort_by_key(thrust_par.on(stream), d_dst_ptr, d_dst_ptr + n,
-                            d_src_offset_ptr, thrust::less<uintptr_t>());
-      }
-
-      if (options_.io_by_cpu) {
-        const size_type host_ws_size{dev_ws_size +
-                                     n * sizeof(value_type) * dim()};
-        auto host_ws{host_mem_pool_->get_workspace<1>(host_ws_size, stream)};
-        auto h_dst{host_ws.get<value_type**>(0)};
-        auto h_src_offset{reinterpret_cast<int*>(h_dst + n)};
-        auto h_values{reinterpret_cast<value_type*>(h_src_offset + n)};
-
-        CUDA_CHECK(cudaMemcpyAsync(h_dst, d_dst, dev_ws_size,
-                                   cudaMemcpyDeviceToHost, stream));
-        CUDA_CHECK(cudaMemcpyAsync(h_values, values, host_ws_size - dev_ws_size,
-                                   cudaMemcpyDeviceToHost, stream));
-        CUDA_CHECK(cudaStreamSynchronize(stream));
-
-        write_by_cpu<value_type>(h_dst, h_values, h_src_offset, dim(), n);
-      } else {
-        const size_t block_size = options_.io_block_size;
-        const size_t N = n * dim();
-        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
-
-        write_kernel<key_type, value_type, meta_type>
-            <<<grid_size, block_size, 0, stream>>>(values, d_dst, d_src_offset,
-                                                   dim(), N);
-      }
-    }
-
-    CudaCheckError();
-  }
-
-  /**
-   * @brief Searches the hash table for the specified keys.
-   *
-   * @note When a key is missing, the value in @p values is not changed.
-   *
-   * @param n The number of key-value-meta tuples to search.
-   * @param keys The keys to search on GPU-accessible memory with shape (n).
-   * @param values The values to search on GPU-accessible memory with
-   * shape (n, DIM).
-   * @param founds The status that indicates if the keys are found on
-   * GPU-accessible memory with shape (n).
-   * @param metas The metas to search on GPU-accessible memory with shape (n).
-   * @parblock
-   * If @p metas is `nullptr`, the meta for each key will not be returned.
-   * @endparblock
-   * @param stream The CUDA stream that is used to execute the operation.
-   *
-   */
-  void find(const size_type n, const key_type* keys,  // (n)
-            value_type* values,                       // (n, DIM)
-            bool* founds,                             // (n)
-            meta_type* metas = nullptr,               // (n)
-            cudaStream_t stream = 0) const {
-    if (n == 0) {
-      return;
-    }
-
-    CUDA_CHECK(cudaMemsetAsync(founds, 0, n * sizeof(bool), stream));
-
-    reader_shared_lock lock(mutex_);
-
-    if (is_fast_mode()) {
-      using Selector =
-          SelectLookupKernelWithIO<key_type, value_type, meta_type>;
-      static thread_local int step_counter = 0;
-      static thread_local float load_factor = 0.0;
-
-      if (((step_counter++) % kernel_select_interval_) == 0) {
-        load_factor = fast_load_factor(0, stream, false);
-      }
-      Selector::execute_kernel(load_factor, options_.block_size,
-                               options_.max_bucket_size, table_->buckets_num,
-                               options_.dim, stream, n, d_table_, keys, values,
-                               metas, founds);
-    } else {
-      const size_type dev_ws_size{n * (sizeof(value_type*) + sizeof(int))};
-      auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)};
-      auto src{dev_ws.get<value_type**>(0)};
-      auto dst_offset{reinterpret_cast<int*>(src + n)};
-
-      CUDA_CHECK(cudaMemsetAsync(src, 0, dev_ws_size, stream));
-
-      {
-        const size_t block_size = options_.block_size;
-        const size_t N = n * TILE_SIZE;
-        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
-
-        lookup_kernel<key_type, value_type, meta_type, TILE_SIZE>
-            <<<grid_size, block_size, 0, stream>>>(
-                d_table_, options_.max_bucket_size, table_->buckets_num,
-                options_.dim, keys, src, metas, founds, dst_offset, N);
-      }
-
-      {
-        thrust::device_ptr<uintptr_t> src_ptr(
-            reinterpret_cast<uintptr_t*>(src));
-        thrust::device_ptr<int> dst_offset_ptr(dst_offset);
-
-        thrust::sort_by_key(thrust_par.on(stream), src_ptr, src_ptr + n,
-                            dst_offset_ptr, thrust::less<uintptr_t>());
-      }
-
-      {
-        const size_t block_size = options_.io_block_size;
-        const size_t N = n * dim();
-        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
-
-        read_kernel<key_type, value_type, meta_type>
-            <<<grid_size, block_size, 0, stream>>>(src, values, founds,
-                                                   dst_offset, dim(), N);
-      }
-    }
-
-    CudaCheckError();
-  }
-
-  /**
-   * @brief Searches the hash table for the specified keys and returns address
-   * of the values.
-   *
-   * @note When a key is missing, the data in @p values won't change.
-   * @warning This API returns internal addresses for high-performance but
-   * thread-unsafe. The caller is responsible for guaranteeing data consistency.
-   *
-   * @param n The number of key-value-meta tuples to search.
-   * @param keys The keys to search on GPU-accessible memory with shape (n).
-   * @param values The addresses of values to search on GPU-accessible memory
-   * with shape (n).
-   * @param founds The status that indicates if the keys are found on
-   * GPU-accessible memory with shape (n).
-   * @param metas The metas to search on GPU-accessible memory with shape (n).
-   * @parblock
-   * If @p metas is `nullptr`, the meta for each key will not be returned.
-   * @endparblock
-   * @param stream The CUDA stream that is used to execute the operation.
-   *
-   */
-  void find(const size_type n, const key_type* keys,  // (n)
-            value_type** values,                      // (n)
-            bool* founds,                             // (n)
-            meta_type* metas = nullptr,               // (n)
-            cudaStream_t stream = 0) const {
-    if (n == 0) {
-      return;
-    }
-
-    CUDA_CHECK(cudaMemsetAsync(founds, 0, n * sizeof(bool), stream));
-
-    reader_shared_lock lock(mutex_);
-
-    using Selector = SelectLookupPtrKernel<key_type, value_type, meta_type>;
-    static thread_local int step_counter = 0;
-    static thread_local float load_factor = 0.0;
-
-    if (((step_counter++) % kernel_select_interval_) == 0) {
-      load_factor = fast_load_factor(0, stream, false);
-    }
-    Selector::execute_kernel(load_factor, options_.block_size,
-                             options_.max_bucket_size, table_->buckets_num,
-                             options_.dim, stream, n, d_table_, keys, values,
-                             metas, founds);
-
-    CudaCheckError();
-  }
-
-  /**
-   * @brief Removes specified elements from the hash table.
-   *
-   * @param n The number of keys to remove.
-   * @param keys The keys to remove on GPU-accessible memory.
-   * @param stream The CUDA stream that is used to execute the operation.
-   *
-   */
-  void erase(const size_type n, const key_type* keys, cudaStream_t stream = 0) {
-    if (n == 0) {
-      return;
-    }
-
-    write_read_lock lock(mutex_);
-
-    {
-      const size_t block_size = options_.block_size;
-      const size_t N = n * TILE_SIZE;
-      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
-
-      remove_kernel<key_type, value_type, meta_type, TILE_SIZE>
-          <<<grid_size, block_size, 0, stream>>>(
-              table_, keys, table_->buckets, table_->buckets_size,
-              table_->bucket_max_size, table_->buckets_num, N);
-    }
-
-    CudaCheckError();
-    return;
-  }
-
-  /**
-   * @brief Erases all elements that satisfy the predicate @p pred from the
-   * hash table.
-   *
-   * The value for @p pred should be a function with type `Pred` defined like
-   * the following example:
-   *
-   *    ```
-   *    template <class K, class M>
-   *    __forceinline__ __device__ bool erase_if_pred(const K& key,
-   *                                                  const M& meta,
-   *                                                  const K& pattern,
-   *                                                  const M& threshold) {
-   *      return ((key & 0x1 == pattern) && (meta < threshold));
-   *    }
-   *    ```
-   *
-   * @param pred The predicate function with type Pred that returns `true` if
-   * the element should be erased.
-   * @param pattern The third user-defined argument to @p pred with key_type
-   * type.
-   * @param threshold The fourth user-defined argument to @p pred with meta_type
-   * type.
-   * @param stream The CUDA stream that is used to execute the operation.
-   *
-   * @return The number of elements removed.
-   *
-   */
-  size_type erase_if(const Pred& pred, const key_type& pattern,
-                     const meta_type& threshold, cudaStream_t stream = 0) {
-    write_read_lock lock(mutex_);
-
-    auto dev_ws{dev_mem_pool_->get_workspace<1>(sizeof(size_type), stream)};
-    auto d_count{dev_ws.get<size_type*>(0)};
-
-    CUDA_CHECK(cudaMemsetAsync(d_count, 0, sizeof(size_type), stream));
-
-    Pred h_pred;
-    CUDA_CHECK(cudaMemcpyFromSymbolAsync(&h_pred, pred, sizeof(Pred), 0,
-                                         cudaMemcpyDeviceToHost, stream));
-
-    {
-      const size_t block_size = options_.block_size;
-      const size_t N = table_->buckets_num;
-      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
-
-      remove_kernel<key_type, value_type, meta_type>
-          <<<grid_size, block_size, 0, stream>>>(
-              table_, h_pred, pattern, threshold, d_count, table_->buckets,
-              table_->buckets_size, table_->bucket_max_size,
-              table_->buckets_num, N);
-    }
-
-    size_type count = 0;
-    CUDA_CHECK(cudaMemcpyAsync(&count, d_count, sizeof(size_type),
-                               cudaMemcpyDeviceToHost, stream));
-    CUDA_CHECK(cudaStreamSynchronize(stream));
-
-    CudaCheckError();
-    return count;
-  }
-
-  /**
-   * @brief Removes all of the elements in the hash table with no release
-   * object.
-   */
-  void clear(cudaStream_t stream = 0) {
-    write_read_lock lock(mutex_);
-
-    const size_t block_size = options_.block_size;
-    const size_t N = table_->buckets_num * table_->bucket_max_size;
-    const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
-
-    clear_kernel<key_type, value_type, meta_type>
-        <<<grid_size, block_size, 0, stream>>>(table_, N);
-
-    CudaCheckError();
-  }
-
- public:
-  /**
-   * @brief Exports a certain number of the key-value-meta tuples from the
-   * hash table.
-   *
-   * @param n The maximum number of exported pairs.
-   * @param offset The position of the key to remove.
-   * @param counter Accumulates amount of successfully exported values.
-   * @param keys The keys to dump from GPU-accessible memory with shape (n).
-   * @param values The values to dump from GPU-accessible memory with shape
-   * (n, DIM).
-   * @param metas The metas to search on GPU-accessible memory with shape (n).
-   * @parblock
-   * If @p metas is `nullptr`, the meta for each key will not be returned.
-   * @endparblock
-   *
-   * @param stream The CUDA stream that is used to execute the operation.
-   *
-   * @return The number of elements dumped.
-   *
-   * @throw CudaException If the key-value size is too large for GPU shared
-   * memory. Reducing the value for @p n is currently required if this exception
-   * occurs.
-   */
-  void export_batch(size_type n, const size_type offset,
-                    size_type* counter,          // (1)
-                    key_type* keys,              // (n)
-                    value_type* values,          // (n, DIM)
-                    meta_type* metas = nullptr,  // (n)
-                    cudaStream_t stream = 0) const {
-    reader_shared_lock lock(mutex_);
-
-    if (offset >= table_->capacity) {
-      return;
-    }
-    n = std::min(table_->capacity - offset, n);
-
-    size_type shared_size;
-    size_type block_size;
-    std::tie(shared_size, block_size) =
-        dump_kernel_shared_memory_size<K, V, M>(shared_mem_size_);
-
-    const size_t grid_size = SAFE_GET_GRID_SIZE(n, block_size);
-
-    dump_kernel<key_type, value_type, meta_type>
-        <<<grid_size, block_size, shared_size, stream>>>(
-            table_, keys, values, metas, offset, n, counter);
-
-    CudaCheckError();
-  }
-
-  size_type export_batch(const size_type n, const size_type offset,
-                         key_type* keys,              // (n)
-                         value_type* values,          // (n, DIM)
-                         meta_type* metas = nullptr,  // (n)
-                         cudaStream_t stream = 0) const {
-    auto dev_ws{dev_mem_pool_->get_workspace<1>(sizeof(size_type), stream)};
-    auto d_counter{dev_ws.get<size_type*>(0)};
-
-    CUDA_CHECK(cudaMemsetAsync(d_counter, 0, sizeof(size_type), stream));
-    export_batch(n, offset, d_counter, keys, values, metas, stream);
-
-    size_type counter = 0;
-    CUDA_CHECK(cudaMemcpyAsync(&counter, d_counter, sizeof(size_type),
-                               cudaMemcpyDeviceToHost, stream));
-    CUDA_CHECK(cudaStreamSynchronize(stream));
-    return counter;
-  }
-
-  /**
-   * @brief Exports a certain number of the key-value-meta tuples which match
-   * specified condition from the hash table.
-   *
-   * @param n The maximum number of exported pairs.
-   * The value for @p pred should be a function with type `Pred` defined like
-   * the following example:
-   *
-   *    ```
-   *    template <class K, class M>
-   *    __forceinline__ __device__ bool export_if_pred(const K& key,
-   *                                                   M& meta,
-   *                                                   const K& pattern,
-   *                                                   const M& threshold) {
-   *
-   *      return meta > threshold;
-   *    }
-   *    ```
-   *
-   * @param pred The predicate function with type Pred that returns `true` if
-   * the element should be exported.
-   * @param pattern The third user-defined argument to @p pred with key_type
-   * type.
-   * @param threshold The fourth user-defined argument to @p pred with meta_type
-   * type.
-   * @param offset The position of the key to remove.
-   * @param keys The keys to dump from GPU-accessible memory with shape (n).
-   * @param values The values to dump from GPU-accessible memory with shape
-   * (n, DIM).
-   * @param metas The metas to search on GPU-accessible memory with shape (n).
-   * @parblock
-   * If @p metas is `nullptr`, the meta for each key will not be returned.
-   * @endparblock
-   *
-   * @param stream The CUDA stream that is used to execute the operation.
-   *
-   * @return The number of elements dumped.
-   *
-   * @throw CudaException If the key-value size is too large for GPU shared
-   * memory. Reducing the value for @p n is currently required if this exception
-   * occurs.
-   */
-  template <template <typename, typename> class PredFunctor>
-  void export_batch_if(const key_type& pattern,
-                       const meta_type& threshold, size_type n,
-                       const size_type offset, size_type* d_counter,
-                       key_type* keys,              // (n)
-                       value_type* values,          // (n, DIM)
-                       meta_type* metas = nullptr,  // (n)
-                       cudaStream_t stream = 0) const {
-    reader_shared_lock lock(mutex_);
-
-    if (offset >= table_->capacity) {
-      CUDA_CHECK(cudaMemsetAsync(d_counter, 0, sizeof(size_type), stream));
-      return;
-    }
-    n = std::min(table_->capacity - offset, n);
-
-    const size_t meta_size = metas ? sizeof(meta_type) : 0;
-    const size_t kvm_size =
-        sizeof(key_type) + sizeof(value_type) * dim() + meta_size;
-    const size_t block_size = std::min(shared_mem_size_ / 2 / kvm_size, 1024UL);
-    MERLIN_CHECK(
-        block_size > 0,
-        "[HierarchicalKV] block_size <= 0, the K-V-M size may be too large!");
-
-    const size_t shared_size = kvm_size * block_size;
-    const size_t grid_size = SAFE_GET_GRID_SIZE(n, block_size);
-
-    dump_kernel<key_type, value_type, meta_type, PredFunctor>
-        <<<grid_size, block_size, shared_size, stream>>>(
-            table_, pattern, threshold, keys, values, metas, offset, n,
-            d_counter);
-
-    CudaCheckError();
-  }
-
- public:
-  /**
-   * @brief Indicates if the hash table has no elements.
-   *
-   * @param stream The CUDA stream that is used to execute the operation.
-   * @return `true` if the table is empty and `false` otherwise.
-   */
-  bool empty(cudaStream_t stream = 0) const { return size(stream) == 0; }
-
-  /**
-   * @brief Returns the hash table size.
-   *
-   * @param stream The CUDA stream that is used to execute the operation.
-   * @return The table size.
-   */
-  size_type size(cudaStream_t stream = 0) const {
-    reader_shared_lock lock(mutex_);
-
-    size_type h_size = 0;
-
-    const size_type N = table_->buckets_num;
-    const size_type step = static_cast<size_type>(
-        std::numeric_limits<int>::max() / options_.max_bucket_size);
-
-    thrust::device_ptr<int> size_ptr(table_->buckets_size);
-
-    for (size_type start_i = 0; start_i < N; start_i += step) {
-      size_type end_i = std::min(start_i + step, N);
-      h_size += thrust::reduce(thrust_par.on(stream), size_ptr + start_i,
-                               size_ptr + end_i, 0, thrust::plus<int>());
-    }
-
-    CudaCheckError();
-    return h_size;
-  }
-
-  /**
-   * @brief Returns the hash table capacity.
-   *
-   * @note The value that is returned might be less than the actual capacity of
-   * the hash table because the hash table currently keeps the capacity to be
-   * a power of 2 for performance considerations.
-   *
-   * @return The table capacity.
-   */
-  size_type capacity() const { return table_->capacity; }
-
-  /**
-   * @brief Sets the number of buckets to the number that is needed to
-   * accommodate at least @p new_capacity elements without exceeding the maximum
-   * load factor. This method rehashes the hash table. Rehashing puts the
-   * elements into the appropriate buckets considering that total number of
-   * buckets has changed.
-   *
-   * @note If the value of @p new_capacity or double of @p new_capacity is
-   * greater or equal than `options_.max_capacity`, the reserve does not perform
-   * any change to the hash table.
-   *
-   * @param new_capacity The requested capacity for the hash table.
-   * @param stream The CUDA stream that is used to execute the operation.
-   */
-  void reserve(const size_type new_capacity, cudaStream_t stream = 0) {
-    if (reach_max_capacity_ || new_capacity > options_.max_capacity) {
-      return;
-    }
-
-    {
-      write_read_lock lock(mutex_);
-
-      // Once we have exclusive access, make sure that pending GPU calls have
-      // been processed.
-      CUDA_CHECK(cudaDeviceSynchronize());
-
-      while (capacity() < new_capacity &&
-             capacity() * 2 <= options_.max_capacity) {
-        double_capacity(&table_);
-        CUDA_CHECK(cudaDeviceSynchronize());
-        sync_table_configuration();
-
-        const size_t block_size = options_.block_size;
-        const size_t N = TILE_SIZE * table_->buckets_num / 2;
-        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
-
-        rehash_kernel_for_fast_mode<key_type, value_type, meta_type, TILE_SIZE>
-            <<<grid_size, block_size, 0, stream>>>(d_table_, N);
-      }
-      CUDA_CHECK(cudaDeviceSynchronize());
-      reach_max_capacity_ = (capacity() * 2 > options_.max_capacity);
-    }
-    CudaCheckError();
-  }
-
-  /**
-   * @brief Returns the average number of elements per slot, that is, size()
-   * divided by capacity().
-   *
-   * @param stream The CUDA stream that is used to execute the operation.
-   *
-   * @return The load factor
-   */
-  float load_factor(cudaStream_t stream = 0) const {
-    return static_cast<float>((size(stream) * 1.0) / (capacity() * 1.0));
-  }
-
-  /**
-   * @brief Set max_capacity of the table.
-   *
-   * @param new_max_capacity The new expecting max_capacity. It must be power
-   * of 2. Otherwise it will raise an error.
-   */
-  void set_max_capacity(size_type new_max_capacity) {
-    if (!is_power(2, new_max_capacity)) {
-      throw std::invalid_argument(
-          "None power-of-2 new_max_capacity is not supported.");
-    }
-
-    write_read_lock lock(mutex_);
-
-    if (new_max_capacity < capacity()) {
-      return;
-    }
-    if (reach_max_capacity_) {
-      reach_max_capacity_ = false;
-    }
-    options_.max_capacity = new_max_capacity;
-  }
-
-  /**
-   * @brief Returns the dimension of the vectors.
-   *
-   * @return The dimension of the vectors.
-   */
-  size_type dim() const noexcept { return options_.dim; }
-
-  /**
-   * @brief Returns The length of each bucket.
-   *
-   * @return The length of each bucket.
-   */
-  size_type max_bucket_size() const noexcept {
-    return options_.max_bucket_size;
-  }
-
-  /**
-   * @brief Returns the number of buckets in the table.
-   *
-   * @return The number of buckets in the table.
-   */
-  size_type bucket_count() const noexcept { return table_->buckets_num; }
-
-  /**
-   * @brief Save keys, vectors, metas in table to file or files.
-   *
-   * @param file A BaseKVFile object defined the file format on host filesystem.
-   * @param max_workspace_size Saving is conducted in chunks. This value denotes
-   * the maximum amount of temporary memory to use when dumping the table.
-   * Larger values *can* lead to higher performance.
-   * @param stream The CUDA stream used to execute the operation.
-   *
-   * @return Number of KV pairs saved to file.
-   */
-  size_type save(BaseKVFile<K, V, M>* file,
-                 const size_t max_workspace_size = 1L * 1024 * 1024,
-                 cudaStream_t stream = 0) const {
-    const size_type tuple_size{sizeof(key_type) + sizeof(meta_type) +
-                               sizeof(value_type) * dim()};
-    MERLIN_CHECK(max_workspace_size >= tuple_size,
-                 "[HierarchicalKV] max_workspace_size is smaller than a single "
-                 "`key + metadata + value` tuple! Please set a larger value!");
-
-    size_type shared_size;
-    size_type block_size;
-    std::tie(shared_size, block_size) =
-        dump_kernel_shared_memory_size<K, V, M>(shared_mem_size_);
-
-    // Request exclusive access (to make sure capacity won't change anymore).
-    write_read_lock lock(mutex_);
-
-    const size_type total_size{capacity()};
-    const size_type n{std::min(max_workspace_size / tuple_size, total_size)};
-    const size_type grid_size{SAFE_GET_GRID_SIZE(n, block_size)};
-
-    // Grab temporary device and host memory.
-    const size_type host_ws_size{n * tuple_size};
-    auto host_ws{host_mem_pool_->get_workspace<1>(host_ws_size, stream)};
-    auto h_keys{host_ws.get<key_type*>(0)};
-    auto h_metas{reinterpret_cast<meta_type*>(h_keys + n)};
-    auto h_values{reinterpret_cast<value_type*>(h_metas + n)};
-
-    const size_type dev_ws_size{sizeof(size_type) + host_ws_size};
-    auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)};
-    auto d_count{dev_ws.get<size_type*>(0)};
-    auto d_keys{reinterpret_cast<key_type*>(d_count + 1)};
-    auto d_metas{reinterpret_cast<meta_type*>(d_keys + n)};
-    auto d_values{reinterpret_cast<value_type*>(d_metas + n)};
-
-    // Step through table, dumping contents in batches.
-    size_type total_count{0};
-    for (size_type i{0}; i < total_size; i += n) {
-      // Dump the next batch to workspace, and then write it to the file.
-      CUDA_CHECK(cudaMemsetAsync(d_count, 0, sizeof(size_type), stream));
-
-      dump_kernel<key_type, value_type, meta_type>
-          <<<grid_size, block_size, shared_size, stream>>>(
-              table_, d_keys, d_values, d_metas, i, std::min(total_size - i, n),
-              d_count);
-
-      size_type count;
-      CUDA_CHECK(cudaMemcpyAsync(&count, d_count, sizeof(size_type),
-                                 cudaMemcpyDeviceToHost, stream));
-      CUDA_CHECK(cudaStreamSynchronize(stream));
-
-      if (count == n) {
-        CUDA_CHECK(cudaMemcpyAsync(h_keys, d_keys, host_ws_size,
-                                   cudaMemcpyDeviceToHost, stream));
-      } else {
-        CUDA_CHECK(cudaMemcpyAsync(h_keys, d_keys, sizeof(key_type) * count,
-                                   cudaMemcpyDeviceToHost, stream));
-        CUDA_CHECK(cudaMemcpyAsync(h_metas, d_metas, sizeof(meta_type) * count,
-                                   cudaMemcpyDeviceToHost, stream));
-        CUDA_CHECK(cudaMemcpyAsync(h_values, d_values,
-                                   sizeof(value_type) * dim() * count,
-                                   cudaMemcpyDeviceToHost, stream));
-      }
-
-      CUDA_CHECK(cudaStreamSynchronize(stream));
-      file->write(count, dim(), h_keys, h_values, h_metas);
-      total_count += count;
-    }
-
-    return total_count;
-  }
-
-  /**
-   * @brief Load keys, vectors, metas from file to table.
-   *
-   * @param file An BaseKVFile defined the file format within filesystem.
-   * @param max_workspace_size Loading is conducted in chunks. This value
-   * denotes the maximum size of such chunks. Larger values *can* lead to higher
-   * performance.
-   * @param stream The CUDA stream used to execute the operation.
-   *
-   * @return Number of keys loaded from file.
-   */
-  size_type load(BaseKVFile<K, V, M>* file,
-                 const size_t max_workspace_size = 1L * 1024 * 1024,
-                 cudaStream_t stream = 0) {
-    const size_type tuple_size{sizeof(key_type) + sizeof(meta_type) +
-                               sizeof(value_type) * dim()};
-    MERLIN_CHECK(max_workspace_size >= tuple_size,
-                 "[HierarchicalKV] max_workspace_size is smaller than a single "
-                 "`key + metadata + value` tuple! Please set a larger value!");
-
-    const size_type n{max_workspace_size / tuple_size};
-    const size_type ws_size{n * tuple_size};
-
-    // Grab enough host memory to hold batch data.
-    auto host_ws{host_mem_pool_->get_workspace<1>(ws_size, stream)};
-    auto h_keys{host_ws.get<key_type*>(0)};
-    auto h_metas{reinterpret_cast<meta_type*>(h_keys + n)};
-    auto h_values{reinterpret_cast<value_type*>(h_metas + n)};
-
-    // Attempt a first read.
-    size_type count{file->read(n, dim(), h_keys, h_values, h_metas)};
-    if (count == 0) {
-      return 0;
-    }
-
-    // Grab equal amount of device memory as temporary storage.
-    auto dev_ws{dev_mem_pool_->get_workspace<1>(ws_size, stream)};
-    auto d_keys{dev_ws.get<key_type*>(0)};
-    auto d_metas{reinterpret_cast<meta_type*>(d_keys + n)};
-    auto d_values{reinterpret_cast<value_type*>(d_metas + n)};
-
-    size_type total_count{0};
-    do {
-      if (count == n) {
-        CUDA_CHECK(cudaMemcpyAsync(d_keys, h_keys, ws_size,
-                                   cudaMemcpyHostToDevice, stream));
-      } else {
-        CUDA_CHECK(cudaMemcpyAsync(d_keys, h_keys, sizeof(key_type) * count,
-                                   cudaMemcpyHostToDevice, stream));
-        CUDA_CHECK(cudaMemcpyAsync(d_metas, h_metas, sizeof(meta_type) * count,
-                                   cudaMemcpyHostToDevice, stream));
-        CUDA_CHECK(cudaMemcpyAsync(d_values, h_values,
-                                   sizeof(value_type) * dim() * count,
-                                   cudaMemcpyHostToDevice, stream));
-      }
-
-      insert_or_assign(count, d_keys, d_values, d_metas, stream, true);
-      total_count += count;
-
-      // Read next batch.
-      CUDA_CHECK(cudaStreamSynchronize(stream));
-      count = file->read(n, dim(), h_keys, h_values, h_metas);
-    } while (count > 0);
-
-    return total_count;
-  }
-
- private:
-  bool is_power(size_t base, size_t n) {
-    if (base < 2) {
-      throw std::invalid_argument("is_power with zero base.");
-    }
-    while (n > 1) {
-      if (n % base != 0) {
-        return false;
-      }
-      n /= base;
-    }
-    return true;
-  }
-
- private:
-  inline bool is_fast_mode() const noexcept { return table_->is_pure_hbm; }
-
-  /**
-   * @brief Returns the load factor by sampling up to 1024 buckets.
-   *
-   * @note For performance consideration, the returned load factor is
-   * inaccurate but within an error in 1% empirically which is enough for
-   * capacity control. But it's not suitable for end-users.
-   *
-   * @param delta A hypothetical upcoming change on table size.
-   * @param stream The CUDA stream used to execute the operation.
-   * @param need_lock If lock is needed.
-   *
-   * @return The evaluated load factor
-   */
-  inline float fast_load_factor(const size_type delta = 0,
-                                cudaStream_t stream = 0,
-                                const bool need_lock = true) const {
-    reader_shared_lock lock(mutex_, std::defer_lock);
-    if (need_lock) {
-      lock.lock();
-    }
-
-    size_t N = std::min(table_->buckets_num, 1024UL);
-
-    thrust::device_ptr<int> size_ptr(table_->buckets_size);
-
-    int size = thrust::reduce(thrust_par.on(stream), size_ptr, size_ptr + N, 0,
-                              thrust::plus<int>());
-
-    CudaCheckError();
-    return static_cast<float>((delta * 1.0) / (capacity() * 1.0) +
-                              (size * 1.0) /
-                                  (options_.max_bucket_size * N * 1.0));
-  }
-
-  inline void check_evict_strategy(const meta_type* metas) {
-    if (options_.evict_strategy == EvictStrategy::kLru) {
-      MERLIN_CHECK(metas == nullptr,
-                   "the metas should not be specified when running on "
-                   "LRU mode.");
-    }
-
-    if (options_.evict_strategy == EvictStrategy::kCustomized) {
-      MERLIN_CHECK(metas != nullptr,
-                   "the metas should be specified when running on "
-                   "customized mode.");
-    }
-  }
-
-  /**
-   * @brief Synchronize the TableCore struct to replicas.
-   *
-   * @note For performance consideration, synchronize the TableCore struct to
-   * its replicas in constant memory and device memory when it's changed.
-   */
-  inline void sync_table_configuration() {
-    CUDA_CHECK(
-        cudaMemcpy(d_table_, table_, sizeof(TableCore), cudaMemcpyDefault));
-    if (c_table_index_ >= 0) {
-      CUDA_CHECK(cudaMemcpyToSymbol(c_table_, table_, sizeof(TableCore),
-                                    sizeof(TableCore) * c_table_index_,
-                                    cudaMemcpyDefault));
-    }
-  }
-
- private:
-  HashTableOptions options_;
-  TableCore* table_ = nullptr;
-  TableCore* d_table_ = nullptr;
-  size_t shared_mem_size_ = 0;
-  std::atomic_bool reach_max_capacity_{false};
-  bool initialized_ = false;
-  mutable group_shared_mutex mutex_;
-  const unsigned int kernel_select_interval_ = 7;
-  int c_table_index_ = -1;
-  std::unique_ptr<DeviceMemoryPool> dev_mem_pool_;
-  std::unique_ptr<HostMemoryPool> host_mem_pool_;
-};
-
-}  // namespace merlin
-}  // namespace nv
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin_localfile.hpp b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin_localfile.hpp
deleted file mode 100644
index c60099ac2..000000000
--- a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/merlin_inc/merlin_localfile.hpp
+++ /dev/null
@@ -1,162 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <stddef.h>
-#include <stdio.h>
-#include <string>
-#include "merlin/types.cuh"
-
-namespace nv {
-namespace merlin {
-
-/**
- * The KV file on local file system. It only save/load keys and vectors
- * between table and file. `metas` are ignored in it since absolute
- * values of metas are commonly time-variant, while the time interval
- * between save/load calling is not deterministic, in default case. If
- * other specified rules are required, the BaseKVFile could be inherited
- * to implement customized read/write rules. The LocalKVFile uses compact,
- * consecutive binary format, where keys, values, and metas are stored in
- * seperated paths.
- *
- * @tparam K The data type of the key.
- * @tparam V The data type of the vector's elements.
- *         The item data type should be a basic data type of C++/CUDA.
- * @tparam M The data type for `meta`.
- *           The currently supported data type is only `uint64_t`.
- *
- */
-template <class K, class V, class M>
-class LocalKVFile : public BaseKVFile<K, V, M> {
- public:
-  LocalKVFile() : keys_fp_(nullptr), values_fp_(nullptr), metas_fp_(nullptr) {}
-
-  ~LocalKVFile() { close(); }
-
-  /**
-   * @brief Open the file from local path. A LocalKVFile can only be
-   * read or written when it stays opened.
-   *
-   * @param keys_path Path to file to store keys.
-   * @param values_path Path to file to store values.
-   * @param metas_path Path to file to store metas.
-   * @params mode The mode to the file. The mode follows glibc style
-   *              and behavior like fopen.
-   */
-  bool open(const std::string& keys_path, const std::string& values_path,
-            const std::string& metas_path, const char* mode) {
-    close();
-    keys_fp_ = fopen(keys_path.c_str(), mode);
-    if (!keys_fp_) {
-      return false;
-    }
-    values_fp_ = fopen(values_path.c_str(), mode);
-    if (!values_fp_) {
-      close();
-      return false;
-    }
-    metas_fp_ = fopen(metas_path.c_str(), mode);
-    if (!metas_fp_) {
-      close();
-      return false;
-    }
-    return true;
-  }
-
-  /**
-   * @brief Close the file from open status and release fd(s) on files
-   * of keys, values, and metas.
-   */
-  void close() noexcept {
-    if (keys_fp_) {
-      fclose(keys_fp_);
-      keys_fp_ = nullptr;
-    }
-    if (values_fp_) {
-      fclose(values_fp_);
-      values_fp_ = nullptr;
-    }
-    if (metas_fp_) {
-      fclose(metas_fp_);
-      metas_fp_ = nullptr;
-    }
-  }
-
-  /**
-   * Read from file and fill into the keys, values, and metas buffer.
-   * When calling save/load method from table, it can assume that the
-   * received buffer of keys, vectors, and metas are automatically
-   * pre-allocated.
-   *
-   * @param n The number of KV pairs expect to read. `int64_t` was used
-   *          here to adapt to various filesytem and formats.
-   * @param dim The dimension of the `vectors`.
-   * @param keys The pointer to received buffer for keys.
-   * @param vectors The pointer to received buffer for vectors.
-   * @param metas The pointer to received buffer for metas.
-   *
-   * @return Number of KV pairs have been successfully read.
-   */
-  size_t read(const size_t n, const size_t dim, K* keys, V* vectors,
-              M* metas) override {
-    size_t nread_keys =
-        fread(keys, sizeof(K), static_cast<size_t>(n), keys_fp_);
-    size_t nread_vecs =
-        fread(vectors, sizeof(V) * dim, static_cast<size_t>(n), values_fp_);
-    size_t nread_metas =
-        fread(metas, sizeof(M), static_cast<size_t>(n), metas_fp_);
-    if (nread_keys != nread_vecs || nread_keys != nread_metas) {
-      return 0;
-    }
-    return nread_keys;
-  }
-
-  /**
-   * Write keys, values, metas from table to the file.
-   *
-   * @param n The number of KV pairs to be written. `int64_t` was used
-   *          here to adapt to various filesytem and formats.
-   * @param dim The dimension of the `vectors`.
-   * @param keys The keys will be written to file.
-   * @param vectors The vectors of values will be written to file.
-   * @param metas The metas will be written to file.
-   *
-   * @return Number of KV pairs have been successfully written.
-   */
-  size_t write(const size_t n, const size_t dim, const K* keys,
-               const V* vectors, const M* metas) override {
-    size_t nwritten_keys =
-        fwrite(keys, sizeof(K), static_cast<size_t>(n), keys_fp_);
-    size_t nwritten_vecs =
-        fwrite(vectors, sizeof(V) * dim, static_cast<size_t>(n), values_fp_);
-    size_t nwritten_metas =
-        fwrite(metas, sizeof(M), static_cast<size_t>(n), metas_fp_);
-    if (nwritten_keys != nwritten_vecs || nwritten_keys != nwritten_metas) {
-      return 0;
-    }
-    return nwritten_keys;
-  }
-
- private:
-  FILE* keys_fp_;
-  FILE* values_fp_;
-  FILE* metas_fp_;
-};
-
-}  // namespace merlin
-}  // namespace nv
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/utils/BUILD b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/utils/BUILD
index 7108afe68..c3f9682d9 100644
--- a/tensorflow_recommenders_addons/dynamic_embedding/core/lib/utils/BUILD
+++ b/tensorflow_recommenders_addons/dynamic_embedding/core/lib/utils/BUILD
@@ -3,10 +3,6 @@ package(default_visibility = ["//visibility:public"])
 cc_library(
     name = "cuda_utils",
     hdrs = [
-      "cuda_utils.cuh",
-    ],
-    linkopts = [
-        "-L/usr/local/cuda-11.2/targets/x86_64-linux/lib/stubs",
-        "-lnvidia-ml",
+        "cuda_utils.cuh",
     ],
 )
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/core/ops/hkv_hashtable_ops.cc b/tensorflow_recommenders_addons/dynamic_embedding/core/ops/hkv_hashtable_ops.cc
index a241b7abd..13735a3f0 100644
--- a/tensorflow_recommenders_addons/dynamic_embedding/core/ops/hkv_hashtable_ops.cc
+++ b/tensorflow_recommenders_addons/dynamic_embedding/core/ops/hkv_hashtable_ops.cc
@@ -113,7 +113,7 @@ Status ValidateTableResourceHandle(InferenceContext* c, ShapeHandle keys,
 }
 
 Status HkvHashTableShape(InferenceContext* c, const ShapeHandle& key,
-                            const ShapeHandle& value) {
+                         const ShapeHandle& value) {
   c->set_output(0, c->Scalar());
 
   ShapeHandle key_s;
@@ -131,7 +131,7 @@ Status HkvHashTableShape(InferenceContext* c, const ShapeHandle& key,
   return Status::OK();
 }
 
-REGISTER_OP("TfraHkvHashTableFind")
+REGISTER_OP(PREFIX_OP_NAME(HkvHashTableFind))
     .Input("table_handle: resource")
     .Input("keys: Tin")
     .Input("default_value: Tout")
@@ -154,7 +154,7 @@ REGISTER_OP("TfraHkvHashTableFind")
       return Status::OK();
     });
 
-REGISTER_OP("TfraHkvHashTableFindWithExists")
+REGISTER_OP(PREFIX_OP_NAME(HkvHashTableFindWithExists))
     .Input("table_handle: resource")
     .Input("keys: Tin")
     .Input("default_value: Tout")
@@ -180,7 +180,7 @@ REGISTER_OP("TfraHkvHashTableFindWithExists")
       return Status::OK();
     });
 
-REGISTER_OP("TfraHkvHashTableInsert")
+REGISTER_OP(PREFIX_OP_NAME(HkvHashTableInsert))
     .Input("table_handle: resource")
     .Input("keys: Tin")
     .Input("values: Tout")
@@ -194,7 +194,7 @@ REGISTER_OP("TfraHkvHashTableInsert")
       return Status::OK();
     });
 
-REGISTER_OP("TfraHkvHashTableAccum")
+REGISTER_OP(PREFIX_OP_NAME(HkvHashTableAccum))
     .Input("table_handle: resource")
     .Input("keys: key_dtype")
     .Input("values_or_deltas: value_dtype")
@@ -209,7 +209,7 @@ REGISTER_OP("TfraHkvHashTableAccum")
       return Status::OK();
     });
 
-REGISTER_OP("TfraHkvHashTableRemove")
+REGISTER_OP(PREFIX_OP_NAME(HkvHashTableRemove))
     .Input("table_handle: resource")
     .Input("keys: Tin")
     .Attr("Tin: type")
@@ -222,17 +222,19 @@ REGISTER_OP("TfraHkvHashTableRemove")
       return Status::OK();
     });
 
-REGISTER_OP("TfraHkvHashTableClear")
+REGISTER_OP(PREFIX_OP_NAME(HkvHashTableClear))
     .Input("table_handle: resource")
     .Attr("key_dtype: type")
     .Attr("value_dtype: type");
 
-REGISTER_OP("TfraHkvHashTableSize")
+REGISTER_OP(PREFIX_OP_NAME(HkvHashTableSize))
     .Input("table_handle: resource")
     .Output("size: int64")
+    .Attr("key_dtype: type")
+    .Attr("value_dtype: type")
     .SetShapeFn(ScalarAndTwoElementVectorInputsAndScalarOutputs);
 
-REGISTER_OP("TfraHkvHashTableExport")
+REGISTER_OP(PREFIX_OP_NAME(HkvHashTableExport))
     .Input("table_handle: resource")
     .Output("keys: Tkeys")
     .Output("values: Tvalues")
@@ -254,7 +256,7 @@ REGISTER_OP("TfraHkvHashTableExport")
       return Status::OK();
     });
 
-REGISTER_OP("TfraHkvHashTableSaveToFileSystem")
+REGISTER_OP(PREFIX_OP_NAME(HkvHashTableSaveToFileSystem))
     .Input("table_handle: resource")
     .Input("dirpath: string")
     .Input("file_name: string")
@@ -264,7 +266,7 @@ REGISTER_OP("TfraHkvHashTableSaveToFileSystem")
     .Attr("append_to_file: bool")
     .Attr("buffer_size: int >= 1");
 
-REGISTER_OP("TfraHkvHashTableExportKeysAndMetas")
+REGISTER_OP(PREFIX_OP_NAME(HkvHashTableExportKeysAndMetas))
     .Input("table_handle: resource")
     .Output("keys: Tkeys")
     .Output("metas: int64")
@@ -280,7 +282,7 @@ REGISTER_OP("TfraHkvHashTableExportKeysAndMetas")
       return Status::OK();
     });
 
-REGISTER_OP("TfraHkvHashTableImport")
+REGISTER_OP(PREFIX_OP_NAME(HkvHashTableImport))
     .Input("table_handle: resource")
     .Input("keys: Tin")
     .Input("values: Tout")
@@ -296,7 +298,7 @@ REGISTER_OP("TfraHkvHashTableImport")
       return Status::OK();
     });
 
-REGISTER_OP("TfraHkvHashTableLoadFromFileSystem")
+REGISTER_OP(PREFIX_OP_NAME(HkvHashTableLoadFromFileSystem))
     .Input("table_handle: resource")
     .Input("dirpath: string")
     .Input("file_name: string")
@@ -306,7 +308,7 @@ REGISTER_OP("TfraHkvHashTableLoadFromFileSystem")
     .Attr("load_entire_dir: bool")
     .Attr("buffer_size: int >= 1");
 
-REGISTER_OP("TfraHkvHashTableOfTensors")
+REGISTER_OP(PREFIX_OP_NAME(HkvHashTableOfTensors))
     .Output("table_handle: resource")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
@@ -316,6 +318,7 @@ REGISTER_OP("TfraHkvHashTableOfTensors")
     .Attr("value_shape: shape = {}")
     .Attr("init_capacity: int = 0")
     .Attr("max_capacity: int = 0")
+    .Attr("max_hbm_for_vectors: int = 0")
     .SetIsStateful()
     .SetShapeFn([](InferenceContext* c) {
       PartialTensorShape value_p;
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/python/kernel_tests/BUILD b/tensorflow_recommenders_addons/dynamic_embedding/python/kernel_tests/BUILD
index 9adbfa130..239e3b243 100644
--- a/tensorflow_recommenders_addons/dynamic_embedding/python/kernel_tests/BUILD
+++ b/tensorflow_recommenders_addons/dynamic_embedding/python/kernel_tests/BUILD
@@ -1,5 +1,7 @@
 # Placeholder for internal Python library.
 # Placeholder for internal Python test library.
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_is_configured")
+load("//tensorflow_recommenders_addons:tensorflow_recommenders_addons.bzl", "custom_cuda_op_library", "if_cuda_for_tf_serving")
 
 package(
     default_visibility = [
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/python/kernel_tests/cuckoo_hashtable_ops_test.py b/tensorflow_recommenders_addons/dynamic_embedding/python/kernel_tests/cuckoo_hashtable_ops_test.py
index ed6bf1f50..ee2ff2c82 100644
--- a/tensorflow_recommenders_addons/dynamic_embedding/python/kernel_tests/cuckoo_hashtable_ops_test.py
+++ b/tensorflow_recommenders_addons/dynamic_embedding/python/kernel_tests/cuckoo_hashtable_ops_test.py
@@ -62,7 +62,15 @@ def test_dynamic_embedding_variable_set_init_size(self):
         id += 1
         self.assertTrue("I" in printed.contents())
         self.assertTrue(dev_str in printed.contents())
-        self.assertTrue("_size={}".format(expect_size) in printed.contents())
+        if not use_gpu:
+          self.assertTrue("_size={}".format(expect_size) in printed.contents())
+        else:
+          if init_size == 0:
+            self.assertTrue(
+                "init capacity: {}".format(1024 * 1024) in printed.contents())
+          else:
+            self.assertTrue(
+                "init capacity: {}".format(init_size) in printed.contents())
 
   @test_util.run_in_graph_and_eager_modes()
   def test_cuckoo_hashtable_import_and_export(self):
@@ -78,7 +86,7 @@ def test_cuckoo_hashtable_import_and_export(self):
                                   dtypes.int32,
                                   initializer=0,
                                   dim=3,
-                                  init_size=128)
+                                  init_size=168)
           keys = constant_op.constant(list(range(168)), dtypes.int64)
           values = constant_op.constant([[1, 1, 1] for _ in range(168)],
                                         dtypes.int32)
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/python/kernel_tests/dynamic_embedding_variable_test.py b/tensorflow_recommenders_addons/dynamic_embedding/python/kernel_tests/dynamic_embedding_variable_test.py
index c065f4f41..cb3d1c477 100644
--- a/tensorflow_recommenders_addons/dynamic_embedding/python/kernel_tests/dynamic_embedding_variable_test.py
+++ b/tensorflow_recommenders_addons/dynamic_embedding/python/kernel_tests/dynamic_embedding_variable_test.py
@@ -384,7 +384,7 @@ def test_variable(self):
       dim_list = [1, 2, 4, 8, 10, 16, 32, 64, 100, 200]
       kv_list = [[dtypes.int64, dtypes.float32], [dtypes.int64, dtypes.int32],
                  [dtypes.int64, dtypes.half], [dtypes.int64, dtypes.int8],
-                 [dtypes.int32, dtypes.float32], [dtypes.int64, dtypes.int64]]
+                 [dtypes.int64, dtypes.int64]]
     else:
       dim_list = [1, 8, 16, 128]
       kv_list = [[dtypes.int32, dtypes.double], [dtypes.int32, dtypes.float32],
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/python/kernel_tests/hkv_hashtable_ops_test.py b/tensorflow_recommenders_addons/dynamic_embedding/python/kernel_tests/hkv_hashtable_ops_test.py
new file mode 100644
index 000000000..5efa574d2
--- /dev/null
+++ b/tensorflow_recommenders_addons/dynamic_embedding/python/kernel_tests/hkv_hashtable_ops_test.py
@@ -0,0 +1,647 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""unit tests of hkv hashtable ops
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+import os
+import itertools
+import numpy as np
+
+from tensorflow_recommenders_addons import dynamic_embedding as de
+from tensorflow_recommenders_addons.utils.check_platform import is_windows, is_macos, is_arm64, is_linux, is_raspi_arm
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import server_lib
+from tensorflow.python.client import session
+
+import tensorflow as tf
+try:
+  import tensorflow_io
+except:
+  print()
+
+
+def _type_converter(tf_type):
+  mapper = {
+      dtypes.int32: np.int32,
+      dtypes.int64: np.int64,
+      dtypes.float32: float,
+      dtypes.float64: np.float64,
+      dtypes.string: str,
+      dtypes.half: np.float16,
+      dtypes.int8: np.int8,
+      dtypes.bool: bool,
+  }
+  return mapper[tf_type]
+
+
+default_config = config_pb2.ConfigProto(
+    allow_soft_placement=False,
+    gpu_options=config_pb2.GPUOptions(allow_growth=True))
+
+
+def _get_devices():
+  return ["/gpu:0" if test_util.is_gpu_available() else "/cpu:0"]
+
+
+is_gpu_available = test_util.is_gpu_available()
+
+
+class HkvHashtableTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_basic(self):
+    if not is_gpu_available:
+      self.skipTest('Only test when gpu is available.')
+    with self.session(use_gpu=True, config=default_config):
+      table = de.get_variable(
+          'basic',
+          key_dtype=dtypes.int64,
+          value_dtype=dtypes.int32,
+          initializer=0,
+          dim=8,
+          init_size=1024,
+          kv_creator=de.HkvHashTableCreator(config=de.HkvHashTableConfig(
+              max_capacity=99999)))
+      self.evaluate(table.size())
+      self.evaluate(table.clear())
+      del table
+
+  def test_variable(self):
+    if not is_gpu_available:
+      self.skipTest('Only test when gpu is available.')
+    id = 0
+    dim_list = [1, 2, 4, 8, 10, 16, 32, 64, 100, 200]
+    kv_list = [[dtypes.int64, dtypes.int8], [dtypes.int64, dtypes.int32],
+               [dtypes.int64, dtypes.int64], [dtypes.int64, dtypes.float32],
+               [dtypes.int64, dtypes.half]]
+
+    def _convert(v, t):
+      return np.array(v).astype(_type_converter(t))
+
+    for (key_dtype, value_dtype), dim in itertools.product(kv_list, dim_list):
+      id += 1
+      # Skip float16 tests if the platform is macOS arm64 architecture
+      if is_macos() and is_arm64():
+        if value_dtype == dtypes.half:
+          continue
+      with self.session(config=default_config, use_gpu=True) as sess:
+        keys = constant_op.constant(
+            np.array([0, 1, 2, 3]).astype(_type_converter(key_dtype)),
+            key_dtype)
+        values = constant_op.constant(
+            _convert([[0] * dim, [1] * dim, [2] * dim, [3] * dim], value_dtype),
+            value_dtype)
+        table = de.get_variable(
+            't1-' + str(id) + '_test_variable',
+            key_dtype=key_dtype,
+            value_dtype=value_dtype,
+            initializer=np.array([-1]).astype(_type_converter(value_dtype)),
+            dim=dim,
+            init_size=1024,
+            kv_creator=de.HkvHashTableCreator(config=de.HkvHashTableConfig(
+                max_capacity=99999)))
+        table.clear()
+
+        self.assertAllEqual(0, self.evaluate(table.size()))
+
+        self.evaluate(table.upsert(keys, values))
+        self.assertAllEqual(4, self.evaluate(table.size()))
+
+        output = table.lookup(keys)
+        self.assertAllEqual(values, self.evaluate(output))
+
+        remove_keys = constant_op.constant(_convert([1, 5], key_dtype),
+                                           key_dtype)
+
+        self.evaluate(table.remove(remove_keys))
+
+        self.assertAllEqual(3, self.evaluate(table.size()))
+
+        remove_keys = constant_op.constant(_convert([0, 1, 5], key_dtype),
+                                           key_dtype)
+        output = table.lookup(remove_keys)
+        self.assertAllEqual([3, dim], output.get_shape())
+
+        result = self.evaluate(output)
+        self.assertAllEqual(
+            _convert([[0] * dim, [-1] * dim, [-1] * dim], value_dtype),
+            _convert(result, value_dtype))
+
+        exported_keys, exported_values = table.export()
+
+        # exported data is in the order of the internal map, i.e. undefined
+        sorted_keys = np.sort(self.evaluate(exported_keys))
+        sorted_values = np.sort(self.evaluate(exported_values), axis=0)
+        self.assertAllEqual(_convert([0, 2, 3], key_dtype),
+                            _convert(sorted_keys, key_dtype))
+        self.assertAllEqual(
+            _convert([[0] * dim, [2] * dim, [3] * dim], value_dtype),
+            _convert(sorted_values, value_dtype))
+
+        self.evaluate(table.clear())
+        del table
+
+  def test_dynamic_embedding_variable_set_init_size(self):
+    if not is_gpu_available:
+      self.skipTest('Only test when gpu is available.')
+    test_list = [["GPU", True, 54321, 54321], ["GPU", True, 0, 8192]]
+    self.assertTrue(len(test_list) > 0)
+    id = 0
+    for dev_str, use_gpu, init_size, expect_size in test_list:
+      with self.session(use_gpu=use_gpu, config=default_config):
+        with self.captureWritesToStream(sys.stderr) as printed:
+          table = de.get_variable(
+              "2021-" + str(id),
+              dtypes.int64,
+              dtypes.int32,
+              initializer=0,
+              dim=8,
+              init_size=init_size,
+              kv_creator=de.HkvHashTableCreator(config=de.HkvHashTableConfig(
+                  max_capacity=99999)))
+          self.evaluate(table.size())
+        id += 1
+
+  def test_hkv_hashtable_import_and_export(self):
+    if not is_gpu_available:
+      self.skipTest('Only test when gpu is available.')
+    test_list = [["GPU", True]]
+    id = 0
+    for device, use_gpu in test_list:
+      with self.session(use_gpu=use_gpu, config=default_config):
+        with self.captureWritesToStream(sys.stderr) as printed:
+          table = de.get_variable(
+              "2021-" + str(id),
+              dtypes.int64,
+              dtypes.int32,
+              initializer=0,
+              dim=3,
+              init_size=128,
+              kv_creator=de.HkvHashTableCreator(config=de.HkvHashTableConfig(
+                  max_capacity=99999)))
+          keys = constant_op.constant(list(range(168)), dtypes.int64)
+          values = constant_op.constant([[1, 1, 1] for _ in range(168)],
+                                        dtypes.int32)
+          self.evaluate(table.upsert(keys, values))
+          self.assertAllEqual(168, self.evaluate(table.size()))
+          exported_keys, exported_values = self.evaluate(table.export())
+          exported_keys = sorted(exported_keys)
+          self.assertAllEqual(keys, exported_keys)
+          self.assertEqual(168, len(exported_values))
+          id += 1
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_insert(self):
+    if not is_gpu_available:
+      self.skipTest('Only test when gpu is available.')
+    dim_list = [1, 2, 4, 8, 10, 16, 32, 64, 100, 200]
+    kv_list = [[dtypes.int64, dtypes.float32], [dtypes.int64, dtypes.int32],
+               [dtypes.int64, dtypes.int64], [dtypes.int64, dtypes.int8]]
+
+    def _convert(v, t):
+      return np.array(v).astype(_type_converter(t))
+
+    id = 0
+
+    for (key_dtype, value_dtype), dim in itertools.product(kv_list, dim_list):
+      id += 1
+      # Skip float16 tests if the platform is macOS arm64 architecture
+      with self.session(config=default_config, use_gpu=True) as sess:
+        table = de.get_variable('test_insert' + str(id),
+                                key_dtype=key_dtype,
+                                value_dtype=value_dtype,
+                                initializer=np.array([-1]).astype(
+                                    _type_converter(value_dtype)),
+                                dim=dim,
+                                init_size=102400)
+
+        base_keys = []
+        for i in range(18):
+          keys = constant_op.constant(
+              np.array([x for x in range(85 * i, 85 * (i + 1))
+                       ]).astype(_type_converter(key_dtype)), key_dtype)
+          values = constant_op.constant(
+              _convert([[x] * dim for x in range(85 * i, 85 * (i + 1))],
+                       value_dtype), value_dtype)
+
+          self.evaluate(table.upsert(keys, values))
+          self.assertAllEqual((i + 1) * 85, self.evaluate(table.size()))
+        self.evaluate(table.clear())
+        del table
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_hkv_hashtable_save_local_file_system(self):
+    if not is_gpu_available:
+      self.skipTest('Only test when gpu is available.')
+    test_devices = ['/GPU:0']
+    dim = 8
+    for idx, device in enumerate(test_devices):
+      var1 = de.get_variable(
+          'lfsv1_' + str(idx),
+          key_dtype=dtypes.int64,
+          value_dtype=dtypes.float32,
+          initializer=init_ops.random_normal_initializer(0.0, 0.01),
+          devices=[device],
+          init_size=3000,
+          dim=dim,
+          kv_creator=de.HkvHashTableCreator(config=de.HkvHashTableConfig(
+              max_capacity=99999)))
+      var2 = de.get_variable(
+          'lfsv2_' + str(idx),
+          key_dtype=dtypes.int64,
+          value_dtype=dtypes.float32,
+          initializer=init_ops.random_normal_initializer(0.0, 0.01),
+          devices=[device],
+          init_size=3000,
+          dim=dim,
+          kv_creator=de.HkvHashTableCreator(config=de.HkvHashTableConfig(
+              max_capacity=99999)))
+      init_keys = constant_op.constant(list(range(10000)), dtypes.int64)
+      init_values = var1.lookup(init_keys)
+
+      with self.session(config=default_config, use_gpu=True):
+        self.evaluate(var1.upsert(init_keys, init_values))
+
+        np_keys = self.evaluate(init_keys)
+        np_values = self.evaluate(init_values)
+
+        dirpath = "file:///tmp/test_local_file_system/tfra_embedding"
+        self.evaluate(var1.tables[0].save_to_file_system(dirpath,
+                                                         file_name="test",
+                                                         buffer_size=4096))
+        self.evaluate(var2.tables[0].load_from_file_system(dirpath,
+                                                           file_name="test",
+                                                           buffer_size=4096))
+
+        load_keys, load_values = self.evaluate(var2.export())
+        sort_idx = load_keys.argsort()
+        load_keys = load_keys[sort_idx[::1]]
+        load_values = load_values[sort_idx[::1]]
+
+        self.assertAllEqual(np_keys, np.sort(load_keys))
+        self.assertAllEqual(np.sort(np_values), np.sort(load_values))
+
+  """
+    base kv [0, 1, 2, 3] [0, 1, 2, 3]
+    add kv [100] [99]
+    remove k [1]
+    accum key [0, 1, 100, 3]
+    old val [0, 1, 2, 3]
+    new val [10 ,11, 100, 13]
+    export_exist [t, t, f, t]
+
+      insert base kv
+      lookup accum key
+      insert add kv
+      remove remove k [0, 2, 3, 100] [0, 2, 3, 99]
+  """
+
+  def test_variable_find_with_exists_and_accum(self):
+    if not is_gpu_available:
+      self.skipTest('Only test when gpu is available.')
+    id = 0
+    dim_list = [1, 2, 4, 8, 10, 16, 32, 64, 100, 200]
+    kv_list = [[dtypes.int64, dtypes.float32], [dtypes.int64, dtypes.int8],
+               [dtypes.int64, dtypes.int32], [dtypes.int64, dtypes.int64],
+               [dtypes.int64, dtypes.half]]
+
+    def _convert(v, t):
+      return np.array(v).astype(_type_converter(t))
+
+    for (key_dtype, value_dtype), dim in itertools.product(kv_list, dim_list):
+      id += 1
+      with self.session(config=default_config, use_gpu=True) as sess:
+        base_keys = constant_op.constant(
+            np.array([0, 1, 2, 3]).astype(_type_converter(key_dtype)),
+            key_dtype)
+        base_values = constant_op.constant(
+            _convert([[0] * dim, [1] * dim, [2] * dim, [3] * dim], value_dtype),
+            value_dtype)
+
+        simulate_other_process_add_keys = constant_op.constant(
+            np.array([100]).astype(_type_converter(key_dtype)), key_dtype)
+        simulate_other_process_add_vals = constant_op.constant(
+            _convert([
+                [99] * dim,
+            ], value_dtype), value_dtype)
+
+        simulate_other_process_remove_keys = constant_op.constant(
+            np.array([1]).astype(_type_converter(key_dtype)), key_dtype)
+        accum_keys = constant_op.constant(
+            np.array([0, 1, 100, 3]).astype(_type_converter(key_dtype)),
+            key_dtype)
+        old_values = constant_op.constant(
+            _convert([[0] * dim, [1] * dim, [2] * dim, [3] * dim], value_dtype),
+            value_dtype)
+        new_values = constant_op.constant(
+            _convert([[10] * dim, [11] * dim, [100] * dim, [13] * dim],
+                     value_dtype), value_dtype)
+        exported_exists = constant_op.constant([True, True, False, True],
+                                               dtype=dtypes.bool)
+
+        table = de.get_variable(
+            'taccum1-' + str(id),
+            key_dtype=key_dtype,
+            value_dtype=value_dtype,
+            initializer=np.array([-1]).astype(_type_converter(value_dtype)),
+            dim=dim,
+            devices=["/GPU:0"],
+            kv_creator=de.HkvHashTableCreator(config=de.HkvHashTableConfig(
+                max_capacity=99999)))
+        self.evaluate(table.clear())
+
+        self.assertAllEqual(0, self.evaluate(table.size()))
+
+        self.evaluate(table.upsert(base_keys, base_values))
+        _, exists = table.lookup(accum_keys, return_exists=True)
+        self.assertAllEqual(self.evaluate(exported_exists),
+                            self.evaluate(exists))
+        # Simulate multi-process situation that other process operated table,
+        # between lookup and accum in this process.
+        self.evaluate(
+            table.upsert(simulate_other_process_add_keys,
+                         simulate_other_process_add_vals))
+        self.evaluate(table.remove(simulate_other_process_remove_keys))
+        self.assertAllEqual(4, self.evaluate(table.size()))
+        self.evaluate(
+            table.accum(accum_keys, old_values, new_values, exported_exists))
+
+        exported_keys, exported_values = table.export()
+
+        # exported data is in the order of the internal map, i.e. undefined
+        sorted_keys = np.sort(self.evaluate(exported_keys), axis=0)
+        sorted_values = np.sort(self.evaluate(exported_values), axis=0)
+        self.assertAllEqual(
+            np.sort(_convert([0, 2, 3, 100], key_dtype), axis=0),
+            _convert(sorted_keys, key_dtype))
+        self.assertAllEqual(
+            _convert([[2] * dim, [10] * dim, [13] * dim, [99] * dim],
+                     value_dtype), _convert(sorted_values, value_dtype))
+
+        self.evaluate(table.clear())
+        del table
+
+  def test_get_variable(self):
+    if not is_gpu_available:
+      self.skipTest('Only test when gpu is available.')
+    with self.session(
+        config=default_config,
+        graph=ops.Graph(),
+        use_gpu=True,
+    ):
+      default_val = -1
+      with variable_scope.variable_scope("embedding", reuse=True):
+        table1 = de.get_variable(
+            "t1" + '_test_get_variable',
+            dtypes.int64,
+            dtypes.int32,
+            initializer=default_val,
+            dim=2,
+            devices=["/GPU:0"],
+            init_size=2048,
+            kv_creator=de.HkvHashTableCreator(config=de.HkvHashTableConfig(
+                max_capacity=99999)))
+        table2 = de.get_variable(
+            "t1" + '_test_get_variable',
+            dtypes.int64,
+            dtypes.int32,
+            initializer=default_val,
+            dim=2,
+            devices=["/GPU:0"],
+            init_size=2048,
+            kv_creator=de.HkvHashTableCreator(config=de.HkvHashTableConfig(
+                max_capacity=99999)))
+        table3 = de.get_variable(
+            "t3" + '_test_get_variable',
+            dtypes.int64,
+            dtypes.int32,
+            initializer=default_val,
+            dim=2,
+            devices=["/GPU:0"],
+            init_size=2048,
+            kv_creator=de.HkvHashTableCreator(config=de.HkvHashTableConfig(
+                max_capacity=99999)))
+
+        table1.clear()
+        table2.clear()
+        table3.clear()
+
+      self.assertAllEqual(table1, table2)
+      self.assertNotEqual(table1, table3)
+
+  def test_get_variable_reuse_error(self):
+    if not is_gpu_available:
+      self.skipTest('Only test when gpu is available.')
+    ops.disable_eager_execution()
+    with self.session(
+        config=default_config,
+        graph=ops.Graph(),
+        use_gpu=True,
+    ):
+      with variable_scope.variable_scope("embedding", reuse=False):
+        _ = de.get_variable(
+            "t900",
+            initializer=-1,
+            dim=2,
+            devices=["/GPU:0"],
+            kv_creator=de.HkvHashTableCreator(config=de.HkvHashTableConfig(
+                max_capacity=99999)))
+        with self.assertRaisesRegexp(ValueError,
+                                     "Variable embedding/t900 already exists"):
+          _ = de.get_variable(
+              "t900",
+              initializer=-1,
+              dim=2,
+              devices=["/GPU:0"],
+              kv_creator=de.HkvHashTableCreator(config=de.HkvHashTableConfig(
+                  max_capacity=99999)))
+
+  @test_util.run_v1_only("Multiple sessions")
+  def test_sharing_between_multi_sessions(self):
+    if not is_gpu_available:
+      self.skipTest('Only test when gpu is available.')
+    ops.disable_eager_execution()
+    # Start a server to store the table state
+    server = server_lib.Server({"local0": ["localhost:0"]},
+                               protocol="grpc",
+                               start=True)
+    # Create two sessions sharing the same state
+    session1 = session.Session(server.target, config=default_config)
+    session2 = session.Session(server.target, config=default_config)
+
+    table = de.get_variable(
+        "tx100" + '_test_sharing_between_multi_sessions',
+        dtypes.int64,
+        dtypes.int32,
+        initializer=0,
+        dim=1,
+        devices=["/GPU:0"],
+        kv_creator=de.HkvHashTableCreator(config=de.HkvHashTableConfig(
+            max_capacity=99999)))
+    self.evaluate(table.clear())
+
+    # Populate the table in the first session
+    with session1:
+      with ops.device(_get_devices()[0]):
+        self.evaluate(variables.global_variables_initializer())
+        self.evaluate(variables.local_variables_initializer())
+        self.assertAllEqual(0, table.size().eval())
+
+        keys = constant_op.constant([11, 12], dtypes.int64)
+        values = constant_op.constant([[11], [12]], dtypes.int32)
+        table.upsert(keys, values).run()
+        self.assertAllEqual(2, table.size().eval())
+
+        output = table.lookup(constant_op.constant([11, 12, 13], dtypes.int64))
+        self.assertAllEqual([[11], [12], [0]], output.eval())
+
+    # Verify that we can access the shared data from the second session
+    with session2:
+      with ops.device(_get_devices()[0]):
+        self.assertAllEqual(2, table.size().eval())
+
+        output = table.lookup(constant_op.constant([10, 11, 12], dtypes.int64))
+        self.assertAllEqual([[0], [11], [12]], output.eval())
+
+  def test_insert_repeat_data(self):
+    if not is_gpu_available:
+      self.skipTest('Only test when gpu is available.')
+    id = 0
+    dim_list = [1, 2, 4, 8, 10, 16, 32, 64, 100, 200]
+    kv_list = [[dtypes.int64, dtypes.int8], [dtypes.int64, dtypes.int32],
+               [dtypes.int64, dtypes.int64], [dtypes.int64, dtypes.float32],
+               [dtypes.int64, dtypes.half]]
+    bits_list = []
+    for kv in kv_list:
+      bits_list.append(kv[1].size)
+
+    def _convert(v, t):
+      return np.array(v).astype(_type_converter(t))
+
+    key_num = 50000
+    for (key_dtype,
+         value_dtype), dim, bit in itertools.product(kv_list, dim_list,
+                                                     bits_list):
+      id += 1
+      with self.session(config=default_config, use_gpu=True) as sess:
+        hkv_table_config = de.HkvHashTableConfig(init_capacity=100000,
+                                                 max_capacity=100000,
+                                                 max_hbm_for_values=bit *
+                                                 (dim + 1) * 100)
+        table = de.get_variable(
+            't1-' + str(id) + '_test_variable',
+            key_dtype=key_dtype,
+            value_dtype=value_dtype,
+            initializer=np.array([-1]).astype(_type_converter(value_dtype)),
+            devices=['/GPU:0'],
+            dim=dim,
+            kv_creator=de.HkvHashTableCreator(config=hkv_table_config))
+
+        for i in range(2):
+          keys = constant_op.constant(
+              np.array([x for x in range(key_num)
+                       ]).astype(_type_converter(key_dtype)), key_dtype)
+          values = constant_op.constant(
+              _convert([[x] * dim for x in range(key_num)], value_dtype),
+              value_dtype)
+
+          if i == 0:
+            self.assertAllEqual(0, self.evaluate(table.size()))
+          else:
+            self.assertAllEqual(key_num, self.evaluate(table.size()))
+
+          self.evaluate(table.upsert(keys, values))
+          self.assertAllEqual(key_num, self.evaluate(table.size()))
+
+        self.evaluate(table.clear())
+        del table
+
+  def test_reach_max_hbm(self):
+    if not is_gpu_available:
+      self.skipTest('Only test when gpu is available.')
+    id = 0
+    dim_list = [1, 2, 4, 8, 10, 16, 32, 64, 100, 200]
+    kv_list = [[dtypes.int64, dtypes.int8], [dtypes.int64, dtypes.int32],
+               [dtypes.int64, dtypes.int64], [dtypes.int64, dtypes.float32],
+               [dtypes.int64, dtypes.half]]
+    bits_list = [1, 4, 8, 4, 2]
+
+    dim_list = [32]
+    kv_list = [[dtypes.int64, dtypes.int64]]
+    bits_list = [8]
+
+    def _convert(v, t):
+      return np.array(v).astype(_type_converter(t))
+
+    k_key_num = 1024 * 1024 * 2
+    for (key_dtype,
+         value_dtype), dim, bit in itertools.product(kv_list, dim_list,
+                                                     bits_list):
+      id += 1
+      with self.session(config=default_config, use_gpu=True) as sess:
+        hkv_table_config = de.HkvHashTableConfig(init_capacity=k_key_num,
+                                                 max_capacity=k_key_num,
+                                                 max_hbm_for_values=bit *
+                                                 (dim + 1) * 1024 * 1024 * 4)
+        table = de.get_variable(
+            't1-' + str(id) + '_test_variable',
+            key_dtype=key_dtype,
+            value_dtype=value_dtype,
+            initializer=np.array([-1]).astype(_type_converter(value_dtype)),
+            dim=dim,
+            kv_creator=de.HkvHashTableCreator(config=hkv_table_config))
+        table.clear()
+        keys = constant_op.constant(
+            np.array([x for x in range(0, int(k_key_num / 2))
+                     ]).astype(_type_converter(key_dtype)), key_dtype)
+        values = constant_op.constant(
+            _convert([[x] * dim for x in range(0, int(k_key_num / 2))],
+                     value_dtype), value_dtype)
+
+        self.assertAllEqual(0, self.evaluate(table.size()))
+
+        self.evaluate(table.upsert(keys, values))
+        self.assertAllEqual(k_key_num / 2, self.evaluate(table.size()))
+
+        keys = constant_op.constant(
+            np.array([x for x in range(int(k_key_num / 2), k_key_num)
+                     ]).astype(_type_converter(key_dtype)), key_dtype)
+        values = constant_op.constant(
+            _convert([[x] * dim for x in range(int(k_key_num / 2), k_key_num)],
+                     value_dtype), value_dtype)
+
+        self.evaluate(table.upsert(keys, values))
+        self.assertAllInRange(self.evaluate(table.size()), k_key_num / 2,
+                              k_key_num)
+
+        self.evaluate(table.clear())
+        del table
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/python/kernel_tests/shadow_embedding_ops_test.py b/tensorflow_recommenders_addons/dynamic_embedding/python/kernel_tests/shadow_embedding_ops_test.py
index ce17eb627..34651b64d 100644
--- a/tensorflow_recommenders_addons/dynamic_embedding/python/kernel_tests/shadow_embedding_ops_test.py
+++ b/tensorflow_recommenders_addons/dynamic_embedding/python/kernel_tests/shadow_embedding_ops_test.py
@@ -114,8 +114,8 @@ def test_create(self):
     if not context.executing_eagerly():
       self.skipTest('Only test in eager mode.')
 
-    key_dtypes = [dtypes.int32, dtypes.int64]
-    value_dtypes = [dtypes.int32, dtypes.float32, dtypes.float64]
+    key_dtypes = [dtypes.int64]
+    value_dtypes = [dtypes.int32, dtypes.float32]
     dims = [1, 4]
     trainable_options = [True, False]
     devices = ['/CPU:0']
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/python/ops/BUILD b/tensorflow_recommenders_addons/dynamic_embedding/python/ops/BUILD
index 71788a12b..b6893933b 100644
--- a/tensorflow_recommenders_addons/dynamic_embedding/python/ops/BUILD
+++ b/tensorflow_recommenders_addons/dynamic_embedding/python/ops/BUILD
@@ -1,4 +1,5 @@
 # Placeholder for internal Python library.
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 
 package(
     default_visibility = [
@@ -11,11 +12,20 @@ licenses(["notice"])  # Apache 2.0
 py_library(
     name = "dynamic_embedding",
     srcs = glob(["*.py"]),
-    data = [
-        "//tensorflow_recommenders_addons/dynamic_embedding/core:_cuckoo_hashtable_ops.so",
-        "//tensorflow_recommenders_addons/dynamic_embedding/core:_data_flow_ops.so",
-        "//tensorflow_recommenders_addons/dynamic_embedding/core:_math_ops.so",
-        "//tensorflow_recommenders_addons/dynamic_embedding/core:_redis_table_ops.so",
-    ],
+    data = if_cuda(
+        [
+            "//tensorflow_recommenders_addons/dynamic_embedding/core:_cuckoo_hashtable_ops.so",
+            "//tensorflow_recommenders_addons/dynamic_embedding/core:_data_flow_ops.so",
+            "//tensorflow_recommenders_addons/dynamic_embedding/core:_hkv_ops.so",
+            "//tensorflow_recommenders_addons/dynamic_embedding/core:_math_ops.so",
+            "//tensorflow_recommenders_addons/dynamic_embedding/core:_redis_table_ops.so",
+        ],
+        [
+            "//tensorflow_recommenders_addons/dynamic_embedding/core:_cuckoo_hashtable_ops.so",
+            "//tensorflow_recommenders_addons/dynamic_embedding/core:_data_flow_ops.so",
+            "//tensorflow_recommenders_addons/dynamic_embedding/core:_math_ops.so",
+            "//tensorflow_recommenders_addons/dynamic_embedding/core:_redis_table_ops.so",
+        ],
+    ),
     srcs_version = "PY2AND3",
 )
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/python/ops/cuckoo_hashtable_ops.py b/tensorflow_recommenders_addons/dynamic_embedding/python/ops/cuckoo_hashtable_ops.py
index ca494066d..fd9a4394e 100644
--- a/tensorflow_recommenders_addons/dynamic_embedding/python/ops/cuckoo_hashtable_ops.py
+++ b/tensorflow_recommenders_addons/dynamic_embedding/python/ops/cuckoo_hashtable_ops.py
@@ -15,12 +15,14 @@
 """CuckooHash Lookup operations."""
 # pylint: disable=g-bad-name
 
+import sys
 import copy
 import functools
 
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import device as tf_device
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.lookup_ops import LookupInterface
 from tensorflow.python.training.saver import BaseSaverBuilder
@@ -28,7 +30,15 @@
 from tensorflow_recommenders_addons.utils.resource_loader import LazySO
 from tensorflow_recommenders_addons.utils.resource_loader import prefix_op_name
 
-cuckoo_ops = LazySO("dynamic_embedding/core/_cuckoo_hashtable_ops.so").ops
+try:
+  hkv_ops = LazySO("dynamic_embedding/core/_hkv_ops.so").ops
+except:
+  hkv_ops = None
+
+try:
+  cuckoo_ops = LazySO("dynamic_embedding/core/_cuckoo_hashtable_ops.so").ops
+except:
+  cuckoo_ops = None
 
 
 class CuckooHashTable(LookupInterface):
@@ -93,6 +103,12 @@ def __init__(
     self._init_size = init_size
     self._name = name
     self._new_obj_trackable = None  # for restore op can easily found this table
+    self._max_capacity = init_size
+    if self._max_capacity == 0:
+      self._max_capacity = 1024 * 1024
+    self._max_hbm_for_values = sys.maxsize
+    self._device_type = tf_device.DeviceSpec.from_string(
+        self._device).device_type
 
     self._shared_name = None
     if context.executing_eagerly():
@@ -115,13 +131,18 @@ def __init__(
         _table_full_name = self._name
       if shard_saveable_object_fn:
         self._saveable_fn = shard_saveable_object_fn
+        self.saveable = self._saveable_fn(
+            table=self,
+            name=_table_name,
+            full_name=_table_full_name,
+        )
       else:
         self._saveable_fn = CuckooHashTable._Saveable
-      self.saveable = self._saveable_fn(
-          table=self,
-          name=_table_name,
-          full_name=_table_full_name,
-      )
+        self.saveable = self._saveable_fn(
+            table=self,
+            name=_table_name,
+            full_name=_table_full_name,
+        )
       if not context.executing_eagerly():
         ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, self.saveable)
 
@@ -131,16 +152,30 @@ def _create_resource(self):
     # explicitly specified.
     use_node_name_sharing = self._checkpoint and self._shared_name is None
 
-    with ops.device(self._device):
-      table_ref = cuckoo_ops.tfra_cuckoo_hash_table_of_tensors(
-          shared_name=self._shared_name,
-          use_node_name_sharing=use_node_name_sharing,
-          key_dtype=self._key_dtype,
-          value_dtype=self._value_dtype,
-          value_shape=self._default_value.get_shape(),
-          init_size=self._init_size,
-          name=self._name,
-      )
+    if self._device_type == "GPU":
+      with ops.device(self._device):
+        table_ref = hkv_ops.tfra_hkv_hash_table_of_tensors(
+            shared_name=self._shared_name,
+            use_node_name_sharing=use_node_name_sharing,
+            key_dtype=self._key_dtype,
+            value_dtype=self._value_dtype,
+            value_shape=self._default_value.get_shape(),
+            init_capacity=self._init_size,
+            max_capacity=self._max_capacity,
+            max_hbm_for_vectors=self._max_hbm_for_values,
+            name=self._name,
+        )
+    else:
+      with ops.device(self._device):
+        table_ref = cuckoo_ops.tfra_cuckoo_hash_table_of_tensors(
+            shared_name=self._shared_name,
+            use_node_name_sharing=use_node_name_sharing,
+            key_dtype=self._key_dtype,
+            value_dtype=self._value_dtype,
+            value_shape=self._default_value.get_shape(),
+            init_size=self._init_size,
+            name=self._name,
+        )
 
     if context.executing_eagerly():
       self._table_name = None
@@ -177,7 +212,12 @@ def size(self, name=None):
         """
     with ops.name_scope(name, "%s_Size" % self.name, [self.resource_handle]):
       with ops.colocate_with(self.resource_handle):
-        return cuckoo_ops.tfra_cuckoo_hash_table_size(self.resource_handle)
+        if self._device_type == "GPU":
+          return hkv_ops.tfra_hkv_hash_table_size(self.resource_handle,
+                                                  key_dtype=self._key_dtype,
+                                                  value_dtype=self._value_dtype)
+        else:
+          return cuckoo_ops.tfra_cuckoo_hash_table_size(self.resource_handle)
 
   def remove(self, keys, name=None):
     """Removes `keys` and its associated values from the table.
@@ -204,9 +244,11 @@ def remove(self, keys, name=None):
         "%s_lookup_table_remove" % self.name,
         (self.resource_handle, keys, self._default_value),
     ):
-      op = cuckoo_ops.tfra_cuckoo_hash_table_remove(self.resource_handle, keys)
-
-    return op
+      if self._device_type == "GPU":
+        return hkv_ops.tfra_hkv_hash_table_remove(self.resource_handle, keys)
+      else:
+        return cuckoo_ops.tfra_cuckoo_hash_table_remove(self.resource_handle,
+                                                        keys)
 
   def clear(self, name=None):
     """clear all keys and values in the table.
@@ -219,12 +261,15 @@ def clear(self, name=None):
     """
     with ops.name_scope(name, "%s_lookup_table_clear" % self.name,
                         (self.resource_handle, self._default_value)):
-      op = cuckoo_ops.tfra_cuckoo_hash_table_clear(
-          self.resource_handle,
-          key_dtype=self._key_dtype,
-          value_dtype=self._value_dtype)
-
-    return op
+      if self._device_type == "GPU":
+        return hkv_ops.tfra_hkv_hash_table_clear(self.resource_handle,
+                                                 key_dtype=self._key_dtype,
+                                                 value_dtype=self._value_dtype)
+      else:
+        return cuckoo_ops.tfra_cuckoo_hash_table_clear(
+            self.resource_handle,
+            key_dtype=self._key_dtype,
+            value_dtype=self._value_dtype)
 
   def lookup(self,
              keys,
@@ -263,20 +308,37 @@ def lookup(self,
     ):
       keys = ops.convert_to_tensor(keys, dtype=self._key_dtype, name="keys")
       with ops.colocate_with(self.resource_handle, ignore_existing=True):
-        if return_exists:
-          values, exists = cuckoo_ops.tfra_cuckoo_hash_table_find_with_exists(
-              self.resource_handle,
-              keys,
-              dynamic_default_values
-              if dynamic_default_values is not None else self._default_value,
-          )
+        if self._device_type == "GPU":
+          if return_exists:
+            values, exists = hkv_ops.tfra_hkv_hash_table_find_with_exists(
+                self.resource_handle,
+                keys,
+                dynamic_default_values
+                if dynamic_default_values is not None else self._default_value,
+            )
+          else:
+            values = hkv_ops.tfra_hkv_hash_table_find(
+                self.resource_handle,
+                keys,
+                dynamic_default_values
+                if dynamic_default_values is not None else self._default_value,
+            )
         else:
-          values = cuckoo_ops.tfra_cuckoo_hash_table_find(
-              self.resource_handle,
-              keys,
-              dynamic_default_values
-              if dynamic_default_values is not None else self._default_value,
-          )
+          if return_exists:
+            values, exists = cuckoo_ops.tfra_cuckoo_hash_table_find_with_exists(
+                self.resource_handle,
+                keys,
+                dynamic_default_values
+                if dynamic_default_values is not None else self._default_value,
+            )
+          else:
+            values = cuckoo_ops.tfra_cuckoo_hash_table_find(
+                self.resource_handle,
+                keys,
+                dynamic_default_values
+                if dynamic_default_values is not None else self._default_value,
+            )
+
     return (values, exists) if return_exists else values
 
   def insert(self, keys, values, name=None):
@@ -305,9 +367,12 @@ def insert(self, keys, values, name=None):
       values = ops.convert_to_tensor(values, self._value_dtype, name="values")
       with ops.colocate_with(self.resource_handle, ignore_existing=True):
         # pylint: disable=protected-access
-        op = cuckoo_ops.tfra_cuckoo_hash_table_insert(self.resource_handle,
-                                                      keys, values)
-    return op
+        if self._device_type == "GPU":
+          return hkv_ops.tfra_hkv_hash_table_insert(self.resource_handle, keys,
+                                                    values)
+        else:
+          return cuckoo_ops.tfra_cuckoo_hash_table_insert(
+              self.resource_handle, keys, values)
 
   def accum(self, keys, values_or_deltas, exists, name=None):
     """Associates `keys` with `values`.
@@ -340,9 +405,12 @@ def accum(self, keys, values_or_deltas, exists, name=None):
       exists = ops.convert_to_tensor(exists, dtypes.bool, name="exists")
       with ops.colocate_with(self.resource_handle, ignore_existing=True):
         # pylint: disable=protected-access
-        op = cuckoo_ops.tfra_cuckoo_hash_table_accum(self.resource_handle, keys,
-                                                     values_or_deltas, exists)
-    return op
+        if self._device_type == "GPU":
+          return hkv_ops.tfra_hkv_hash_table_accum(self.resource_handle, keys,
+                                                   values_or_deltas, exists)
+        else:
+          return cuckoo_ops.tfra_cuckoo_hash_table_accum(
+              self.resource_handle, keys, values_or_deltas, exists)
 
   def export(self, name=None):
     """Returns tensors of all keys and values in the table.
@@ -357,8 +425,13 @@ def export(self, name=None):
     with ops.name_scope(name, "%s_lookup_table_export_values" % self.name,
                         [self.resource_handle]):
       with ops.colocate_with(self.resource_handle):
-        keys, values = cuckoo_ops.tfra_cuckoo_hash_table_export(
-            self.resource_handle, self._key_dtype, self._value_dtype)
+        if self._device_type == "GPU":
+          keys, values = hkv_ops.tfra_hkv_hash_table_export(
+              self.resource_handle, self._key_dtype, self._value_dtype)
+        else:
+          keys, values = cuckoo_ops.tfra_cuckoo_hash_table_export(
+              self.resource_handle, self._key_dtype, self._value_dtype)
+
     return keys, values
 
   def save_to_file_system(self,
@@ -384,15 +457,26 @@ def save_to_file_system(self,
     with ops.name_scope(name, "%s_save_table" % self.name,
                         [self.resource_handle]):
       with ops.colocate_with(None, ignore_existing=True):
-        return cuckoo_ops.tfra_cuckoo_hash_table_save_to_file_system(
-            self.resource_handle,
-            dirpath=dirpath,
-            file_name=file_name if file_name else self._name,
-            key_dtype=self._key_dtype,
-            value_dtype=self._value_dtype,
-            dirpath_env=dirpath_env,
-            append_to_file=append_to_file,
-            buffer_size=buffer_size)
+        if self._device_type == "GPU":
+          return hkv_ops.tfra_hkv_hash_table_save_to_file_system(
+              self.resource_handle,
+              dirpath=dirpath,
+              file_name=file_name if file_name else self._name,
+              key_dtype=self._key_dtype,
+              value_dtype=self._value_dtype,
+              dirpath_env=dirpath_env,
+              append_to_file=append_to_file,
+              buffer_size=buffer_size)
+        else:
+          return cuckoo_ops.tfra_cuckoo_hash_table_save_to_file_system(
+              self.resource_handle,
+              dirpath=dirpath,
+              file_name=file_name if file_name else self._name,
+              key_dtype=self._key_dtype,
+              value_dtype=self._value_dtype,
+              dirpath_env=dirpath_env,
+              append_to_file=append_to_file,
+              buffer_size=buffer_size)
 
   def load_from_file_system(self,
                             dirpath,
@@ -417,15 +501,26 @@ def load_from_file_system(self,
     with ops.name_scope(name, "%s_load_table" % self.name,
                         [self.resource_handle]):
       with ops.colocate_with(None, ignore_existing=True):
-        return cuckoo_ops.tfra_cuckoo_hash_table_load_from_file_system(
-            self.resource_handle,
-            dirpath=dirpath,
-            file_name=file_name if file_name else self._name,
-            key_dtype=self._key_dtype,
-            value_dtype=self._value_dtype,
-            dirpath_env=dirpath_env,
-            load_entire_dir=load_entire_dir,
-            buffer_size=buffer_size)
+        if self._device_type == "GPU":
+          return hkv_ops.tfra_hkv_hash_table_load_from_file_system(
+              self.resource_handle,
+              dirpath=dirpath,
+              file_name=file_name if file_name else self._name,
+              key_dtype=self._key_dtype,
+              value_dtype=self._value_dtype,
+              dirpath_env=dirpath_env,
+              load_entire_dir=load_entire_dir,
+              buffer_size=buffer_size)
+        else:
+          return cuckoo_ops.tfra_cuckoo_hash_table_load_from_file_system(
+              self.resource_handle,
+              dirpath=dirpath,
+              file_name=file_name if file_name else self._name,
+              key_dtype=self._key_dtype,
+              value_dtype=self._value_dtype,
+              dirpath_env=dirpath_env,
+              load_entire_dir=load_entire_dir,
+              buffer_size=buffer_size)
 
   def _gather_saveables_for_checkpoint(self):
     """For object-based checkpointing."""
@@ -463,11 +558,18 @@ def restore(self, restored_tensors, restored_shapes, name=None):
       # pylint: disable=protected-access
       with ops.name_scope(name, "%s_table_restore" % self._restore_name):
         with ops.colocate_with(self.op.resource_handle):
-          return cuckoo_ops.tfra_cuckoo_hash_table_import(
-              self.op.resource_handle,
-              restored_tensors[0],
-              restored_tensors[1],
-          )
+          if self.op.resource_handle.device.count('GPU'):
+            return hkv_ops.tfra_hkv_hash_table_import(
+                self.op.resource_handle,
+                restored_tensors[0],
+                restored_tensors[1],
+            )
+          else:
+            return cuckoo_ops.tfra_cuckoo_hash_table_import(
+                self.op.resource_handle,
+                restored_tensors[0],
+                restored_tensors[1],
+            )
 
 
 ops.NotDifferentiable(prefix_op_name("CuckooHashTableOfTensors"))
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/python/ops/dynamic_embedding_creator.py b/tensorflow_recommenders_addons/dynamic_embedding/python/ops/dynamic_embedding_creator.py
index 258d3c726..b5f1f1266 100644
--- a/tensorflow_recommenders_addons/dynamic_embedding/python/ops/dynamic_embedding_creator.py
+++ b/tensorflow_recommenders_addons/dynamic_embedding/python/ops/dynamic_embedding_creator.py
@@ -27,6 +27,10 @@
 from tensorflow.python.training.saver import BaseSaverBuilder
 from tensorflow_recommenders_addons import dynamic_embedding as de
 
+KHkvHashTableInitCapacity = 1024 * 1024
+KHkvHashTableMaxCapacity = 1024 * 1024
+KHkvHashTableMaxHbmForValues = 1024 * 1024 * 1024
+
 
 class KVCreator(object, metaclass=ABCMeta):
   """
@@ -89,7 +93,7 @@ def create(
       default_value=None,
       name=None,
       checkpoint=None,
-      init_size=None,
+      init_size=0,
       config=None,
       device=None,
       shard_saveable_object_fn=None,
@@ -103,7 +107,6 @@ def create(
     self.config = config
     self.device = device
     self.shard_saveable_object_fn = shard_saveable_object_fn
-
     return de.CuckooHashTable(
         key_dtype=self.key_dtype,
         value_dtype=self.value_dtype,
@@ -133,6 +136,80 @@ def get_config(self):
     return config
 
 
+class HkvHashTableConfig(object):
+
+  def __init__(self,
+               init_capacity=KHkvHashTableInitCapacity,
+               max_capacity=KHkvHashTableMaxCapacity,
+               max_hbm_for_values=KHkvHashTableMaxHbmForValues):
+    """ CuckooHashTableConfig include nothing for parameter default satisfied.
+    """
+    self.init_capacity = init_capacity
+    self.max_capacity = max_capacity
+    self.max_hbm_for_values = max_hbm_for_values
+
+
+class HkvHashTableCreator(KVCreator):
+
+  def create(
+      self,
+      key_dtype=None,
+      value_dtype=None,
+      default_value=None,
+      name=None,
+      checkpoint=None,
+      init_size=KHkvHashTableInitCapacity,
+      config=None,
+      device=None,
+      shard_saveable_object_fn=None,
+  ):
+    self.key_dtype = key_dtype
+    self.value_dtype = value_dtype
+    self.default_value = default_value
+    self.name = name
+    self.checkpoint = checkpoint
+    self.init_capacity = init_size
+    self.max_capacity = KHkvHashTableMaxCapacity
+    self.max_hbm_for_values = KHkvHashTableMaxHbmForValues
+    if self.config and isinstance(self.config, de.HkvHashTableConfig):
+      self.init_capacity = self.config.init_capacity
+      self.max_capacity = self.config.max_capacity
+      self.max_hbm_for_values = self.config.max_hbm_for_values
+    self.device = device
+    self.shard_saveable_object_fn = shard_saveable_object_fn
+
+    return de.HkvHashTable(
+        key_dtype=self.key_dtype,
+        value_dtype=self.value_dtype,
+        default_value=self.default_value,
+        name=self.name,
+        checkpoint=self.checkpoint,
+        init_capacity=self.init_capacity,
+        max_capacity=self.max_capacity,
+        max_hbm_for_values=self.max_hbm_for_values,
+        config=self.config,
+        device=self.device,
+        shard_saveable_object_fn=self.shard_saveable_object_fn)
+
+  def get_config(self):
+    if not context.executing_eagerly():
+      raise RuntimeError(
+          'Unsupported to serialize python object of HkvHashTableCreator.')
+
+    config = {
+        'key_dtype': self.key_dtype,
+        'value_dtype': self.value_dtype,
+        'default_value': self.default_value.numpy(),
+        'name': self.name,
+        'checkpoint': self.checkpoint,
+        'init_capacity': self.init_capacity,
+        'max_capacity': self.max_capacity,
+        'config': self.config,
+        'device': self.device,
+    }
+    return config
+
+
 class RedisTableConfig(object):
   """ 
   RedisTableConfig config json file for connecting Redis service and 
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/python/ops/dynamic_embedding_variable.py b/tensorflow_recommenders_addons/dynamic_embedding/python/ops/dynamic_embedding_variable.py
index b299de79a..dec2f8b21 100644
--- a/tensorflow_recommenders_addons/dynamic_embedding/python/ops/dynamic_embedding_variable.py
+++ b/tensorflow_recommenders_addons/dynamic_embedding/python/ops/dynamic_embedding_variable.py
@@ -579,14 +579,14 @@ def _get_default_devices():
                         [dtypes.string, dtypes.int64],
                         [dtypes.string, dtypes.float64],
                         [dtypes.string, dtypes.bool]]
-    if "GPU" in self.devices[0].upper():
+    if "GPU" in self.devices[0].upper() or isinstance(self.kv_creator,
+                                                      de.HkvHashTableCreator):
       valid_dtype_list = [
           [dtypes.int64, dtypes.float32],
-          [dtypes.int64, dtypes.half],
-          [dtypes.int64, dtypes.int32],
           [dtypes.int64, dtypes.int8],
+          [dtypes.int64, dtypes.int32],
           [dtypes.int64, dtypes.int64],
-          [dtypes.int32, dtypes.float32],
+          [dtypes.int64, dtypes.half],
       ]
     if is_macos() and is_arm64():
       if value_dtype == dtypes.half or value_dtype == dtypes.bfloat16:
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/python/ops/hkv_hashtable_ops.py b/tensorflow_recommenders_addons/dynamic_embedding/python/ops/hkv_hashtable_ops.py
index 7061c6b30..30a286c79 100644
--- a/tensorflow_recommenders_addons/dynamic_embedding/python/ops/hkv_hashtable_ops.py
+++ b/tensorflow_recommenders_addons/dynamic_embedding/python/ops/hkv_hashtable_ops.py
@@ -12,12 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""CuckooHash Lookup operations."""
+"""hkv hashtable Lookup operations."""
 # pylint: disable=g-bad-name
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import copy
 import functools
 
 from tensorflow.python.eager import context
@@ -32,12 +33,16 @@
 from tensorflow_recommenders_addons.utils.resource_loader import prefix_op_name
 
 try:
-  cuckoo_ops = LazySO("dynamic_embedding/core/_cuckoo_hashtable_ops.so").ops
+  hkv_ops = LazySO("dynamic_embedding/core/_hkv_ops.so").ops
 except:
-  cuckoo_ops = None
+  hkv_ops = None
 
+KHkvHashTableInitCapacity = 1024 * 1024
+KHkvHashTableMaxCapacity = 1024 * 1024
+KHkvHashTableMaxHbmForValues = 1024 * 1024 * 1024
 
-class CuckooHashTable(LookupInterface):
+
+class HkvHashTable(LookupInterface):
   """A generic mutable hash table implementation.
 
     Data can be inserted by calling the insert method and removed by calling the
@@ -46,9 +51,9 @@ class CuckooHashTable(LookupInterface):
     Example usage:
 
     ```python
-    table = tfra.dynamic_embedding.CuckooHashTable(key_dtype=tf.string,
-                                                   value_dtype=tf.int64,
-                                                   default_value=-1)
+    table = tfra.dynamic_embedding.HkvHashTable(key_dtype=tf.string,
+                                                value_dtype=tf.int64,
+                                                default_value=-1)
     sess.run(table.insert(keys, values))
     out = table.lookup(query_keys)
     print(out.eval())
@@ -60,15 +65,16 @@ def __init__(
       key_dtype,
       value_dtype,
       default_value,
-      name="CuckooHashTable",
+      name="HkvHashTable",
       checkpoint=True,
-      init_capacity=0,
-      max_capacity=0,
+      init_capacity=KHkvHashTableInitCapacity,
+      max_capacity=KHkvHashTableMaxCapacity,
+      max_hbm_for_values=KHkvHashTableMaxHbmForValues,
       config=None,
-      slot_hook=None,
-      is_slot=False,
+      device='',
+      shard_saveable_object_fn=None,
   ):
-    """Creates an empty `CuckooHashTable` object.
+    """Creates an empty `HkvHashTable` object.
 
         Creates a table, the type of its keys and values are specified by key_dtype
         and value_dtype, respectively.
@@ -87,7 +93,7 @@ def __init__(
             tables will be int(max_capacity / N), N is the number of the devices.
 
         Returns:
-          A `CuckooHashTable` object.
+          A `HkvHashTable` object.
 
         Raises:
           ValueError: If checkpoint is True and no name was specified.
@@ -100,7 +106,18 @@ def __init__(
     self._value_dtype = value_dtype
     self._init_capacity = init_capacity
     self._max_capacity = max_capacity
+    self._max_hbm_for_values = max_hbm_for_values
     self._name = name
+    self._config = config
+    self._device = device
+    if not self._device or self._device == '':
+      self._device = ['/GPU:0']
+    self._new_obj_trackable = None
+
+    if self._config:
+      self._init_capacity = self._config.init_capacity
+      self._max_capacity = self._config.max_capacity
+      self._max_hbm_for_values = self._config.max_hbm_for_values
 
     self._shared_name = None
     if context.executing_eagerly():
@@ -110,22 +127,28 @@ def __init__(
       # tables in a loop is uncommon).
       # TODO(rohanj): Use context.shared_name() instead.
       self._shared_name = "table_%d" % (ops.uid(),)
-    super(CuckooHashTable, self).__init__(key_dtype, value_dtype)
+    super(HkvHashTable, self).__init__(key_dtype, value_dtype)
 
     self._resource_handle = self._create_resource()
     if checkpoint:
-      _ = CuckooHashTable._Saveable(self, name)
+      _ = HkvHashTable._Saveable(self, name)
       if not context.executing_eagerly():
-        self.saveable = CuckooHashTable._Saveable(
-            self,
-            name=self._resource_handle.op.name,
-            full_name=self._resource_handle.op.name,
-        )
-        ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, self.saveable)
+        _table_name = self._resource_handle.op.name
+        _table_full_name = self._resource_handle.op.name
       else:
-        self.saveable = CuckooHashTable._Saveable(self,
-                                                  name=name,
-                                                  full_name=name)
+        _table_name = self._name
+        _table_full_name = self._name
+      if shard_saveable_object_fn:
+        self._saveable_fn = shard_saveable_object_fn
+      else:
+        self._saveable_fn = HkvHashTable._Saveable
+      self.saveable = self._saveable_fn(
+          table=self,
+          name=_table_name,
+          full_name=_table_full_name,
+      )
+      if not context.executing_eagerly():
+        ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, self.saveable)
 
   def _create_resource(self):
     # The table must be shared if checkpointing is requested for multi-worker
@@ -133,16 +156,18 @@ def _create_resource(self):
     # explicitly specified.
     use_node_name_sharing = self._checkpoint and self._shared_name is None
 
-    table_ref = cuckoo_ops.tfra_cuckoo_hash_table_of_tensors(
-        shared_name=self._shared_name,
-        use_node_name_sharing=use_node_name_sharing,
-        key_dtype=self._key_dtype,
-        value_dtype=self._value_dtype,
-        value_shape=self._default_value.get_shape(),
-        init_capacity=self._init_capacity,
-        max_capacity=self._max_capacity,
-        name=self._name,
-    )
+    with ops.device(self._device):
+      table_ref = hkv_ops.tfra_hkv_hash_table_of_tensors(
+          shared_name=self._shared_name,
+          use_node_name_sharing=use_node_name_sharing,
+          key_dtype=self._key_dtype,
+          value_dtype=self._value_dtype,
+          value_shape=self._default_value.get_shape(),
+          init_capacity=self._init_capacity,
+          max_capacity=self._max_capacity,
+          max_hbm_for_vectors=self._max_hbm_for_values,
+          name=self._name,
+      )
 
     if context.executing_eagerly():
       self._table_name = None
@@ -150,6 +175,20 @@ def _create_resource(self):
       self._table_name = table_ref.op.name.split("/")[-1]
     return table_ref
 
+  def _map_resources(self, _):
+    """For implementing `Trackable`."""
+    new_obj = copy.copy(self)
+    if self._new_obj_trackable is None:
+      self._new_obj_trackable = new_obj
+    # pylint: disable=protected-access
+    with ops.device(self._resource_device):
+      new_resource = new_obj._create_resource()
+    new_obj._resource_handle = new_resource
+    # pylint: enable=protected-access
+    obj_map = {self: new_obj}
+    resource_map = {self.resource_handle: new_resource}
+    return obj_map, resource_map
+
   @property
   def name(self):
     return self._table_name
@@ -165,10 +204,9 @@ def size(self, name=None):
         """
     with ops.name_scope(name, "%s_Size" % self.name, [self.resource_handle]):
       with ops.colocate_with(self.resource_handle):
-        return cuckoo_ops.tfra_cuckoo_hash_table_size(
-            self.resource_handle,
-            key_dtype=self._key_dtype,
-            value_dtype=self._value_dtype)
+        return hkv_ops.tfra_hkv_hash_table_size(self.resource_handle,
+                                                key_dtype=self._key_dtype,
+                                                value_dtype=self._value_dtype)
 
   def remove(self, keys, name=None):
     """Removes `keys` and its associated values from the table.
@@ -195,7 +233,7 @@ def remove(self, keys, name=None):
         "%s_lookup_table_remove" % self.name,
         (self.resource_handle, keys, self._default_value),
     ):
-      op = cuckoo_ops.tfra_cuckoo_hash_table_remove(self.resource_handle, keys)
+      op = hkv_ops.tfra_hkv_hash_table_remove(self.resource_handle, keys)
 
     return op
 
@@ -210,10 +248,9 @@ def clear(self, name=None):
     """
     with ops.name_scope(name, "%s_lookup_table_clear" % self.name,
                         (self.resource_handle, self._default_value)):
-      op = cuckoo_ops.tfra_cuckoo_hash_table_clear(
-          self.resource_handle,
-          key_dtype=self._key_dtype,
-          value_dtype=self._value_dtype)
+      op = hkv_ops.tfra_hkv_hash_table_clear(self.resource_handle,
+                                             key_dtype=self._key_dtype,
+                                             value_dtype=self._value_dtype)
 
     return op
 
@@ -259,14 +296,14 @@ def lookup(self,
       keys = ops.convert_to_tensor(keys, dtype=self._key_dtype, name="keys")
       with ops.colocate_with(self.resource_handle, ignore_existing=True):
         if return_exists:
-          values, exists = cuckoo_ops.tfra_cuckoo_hash_table_find_with_exists(
+          values, exists = hkv_ops.tfra_hkv_hash_table_find_with_exists(
               self.resource_handle,
               keys,
               dynamic_default_values
               if dynamic_default_values is not None else self._default_value,
           )
         else:
-          values = cuckoo_ops.tfra_cuckoo_hash_table_find(
+          values = hkv_ops.tfra_hkv_hash_table_find(
               self.resource_handle,
               keys,
               dynamic_default_values
@@ -300,8 +337,8 @@ def insert(self, keys, values, name=None, shared_name=None):
       values = ops.convert_to_tensor(values, self._value_dtype, name="values")
       with ops.colocate_with(self.resource_handle, ignore_existing=True):
         # pylint: disable=protected-access
-        op = cuckoo_ops.tfra_cuckoo_hash_table_insert(self.resource_handle,
-                                                      keys, values)
+        op = hkv_ops.tfra_hkv_hash_table_insert(self.resource_handle, keys,
+                                                values)
     return op
 
   def accum(self, keys, values_or_deltas, exists, name=None, shared_name=None):
@@ -335,8 +372,8 @@ def accum(self, keys, values_or_deltas, exists, name=None, shared_name=None):
       exists = ops.convert_to_tensor(exists, dtypes.bool, name="exists")
       with ops.colocate_with(self.resource_handle, ignore_existing=True):
         # pylint: disable=protected-access
-        op = cuckoo_ops.tfra_cuckoo_hash_table_accum(self.resource_handle, keys,
-                                                     values_or_deltas, exists)
+        op = hkv_ops.tfra_hkv_hash_table_accum(self.resource_handle, keys,
+                                               values_or_deltas, exists)
     return op
 
   def export(self, name=None):
@@ -352,7 +389,7 @@ def export(self, name=None):
     with ops.name_scope(name, "%s_lookup_table_export_values" % self.name,
                         [self.resource_handle]):
       with ops.colocate_with(self.resource_handle):
-        keys, values = cuckoo_ops.tfra_cuckoo_hash_table_export(
+        keys, values = hkv_ops.tfra_hkv_hash_table_export(
             self.resource_handle, self._key_dtype, self._value_dtype)
     return keys, values
 
@@ -363,73 +400,97 @@ def export_keys_and_metas(self, split_size, name=None):
                         "%s_lookup_table_export_keys_and_metas" % self.name,
                         [self.resource_handle]):
       with ops.colocate_with(self.resource_handle):
-        keys, metas = cuckoo_ops.tfra_cuckoo_hash_table_export_keys_and_metas(
+        keys, metas = hkv_ops.tfra_hkv_hash_table_export_keys_and_metas(
             self.resource_handle, Tkeys=self._key_dtype, split_size=split_size)
     return keys, metas
 
-  def save(self, filepath, buffer_size=4194304, name=None):
+  def save_to_file_system(self,
+                          dirpath,
+                          file_name=None,
+                          dirpath_env='TFRA_SAVED_KV',
+                          append_to_file=False,
+                          buffer_size=4194304,
+                          name=None):
     """
-    Returns an operation to save the keys and values in table to
-    filepath. The keys and values will be stored in files with
-    suffix ".keys" and ".values", appended to the filepath.
-
+    Returns an operation to save the keys and values in table to dirpath. 
+    The keys and values will be stored in FileSystem, rewrited or appended to the filepath.
     Args:
-      filepath: A path to save the table.
+      dirpath: A directory path to save the table.
+      dirpath_env: A environment variable stored a path to save the table, which priority higher than dirpath.
+      file_name: User custom file name for key/value prefix file name, default is self._name.
+      buffer_size: Number of keys in write buffer to file.
+      append_to_file: If true, operation will append data to the file but not write a new one.
       name: Name for the operation.
-      buffer_size: Number of kv pairs buffer write to file.
-
     Returns:
       An operation to save the table.
     """
+
     with ops.name_scope(name, "%s_save_table" % self.name,
                         [self.resource_handle]):
       with ops.colocate_with(None, ignore_existing=True):
-        return cuckoo_ops.tfra_cuckoo_hash_table_export_to_file(
+        return hkv_ops.tfra_hkv_hash_table_save_to_file_system(
             self.resource_handle,
-            filepath,
+            dirpath=dirpath,
+            file_name=file_name if file_name else self._name,
             key_dtype=self._key_dtype,
             value_dtype=self._value_dtype,
+            dirpath_env=dirpath_env,
+            append_to_file=append_to_file,
             buffer_size=buffer_size)
 
-  def load(self, filepath, buffer_size=4194304, name=None):
+  def load_from_file_system(self,
+                            dirpath,
+                            file_name=None,
+                            dirpath_env='TFRA_SAVED_KV',
+                            load_entire_dir=False,
+                            buffer_size=4194304,
+                            name=None):
     """
     Returns an operation to load keys and values to table from
-    file. The keys and values files are generated from `save`.
-
+    FileSystem. The keys and values files are generated from `save_to_file_system`.
     Args:
-      filepath: A file path stored the table keys and values.
+      dirpath: A directory path stored the table keys and values.
+      dirpath_env: A environment variable stored a path to load the table, which priority higher than dirpath.
+      file_name: User custom file name for key/value prefix file name, default is self._name.
+      buffer_size: Number of keys in read buffer from file.
+      load_entire_dir: If true, operation will load all key value files in the dirpath regardless partition.
       name: Name for the operation.
-      buffer_size: Number of kv pairs buffer to read file.
-
     Returns:
-      An operation to load keys and values to table from file.
+      An operation to load keys and values to table from FileSystem.
     """
     with ops.name_scope(name, "%s_load_table" % self.name,
                         [self.resource_handle]):
       with ops.colocate_with(None, ignore_existing=True):
-        return cuckoo_ops.tfra_cuckoo_hash_table_import_from_file(
+        return hkv_ops.tfra_hkv_hash_table_load_from_file_system(
             self.resource_handle,
-            filepath,
+            dirpath=dirpath,
+            file_name=file_name if file_name else self._name,
             key_dtype=self._key_dtype,
             value_dtype=self._value_dtype,
+            dirpath_env=dirpath_env,
+            load_entire_dir=load_entire_dir,
             buffer_size=buffer_size)
 
   def _gather_saveables_for_checkpoint(self):
     """For object-based checkpointing."""
     # full_name helps to figure out the name-based Saver's name for this saveable.
     full_name = self._table_name
-    return {
-        "table":
-            functools.partial(
-                CuckooHashTable._Saveable,
-                table=self,
-                name=self._name,
-                full_name=full_name,
-            )
-    }
+    self._new_obj_trackable = None  # reset _new_obj_trackable when save again
+    if self._checkpoint:
+      return {
+          "table":
+              functools.partial(
+                  self._saveable_fn,
+                  table=self,
+                  name=self._name,
+                  full_name=full_name,
+              )
+      }
+    else:
+      return {}
 
   class _Saveable(BaseSaverBuilder.SaveableObject):
-    """SaveableObject implementation for CuckooHashTable."""
+    """SaveableObject implementation for HkvHashTable."""
 
     def __init__(self, table, name, full_name=""):
       tensors = table.export()
@@ -438,7 +499,7 @@ def __init__(self, table, name, full_name=""):
           BaseSaverBuilder.SaveSpec(tensors[1], "", name + "-values"),
       ]
       # pylint: disable=protected-access
-      super(CuckooHashTable._Saveable, self).__init__(table, specs, name)
+      super(HkvHashTable._Saveable, self).__init__(table, specs, name)
       self._restore_name = table._name
 
     def restore(self, restored_tensors, restored_shapes, name=None):
@@ -446,11 +507,11 @@ def restore(self, restored_tensors, restored_shapes, name=None):
       # pylint: disable=protected-access
       with ops.name_scope(name, "%s_table_restore" % self._restore_name):
         with ops.colocate_with(self.op.resource_handle):
-          return cuckoo_ops.tfra_cuckoo_hash_table_import(
+          return hkv_ops.tfra_hkv_hash_table_import(
               self.op.resource_handle,
               restored_tensors[0],
               restored_tensors[1],
           )
 
 
-ops.NotDifferentiable(prefix_op_name("CuckooHashTableOfTensors"))
+ops.NotDifferentiable(prefix_op_name("HkvHashTableOfTensors"))
diff --git a/tensorflow_recommenders_addons/dynamic_embedding/python/ops/math_ops.py b/tensorflow_recommenders_addons/dynamic_embedding/python/ops/math_ops.py
index ee693a849..fdc9fc096 100644
--- a/tensorflow_recommenders_addons/dynamic_embedding/python/ops/math_ops.py
+++ b/tensorflow_recommenders_addons/dynamic_embedding/python/ops/math_ops.py
@@ -208,7 +208,7 @@ def sparse_reshape(sp_input, shape, name=None):
     by the new dense shape.
   """
   gpu_devices = config.list_physical_devices('GPU')
-  if gpu_devices:
+  if gpu_devices and hasattr(sp_input, 'device'):
     # Deprecated(MoFHeka): tfra_sparse_reshape op cause core dump when import horovod with GPU.
     if context.executing_eagerly():
       with ops.colocate_with(sp_input, ignore_existing=True):
diff --git a/tensorflow_recommenders_addons/tensorflow_recommenders_addons.bzl b/tensorflow_recommenders_addons/tensorflow_recommenders_addons.bzl
index cf5afc9d2..c32b80166 100644
--- a/tensorflow_recommenders_addons/tensorflow_recommenders_addons.bzl
+++ b/tensorflow_recommenders_addons/tensorflow_recommenders_addons.bzl
@@ -7,10 +7,22 @@ load(
 )
 load(
     "@local_config_cuda//cuda:build_defs.bzl",
+    "cuda_is_configured",
     "if_cuda",
     "if_cuda_is_configured",
 )
 
+def custom_cuda_op_library(
+        name,
+        srcs = [],
+        cuda_srcs = [],
+        deps = [],
+        cuda_deps = [],
+        copts = [],
+        **kwargs):
+    if cuda_is_configured():
+        custom_op_library(name, srcs, cuda_srcs, deps, cuda_deps, copts, **kwargs)
+
 def custom_op_library(
         name,
         srcs = [],
diff --git a/tools/docker/build_wheel.Dockerfile b/tools/docker/build_wheel.Dockerfile
index e4d3150b9..c6fca9e42 100644
--- a/tools/docker/build_wheel.Dockerfile
+++ b/tools/docker/build_wheel.Dockerfile
@@ -82,7 +82,7 @@ RUN bash tools/testing/build_and_run_tests.sh $SKIP_CUSTOM_OP_TESTS && \
     bazel-bin/build_pip_pkg artifacts $NIGHTLY_FLAG
 
 RUN bash tools/releases/tf_auditwheel_patch.sh
-RUN python -m auditwheel repair --plat manylinux2010_x86_64 artifacts/*.whl
+RUN python -m auditwheel repair --plat manylinux2014_x86_64 artifacts/*.whl
 RUN ls -al wheelhouse/
 
 # -------------------------------------------------------------------