From 8a3e5f1a7af6c638397fcabf17bea9192bd799d2 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 12 Dec 2024 17:40:20 -0800
Subject: [PATCH] Remove cudf._lib.nvtext in favor of inlining pylibcudf
 (#17535)

Contributes to https://github.com/rapidsai/cudf/issues/17317

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17535
---
 python/cudf/cudf/_lib/CMakeLists.txt          |   2 -
 python/cudf/cudf/_lib/__init__.py             |   1 -
 python/cudf/cudf/_lib/nvtext/CMakeLists.txt   |  24 --
 python/cudf/cudf/_lib/nvtext/__init__.pxd     |   0
 python/cudf/cudf/_lib/nvtext/__init__.py      |   0
 .../cudf/_lib/nvtext/byte_pair_encode.pyx     |  24 --
 .../cudf/cudf/_lib/nvtext/edit_distance.pyx   |  24 --
 .../cudf/cudf/_lib/nvtext/generate_ngrams.pyx |  35 --
 python/cudf/cudf/_lib/nvtext/jaccard.pyx      |  17 -
 python/cudf/cudf/_lib/nvtext/minhash.pyx      |  35 --
 .../cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx |  24 --
 python/cudf/cudf/_lib/nvtext/normalize.pyx    |  28 --
 python/cudf/cudf/_lib/nvtext/replace.pyx      |  52 ---
 python/cudf/cudf/_lib/nvtext/stemmer.pyx      |  55 ---
 .../cudf/_lib/nvtext/subword_tokenize.pyx     |  38 --
 python/cudf/cudf/_lib/nvtext/tokenize.pyx     |  86 ----
 python/cudf/cudf/_lib/strings/__init__.pxd    |   0
 python/cudf/cudf/_lib/strings/__init__.py     |  30 --
 python/cudf/cudf/core/byte_pair_encoding.py   |  13 +-
 python/cudf/cudf/core/column/string.py        | 388 ++++++++++++++----
 python/cudf/cudf/core/subword_tokenizer.py    |   7 +-
 python/cudf/cudf/core/tokenize_vocabulary.py  |   9 +-
 22 files changed, 328 insertions(+), 564 deletions(-)
 delete mode 100644 python/cudf/cudf/_lib/nvtext/CMakeLists.txt
 delete mode 100644 python/cudf/cudf/_lib/nvtext/__init__.pxd
 delete mode 100644 python/cudf/cudf/_lib/nvtext/__init__.py
 delete mode 100644 python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
 delete mode 100644 python/cudf/cudf/_lib/nvtext/edit_distance.pyx
 delete mode 100644 python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx
 delete mode 100644 python/cudf/cudf/_lib/nvtext/jaccard.pyx
 delete mode 100644 python/cudf/cudf/_lib/nvtext/minhash.pyx
 delete mode 100644 python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx
 delete mode 100644 python/cudf/cudf/_lib/nvtext/normalize.pyx
 delete mode 100644 python/cudf/cudf/_lib/nvtext/replace.pyx
 delete mode 100644 python/cudf/cudf/_lib/nvtext/stemmer.pyx
 delete mode 100644 python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx
 delete mode 100644 python/cudf/cudf/_lib/nvtext/tokenize.pyx
 delete mode 100644 python/cudf/cudf/_lib/strings/__init__.pxd
 delete mode 100644 python/cudf/cudf/_lib/strings/__init__.py

diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index f422635d22a..c2677c6d88d 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -30,5 +30,3 @@ target_include_directories(interop PUBLIC "$<BUILD_INTERFACE:${DLPACK_INCLUDE_DI
 include(${rapids-cmake-dir}/export/find_package_root.cmake)
 include(../../../../cpp/cmake/thirdparty/get_nanoarrow.cmake)
 target_link_libraries(interop PUBLIC nanoarrow)
-
-add_subdirectory(nvtext)
diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
index cfdcec4cd3b..f86a15b932b 100644
--- a/python/cudf/cudf/_lib/__init__.py
+++ b/python/cudf/cudf/_lib/__init__.py
@@ -6,7 +6,6 @@
     csv,
     groupby,
     interop,
-    nvtext,
     reduce,
     sort,
     stream_compaction,
diff --git a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt
deleted file mode 100644
index 22ec5d472f2..00000000000
--- a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-# =============================================================================
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-
-set(cython_sources
-    byte_pair_encode.pyx edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx
-    ngrams_tokenize.pyx normalize.pyx replace.pyx stemmer.pyx subword_tokenize.pyx tokenize.pyx
-)
-set(linked_libraries cudf::cudf)
-rapids_cython_create_modules(
-  CXX
-  SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX nvtext_ ASSOCIATED_TARGETS cudf
-)
diff --git a/python/cudf/cudf/_lib/nvtext/__init__.pxd b/python/cudf/cudf/_lib/nvtext/__init__.pxd
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/python/cudf/cudf/_lib/nvtext/__init__.py b/python/cudf/cudf/_lib/nvtext/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
deleted file mode 100644
index 2b2762eead2..00000000000
--- a/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-
-
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-
-from pylibcudf import nvtext
-from pylibcudf.nvtext.byte_pair_encode import BPEMergePairs  # no-cython-lint
-
-
-@acquire_spill_lock()
-def byte_pair_encoding(
-    Column strings,
-    object merge_pairs,
-    object separator
-):
-    return Column.from_pylibcudf(
-        nvtext.byte_pair_encode.byte_pair_encoding(
-            strings.to_pylibcudf(mode="read"),
-            merge_pairs,
-            separator.device_value.c_value
-        )
-    )
diff --git a/python/cudf/cudf/_lib/nvtext/edit_distance.pyx b/python/cudf/cudf/_lib/nvtext/edit_distance.pyx
deleted file mode 100644
index 3dd99c42d76..00000000000
--- a/python/cudf/cudf/_lib/nvtext/edit_distance.pyx
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from pylibcudf cimport nvtext
-
-from cudf._lib.column cimport Column
-
-
-@acquire_spill_lock()
-def edit_distance(Column strings, Column targets):
-    result = nvtext.edit_distance.edit_distance(
-        strings.to_pylibcudf(mode="read"),
-        targets.to_pylibcudf(mode="read")
-    )
-    return Column.from_pylibcudf(result)
-
-
-@acquire_spill_lock()
-def edit_distance_matrix(Column strings):
-    result = nvtext.edit_distance.edit_distance_matrix(
-        strings.to_pylibcudf(mode="read")
-    )
-    return Column.from_pylibcudf(result)
diff --git a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx b/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx
deleted file mode 100644
index 7fdf9258b7f..00000000000
--- a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-
-from pylibcudf import nvtext
-
-
-@acquire_spill_lock()
-def generate_ngrams(Column strings, int ngrams, object py_separator):
-    result = nvtext.generate_ngrams.generate_ngrams(
-        strings.to_pylibcudf(mode="read"),
-        ngrams,
-        py_separator.device_value.c_value
-    )
-    return Column.from_pylibcudf(result)
-
-
-@acquire_spill_lock()
-def generate_character_ngrams(Column strings, int ngrams):
-    result = nvtext.generate_ngrams.generate_character_ngrams(
-        strings.to_pylibcudf(mode="read"),
-        ngrams
-    )
-    return Column.from_pylibcudf(result)
-
-
-@acquire_spill_lock()
-def hash_character_ngrams(Column strings, int ngrams):
-    result = nvtext.generate_ngrams.hash_character_ngrams(
-        strings.to_pylibcudf(mode="read"),
-        ngrams
-    )
-    return Column.from_pylibcudf(result)
diff --git a/python/cudf/cudf/_lib/nvtext/jaccard.pyx b/python/cudf/cudf/_lib/nvtext/jaccard.pyx
deleted file mode 100644
index c964d0206b7..00000000000
--- a/python/cudf/cudf/_lib/nvtext/jaccard.pyx
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-
-from pylibcudf import nvtext
-
-
-@acquire_spill_lock()
-def jaccard_index(Column input1, Column input2, int width):
-    result = nvtext.jaccard.jaccard_index(
-        input1.to_pylibcudf(mode="read"),
-        input2.to_pylibcudf(mode="read"),
-        width,
-    )
-    return Column.from_pylibcudf(result)
diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx
deleted file mode 100644
index 9f2b3f92502..00000000000
--- a/python/cudf/cudf/_lib/nvtext/minhash.pyx
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-
-from libc.stdint cimport uint32_t, uint64_t
-
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-
-from pylibcudf import nvtext
-
-
-@acquire_spill_lock()
-def minhash(Column input, uint32_t seed, Column a, Column b, int width):
-    return Column.from_pylibcudf(
-        nvtext.minhash.minhash(
-            input.to_pylibcudf(mode="read"),
-            seed,
-            a.to_pylibcudf(mode="read"),
-            b.to_pylibcudf(mode="read"),
-            width,
-        )
-    )
-
-
-@acquire_spill_lock()
-def minhash64(Column input, uint64_t seed, Column a, Column b, int width):
-    return Column.from_pylibcudf(
-        nvtext.minhash.minhash64(
-            input.to_pylibcudf(mode="read"),
-            seed,
-            a.to_pylibcudf(mode="read"),
-            b.to_pylibcudf(mode="read"),
-            width,
-        )
-    )
diff --git a/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx
deleted file mode 100644
index c125d92a24e..00000000000
--- a/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-
-from pylibcudf import nvtext
-
-
-@acquire_spill_lock()
-def ngrams_tokenize(
-    Column input,
-    int ngrams,
-    object py_delimiter,
-    object py_separator
-):
-    return Column.from_pylibcudf(
-        nvtext.ngrams_tokenize.ngrams_tokenize(
-            input.to_pylibcudf(mode="read"),
-            ngrams,
-            py_delimiter.device_value.c_value,
-            py_separator.device_value.c_value
-        )
-    )
diff --git a/python/cudf/cudf/_lib/nvtext/normalize.pyx b/python/cudf/cudf/_lib/nvtext/normalize.pyx
deleted file mode 100644
index cc45123dd0a..00000000000
--- a/python/cudf/cudf/_lib/nvtext/normalize.pyx
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from libcpp cimport bool
-
-from cudf._lib.column cimport Column
-
-from pylibcudf import nvtext
-
-
-@acquire_spill_lock()
-def normalize_spaces(Column input):
-    return Column.from_pylibcudf(
-        nvtext.normalize.normalize_spaces(
-            input.to_pylibcudf(mode="read")
-        )
-    )
-
-
-@acquire_spill_lock()
-def normalize_characters(Column input, bool do_lower=True):
-    return Column.from_pylibcudf(
-        nvtext.normalize.normalize_characters(
-            input.to_pylibcudf(mode="read"),
-            do_lower,
-        )
-    )
diff --git a/python/cudf/cudf/_lib/nvtext/replace.pyx b/python/cudf/cudf/_lib/nvtext/replace.pyx
deleted file mode 100644
index bec56ade83c..00000000000
--- a/python/cudf/cudf/_lib/nvtext/replace.pyx
+++ /dev/null
@@ -1,52 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from pylibcudf.libcudf.types cimport size_type
-
-from cudf._lib.column cimport Column
-from pylibcudf import nvtext
-
-
-@acquire_spill_lock()
-def replace_tokens(Column strings,
-                   Column targets,
-                   Column replacements,
-                   object py_delimiter):
-    """
-    The `targets` tokens are searched for within each `strings`
-    in the Column and replaced with the corresponding `replacements`
-    if found. Tokens are identified by the `py_delimiter` character
-    provided.
-    """
-
-    return Column.from_pylibcudf(
-        nvtext.replace.replace_tokens(
-            strings.to_pylibcudf(mode="read"),
-            targets.to_pylibcudf(mode="read"),
-            replacements.to_pylibcudf(mode="read"),
-            py_delimiter.device_value.c_value,
-        )
-    )
-
-
-@acquire_spill_lock()
-def filter_tokens(Column strings,
-                  size_type min_token_length,
-                  object py_replacement,
-                  object py_delimiter):
-    """
-    Tokens smaller than `min_token_length` are removed from `strings`
-    in the Column and optionally replaced with the corresponding
-    `py_replacement` string. Tokens are identified by the `py_delimiter`
-    character provided.
-    """
-
-    return Column.from_pylibcudf(
-        nvtext.replace.filter_tokens(
-            strings.to_pylibcudf(mode="read"),
-            min_token_length,
-            py_replacement.device_value.c_value,
-            py_delimiter.device_value.c_value,
-        )
-    )
diff --git a/python/cudf/cudf/_lib/nvtext/stemmer.pyx b/python/cudf/cudf/_lib/nvtext/stemmer.pyx
deleted file mode 100644
index 63a389b64d5..00000000000
--- a/python/cudf/cudf/_lib/nvtext/stemmer.pyx
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from enum import IntEnum
-
-from cudf.core.buffer import acquire_spill_lock
-
-from pylibcudf.libcudf.nvtext.stemmer cimport (
-    letter_type,
-    underlying_type_t_letter_type,
-)
-from pylibcudf.libcudf.types cimport size_type
-
-from cudf._lib.column cimport Column
-
-from pylibcudf import nvtext
-
-
-class LetterType(IntEnum):
-    CONSONANT = <underlying_type_t_letter_type> letter_type.CONSONANT
-    VOWEL = <underlying_type_t_letter_type> letter_type.VOWEL
-
-
-@acquire_spill_lock()
-def porter_stemmer_measure(Column strings):
-    return Column.from_pylibcudf(
-        nvtext.stemmer.porter_stemmer_measure(
-            strings.to_pylibcudf(mode="read"),
-        )
-    )
-
-
-@acquire_spill_lock()
-def is_letter(Column strings,
-              object ltype,
-              size_type index):
-    return Column.from_pylibcudf(
-        nvtext.stemmer.is_letter(
-            strings.to_pylibcudf(mode="read"),
-            ltype==LetterType.VOWEL,
-            index,
-        )
-    )
-
-
-@acquire_spill_lock()
-def is_letter_multi(Column strings,
-                    object ltype,
-                    Column indices):
-    return Column.from_pylibcudf(
-        nvtext.stemmer.is_letter(
-            strings.to_pylibcudf(mode="read"),
-            ltype==LetterType.VOWEL,
-            indices.to_pylibcudf(mode="read"),
-        )
-    )
diff --git a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx
deleted file mode 100644
index 5e0bfb74705..00000000000
--- a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from libc.stdint cimport uint32_t
-
-from cudf.core.buffer import acquire_spill_lock
-
-from libcpp cimport bool
-
-from cudf._lib.column cimport Column
-
-from pylibcudf import nvtext
-
-
-@acquire_spill_lock()
-def subword_tokenize_inmem_hash(
-    Column strings,
-    object hashed_vocabulary,
-    uint32_t max_sequence_length=64,
-    uint32_t stride=48,
-    bool do_lower=True,
-    bool do_truncate=False,
-):
-    """
-    Subword tokenizes text series by using the pre-loaded hashed vocabulary
-    """
-    result = nvtext.subword_tokenize.subword_tokenize(
-        strings.to_pylibcudf(mode="read"),
-        hashed_vocabulary,
-        max_sequence_length,
-        stride,
-        do_lower,
-        do_truncate,
-    )
-    # return the 3 tensor components
-    tokens = Column.from_pylibcudf(result[0])
-    masks = Column.from_pylibcudf(result[1])
-    metadata = Column.from_pylibcudf(result[2])
-    return tokens, masks, metadata
diff --git a/python/cudf/cudf/_lib/nvtext/tokenize.pyx b/python/cudf/cudf/_lib/nvtext/tokenize.pyx
deleted file mode 100644
index f473c48e2f7..00000000000
--- a/python/cudf/cudf/_lib/nvtext/tokenize.pyx
+++ /dev/null
@@ -1,86 +0,0 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from pylibcudf.libcudf.types cimport size_type
-
-from pylibcudf.nvtext.tokenize import TokenizeVocabulary  # no-cython-lint
-
-from cudf._lib.column cimport Column
-
-from pylibcudf import nvtext
-
-
-@acquire_spill_lock()
-def _tokenize_scalar(Column strings, object py_delimiter):
-    return Column.from_pylibcudf(
-        nvtext.tokenize.tokenize_scalar(
-            strings.to_pylibcudf(mode="read"),
-            py_delimiter.device_value.c_value
-        )
-    )
-
-
-@acquire_spill_lock()
-def _tokenize_column(Column strings, Column delimiters):
-    return Column.from_pylibcudf(
-        nvtext.tokenize.tokenize_column(
-            strings.to_pylibcudf(mode="read"),
-            delimiters.to_pylibcudf(mode="read"),
-        )
-    )
-
-
-@acquire_spill_lock()
-def _count_tokens_scalar(Column strings, object py_delimiter):
-    return Column.from_pylibcudf(
-        nvtext.tokenize.count_tokens_scalar(
-            strings.to_pylibcudf(mode="read"),
-            py_delimiter.device_value.c_value
-        )
-    )
-
-
-@acquire_spill_lock()
-def _count_tokens_column(Column strings, Column delimiters):
-    return Column.from_pylibcudf(
-        nvtext.tokenize.count_tokens_column(
-            strings.to_pylibcudf(mode="read"),
-            delimiters.to_pylibcudf(mode="read")
-        )
-    )
-
-
-@acquire_spill_lock()
-def character_tokenize(Column strings):
-    return Column.from_pylibcudf(
-        nvtext.tokenize.character_tokenize(
-            strings.to_pylibcudf(mode="read")
-        )
-    )
-
-
-@acquire_spill_lock()
-def detokenize(Column strings, Column indices, object py_separator):
-    return Column.from_pylibcudf(
-        nvtext.tokenize.detokenize(
-            strings.to_pylibcudf(mode="read"),
-            indices.to_pylibcudf(mode="read"),
-            py_separator.device_value.c_value
-        )
-    )
-
-
-@acquire_spill_lock()
-def tokenize_with_vocabulary(Column strings,
-                             object vocabulary,
-                             object py_delimiter,
-                             size_type default_id):
-    return Column.from_pylibcudf(
-        nvtext.tokenize.tokenize_with_vocabulary(
-            strings.to_pylibcudf(mode="read"),
-            vocabulary,
-            py_delimiter.device_value.c_value,
-            default_id
-        )
-    )
diff --git a/python/cudf/cudf/_lib/strings/__init__.pxd b/python/cudf/cudf/_lib/strings/__init__.pxd
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py
deleted file mode 100644
index b9095a22a42..00000000000
--- a/python/cudf/cudf/_lib/strings/__init__.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-from cudf._lib.nvtext.edit_distance import edit_distance, edit_distance_matrix
-from cudf._lib.nvtext.generate_ngrams import (
-    generate_character_ngrams,
-    generate_ngrams,
-    hash_character_ngrams,
-)
-from cudf._lib.nvtext.jaccard import jaccard_index
-from cudf._lib.nvtext.minhash import (
-    minhash,
-    minhash64,
-)
-from cudf._lib.nvtext.ngrams_tokenize import ngrams_tokenize
-from cudf._lib.nvtext.normalize import normalize_characters, normalize_spaces
-from cudf._lib.nvtext.replace import filter_tokens, replace_tokens
-from cudf._lib.nvtext.stemmer import (
-    LetterType,
-    is_letter,
-    is_letter_multi,
-    porter_stemmer_measure,
-)
-from cudf._lib.nvtext.tokenize import (
-    _count_tokens_column,
-    _count_tokens_scalar,
-    _tokenize_column,
-    _tokenize_scalar,
-    character_tokenize,
-    detokenize,
-    tokenize_with_vocabulary,
-)
diff --git a/python/cudf/cudf/core/byte_pair_encoding.py b/python/cudf/cudf/core/byte_pair_encoding.py
index 8d38a5f2272..b49f5154697 100644
--- a/python/cudf/cudf/core/byte_pair_encoding.py
+++ b/python/cudf/cudf/core/byte_pair_encoding.py
@@ -5,9 +5,6 @@
 import pylibcudf as plc
 
 import cudf
-from cudf._lib.nvtext.byte_pair_encode import (
-    byte_pair_encoding as cpp_byte_pair_encoding,
-)
 
 
 class BytePairEncoder:
@@ -25,12 +22,12 @@ class BytePairEncoder:
     BytePairEncoder
     """
 
-    def __init__(self, merges_pair: "cudf.Series"):
+    def __init__(self, merges_pair: cudf.Series) -> None:
         self.merge_pairs = plc.nvtext.byte_pair_encode.BPEMergePairs(
             merges_pair._column.to_pylibcudf(mode="read")
         )
 
-    def __call__(self, text, separator: str = " ") -> cudf.Series:
+    def __call__(self, text: cudf.Series, separator: str = " ") -> cudf.Series:
         """
 
         Parameters
@@ -57,6 +54,6 @@ def __call__(self, text, separator: str = " ") -> cudf.Series:
         dtype: object
         """
         sep = cudf.Scalar(separator, dtype="str")
-        result = cpp_byte_pair_encoding(text._column, self.merge_pairs, sep)
-
-        return cudf.Series._from_column(result)
+        return cudf.Series._from_column(
+            text._column.byte_pair_encoding(self.merge_pairs, sep)
+        )
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 06196717ce3..c021554f3bd 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -20,7 +20,7 @@
 import cudf.core.column.column as column
 import cudf.core.column.datetime as datetime
 from cudf import _lib as libcudf
-from cudf._lib import string_casting as str_cast, strings as libstrings
+from cudf._lib import string_casting as str_cast
 from cudf._lib.column import Column
 from cudf._lib.types import size_type_dtype
 from cudf.api.types import is_integer, is_scalar, is_string_dtype
@@ -45,6 +45,7 @@
         SeriesOrIndex,
     )
     from cudf.core.buffer import Buffer
+    from cudf.core.column.lists import ListColumn
     from cudf.core.column.numerical import NumericalColumn
 
 
@@ -624,7 +625,7 @@ def join(
 
     def _split_by_character(self):
         col = self._column.fillna("")  # sanitize nulls
-        result_col = libstrings.character_tokenize(col)
+        result_col = col.character_tokenize()
 
         offset_col = col.children[0]
 
@@ -4693,9 +4694,7 @@ def normalize_spaces(self) -> SeriesOrIndex:
         1    test string
         dtype: object
         """
-        return self._return_or_inplace(
-            libstrings.normalize_spaces(self._column)
-        )
+        return self._return_or_inplace(self._column.normalize_spaces())
 
     def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex:
         r"""
@@ -4743,7 +4742,7 @@ def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex:
         dtype: object
         """
         return self._return_or_inplace(
-            libstrings.normalize_characters(self._column, do_lower)
+            self._column.normalize_characters(do_lower)
         )
 
     def tokenize(self, delimiter: str = " ") -> SeriesOrIndex:
@@ -4775,16 +4774,16 @@ def tokenize(self, delimiter: str = " ") -> SeriesOrIndex:
         2    goodbye
         dtype: object
         """
-        delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True)
+        delim = _massage_string_arg(delimiter, "delimiter", allow_col=True)
 
-        if isinstance(delimiter, Column):
+        if isinstance(delim, Column):
             result = self._return_or_inplace(
-                libstrings._tokenize_column(self._column, delimiter),
+                self._column.tokenize_column(delim),
                 retain_index=False,
             )
-        elif isinstance(delimiter, cudf.Scalar):
+        elif isinstance(delim, cudf.Scalar):
             result = self._return_or_inplace(
-                libstrings._tokenize_scalar(self._column, delimiter),
+                self._column.tokenize_scalar(delim),
                 retain_index=False,
             )
         else:
@@ -4799,7 +4798,7 @@ def tokenize(self, delimiter: str = " ") -> SeriesOrIndex:
         return result
 
     def detokenize(
-        self, indices: "cudf.Series", separator: str = " "
+        self, indices: cudf.Series, separator: str = " "
     ) -> SeriesOrIndex:
         """
         Combines tokens into strings by concatenating them in the order
@@ -4829,9 +4828,9 @@ def detokenize(
         2          three
         dtype: object
         """
-        separator = _massage_string_arg(separator, "separator")
+        sep = _massage_string_arg(separator, "separator")
         return self._return_or_inplace(
-            libstrings.detokenize(self._column, indices._column, separator),
+            self._column.detokenize(indices._column, sep),  # type: ignore[arg-type]
             retain_index=False,
         )
 
@@ -4882,17 +4881,15 @@ def character_tokenize(self) -> SeriesOrIndex:
         2    .
         dtype: object
         """
-        result_col = libstrings.character_tokenize(self._column)
+        result_col = self._column.character_tokenize()
         if isinstance(self._parent, cudf.Series):
             lengths = self.len().fillna(0)
             index = self._parent.index.repeat(lengths)
-            return cudf.Series._from_column(
+            return type(self._parent)._from_column(
                 result_col, name=self._parent.name, index=index
             )
-        elif isinstance(self._parent, cudf.BaseIndex):
-            return cudf.Index._from_column(result_col, name=self._parent.name)
         else:
-            return result_col
+            return self._return_or_inplace(result_col)
 
     def token_count(self, delimiter: str = " ") -> SeriesOrIndex:
         """
@@ -4919,15 +4916,15 @@ def token_count(self, delimiter: str = " ") -> SeriesOrIndex:
         2    0
         dtype: int32
         """
-        delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True)
-        if isinstance(delimiter, Column):
+        delim = _massage_string_arg(delimiter, "delimiter", allow_col=True)
+        if isinstance(delim, Column):
             return self._return_or_inplace(
-                libstrings._count_tokens_column(self._column, delimiter)
+                self._column.count_tokens_column(delim)
             )
 
-        elif isinstance(delimiter, cudf.Scalar):
+        elif isinstance(delim, cudf.Scalar):
             return self._return_or_inplace(
-                libstrings._count_tokens_scalar(self._column, delimiter)
+                self._column.count_tokens_scalar(delim)  # type: ignore[arg-type]
             )
         else:
             raise TypeError(
@@ -4966,9 +4963,9 @@ def ngrams(self, n: int = 2, separator: str = "_") -> SeriesOrIndex:
         2    xyz_hhh
         dtype: object
         """
-        separator = _massage_string_arg(separator, "separator")
+        sep = _massage_string_arg(separator, "separator")
         return self._return_or_inplace(
-            libstrings.generate_ngrams(self._column, n, separator),
+            self._column.generate_ngrams(n, sep),  # type: ignore[arg-type]
             retain_index=False,
         )
 
@@ -5015,7 +5012,7 @@ def character_ngrams(
         dtype: list
         """
         result = self._return_or_inplace(
-            libstrings.generate_character_ngrams(self._column, n),
+            self._column.generate_character_ngrams(n),
             retain_index=True,
         )
         if isinstance(result, cudf.Series) and not as_list:
@@ -5060,7 +5057,7 @@ def hash_character_ngrams(
         """
 
         result = self._return_or_inplace(
-            libstrings.hash_character_ngrams(self._column, n),
+            self._column.hash_character_ngrams(n),
             retain_index=True,
         )
         if isinstance(result, cudf.Series) and not as_list:
@@ -5098,10 +5095,10 @@ def ngrams_tokenize(
         2    best_book
         dtype: object
         """
-        delimiter = _massage_string_arg(delimiter, "delimiter")
-        separator = _massage_string_arg(separator, "separator")
+        delim = _massage_string_arg(delimiter, "delimiter")
+        sep = _massage_string_arg(separator, "separator")
         return self._return_or_inplace(
-            libstrings.ngrams_tokenize(self._column, n, delimiter, separator),
+            self._column.ngrams_tokenize(n, delim, sep),  # type: ignore[arg-type]
             retain_index=False,
         )
 
@@ -5180,10 +5177,9 @@ def replace_tokens(
             )
 
         return self._return_or_inplace(
-            libstrings.replace_tokens(
-                self._column,
-                targets_column,
-                replacements_column,
+            self._column.replace_tokens(
+                targets_column,  # type: ignore[arg-type]
+                replacements_column,  # type: ignore[arg-type]
                 cudf.Scalar(delimiter, dtype="str"),
             ),
         )
@@ -5251,8 +5247,7 @@ def filter_tokens(
             )
 
         return self._return_or_inplace(
-            libstrings.filter_tokens(
-                self._column,
+            self._column.filter_tokens(
                 min_token_length,
                 cudf.Scalar(replacement, dtype="str"),
                 cudf.Scalar(delimiter, dtype="str"),
@@ -5278,9 +5273,7 @@ def porter_stemmer_measure(self) -> SeriesOrIndex:
         1    2
         dtype: int32
         """
-        return self._return_or_inplace(
-            libstrings.porter_stemmer_measure(self._column)
-        )
+        return self._return_or_inplace(self._column.porter_stemmer_measure())
 
     def is_consonant(self, position) -> SeriesOrIndex:
         """
@@ -5313,17 +5306,10 @@ def is_consonant(self, position) -> SeriesOrIndex:
         1    False
         dtype: bool
         """
-        ltype = libstrings.LetterType.CONSONANT
-
         if can_convert_to_column(position):
-            return self._return_or_inplace(
-                libstrings.is_letter_multi(
-                    self._column, ltype, column.as_column(position)
-                ),
-            )
-
+            position = column.as_column(position)
         return self._return_or_inplace(
-            libstrings.is_letter(self._column, ltype, position)
+            self._column.is_letter(False, position)  # type: ignore[arg-type]
         )
 
     def is_vowel(self, position) -> SeriesOrIndex:
@@ -5357,17 +5343,10 @@ def is_vowel(self, position) -> SeriesOrIndex:
         1     True
         dtype: bool
         """
-        ltype = libstrings.LetterType.VOWEL
-
         if can_convert_to_column(position):
-            return self._return_or_inplace(
-                libstrings.is_letter_multi(
-                    self._column, ltype, column.as_column(position)
-                ),
-            )
-
+            position = column.as_column(position)
         return self._return_or_inplace(
-            libstrings.is_letter(self._column, ltype, position)
+            self._column.is_letter(True, position)  # type: ignore[arg-type]
         )
 
     def edit_distance(self, targets) -> SeriesOrIndex:
@@ -5416,7 +5395,7 @@ def edit_distance(self, targets) -> SeriesOrIndex:
             )
 
         return self._return_or_inplace(
-            libstrings.edit_distance(self._column, targets_column)
+            self._column.edit_distance(targets_column)  # type: ignore[arg-type]
         )
 
     def edit_distance_matrix(self) -> SeriesOrIndex:
@@ -5456,9 +5435,7 @@ def edit_distance_matrix(self) -> SeriesOrIndex:
                 "Cannot compute edit distance between null strings. "
                 "Consider removing them using `dropna` or fill with `fillna`."
             )
-        return self._return_or_inplace(
-            libstrings.edit_distance_matrix(self._column)
-        )
+        return self._return_or_inplace(self._column.edit_distance_matrix())
 
     def minhash(
         self, seed: np.uint32, a: ColumnLike, b: ColumnLike, width: int
@@ -5508,7 +5485,7 @@ def minhash(
                 f"Expecting a Series with dtype uint32, got {type(b)}"
             )
         return self._return_or_inplace(
-            libstrings.minhash(self._column, seed, a_column, b_column, width)
+            self._column.minhash(seed, a_column, b_column, width)  # type: ignore[arg-type]
         )
 
     def minhash64(
@@ -5559,7 +5536,7 @@ def minhash64(
                 f"Expecting a Series with dtype uint64, got {type(b)}"
             )
         return self._return_or_inplace(
-            libstrings.minhash64(self._column, seed, a_column, b_column, width)
+            self._column.minhash64(seed, a_column, b_column, width)  # type: ignore[arg-type]
         )
 
     def jaccard_index(self, input: cudf.Series, width: int) -> SeriesOrIndex:
@@ -5585,13 +5562,14 @@ def jaccard_index(self, input: cudf.Series, width: int) -> SeriesOrIndex:
         1    0.307692
         dtype: float32
         """
-
         return self._return_or_inplace(
-            libstrings.jaccard_index(self._column, input._column, width),
+            self._column.jaccard_index(input._column, width)
         )
 
 
-def _massage_string_arg(value, name, allow_col=False):
+def _massage_string_arg(
+    value, name, allow_col: bool = False
+) -> StringColumn | cudf.Scalar:
     if isinstance(value, cudf.Scalar):
         return value
 
@@ -5602,9 +5580,9 @@ def _massage_string_arg(value, name, allow_col=False):
 
     if allow_col:
         if isinstance(value, list):
-            return column.as_column(value, dtype="str")
+            return column.as_column(value, dtype="str")  # type: ignore[return-value]
 
-        if isinstance(value, Column) and is_string_dtype(value.dtype):
+        if isinstance(value, StringColumn):
             return value
 
         allowed_types.append("Column")
@@ -6148,6 +6126,278 @@ def view(self, dtype) -> "cudf.core.column.ColumnBase":
 
         return to_view.view(dtype)
 
+    @acquire_spill_lock()
+    def minhash(
+        self,
+        seed: np.uint32,
+        a: NumericalColumn,
+        b: NumericalColumn,
+        width: int,
+    ) -> ListColumn:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.minhash.minhash(
+                self.to_pylibcudf(mode="read"),
+                seed,
+                a.to_pylibcudf(mode="read"),
+                b.to_pylibcudf(mode="read"),
+                width,
+            )
+        )
+
+    @acquire_spill_lock()
+    def minhash64(
+        self,
+        seed: np.uint64,
+        a: NumericalColumn,
+        b: NumericalColumn,
+        width: int,
+    ) -> ListColumn:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.minhash.minhash64(
+                self.to_pylibcudf(mode="read"),
+                seed,
+                a.to_pylibcudf(mode="read"),
+                b.to_pylibcudf(mode="read"),
+                width,
+            )
+        )
+
+    @acquire_spill_lock()
+    def jaccard_index(self, other: Self, width: int) -> NumericalColumn:
+        result = plc.nvtext.jaccard.jaccard_index(
+            self.to_pylibcudf(mode="read"),
+            other.to_pylibcudf(mode="read"),
+            width,
+        )
+        return type(self).from_pylibcudf(result)  # type: ignore[return-value]
+
+    @acquire_spill_lock()
+    def generate_ngrams(self, ngrams: int, separator: cudf.Scalar) -> Self:
+        result = plc.nvtext.generate_ngrams.generate_ngrams(
+            self.to_pylibcudf(mode="read"),
+            ngrams,
+            separator.device_value.c_value,
+        )
+        return type(self).from_pylibcudf(result)  # type: ignore[return-value]
+
+    @acquire_spill_lock()
+    def generate_character_ngrams(self, ngrams: int) -> ListColumn:
+        result = plc.nvtext.generate_ngrams.generate_character_ngrams(
+            self.to_pylibcudf(mode="read"), ngrams
+        )
+        return type(self).from_pylibcudf(result)  # type: ignore[return-value]
+
+    @acquire_spill_lock()
+    def hash_character_ngrams(self, ngrams: int) -> ListColumn:
+        result = plc.nvtext.generate_ngrams.hash_character_ngrams(
+            self.to_pylibcudf(mode="read"), ngrams
+        )
+        return type(self).from_pylibcudf(result)  # type: ignore[return-value]
+
+    @acquire_spill_lock()
+    def edit_distance(self, targets: Self) -> NumericalColumn:
+        result = plc.nvtext.edit_distance.edit_distance(
+            self.to_pylibcudf(mode="read"), targets.to_pylibcudf(mode="read")
+        )
+        return type(self).from_pylibcudf(result)  # type: ignore[return-value]
+
+    @acquire_spill_lock()
+    def edit_distance_matrix(self) -> ListColumn:
+        result = plc.nvtext.edit_distance.edit_distance_matrix(
+            self.to_pylibcudf(mode="read")
+        )
+        return type(self).from_pylibcudf(result)  # type: ignore[return-value]
+
+    @acquire_spill_lock()
+    def byte_pair_encoding(
+        self,
+        merge_pairs: plc.nvtext.byte_pair_encode.BPEMergePairs,
+        separator: cudf.Scalar,
+    ) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.byte_pair_encode.byte_pair_encoding(
+                self.to_pylibcudf(mode="read"),
+                merge_pairs,
+                separator.device_value.c_value,
+            )
+        )
+
+    @acquire_spill_lock()
+    def ngrams_tokenize(
+        self,
+        ngrams: int,
+        delimiter: cudf.Scalar,
+        separator: cudf.Scalar,
+    ) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.ngrams_tokenize.ngrams_tokenize(
+                self.to_pylibcudf(mode="read"),
+                ngrams,
+                delimiter.device_value.c_value,
+                separator.device_value.c_value,
+            )
+        )
+
+    @acquire_spill_lock()
+    def normalize_spaces(self) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.normalize.normalize_spaces(
+                self.to_pylibcudf(mode="read")
+            )
+        )
+
+    @acquire_spill_lock()
+    def normalize_characters(self, do_lower: bool = True) -> Self:
+        return Column.from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.normalize.normalize_characters(
+                self.to_pylibcudf(mode="read"),
+                do_lower,
+            )
+        )
+
+    @acquire_spill_lock()
+    def replace_tokens(
+        self, targets: Self, replacements: Self, delimiter: cudf.Scalar
+    ) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.replace.replace_tokens(
+                self.to_pylibcudf(mode="read"),
+                targets.to_pylibcudf(mode="read"),
+                replacements.to_pylibcudf(mode="read"),
+                delimiter.device_value.c_value,
+            )
+        )
+
+    @acquire_spill_lock()
+    def filter_tokens(
+        self,
+        min_token_length: int,
+        replacement: cudf.Scalar,
+        delimiter: cudf.Scalar,
+    ) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.replace.filter_tokens(
+                self.to_pylibcudf(mode="read"),
+                min_token_length,
+                replacement.device_value.c_value,
+                delimiter.device_value.c_value,
+            )
+        )
+
+    @acquire_spill_lock()
+    def porter_stemmer_measure(self) -> NumericalColumn:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.stemmer.porter_stemmer_measure(
+                self.to_pylibcudf(mode="read")
+            )
+        )
+
+    @acquire_spill_lock()
+    def is_letter(self, is_vowel: bool, index: int | NumericalColumn) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.stemmer.is_letter(
+                self.to_pylibcudf(mode="read"),
+                is_vowel,
+                index
+                if isinstance(index, int)
+                else index.to_pylibcudf(mode="read"),
+            )
+        )
+
+    @acquire_spill_lock()
+    def subword_tokenize(
+        self,
+        hashed_vocabulary: plc.nvtext.subword_tokenize.HashedVocabulary,
+        max_sequence_length: int = 64,
+        stride: int = 48,
+        do_lower: bool = True,
+        do_truncate: bool = False,
+    ) -> tuple[ColumnBase, ColumnBase, ColumnBase]:
+        """
+        Subword tokenizes text series by using the pre-loaded hashed vocabulary
+        """
+        result = plc.nvtext.subword_tokenize.subword_tokenize(
+            self.to_pylibcudf(mode="read"),
+            hashed_vocabulary,
+            max_sequence_length,
+            stride,
+            do_lower,
+            do_truncate,
+        )
+        # return the 3 tensor components
+        tokens = type(self).from_pylibcudf(result[0])
+        masks = type(self).from_pylibcudf(result[1])
+        metadata = type(self).from_pylibcudf(result[2])
+        return tokens, masks, metadata
+
+    @acquire_spill_lock()
+    def tokenize_scalar(self, delimiter: cudf.Scalar) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.tokenize.tokenize_scalar(
+                self.to_pylibcudf(mode="read"), delimiter.device_value.c_value
+            )
+        )
+
+    @acquire_spill_lock()
+    def tokenize_column(self, delimiters: Self) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.tokenize.tokenize_column(
+                self.to_pylibcudf(mode="read"),
+                delimiters.to_pylibcudf(mode="read"),
+            )
+        )
+
+    @acquire_spill_lock()
+    def count_tokens_scalar(self, delimiter: cudf.Scalar) -> NumericalColumn:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.tokenize.count_tokens_scalar(
+                self.to_pylibcudf(mode="read"), delimiter.device_value.c_value
+            )
+        )
+
+    @acquire_spill_lock()
+    def count_tokens_column(self, delimiters: Self) -> NumericalColumn:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.tokenize.count_tokens_column(
+                self.to_pylibcudf(mode="read"),
+                delimiters.to_pylibcudf(mode="read"),
+            )
+        )
+
+    @acquire_spill_lock()
+    def character_tokenize(self) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.tokenize.character_tokenize(
+                self.to_pylibcudf(mode="read")
+            )
+        )
+
+    @acquire_spill_lock()
+    def tokenize_with_vocabulary(
+        self,
+        vocabulary: plc.nvtext.tokenize.TokenizeVocabulary,
+        delimiter: cudf.Scalar,
+        default_id: int,
+    ) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.tokenize.tokenize_with_vocabulary(
+                self.to_pylibcudf(mode="read"),
+                vocabulary,
+                delimiter.device_value.c_value,
+                default_id,
+            )
+        )
+
+    @acquire_spill_lock()
+    def detokenize(self, indices: ColumnBase, separator: cudf.Scalar) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.tokenize.detokenize(
+                self.to_pylibcudf(mode="read"),
+                indices.to_pylibcudf(mode="read"),
+                separator.device_value.c_value,
+            )
+        )
+
     def _modify_characters(
         self, method: Callable[[plc.Column], plc.Column]
     ) -> Self:
diff --git a/python/cudf/cudf/core/subword_tokenizer.py b/python/cudf/cudf/core/subword_tokenizer.py
index dda1f199078..479838ef2a8 100644
--- a/python/cudf/cudf/core/subword_tokenizer.py
+++ b/python/cudf/cudf/core/subword_tokenizer.py
@@ -8,10 +8,6 @@
 
 import pylibcudf as plc
 
-from cudf._lib.nvtext.subword_tokenize import (
-    subword_tokenize_inmem_hash as cpp_subword_tokenize,
-)
-
 
 def _cast_to_appropriate_type(ar, cast_type):
     if cast_type == "cp":
@@ -210,8 +206,7 @@ def __call__(
         stride = max_length - stride
         # behavior varies from subword_tokenize but maps with huggingface
 
-        input_ids, attention_mask, metadata = cpp_subword_tokenize(
-            text._column,
+        input_ids, attention_mask, metadata = text._column.subword_tokenize(
             self.vocab_file,
             max_sequence_length=max_length,
             stride=stride,
diff --git a/python/cudf/cudf/core/tokenize_vocabulary.py b/python/cudf/cudf/core/tokenize_vocabulary.py
index 1e31376cce8..fb8b9b3131c 100644
--- a/python/cudf/cudf/core/tokenize_vocabulary.py
+++ b/python/cudf/cudf/core/tokenize_vocabulary.py
@@ -5,9 +5,6 @@
 import pylibcudf as plc
 
 import cudf
-from cudf._lib.nvtext.tokenize import (
-    tokenize_with_vocabulary as cpp_tokenize_with_vocabulary,
-)
 
 
 class TokenizeVocabulary:
@@ -20,7 +17,7 @@ class TokenizeVocabulary:
         Strings column of vocabulary terms
     """
 
-    def __init__(self, vocabulary: "cudf.Series"):
+    def __init__(self, vocabulary: cudf.Series) -> None:
         self.vocabulary = plc.nvtext.tokenize.TokenizeVocabulary(
             vocabulary._column.to_pylibcudf(mode="read")
         )
@@ -46,8 +43,8 @@ def tokenize(
         if delimiter is None:
             delimiter = ""
         delim = cudf.Scalar(delimiter, dtype="str")
-        result = cpp_tokenize_with_vocabulary(
-            text._column, self.vocabulary, delim, default_id
+        result = text._column.tokenize_with_vocabulary(
+            self.vocabulary, delim, default_id
         )
 
         return cudf.Series._from_column(result)