From 8a3e5f1a7af6c638397fcabf17bea9192bd799d2 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 12 Dec 2024 17:40:20 -0800 Subject: [PATCH] Remove cudf._lib.nvtext in favor of inlining pylibcudf (#17535) Contributes to https://github.com/rapidsai/cudf/issues/17317 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/17535 --- python/cudf/cudf/_lib/CMakeLists.txt | 2 - python/cudf/cudf/_lib/__init__.py | 1 - python/cudf/cudf/_lib/nvtext/CMakeLists.txt | 24 -- python/cudf/cudf/_lib/nvtext/__init__.pxd | 0 python/cudf/cudf/_lib/nvtext/__init__.py | 0 .../cudf/_lib/nvtext/byte_pair_encode.pyx | 24 -- .../cudf/cudf/_lib/nvtext/edit_distance.pyx | 24 -- .../cudf/cudf/_lib/nvtext/generate_ngrams.pyx | 35 -- python/cudf/cudf/_lib/nvtext/jaccard.pyx | 17 - python/cudf/cudf/_lib/nvtext/minhash.pyx | 35 -- .../cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx | 24 -- python/cudf/cudf/_lib/nvtext/normalize.pyx | 28 -- python/cudf/cudf/_lib/nvtext/replace.pyx | 52 --- python/cudf/cudf/_lib/nvtext/stemmer.pyx | 55 --- .../cudf/_lib/nvtext/subword_tokenize.pyx | 38 -- python/cudf/cudf/_lib/nvtext/tokenize.pyx | 86 ---- python/cudf/cudf/_lib/strings/__init__.pxd | 0 python/cudf/cudf/_lib/strings/__init__.py | 30 -- python/cudf/cudf/core/byte_pair_encoding.py | 13 +- python/cudf/cudf/core/column/string.py | 388 ++++++++++++++---- python/cudf/cudf/core/subword_tokenizer.py | 7 +- python/cudf/cudf/core/tokenize_vocabulary.py | 9 +- 22 files changed, 328 insertions(+), 564 deletions(-) delete mode 100644 python/cudf/cudf/_lib/nvtext/CMakeLists.txt delete mode 100644 python/cudf/cudf/_lib/nvtext/__init__.pxd delete mode 100644 python/cudf/cudf/_lib/nvtext/__init__.py delete mode 100644 python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx delete mode 100644 python/cudf/cudf/_lib/nvtext/edit_distance.pyx delete mode 100644 python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx delete mode 100644 python/cudf/cudf/_lib/nvtext/jaccard.pyx delete mode 100644 python/cudf/cudf/_lib/nvtext/minhash.pyx delete mode 100644 python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx delete mode 100644 python/cudf/cudf/_lib/nvtext/normalize.pyx delete mode 100644 python/cudf/cudf/_lib/nvtext/replace.pyx delete mode 100644 python/cudf/cudf/_lib/nvtext/stemmer.pyx delete mode 100644 python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx delete mode 100644 python/cudf/cudf/_lib/nvtext/tokenize.pyx delete mode 100644 python/cudf/cudf/_lib/strings/__init__.pxd delete mode 100644 python/cudf/cudf/_lib/strings/__init__.py diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index f422635d22a..c2677c6d88d 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -30,5 +30,3 @@ target_include_directories(interop PUBLIC "$ letter_type.CONSONANT - VOWEL = letter_type.VOWEL - - -@acquire_spill_lock() -def porter_stemmer_measure(Column strings): - return Column.from_pylibcudf( - nvtext.stemmer.porter_stemmer_measure( - strings.to_pylibcudf(mode="read"), - ) - ) - - -@acquire_spill_lock() -def is_letter(Column strings, - object ltype, - size_type index): - return Column.from_pylibcudf( - nvtext.stemmer.is_letter( - strings.to_pylibcudf(mode="read"), - ltype==LetterType.VOWEL, - index, - ) - ) - - -@acquire_spill_lock() -def is_letter_multi(Column strings, - object ltype, - Column indices): - return Column.from_pylibcudf( - nvtext.stemmer.is_letter( - strings.to_pylibcudf(mode="read"), - ltype==LetterType.VOWEL, - indices.to_pylibcudf(mode="read"), - ) - ) diff --git a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx deleted file mode 100644 index 5e0bfb74705..00000000000 --- a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libc.stdint cimport uint32_t - -from cudf.core.buffer import acquire_spill_lock - -from libcpp cimport bool - -from cudf._lib.column cimport Column - -from pylibcudf import nvtext - - -@acquire_spill_lock() -def subword_tokenize_inmem_hash( - Column strings, - object hashed_vocabulary, - uint32_t max_sequence_length=64, - uint32_t stride=48, - bool do_lower=True, - bool do_truncate=False, -): - """ - Subword tokenizes text series by using the pre-loaded hashed vocabulary - """ - result = nvtext.subword_tokenize.subword_tokenize( - strings.to_pylibcudf(mode="read"), - hashed_vocabulary, - max_sequence_length, - stride, - do_lower, - do_truncate, - ) - # return the 3 tensor components - tokens = Column.from_pylibcudf(result[0]) - masks = Column.from_pylibcudf(result[1]) - metadata = Column.from_pylibcudf(result[2]) - return tokens, masks, metadata diff --git a/python/cudf/cudf/_lib/nvtext/tokenize.pyx b/python/cudf/cudf/_lib/nvtext/tokenize.pyx deleted file mode 100644 index f473c48e2f7..00000000000 --- a/python/cudf/cudf/_lib/nvtext/tokenize.pyx +++ /dev/null @@ -1,86 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from pylibcudf.libcudf.types cimport size_type - -from pylibcudf.nvtext.tokenize import TokenizeVocabulary # no-cython-lint - -from cudf._lib.column cimport Column - -from pylibcudf import nvtext - - -@acquire_spill_lock() -def _tokenize_scalar(Column strings, object py_delimiter): - return Column.from_pylibcudf( - nvtext.tokenize.tokenize_scalar( - strings.to_pylibcudf(mode="read"), - py_delimiter.device_value.c_value - ) - ) - - -@acquire_spill_lock() -def _tokenize_column(Column strings, Column delimiters): - return Column.from_pylibcudf( - nvtext.tokenize.tokenize_column( - strings.to_pylibcudf(mode="read"), - delimiters.to_pylibcudf(mode="read"), - ) - ) - - -@acquire_spill_lock() -def _count_tokens_scalar(Column strings, object py_delimiter): - return Column.from_pylibcudf( - nvtext.tokenize.count_tokens_scalar( - strings.to_pylibcudf(mode="read"), - py_delimiter.device_value.c_value - ) - ) - - -@acquire_spill_lock() -def _count_tokens_column(Column strings, Column delimiters): - return Column.from_pylibcudf( - nvtext.tokenize.count_tokens_column( - strings.to_pylibcudf(mode="read"), - delimiters.to_pylibcudf(mode="read") - ) - ) - - -@acquire_spill_lock() -def character_tokenize(Column strings): - return Column.from_pylibcudf( - nvtext.tokenize.character_tokenize( - strings.to_pylibcudf(mode="read") - ) - ) - - -@acquire_spill_lock() -def detokenize(Column strings, Column indices, object py_separator): - return Column.from_pylibcudf( - nvtext.tokenize.detokenize( - strings.to_pylibcudf(mode="read"), - indices.to_pylibcudf(mode="read"), - py_separator.device_value.c_value - ) - ) - - -@acquire_spill_lock() -def tokenize_with_vocabulary(Column strings, - object vocabulary, - object py_delimiter, - size_type default_id): - return Column.from_pylibcudf( - nvtext.tokenize.tokenize_with_vocabulary( - strings.to_pylibcudf(mode="read"), - vocabulary, - py_delimiter.device_value.c_value, - default_id - ) - ) diff --git a/python/cudf/cudf/_lib/strings/__init__.pxd b/python/cudf/cudf/_lib/strings/__init__.pxd deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py deleted file mode 100644 index b9095a22a42..00000000000 --- a/python/cudf/cudf/_lib/strings/__init__.py +++ /dev/null @@ -1,30 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. -from cudf._lib.nvtext.edit_distance import edit_distance, edit_distance_matrix -from cudf._lib.nvtext.generate_ngrams import ( - generate_character_ngrams, - generate_ngrams, - hash_character_ngrams, -) -from cudf._lib.nvtext.jaccard import jaccard_index -from cudf._lib.nvtext.minhash import ( - minhash, - minhash64, -) -from cudf._lib.nvtext.ngrams_tokenize import ngrams_tokenize -from cudf._lib.nvtext.normalize import normalize_characters, normalize_spaces -from cudf._lib.nvtext.replace import filter_tokens, replace_tokens -from cudf._lib.nvtext.stemmer import ( - LetterType, - is_letter, - is_letter_multi, - porter_stemmer_measure, -) -from cudf._lib.nvtext.tokenize import ( - _count_tokens_column, - _count_tokens_scalar, - _tokenize_column, - _tokenize_scalar, - character_tokenize, - detokenize, - tokenize_with_vocabulary, -) diff --git a/python/cudf/cudf/core/byte_pair_encoding.py b/python/cudf/cudf/core/byte_pair_encoding.py index 8d38a5f2272..b49f5154697 100644 --- a/python/cudf/cudf/core/byte_pair_encoding.py +++ b/python/cudf/cudf/core/byte_pair_encoding.py @@ -5,9 +5,6 @@ import pylibcudf as plc import cudf -from cudf._lib.nvtext.byte_pair_encode import ( - byte_pair_encoding as cpp_byte_pair_encoding, -) class BytePairEncoder: @@ -25,12 +22,12 @@ class BytePairEncoder: BytePairEncoder """ - def __init__(self, merges_pair: "cudf.Series"): + def __init__(self, merges_pair: cudf.Series) -> None: self.merge_pairs = plc.nvtext.byte_pair_encode.BPEMergePairs( merges_pair._column.to_pylibcudf(mode="read") ) - def __call__(self, text, separator: str = " ") -> cudf.Series: + def __call__(self, text: cudf.Series, separator: str = " ") -> cudf.Series: """ Parameters @@ -57,6 +54,6 @@ def __call__(self, text, separator: str = " ") -> cudf.Series: dtype: object """ sep = cudf.Scalar(separator, dtype="str") - result = cpp_byte_pair_encoding(text._column, self.merge_pairs, sep) - - return cudf.Series._from_column(result) + return cudf.Series._from_column( + text._column.byte_pair_encoding(self.merge_pairs, sep) + ) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 06196717ce3..c021554f3bd 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -20,7 +20,7 @@ import cudf.core.column.column as column import cudf.core.column.datetime as datetime from cudf import _lib as libcudf -from cudf._lib import string_casting as str_cast, strings as libstrings +from cudf._lib import string_casting as str_cast from cudf._lib.column import Column from cudf._lib.types import size_type_dtype from cudf.api.types import is_integer, is_scalar, is_string_dtype @@ -45,6 +45,7 @@ SeriesOrIndex, ) from cudf.core.buffer import Buffer + from cudf.core.column.lists import ListColumn from cudf.core.column.numerical import NumericalColumn @@ -624,7 +625,7 @@ def join( def _split_by_character(self): col = self._column.fillna("") # sanitize nulls - result_col = libstrings.character_tokenize(col) + result_col = col.character_tokenize() offset_col = col.children[0] @@ -4693,9 +4694,7 @@ def normalize_spaces(self) -> SeriesOrIndex: 1 test string dtype: object """ - return self._return_or_inplace( - libstrings.normalize_spaces(self._column) - ) + return self._return_or_inplace(self._column.normalize_spaces()) def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex: r""" @@ -4743,7 +4742,7 @@ def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex: dtype: object """ return self._return_or_inplace( - libstrings.normalize_characters(self._column, do_lower) + self._column.normalize_characters(do_lower) ) def tokenize(self, delimiter: str = " ") -> SeriesOrIndex: @@ -4775,16 +4774,16 @@ def tokenize(self, delimiter: str = " ") -> SeriesOrIndex: 2 goodbye dtype: object """ - delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True) + delim = _massage_string_arg(delimiter, "delimiter", allow_col=True) - if isinstance(delimiter, Column): + if isinstance(delim, Column): result = self._return_or_inplace( - libstrings._tokenize_column(self._column, delimiter), + self._column.tokenize_column(delim), retain_index=False, ) - elif isinstance(delimiter, cudf.Scalar): + elif isinstance(delim, cudf.Scalar): result = self._return_or_inplace( - libstrings._tokenize_scalar(self._column, delimiter), + self._column.tokenize_scalar(delim), retain_index=False, ) else: @@ -4799,7 +4798,7 @@ def tokenize(self, delimiter: str = " ") -> SeriesOrIndex: return result def detokenize( - self, indices: "cudf.Series", separator: str = " " + self, indices: cudf.Series, separator: str = " " ) -> SeriesOrIndex: """ Combines tokens into strings by concatenating them in the order @@ -4829,9 +4828,9 @@ def detokenize( 2 three dtype: object """ - separator = _massage_string_arg(separator, "separator") + sep = _massage_string_arg(separator, "separator") return self._return_or_inplace( - libstrings.detokenize(self._column, indices._column, separator), + self._column.detokenize(indices._column, sep), # type: ignore[arg-type] retain_index=False, ) @@ -4882,17 +4881,15 @@ def character_tokenize(self) -> SeriesOrIndex: 2 . dtype: object """ - result_col = libstrings.character_tokenize(self._column) + result_col = self._column.character_tokenize() if isinstance(self._parent, cudf.Series): lengths = self.len().fillna(0) index = self._parent.index.repeat(lengths) - return cudf.Series._from_column( + return type(self._parent)._from_column( result_col, name=self._parent.name, index=index ) - elif isinstance(self._parent, cudf.BaseIndex): - return cudf.Index._from_column(result_col, name=self._parent.name) else: - return result_col + return self._return_or_inplace(result_col) def token_count(self, delimiter: str = " ") -> SeriesOrIndex: """ @@ -4919,15 +4916,15 @@ def token_count(self, delimiter: str = " ") -> SeriesOrIndex: 2 0 dtype: int32 """ - delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True) - if isinstance(delimiter, Column): + delim = _massage_string_arg(delimiter, "delimiter", allow_col=True) + if isinstance(delim, Column): return self._return_or_inplace( - libstrings._count_tokens_column(self._column, delimiter) + self._column.count_tokens_column(delim) ) - elif isinstance(delimiter, cudf.Scalar): + elif isinstance(delim, cudf.Scalar): return self._return_or_inplace( - libstrings._count_tokens_scalar(self._column, delimiter) + self._column.count_tokens_scalar(delim) # type: ignore[arg-type] ) else: raise TypeError( @@ -4966,9 +4963,9 @@ def ngrams(self, n: int = 2, separator: str = "_") -> SeriesOrIndex: 2 xyz_hhh dtype: object """ - separator = _massage_string_arg(separator, "separator") + sep = _massage_string_arg(separator, "separator") return self._return_or_inplace( - libstrings.generate_ngrams(self._column, n, separator), + self._column.generate_ngrams(n, sep), # type: ignore[arg-type] retain_index=False, ) @@ -5015,7 +5012,7 @@ def character_ngrams( dtype: list """ result = self._return_or_inplace( - libstrings.generate_character_ngrams(self._column, n), + self._column.generate_character_ngrams(n), retain_index=True, ) if isinstance(result, cudf.Series) and not as_list: @@ -5060,7 +5057,7 @@ def hash_character_ngrams( """ result = self._return_or_inplace( - libstrings.hash_character_ngrams(self._column, n), + self._column.hash_character_ngrams(n), retain_index=True, ) if isinstance(result, cudf.Series) and not as_list: @@ -5098,10 +5095,10 @@ def ngrams_tokenize( 2 best_book dtype: object """ - delimiter = _massage_string_arg(delimiter, "delimiter") - separator = _massage_string_arg(separator, "separator") + delim = _massage_string_arg(delimiter, "delimiter") + sep = _massage_string_arg(separator, "separator") return self._return_or_inplace( - libstrings.ngrams_tokenize(self._column, n, delimiter, separator), + self._column.ngrams_tokenize(n, delim, sep), # type: ignore[arg-type] retain_index=False, ) @@ -5180,10 +5177,9 @@ def replace_tokens( ) return self._return_or_inplace( - libstrings.replace_tokens( - self._column, - targets_column, - replacements_column, + self._column.replace_tokens( + targets_column, # type: ignore[arg-type] + replacements_column, # type: ignore[arg-type] cudf.Scalar(delimiter, dtype="str"), ), ) @@ -5251,8 +5247,7 @@ def filter_tokens( ) return self._return_or_inplace( - libstrings.filter_tokens( - self._column, + self._column.filter_tokens( min_token_length, cudf.Scalar(replacement, dtype="str"), cudf.Scalar(delimiter, dtype="str"), @@ -5278,9 +5273,7 @@ def porter_stemmer_measure(self) -> SeriesOrIndex: 1 2 dtype: int32 """ - return self._return_or_inplace( - libstrings.porter_stemmer_measure(self._column) - ) + return self._return_or_inplace(self._column.porter_stemmer_measure()) def is_consonant(self, position) -> SeriesOrIndex: """ @@ -5313,17 +5306,10 @@ def is_consonant(self, position) -> SeriesOrIndex: 1 False dtype: bool """ - ltype = libstrings.LetterType.CONSONANT - if can_convert_to_column(position): - return self._return_or_inplace( - libstrings.is_letter_multi( - self._column, ltype, column.as_column(position) - ), - ) - + position = column.as_column(position) return self._return_or_inplace( - libstrings.is_letter(self._column, ltype, position) + self._column.is_letter(False, position) # type: ignore[arg-type] ) def is_vowel(self, position) -> SeriesOrIndex: @@ -5357,17 +5343,10 @@ def is_vowel(self, position) -> SeriesOrIndex: 1 True dtype: bool """ - ltype = libstrings.LetterType.VOWEL - if can_convert_to_column(position): - return self._return_or_inplace( - libstrings.is_letter_multi( - self._column, ltype, column.as_column(position) - ), - ) - + position = column.as_column(position) return self._return_or_inplace( - libstrings.is_letter(self._column, ltype, position) + self._column.is_letter(True, position) # type: ignore[arg-type] ) def edit_distance(self, targets) -> SeriesOrIndex: @@ -5416,7 +5395,7 @@ def edit_distance(self, targets) -> SeriesOrIndex: ) return self._return_or_inplace( - libstrings.edit_distance(self._column, targets_column) + self._column.edit_distance(targets_column) # type: ignore[arg-type] ) def edit_distance_matrix(self) -> SeriesOrIndex: @@ -5456,9 +5435,7 @@ def edit_distance_matrix(self) -> SeriesOrIndex: "Cannot compute edit distance between null strings. " "Consider removing them using `dropna` or fill with `fillna`." ) - return self._return_or_inplace( - libstrings.edit_distance_matrix(self._column) - ) + return self._return_or_inplace(self._column.edit_distance_matrix()) def minhash( self, seed: np.uint32, a: ColumnLike, b: ColumnLike, width: int @@ -5508,7 +5485,7 @@ def minhash( f"Expecting a Series with dtype uint32, got {type(b)}" ) return self._return_or_inplace( - libstrings.minhash(self._column, seed, a_column, b_column, width) + self._column.minhash(seed, a_column, b_column, width) # type: ignore[arg-type] ) def minhash64( @@ -5559,7 +5536,7 @@ def minhash64( f"Expecting a Series with dtype uint64, got {type(b)}" ) return self._return_or_inplace( - libstrings.minhash64(self._column, seed, a_column, b_column, width) + self._column.minhash64(seed, a_column, b_column, width) # type: ignore[arg-type] ) def jaccard_index(self, input: cudf.Series, width: int) -> SeriesOrIndex: @@ -5585,13 +5562,14 @@ def jaccard_index(self, input: cudf.Series, width: int) -> SeriesOrIndex: 1 0.307692 dtype: float32 """ - return self._return_or_inplace( - libstrings.jaccard_index(self._column, input._column, width), + self._column.jaccard_index(input._column, width) ) -def _massage_string_arg(value, name, allow_col=False): +def _massage_string_arg( + value, name, allow_col: bool = False +) -> StringColumn | cudf.Scalar: if isinstance(value, cudf.Scalar): return value @@ -5602,9 +5580,9 @@ def _massage_string_arg(value, name, allow_col=False): if allow_col: if isinstance(value, list): - return column.as_column(value, dtype="str") + return column.as_column(value, dtype="str") # type: ignore[return-value] - if isinstance(value, Column) and is_string_dtype(value.dtype): + if isinstance(value, StringColumn): return value allowed_types.append("Column") @@ -6148,6 +6126,278 @@ def view(self, dtype) -> "cudf.core.column.ColumnBase": return to_view.view(dtype) + @acquire_spill_lock() + def minhash( + self, + seed: np.uint32, + a: NumericalColumn, + b: NumericalColumn, + width: int, + ) -> ListColumn: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.minhash.minhash( + self.to_pylibcudf(mode="read"), + seed, + a.to_pylibcudf(mode="read"), + b.to_pylibcudf(mode="read"), + width, + ) + ) + + @acquire_spill_lock() + def minhash64( + self, + seed: np.uint64, + a: NumericalColumn, + b: NumericalColumn, + width: int, + ) -> ListColumn: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.minhash.minhash64( + self.to_pylibcudf(mode="read"), + seed, + a.to_pylibcudf(mode="read"), + b.to_pylibcudf(mode="read"), + width, + ) + ) + + @acquire_spill_lock() + def jaccard_index(self, other: Self, width: int) -> NumericalColumn: + result = plc.nvtext.jaccard.jaccard_index( + self.to_pylibcudf(mode="read"), + other.to_pylibcudf(mode="read"), + width, + ) + return type(self).from_pylibcudf(result) # type: ignore[return-value] + + @acquire_spill_lock() + def generate_ngrams(self, ngrams: int, separator: cudf.Scalar) -> Self: + result = plc.nvtext.generate_ngrams.generate_ngrams( + self.to_pylibcudf(mode="read"), + ngrams, + separator.device_value.c_value, + ) + return type(self).from_pylibcudf(result) # type: ignore[return-value] + + @acquire_spill_lock() + def generate_character_ngrams(self, ngrams: int) -> ListColumn: + result = plc.nvtext.generate_ngrams.generate_character_ngrams( + self.to_pylibcudf(mode="read"), ngrams + ) + return type(self).from_pylibcudf(result) # type: ignore[return-value] + + @acquire_spill_lock() + def hash_character_ngrams(self, ngrams: int) -> ListColumn: + result = plc.nvtext.generate_ngrams.hash_character_ngrams( + self.to_pylibcudf(mode="read"), ngrams + ) + return type(self).from_pylibcudf(result) # type: ignore[return-value] + + @acquire_spill_lock() + def edit_distance(self, targets: Self) -> NumericalColumn: + result = plc.nvtext.edit_distance.edit_distance( + self.to_pylibcudf(mode="read"), targets.to_pylibcudf(mode="read") + ) + return type(self).from_pylibcudf(result) # type: ignore[return-value] + + @acquire_spill_lock() + def edit_distance_matrix(self) -> ListColumn: + result = plc.nvtext.edit_distance.edit_distance_matrix( + self.to_pylibcudf(mode="read") + ) + return type(self).from_pylibcudf(result) # type: ignore[return-value] + + @acquire_spill_lock() + def byte_pair_encoding( + self, + merge_pairs: plc.nvtext.byte_pair_encode.BPEMergePairs, + separator: cudf.Scalar, + ) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.byte_pair_encode.byte_pair_encoding( + self.to_pylibcudf(mode="read"), + merge_pairs, + separator.device_value.c_value, + ) + ) + + @acquire_spill_lock() + def ngrams_tokenize( + self, + ngrams: int, + delimiter: cudf.Scalar, + separator: cudf.Scalar, + ) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.ngrams_tokenize.ngrams_tokenize( + self.to_pylibcudf(mode="read"), + ngrams, + delimiter.device_value.c_value, + separator.device_value.c_value, + ) + ) + + @acquire_spill_lock() + def normalize_spaces(self) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.normalize.normalize_spaces( + self.to_pylibcudf(mode="read") + ) + ) + + @acquire_spill_lock() + def normalize_characters(self, do_lower: bool = True) -> Self: + return Column.from_pylibcudf( # type: ignore[return-value] + plc.nvtext.normalize.normalize_characters( + self.to_pylibcudf(mode="read"), + do_lower, + ) + ) + + @acquire_spill_lock() + def replace_tokens( + self, targets: Self, replacements: Self, delimiter: cudf.Scalar + ) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.replace.replace_tokens( + self.to_pylibcudf(mode="read"), + targets.to_pylibcudf(mode="read"), + replacements.to_pylibcudf(mode="read"), + delimiter.device_value.c_value, + ) + ) + + @acquire_spill_lock() + def filter_tokens( + self, + min_token_length: int, + replacement: cudf.Scalar, + delimiter: cudf.Scalar, + ) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.replace.filter_tokens( + self.to_pylibcudf(mode="read"), + min_token_length, + replacement.device_value.c_value, + delimiter.device_value.c_value, + ) + ) + + @acquire_spill_lock() + def porter_stemmer_measure(self) -> NumericalColumn: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.stemmer.porter_stemmer_measure( + self.to_pylibcudf(mode="read") + ) + ) + + @acquire_spill_lock() + def is_letter(self, is_vowel: bool, index: int | NumericalColumn) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.stemmer.is_letter( + self.to_pylibcudf(mode="read"), + is_vowel, + index + if isinstance(index, int) + else index.to_pylibcudf(mode="read"), + ) + ) + + @acquire_spill_lock() + def subword_tokenize( + self, + hashed_vocabulary: plc.nvtext.subword_tokenize.HashedVocabulary, + max_sequence_length: int = 64, + stride: int = 48, + do_lower: bool = True, + do_truncate: bool = False, + ) -> tuple[ColumnBase, ColumnBase, ColumnBase]: + """ + Subword tokenizes text series by using the pre-loaded hashed vocabulary + """ + result = plc.nvtext.subword_tokenize.subword_tokenize( + self.to_pylibcudf(mode="read"), + hashed_vocabulary, + max_sequence_length, + stride, + do_lower, + do_truncate, + ) + # return the 3 tensor components + tokens = type(self).from_pylibcudf(result[0]) + masks = type(self).from_pylibcudf(result[1]) + metadata = type(self).from_pylibcudf(result[2]) + return tokens, masks, metadata + + @acquire_spill_lock() + def tokenize_scalar(self, delimiter: cudf.Scalar) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.tokenize.tokenize_scalar( + self.to_pylibcudf(mode="read"), delimiter.device_value.c_value + ) + ) + + @acquire_spill_lock() + def tokenize_column(self, delimiters: Self) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.tokenize.tokenize_column( + self.to_pylibcudf(mode="read"), + delimiters.to_pylibcudf(mode="read"), + ) + ) + + @acquire_spill_lock() + def count_tokens_scalar(self, delimiter: cudf.Scalar) -> NumericalColumn: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.tokenize.count_tokens_scalar( + self.to_pylibcudf(mode="read"), delimiter.device_value.c_value + ) + ) + + @acquire_spill_lock() + def count_tokens_column(self, delimiters: Self) -> NumericalColumn: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.tokenize.count_tokens_column( + self.to_pylibcudf(mode="read"), + delimiters.to_pylibcudf(mode="read"), + ) + ) + + @acquire_spill_lock() + def character_tokenize(self) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.tokenize.character_tokenize( + self.to_pylibcudf(mode="read") + ) + ) + + @acquire_spill_lock() + def tokenize_with_vocabulary( + self, + vocabulary: plc.nvtext.tokenize.TokenizeVocabulary, + delimiter: cudf.Scalar, + default_id: int, + ) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.tokenize.tokenize_with_vocabulary( + self.to_pylibcudf(mode="read"), + vocabulary, + delimiter.device_value.c_value, + default_id, + ) + ) + + @acquire_spill_lock() + def detokenize(self, indices: ColumnBase, separator: cudf.Scalar) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.tokenize.detokenize( + self.to_pylibcudf(mode="read"), + indices.to_pylibcudf(mode="read"), + separator.device_value.c_value, + ) + ) + def _modify_characters( self, method: Callable[[plc.Column], plc.Column] ) -> Self: diff --git a/python/cudf/cudf/core/subword_tokenizer.py b/python/cudf/cudf/core/subword_tokenizer.py index dda1f199078..479838ef2a8 100644 --- a/python/cudf/cudf/core/subword_tokenizer.py +++ b/python/cudf/cudf/core/subword_tokenizer.py @@ -8,10 +8,6 @@ import pylibcudf as plc -from cudf._lib.nvtext.subword_tokenize import ( - subword_tokenize_inmem_hash as cpp_subword_tokenize, -) - def _cast_to_appropriate_type(ar, cast_type): if cast_type == "cp": @@ -210,8 +206,7 @@ def __call__( stride = max_length - stride # behavior varies from subword_tokenize but maps with huggingface - input_ids, attention_mask, metadata = cpp_subword_tokenize( - text._column, + input_ids, attention_mask, metadata = text._column.subword_tokenize( self.vocab_file, max_sequence_length=max_length, stride=stride, diff --git a/python/cudf/cudf/core/tokenize_vocabulary.py b/python/cudf/cudf/core/tokenize_vocabulary.py index 1e31376cce8..fb8b9b3131c 100644 --- a/python/cudf/cudf/core/tokenize_vocabulary.py +++ b/python/cudf/cudf/core/tokenize_vocabulary.py @@ -5,9 +5,6 @@ import pylibcudf as plc import cudf -from cudf._lib.nvtext.tokenize import ( - tokenize_with_vocabulary as cpp_tokenize_with_vocabulary, -) class TokenizeVocabulary: @@ -20,7 +17,7 @@ class TokenizeVocabulary: Strings column of vocabulary terms """ - def __init__(self, vocabulary: "cudf.Series"): + def __init__(self, vocabulary: cudf.Series) -> None: self.vocabulary = plc.nvtext.tokenize.TokenizeVocabulary( vocabulary._column.to_pylibcudf(mode="read") ) @@ -46,8 +43,8 @@ def tokenize( if delimiter is None: delimiter = "" delim = cudf.Scalar(delimiter, dtype="str") - result = cpp_tokenize_with_vocabulary( - text._column, self.vocabulary, delim, default_id + result = text._column.tokenize_with_vocabulary( + self.vocabulary, delim, default_id ) return cudf.Series._from_column(result)