Migrate nvtext generate_ngrams APIs to pylibcudf (rapidsai#17006)

Apart of rapidsai#15162 Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Bradley Dice (https://github.com/bdice) URL: rapidsai#17006
madsbk · Oct 8, 2024 · 09ed210 · 09ed210
1 parent 2d02bdc
commit 09ed210
Show file tree

Hide file tree

Showing 9 changed files with 207 additions and 62 deletions.
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/generate_ngrams.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/generate_ngrams.rst
@@ -0,0 +1,6 @@
+===============
+generate_ngrams
+===============
+
+.. automodule:: pylibcudf.nvtext.generate_ngrams
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
@@ -5,3 +5,4 @@ nvtext
     :maxdepth: 1
 
     edit_distance
+    generate_ngrams
diff --git a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx b/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx
@@ -2,75 +2,34 @@
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.nvtext.generate_ngrams cimport (
-    generate_character_ngrams as cpp_generate_character_ngrams,
-    generate_ngrams as cpp_generate_ngrams,
-    hash_character_ngrams as cpp_hash_character_ngrams,
-)
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from pylibcudf.libcudf.types cimport size_type
-
 from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
+
+from pylibcudf import nvtext
 
 
 @acquire_spill_lock()
 def generate_ngrams(Column strings, int ngrams, object py_separator):
-
-    cdef DeviceScalar separator = py_separator.device_value
-
-    cdef column_view c_strings = strings.view()
-    cdef size_type c_ngrams = ngrams
-    cdef const string_scalar* c_separator = <const string_scalar*>separator\
-        .get_raw_ptr()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_generate_ngrams(
-                c_strings,
-                c_ngrams,
-                c_separator[0]
-            )
-        )
-
-    return Column.from_unique_ptr(move(c_result))
+    result = nvtext.generate_ngrams.generate_ngrams(
+        strings.to_pylibcudf(mode="read"),
+        ngrams,
+        py_separator.device_value.c_value
+    )
+    return Column.from_pylibcudf(result)
 
 
 @acquire_spill_lock()
 def generate_character_ngrams(Column strings, int ngrams):
-    cdef column_view c_strings = strings.view()
-    cdef size_type c_ngrams = ngrams
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_generate_character_ngrams(
-                c_strings,
-                c_ngrams
-            )
-        )
-
-    return Column.from_unique_ptr(move(c_result))
+    result = nvtext.generate_ngrams.generate_character_ngrams(
+        strings.to_pylibcudf(mode="read"),
+        ngrams
+    )
+    return Column.from_pylibcudf(result)
 
 
 @acquire_spill_lock()
 def hash_character_ngrams(Column strings, int ngrams):
-    cdef column_view c_strings = strings.view()
-    cdef size_type c_ngrams = ngrams
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_hash_character_ngrams(
-                c_strings,
-                c_ngrams
-            )
-        )
-
-    return Column.from_unique_ptr(move(c_result))
+    result = nvtext.generate_ngrams.hash_character_ngrams(
+        strings.to_pylibcudf(mode="read"),
+        ngrams
+    )
+    return Column.from_pylibcudf(result)
diff --git a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 
-set(cython_sources edit_distance.pyx)
+set(cython_sources edit_distance.pyx generate_ngrams.pyx)
 
 set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(

diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
@@ -1,7 +1,8 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . cimport edit_distance
+from . cimport edit_distance, generate_ngrams
 
 __all__ = [
     "edit_distance",
+    "generate_ngrams",
 ]
diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/nvtext/__init__.py
@@ -1,7 +1,8 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . import edit_distance
+from . import edit_distance, generate_ngrams
 
 __all__ = [
     "edit_distance",
+    "generate_ngrams",
 ]
diff --git a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pxd b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pxd
@@ -0,0 +1,12 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
+
+
+cpdef Column generate_ngrams(Column input, size_type ngrams, Scalar separator)
+
+cpdef Column generate_character_ngrams(Column input, size_type ngrams=*)
+
+cpdef Column hash_character_ngrams(Column input, size_type ngrams=*)
diff --git a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx
@@ -0,0 +1,111 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.nvtext.generate_ngrams cimport (
+    generate_character_ngrams as cpp_generate_character_ngrams,
+    generate_ngrams as cpp_generate_ngrams,
+    hash_character_ngrams as cpp_hash_character_ngrams,
+)
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
+
+
+cpdef Column generate_ngrams(Column input, size_type ngrams, Scalar separator):
+    """
+    Returns a single column of strings by generating ngrams from a strings column.
+
+    For details, see :cpp:func:`generate_ngrams`
+
+    Parameters
+    ----------
+    input : Column
+        Input strings
+    ngram : size_type
+        The ngram number to generate
+    separator : Scalar
+        The string to use for separating ngram tokens
+
+    Returns
+    -------
+    Column
+        New strings columns of tokens
+    """
+    cdef column_view c_strings = input.view()
+    cdef const string_scalar* c_separator = <const string_scalar*>separator.c_obj.get()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_generate_ngrams(
+                c_strings,
+                ngrams,
+                c_separator[0]
+            )
+        )
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column generate_character_ngrams(Column input, size_type ngrams = 2):
+    """
+    Returns a lists column of ngrams of characters within each string.
+
+    For details, see :cpp:func:`generate_character_ngrams`
+
+    Parameters
+    ----------
+    input : Column
+        Input strings
+    ngram : size_type
+        The ngram number to generate
+
+    Returns
+    -------
+    Column
+        Lists column of strings
+    """
+    cdef column_view c_strings = input.view()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_generate_character_ngrams(
+                c_strings,
+                ngrams,
+            )
+        )
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column hash_character_ngrams(Column input, size_type ngrams = 2):
+    """
+    Returns a lists column of hash values of the characters in each string
+
+    For details, see :cpp:func:`hash_character_ngrams`
+
+    Parameters
+    ----------
+    input : Column
+        Input strings
+    ngram : size_type
+        The ngram number to generate
+
+    Returns
+    -------
+    Column
+        Lists column of hash values
+    """
+    cdef column_view c_strings = input.view()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_hash_character_ngrams(
+                c_strings,
+                ngrams,
+            )
+        )
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pylibcudf as plc
+import pytest
+from utils import assert_column_eq
+
+
+@pytest.fixture(scope="module")
+def input_col():
+    arr = ["ab", "cde", "fgh"]
+    return pa.array(arr)
+
+
+@pytest.mark.parametrize("ngram", [2, 3])
+@pytest.mark.parametrize("sep", ["_", "**", ","])
+def test_generate_ngrams(input_col, ngram, sep):
+    result = plc.nvtext.generate_ngrams.generate_ngrams(
+        plc.interop.from_arrow(input_col),
+        ngram,
+        plc.interop.from_arrow(pa.scalar(sep)),
+    )
+    expected = pa.array([f"ab{sep}cde", f"cde{sep}fgh"])
+    if ngram == 3:
+        expected = pa.array([f"ab{sep}cde{sep}fgh"])
+    assert_column_eq(result, expected)
+
+
+@pytest.mark.parametrize("ngram", [2, 3])
+def test_generate_character_ngrams(input_col, ngram):
+    result = plc.nvtext.generate_ngrams.generate_character_ngrams(
+        plc.interop.from_arrow(input_col),
+        ngram,
+    )
+    expected = pa.array([["ab"], ["cd", "de"], ["fg", "gh"]])
+    if ngram == 3:
+        expected = pa.array([[], ["cde"], ["fgh"]])
+    assert_column_eq(result, expected)
+
+
+@pytest.mark.parametrize("ngram", [2, 3])
+def test_hash_character_ngrams(input_col, ngram):
+    result = plc.nvtext.generate_ngrams.hash_character_ngrams(
+        plc.interop.from_arrow(input_col),
+        ngram,
+    )
+    pa_result = plc.interop.to_arrow(result)
+    assert all(
+        len(got) == max(0, len(s.as_py()) - ngram + 1)
+        for got, s in zip(pa_result, input_col)
+    )
+    assert pa_result.type == pa.list_(
+        pa.field("element", pa.uint32(), nullable=False)
+    )