From 8506ea6dd12cd1bde91550366d846737bc7fdb7c Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Thu, 11 Apr 2024 18:07:22 -0500 Subject: [PATCH] Migrate string `case` operations to `pylibcudf` (#15489) This PR creates `pylibcudf` `case` APIs and migrates the cuDF cython to leverage them. Part of https://github.com/rapidsai/cudf/issues/15162. Authors: - https://github.com/brandon-b-miller - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/15489 --- .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt | 2 + python/cudf/cudf/_lib/pylibcudf/__init__.pxd | 2 + python/cudf/cudf/_lib/pylibcudf/__init__.py | 2 + .../_lib/pylibcudf/strings/CMakeLists.txt | 21 ++++++++ .../cudf/_lib/pylibcudf/strings/__init__.pxd | 3 ++ .../cudf/_lib/pylibcudf/strings/__init__.py | 3 ++ .../cudf/cudf/_lib/pylibcudf/strings/case.pxd | 8 +++ .../cudf/cudf/_lib/pylibcudf/strings/case.pyx | 30 +++++++++++ python/cudf/cudf/_lib/strings/case.pyx | 50 +++++++------------ .../cudf/pylibcudf_tests/test_string_case.py | 35 +++++++++++++ 10 files changed, 124 insertions(+), 32 deletions(-) create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/__init__.py create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/case.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/case.pyx create mode 100644 python/cudf/cudf/pylibcudf_tests/test_string_case.py diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt index 81d15cf95b4..c2b7cb7ca3d 100644 --- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt @@ -44,3 +44,5 @@ rapids_cython_create_modules( LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_ ASSOCIATED_TARGETS cudf ) link_to_pyarrow_headers(pylibcudf_interop) + +add_subdirectory(strings) diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd index 48c23a9dd4c..5adefa5fd93 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd @@ -17,6 +17,7 @@ from . cimport ( search, sorting, stream_compaction, + strings, types, unary, ) @@ -48,6 +49,7 @@ __all__ = [ "rolling", "search", "stream_compaction", + "strings", "sorting", "types", "unary", diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py index 8ccb0ecc341..89f874f5fa5 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py @@ -17,6 +17,7 @@ search, sorting, stream_compaction, + strings, types, unary, ) @@ -48,6 +49,7 @@ "rolling", "search", "stream_compaction", + "strings", "sorting", "types", "unary", diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt new file mode 100644 index 00000000000..3a2a9e1e7eb --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt @@ -0,0 +1,21 @@ +# ============================================================================= +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +set(cython_sources case.pyx) +set(linked_libraries cudf::cudf) +rapids_cython_create_modules( + CXX + SOURCE_FILES "${cython_sources}" + LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_ ASSOCIATED_TARGETS cudf +) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd new file mode 100644 index 00000000000..ff87549b5b5 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd @@ -0,0 +1,3 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from . import case diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py new file mode 100644 index 00000000000..ff87549b5b5 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py @@ -0,0 +1,3 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from . import case diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/case.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/case.pxd new file mode 100644 index 00000000000..225d566fe06 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/case.pxd @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cudf._lib.pylibcudf.column cimport Column + + +cpdef Column to_lower(Column input) +cpdef Column to_upper(Column input) +cpdef Column swapcase(Column input) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/case.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/case.pyx new file mode 100644 index 00000000000..69910fd8c50 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/case.pyx @@ -0,0 +1,30 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move + +from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.strings cimport case as cpp_case +from cudf._lib.pylibcudf.column cimport Column + + +cpdef Column to_lower(Column input): + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_case.to_lower(input.view()) + + return Column.from_libcudf(move(c_result)) + +cpdef Column to_upper(Column input): + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_case.to_upper(input.view()) + + return Column.from_libcudf(move(c_result)) + +cpdef Column swapcase(Column input): + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_case.swapcase(input.view()) + + return Column.from_libcudf(move(c_result)) diff --git a/python/cudf/cudf/_lib/strings/case.pyx b/python/cudf/cudf/_lib/strings/case.pyx index 09af1178946..38f242a67d6 100644 --- a/python/cudf/cudf/_lib/strings/case.pyx +++ b/python/cudf/cudf/_lib/strings/case.pyx @@ -1,48 +1,34 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - from cudf._lib.column cimport Column -from cudf._lib.cpp.column.column cimport column -from cudf._lib.cpp.column.column_view cimport column_view -from cudf._lib.cpp.strings.case cimport ( - swapcase as cpp_swapcase, - to_lower as cpp_to_lower, - to_upper as cpp_to_upper, -) + +from cudf._lib.pylibcudf.strings import case @acquire_spill_lock() def to_upper(Column source_strings): - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_to_upper(source_view)) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + case.to_upper( + source_strings.to_pylibcudf(mode='read') + ) + ) @acquire_spill_lock() def to_lower(Column source_strings): - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_to_lower(source_view)) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + case.to_lower( + source_strings.to_pylibcudf(mode='read') + ) + ) @acquire_spill_lock() def swapcase(Column source_strings): - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_swapcase(source_view)) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + case.swapcase( + source_strings.to_pylibcudf(mode='read') + ) + ) diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_case.py b/python/cudf/cudf/pylibcudf_tests/test_string_case.py new file mode 100644 index 00000000000..ae01d953df5 --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_string_case.py @@ -0,0 +1,35 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import cudf._lib.pylibcudf as plc + + +@pytest.fixture(scope="module") +def string_col(): + return pa.array( + ["AbC", "de", "FGHI", "j", "kLm", "nOPq", None, "RsT", None, "uVw"] + ) + + +def test_to_upper(string_col): + plc_col = plc.interop.from_arrow(string_col) + got = plc.strings.case.to_upper(plc_col) + expected = pa.compute.utf8_upper(string_col) + assert_column_eq(got, expected) + + +def test_to_lower(string_col): + plc_col = plc.interop.from_arrow(string_col) + got = plc.strings.case.to_lower(plc_col) + expected = pa.compute.utf8_lower(string_col) + assert_column_eq(got, expected) + + +def test_swapcase(string_col): + plc_col = plc.interop.from_arrow(string_col) + got = plc.strings.case.swapcase(plc_col) + expected = pa.compute.utf8_swapcase(string_col) + assert_column_eq(got, expected)