From 469f2262c81d8e8505144f6b7208f6f2a1ee81ac Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 16 Dec 2024 11:48:12 -0800
Subject: [PATCH] Remove unused functionality in cudf._lib.utils.pyx (#17586)

Contributes to https://github.com/rapidsai/cudf/issues/17317

More can be removed once my other cudf._lib PRs are in

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17586
---
 python/cudf/cudf/_lib/utils.pxd    |  16 --
 python/cudf/cudf/_lib/utils.pyx    | 309 +----------------------------
 python/cudf/cudf/core/dataframe.py |  38 +++-
 python/cudf/cudf/io/parquet.py     |  12 +-
 4 files changed, 38 insertions(+), 337 deletions(-)

diff --git a/python/cudf/cudf/_lib/utils.pxd b/python/cudf/cudf/_lib/utils.pxd
index 6cc52d046af..900be721c9a 100644
--- a/python/cudf/cudf/_lib/utils.pxd
+++ b/python/cudf/cudf/_lib/utils.pxd
@@ -1,22 +1,6 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from libcpp.memory cimport unique_ptr
-from libcpp.string cimport string
-from libcpp.vector cimport vector
-
-from pylibcudf.libcudf.column.column cimport column_view
-from pylibcudf.libcudf.table.table cimport table, table_view
-
-
-cdef data_from_unique_ptr(
-    unique_ptr[table] c_tbl, column_names, index_names=*)
 cpdef data_from_pylibcudf_table(tbl, column_names, index_names=*)
 cpdef data_from_pylibcudf_io(tbl_with_meta, column_names = *, index_names = *)
-cdef data_from_table_view(
-    table_view tv, object owner, object column_names, object index_names=*)
-cdef table_view table_view_from_columns(columns) except *
-cdef table_view table_view_from_table(tbl, ignore_index=*) except*
-cdef columns_from_unique_ptr(unique_ptr[table] c_tbl)
-cdef columns_from_table_view(table_view tv, object owners)
 cpdef columns_from_pylibcudf_table(tbl)
 cpdef _data_from_columns(columns, column_names, index_names=*)
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index ff032656f80..975c9eb741c 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -1,233 +1,7 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-import numpy as np
-import pyarrow as pa
-
 import cudf
 
-from cython.operator cimport dereference
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-from libcpp.vector cimport vector
-
-from pylibcudf.libcudf.column.column cimport column, column_view
-from pylibcudf.libcudf.table.table cimport table
-from pylibcudf.libcudf.table.table_view cimport table_view
-from pylibcudf.libcudf.types cimport size_type
-
 from cudf._lib.column cimport Column
-from pylibcudf cimport Column as plc_Column
-try:
-    import ujson as json
-except ImportError:
-    import json
-
-from cudf.utils.dtypes import np_dtypes_to_pandas_dtypes, np_to_pa_dtype
-
-PARQUET_META_TYPE_MAP = {
-    str(cudf_dtype): str(pandas_dtype)
-    for cudf_dtype, pandas_dtype in np_dtypes_to_pandas_dtypes.items()
-}
-
-cdef table_view table_view_from_columns(columns) except*:
-    """Create a cudf::table_view from an iterable of Columns."""
-    cdef vector[column_view] column_views
-
-    cdef Column col
-    for col in columns:
-        column_views.push_back(col.view())
-
-    return table_view(column_views)
-
-
-cdef table_view table_view_from_table(tbl, ignore_index=False) except*:
-    """Create a cudf::table_view from a Table.
-
-    Parameters
-    ----------
-    ignore_index : bool, default False
-        If True, don't include the index in the columns.
-    """
-    return table_view_from_columns(
-        tbl._index._columns + tbl._columns
-        if not ignore_index and tbl._index is not None
-        else tbl._columns
-    )
-
-
-cpdef generate_pandas_metadata(table, index):
-    col_names = []
-    types = []
-    index_levels = []
-    index_descriptors = []
-    columns_to_convert = list(table._columns)
-    # Columns
-    for name, col in table._column_labels_and_values:
-        if cudf.get_option("mode.pandas_compatible"):
-            # in pandas-compat mode, non-string column names are stringified.
-            col_names.append(str(name))
-        else:
-            col_names.append(name)
-
-        if isinstance(col.dtype, cudf.CategoricalDtype):
-            raise ValueError(
-                "'category' column dtypes are currently not "
-                + "supported by the gpu accelerated parquet writer"
-            )
-        elif isinstance(col.dtype, (
-            cudf.ListDtype,
-            cudf.StructDtype,
-            cudf.core.dtypes.DecimalDtype
-        )):
-            types.append(col.dtype.to_arrow())
-        else:
-            # A boolean element takes 8 bits in cudf and 1 bit in
-            # pyarrow. To make sure the cudf format is interperable
-            # in arrow, we use `int8` type when converting from a
-            # cudf boolean array.
-            if col.dtype.type == np.bool_:
-                types.append(pa.int8())
-            else:
-                types.append(np_to_pa_dtype(col.dtype))
-
-    # Indexes
-    materialize_index = False
-    if index is not False:
-        for level, name in enumerate(table._index.names):
-            if isinstance(table._index, cudf.MultiIndex):
-                idx = table.index.get_level_values(level)
-            else:
-                idx = table.index
-
-            if isinstance(idx, cudf.RangeIndex):
-                if index is None:
-                    descr = {
-                        "kind": "range",
-                        "name": table.index.name,
-                        "start": table.index.start,
-                        "stop": table.index.stop,
-                        "step": table.index.step,
-                    }
-                else:
-                    materialize_index = True
-                    # When `index=True`, RangeIndex needs to be materialized.
-                    materialized_idx = idx._as_int_index()
-                    descr = _index_level_name(
-                        index_name=materialized_idx.name,
-                        level=level,
-                        column_names=col_names
-                    )
-                    index_levels.append(materialized_idx)
-                    columns_to_convert.append(materialized_idx._values)
-                    col_names.append(descr)
-                    types.append(np_to_pa_dtype(materialized_idx.dtype))
-            else:
-                descr = _index_level_name(
-                    index_name=idx.name,
-                    level=level,
-                    column_names=col_names
-                )
-                columns_to_convert.append(idx._values)
-                col_names.append(descr)
-                if isinstance(idx.dtype, cudf.CategoricalDtype):
-                    raise ValueError(
-                        "'category' column dtypes are currently not "
-                        + "supported by the gpu accelerated parquet writer"
-                    )
-                elif isinstance(idx.dtype, cudf.ListDtype):
-                    types.append(col.dtype.to_arrow())
-                else:
-                    # A boolean element takes 8 bits in cudf and 1 bit in
-                    # pyarrow. To make sure the cudf format is interperable
-                    # in arrow, we use `int8` type when converting from a
-                    # cudf boolean array.
-                    if idx.dtype.type == np.bool_:
-                        types.append(pa.int8())
-                    else:
-                        types.append(np_to_pa_dtype(idx.dtype))
-
-                index_levels.append(idx)
-            index_descriptors.append(descr)
-
-    df_meta = table.head(0)
-    if materialize_index:
-        df_meta.index = df_meta.index._as_int_index()
-    metadata = pa.pandas_compat.construct_metadata(
-        columns_to_convert=columns_to_convert,
-        # It is OKAY to do `.head(0).to_pandas()` because
-        # this method will extract `.columns` metadata only
-        df=df_meta.to_pandas(),
-        column_names=col_names,
-        index_levels=index_levels,
-        index_descriptors=index_descriptors,
-        preserve_index=index,
-        types=types,
-    )
-
-    md_dict = json.loads(metadata[b"pandas"])
-
-    # correct metadata for list and struct and nullable numeric types
-    for col_meta in md_dict["columns"]:
-        if (
-            col_meta["name"] in table._column_names
-            and table._data[col_meta["name"]].nullable
-            and col_meta["numpy_type"] in PARQUET_META_TYPE_MAP
-            and col_meta["pandas_type"] != "decimal"
-        ):
-            col_meta["numpy_type"] = PARQUET_META_TYPE_MAP[
-                col_meta["numpy_type"]
-            ]
-        if col_meta["numpy_type"] in ("list", "struct"):
-            col_meta["numpy_type"] = "object"
-
-    return json.dumps(md_dict)
-
-
-def _index_level_name(index_name, level, column_names):
-    """
-    Return the name of an index level or a default name
-    if `index_name` is None or is already a column name.
-
-    Parameters
-    ----------
-    index_name : name of an Index object
-    level : level of the Index object
-
-    Returns
-    -------
-    name : str
-    """
-    if index_name is not None and index_name not in column_names:
-        return index_name
-    else:
-        return f"__index_level_{level}__"
-
-
-cdef columns_from_unique_ptr(
-    unique_ptr[table] c_tbl
-):
-    """Convert a libcudf table into list of columns.
-
-    Parameters
-    ----------
-    c_tbl : unique_ptr[cudf::table]
-        The libcudf table whose columns will be extracted
-
-    Returns
-    -------
-    list[Column]
-        A list of columns.
-    """
-    cdef vector[unique_ptr[column]] c_columns = move(c_tbl.get().release())
-    cdef vector[unique_ptr[column]].iterator it = c_columns.begin()
-
-    cdef size_t i
-
-    return [
-        Column.from_pylibcudf(
-            plc_Column.from_libcudf(move(dereference(it+i)))
-        ) for i in range(c_columns.size())
-    ]
 
 
 cpdef columns_from_pylibcudf_table(tbl):
@@ -281,8 +55,7 @@ cpdef _data_from_columns(columns, column_names, index_names=None):
         # the data while actually constructing the Index object here (instead
         # of just returning a dict for that as well). As we clean up the
         # Frame factories we may want to look for a less dissonant approach
-        # that does not impose performance penalties. The same applies to
-        # data_from_table_view below.
+        # that does not impose performance penalties.
         cudf.core.index._index_from_data(
             {
                 name: columns[i]
@@ -300,16 +73,6 @@ cpdef _data_from_columns(columns, column_names, index_names=None):
     return data, index
 
 
-cdef data_from_unique_ptr(
-    unique_ptr[table] c_tbl, column_names, index_names=None
-):
-    return _data_from_columns(
-        columns_from_unique_ptr(move(c_tbl)),
-        column_names,
-        index_names
-    )
-
-
 cpdef data_from_pylibcudf_table(tbl, column_names, index_names=None):
     return _data_from_columns(
         columns_from_pylibcudf_table(tbl),
@@ -329,73 +92,3 @@ cpdef data_from_pylibcudf_io(tbl_with_meta, column_names=None, index_names=None)
         column_names=column_names,
         index_names=index_names
     )
-
-cdef columns_from_table_view(
-    table_view tv,
-    object owners,
-):
-    """
-    Given a ``cudf::table_view``, constructs a list of columns from it,
-    along with referencing an owner Python object that owns the memory
-    lifetime. owner must be either None or a list of column. If owner
-    is a list of columns, the owner of the `i`th ``cudf::column_view``
-    in the table view is ``owners[i]``. For more about memory ownership,
-    see ``Column.from_column_view``.
-    """
-
-    return [
-        Column.from_column_view(
-            tv.column(i), owners[i] if isinstance(owners, list) else None
-        ) for i in range(tv.num_columns())
-    ]
-
-cdef data_from_table_view(
-    table_view tv,
-    object owner,
-    object column_names,
-    object index_names=None
-):
-    """
-    Given a ``cudf::table_view``, constructs a Frame from it,
-    along with referencing an ``owner`` Python object that owns the memory
-    lifetime. If ``owner`` is a Frame we reach inside of it and
-    reach inside of each ``cudf.Column`` to make the owner of each newly
-    created ``Buffer`` underneath the ``cudf.Column`` objects of the
-    created Frame the respective ``Buffer`` from the relevant
-    ``cudf.Column`` of the ``owner`` Frame
-    """
-    cdef size_type column_idx = 0
-    table_owner = isinstance(owner, cudf.core.frame.Frame)
-
-    # First construct the index, if any
-    index = None
-    if index_names is not None:
-        index_columns = []
-        for _ in index_names:
-            column_owner = owner
-            if table_owner:
-                column_owner = owner._index._columns[column_idx]
-            index_columns.append(
-                Column.from_column_view(
-                    tv.column(column_idx),
-                    column_owner
-                )
-            )
-            column_idx += 1
-        index = cudf.core.index._index_from_data(
-            dict(zip(index_names, index_columns)))
-
-    # Construct the data dict
-    cdef size_type source_column_idx = 0
-    data_columns = []
-    for _ in column_names:
-        column_owner = owner
-        if table_owner:
-            column_owner = owner._columns[source_column_idx]
-        data_columns.append(
-            Column.from_column_view(tv.column(column_idx), column_owner)
-        )
-        column_idx += 1
-        source_column_idx += 1
-
-    return dict(zip(column_names, data_columns)), index
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index fce361e18ea..81b748d44fc 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1802,13 +1802,37 @@ def _concat(
                 )
                 for table in tables
             ]
-
-            concatted = libcudf.utils.data_from_pylibcudf_table(
-                plc.concatenate.concatenate(plc_tables),
-                column_names=column_names,
-                index_names=index_names,
-            )
-        out = cls._from_data(*concatted)
+            plc_result = plc.concatenate.concatenate(plc_tables)
+            if ignore:
+                index = None
+                data = {
+                    col_name: ColumnBase.from_pylibcudf(col)
+                    for col_name, col in zip(
+                        column_names, plc_result.columns(), strict=True
+                    )
+                }
+            else:
+                result_columns = [
+                    ColumnBase.from_pylibcudf(col)
+                    for col in plc_result.columns()
+                ]
+                index = _index_from_data(
+                    dict(
+                        zip(
+                            index_names,
+                            result_columns[: len(index_names)],
+                            strict=True,
+                        )
+                    )
+                )
+                data = dict(
+                    zip(
+                        column_names,
+                        result_columns[len(index_names) :],
+                        strict=True,
+                    )
+                )
+        out = cls._from_data(data=data, index=index)
 
         # If ignore_index is True, all input frames are empty, and at
         # least one input frame has an index, assign a new RangeIndex
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 153ee0fa01a..c13489630a3 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -25,9 +25,7 @@
 from cudf._lib.column import Column
 from cudf._lib.utils import (
     _data_from_columns,
-    _index_level_name,
     data_from_pylibcudf_io,
-    generate_pandas_metadata,
 )
 from cudf.api.types import is_list_like
 from cudf.core.buffer import acquire_spill_lock
@@ -128,7 +126,7 @@ def _plc_write_parquet(
         tbl_meta = plc.io.types.TableInputMetadata(plc_table)
         for level, idx_name in enumerate(table.index.names):
             tbl_meta.column_metadata[level].set_name(
-                _index_level_name(idx_name, level, table._column_names)
+                ioutils._index_level_name(idx_name, level, table._column_names)
             )
         num_index_cols_meta = len(table.index.names)
     else:
@@ -162,7 +160,7 @@ def _plc_write_parquet(
     if partitions_info is not None:
         user_data = [
             {
-                "pandas": generate_pandas_metadata(
+                "pandas": ioutils.generate_pandas_metadata(
                     table.iloc[start_row : start_row + num_row].copy(
                         deep=False
                     ),
@@ -172,7 +170,9 @@ def _plc_write_parquet(
             for start_row, num_row in partitions_info
         ]
     else:
-        user_data = [{"pandas": generate_pandas_metadata(table, index)}]
+        user_data = [
+            {"pandas": ioutils.generate_pandas_metadata(table, index)}
+        ]
 
     if header_version not in ("1.0", "2.0"):
         raise ValueError(
@@ -1737,7 +1737,7 @@ def _initialize_chunked_state(
             False if isinstance(table.index, cudf.RangeIndex) else self.index
         )
         user_data = [
-            {"pandas": generate_pandas_metadata(table, index)}
+            {"pandas": ioutils.generate_pandas_metadata(table, index)}
         ] * num_partitions
         comp_type = _get_comp_type(self.compression)
         stat_freq = _get_stat_freq(self.statistics)