Merge pull request #21 from lincc-frameworks/read_parquet

Initial read_parquet MVP implementation
lincc-frameworks · Apr 9, 2024 · 136f2c5 · 136f2c5
2 parents 00be464 + 041e513
commit 136f2c5
Show file tree

Hide file tree

Showing 4 changed files with 137 additions and 1 deletion.
diff --git a/src/nested_pandas/__init__.py b/src/nested_pandas/__init__.py
@@ -1,8 +1,9 @@
 from .example_module import greetings, meaning
 from .nestedframe import NestedFrame
+from .nestedframe.io import read_parquet
 
 # Import for registering
 from .series.accessor import NestSeriesAccessor  # noqa: F401
 from .series.dtype import NestedDtype
 
-__all__ = ["greetings", "meaning", "NestedDtype", "NestedFrame"]
+__all__ = ["greetings", "meaning", "NestedDtype", "NestedFrame", "read_parquet"]
diff --git a/src/nested_pandas/nestedframe/__init__.py b/src/nested_pandas/nestedframe/__init__.py
@@ -1 +1,2 @@
 from .core import NestedFrame  # noqa
+from .io import read_parquet  # noqa
diff --git a/src/nested_pandas/nestedframe/io.py b/src/nested_pandas/nestedframe/io.py
@@ -0,0 +1,75 @@
+# typing.Self and "|" union syntax don't exist in Python 3.9
+from __future__ import annotations
+
+import pandas as pd
+from pandas._libs import lib
+from pandas._typing import (
+    DtypeBackend,
+    FilePath,
+    ReadBuffer,
+)
+
+from .core import NestedFrame
+
+
+def read_parquet(
+    data: FilePath | ReadBuffer[bytes],
+    to_pack: dict,
+    columns: list[str] | None = None,
+    pack_columns: dict | None = None,
+    dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
+) -> NestedFrame:
+    """
+    Load a parquet object from a file path and load a set of other
+    parquet objects to pack into the resulting NestedFrame.
+
+    Docstring based on the Pandas equivalent.
+
+    #TODO after MVP: Include full kwarg-set
+    #TODO: Switch dtype backend default?
+
+    Parameters
+    ----------
+    data : str, path object or file-like object
+        String, path object (implementing ``os.PathLike[str]``), or file-like
+        object implementing a binary ``read()`` function.
+        The string could be a URL. Valid URL schemes include http, ftp, s3,
+        gs, and file. For file URLs, a host is expected. A local file could be:
+        ``file://localhost/path/to/table.parquet``.
+        A file URL can also be a path to a directory that contains multiple
+        partitioned parquet files. Both pyarrow and fastparquet support
+        paths to directories as well as file URLs. A directory path could be:
+        ``file://localhost/path/to/tables`` or ``s3://bucket/partition_dir``.
+    to_pack: dict,
+        A dictionary of parquet data paths (same criteria as `data`), where
+        each key reflects the desired column name to pack the data into and
+        each value reflects the parquet data to pack.
+    columns : list, default=None
+        If not None, only these columns will be read from the file.
+    pack_columns: dict, default=None
+        If not None, selects a set of columns from each keyed nested parquet
+        object to read from the nested files.
+    dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable'
+        Back-end data type applied to the resultant :class:`DataFrame`
+        (still experimental). Behaviour is as follows:
+
+        * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
+          (default).
+        * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
+          DataFrame.
+
+    Returns
+    -------
+    NestedFrame
+    """
+
+    df = NestedFrame(pd.read_parquet(data, engine="pyarrow", columns=columns, dtype_backend=dtype_backend))
+
+    for pack_key in to_pack:
+        col_subset = pack_columns.get(pack_key, None) if pack_columns is not None else None
+        packed = pd.read_parquet(
+            to_pack[pack_key], engine="pyarrow", columns=col_subset, dtype_backend=dtype_backend
+        )
+        df = df.add_nested(packed, pack_key)
+
+    return df
diff --git a/tests/nested_pandas/nestedframe/test_io.py b/tests/nested_pandas/nestedframe/test_io.py
@@ -0,0 +1,59 @@
+import os
+
+import pandas as pd
+import pytest
+from nested_pandas import read_parquet
+
+
+@pytest.mark.parametrize("columns", [["a"], None])
+@pytest.mark.parametrize("pack_columns", [{"nested1": ["c"], "nested2": ["e"]}, {"nested1": ["d"]}, None])
+def test_read_parquet(tmp_path, columns, pack_columns):
+    """Test nested parquet loading"""
+    # Setup a temporary directory for files
+    save_path = os.path.join(tmp_path, ".")
+
+    # Generate some test data
+    base = pd.DataFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2])
+
+    nested1 = pd.DataFrame(
+        data={"c": [0, 2, 4, 1, 4, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
+        index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
+    )
+
+    nested2 = pd.DataFrame(
+        data={"e": [0, 2, 4, 1, 4, 3, 1, 4, 1], "f": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
+        index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
+    )
+
+    # Save to parquet
+    base.to_parquet(os.path.join(save_path, "base.parquet"))
+    nested1.to_parquet(os.path.join(save_path, "nested1.parquet"))
+    nested2.to_parquet(os.path.join(save_path, "nested2.parquet"))
+
+    # Read from parquet
+    nf = read_parquet(
+        data=os.path.join(save_path, "base.parquet"),
+        to_pack={
+            "nested1": os.path.join(save_path, "nested1.parquet"),
+            "nested2": os.path.join(save_path, "nested2.parquet"),
+        },
+        columns=columns,
+        pack_columns=pack_columns,
+    )
+
+    # Check Base Columns
+    if columns is not None:
+        assert nf.columns.tolist() == columns + ["nested1", "nested2"]
+    else:
+        assert nf.columns.tolist() == base.columns.tolist() + ["nested1", "nested2"]
+
+    # Check Nested Columns
+    if pack_columns is not None:
+        for nested_col in pack_columns:
+            assert nf[nested_col].nest.fields == pack_columns[nested_col]
+    else:
+        for nested_col in nf.nested_columns:
+            if nested_col == "nested1":
+                assert nf[nested_col].nest.fields == nested1.columns.tolist()
+            elif nested_col == "nested2":
+                assert nf[nested_col].nest.fields == nested2.columns.tolist()