From b2832eadfd975a17034501d56d43015e4f5989d2 Mon Sep 17 00:00:00 2001 From: Doug Branton Date: Mon, 8 Apr 2024 12:59:39 -0700 Subject: [PATCH 1/4] Initial read_parquet MVP implementation --- src/nested_pandas/nestedframe/__init__.py | 1 + src/nested_pandas/nestedframe/io.py | 69 +++++++++++++++++++++++ 2 files changed, 70 insertions(+) create mode 100644 src/nested_pandas/nestedframe/io.py diff --git a/src/nested_pandas/nestedframe/__init__.py b/src/nested_pandas/nestedframe/__init__.py index 54af689..a656cf3 100644 --- a/src/nested_pandas/nestedframe/__init__.py +++ b/src/nested_pandas/nestedframe/__init__.py @@ -1 +1,2 @@ from .core import NestedFrame # noqa +from .io import read_parquet # noqa diff --git a/src/nested_pandas/nestedframe/io.py b/src/nested_pandas/nestedframe/io.py new file mode 100644 index 0000000..b27b713 --- /dev/null +++ b/src/nested_pandas/nestedframe/io.py @@ -0,0 +1,69 @@ +# typing.Self and "|" union syntax don't exist in Python 3.9 +from __future__ import annotations + +import pandas as pd + +from .core import NestedFrame + + +def read_parquet( + data: str, + to_pack: dict, + engine: str = "auto", + columns: list[str] | None = None, + pack_columns: dict | None = None, +) -> NestedFrame: + """ + Load a parquet object from a file path and load a set of other + parquet objects to pack into the resulting NestedFrame. + + Docstring based on the Pandas equivalent. + + #TODO after MVP: Include full kwarg-set + + Parameters + ---------- + data : str, path object or file-like object + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a binary ``read()`` function. + The string could be a URL. Valid URL schemes include http, ftp, s3, + gs, and file. For file URLs, a host is expected. A local file could be: + ``file://localhost/path/to/table.parquet``. + A file URL can also be a path to a directory that contains multiple + partitioned parquet files. Both pyarrow and fastparquet support + paths to directories as well as file URLs. A directory path could be: + ``file://localhost/path/to/tables`` or ``s3://bucket/partition_dir``. + to_pack: dict, + A dictionary of parquet data paths (same criteria as `data`), where + each key reflects the desired column name to pack the data into and + each value reflects the parquet data to pack. + engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto' + Parquet library to use. If 'auto', then the option + ``io.parquet.engine`` is used. The default ``io.parquet.engine`` + behavior is to try 'pyarrow', falling back to 'fastparquet' if + 'pyarrow' is unavailable. + + When using the ``'pyarrow'`` engine and no storage options are provided + and a filesystem is implemented by both ``pyarrow.fs`` and ``fsspec`` + (e.g. "s3://"), then the ``pyarrow.fs`` filesystem is attempted first. + Use the filesystem keyword with an instantiated fsspec filesystem + if you wish to use its implementation. + columns : list, default=None + If not None, only these columns will be read from the file. + pack_columns: dict, default=None + If not None, selects a set of columns from each keyed nested parquet + object to read from the nested files. + + Returns + ------- + NestedFrame + """ + + df = NestedFrame(pd.read_parquet(data, engine, columns)) + + for pack_key in to_pack: + col_subset = pack_columns[pack_key] if pack_columns is not None else None + packed = pd.read_parquet(to_pack[pack_key], engine=engine, columns=col_subset) + df = df.add_nested(packed, pack_key) + + return df From 189c59676e0b1209ec575e02d96ea0b461946196 Mon Sep 17 00:00:00 2001 From: Doug Branton Date: Tue, 9 Apr 2024 11:48:33 -0700 Subject: [PATCH 2/4] add read_parquet test, tweak read_parquet --- src/nested_pandas/__init__.py | 3 +- src/nested_pandas/nestedframe/io.py | 2 +- tests/nested_pandas/nestedframe/test_io.py | 59 ++++++++++++++++++++++ 3 files changed, 62 insertions(+), 2 deletions(-) create mode 100644 tests/nested_pandas/nestedframe/test_io.py diff --git a/src/nested_pandas/__init__.py b/src/nested_pandas/__init__.py index 613f651..e63bd1c 100644 --- a/src/nested_pandas/__init__.py +++ b/src/nested_pandas/__init__.py @@ -1,8 +1,9 @@ from .example_module import greetings, meaning from .nestedframe import NestedFrame +from .nestedframe.io import read_parquet # Import for registering from .series.accessor import NestSeriesAccessor # noqa: F401 from .series.dtype import NestedDtype -__all__ = ["greetings", "meaning", "NestedDtype", "NestedFrame"] +__all__ = ["greetings", "meaning", "NestedDtype", "NestedFrame", "read_parquet"] diff --git a/src/nested_pandas/nestedframe/io.py b/src/nested_pandas/nestedframe/io.py index b27b713..fab2da1 100644 --- a/src/nested_pandas/nestedframe/io.py +++ b/src/nested_pandas/nestedframe/io.py @@ -62,7 +62,7 @@ def read_parquet( df = NestedFrame(pd.read_parquet(data, engine, columns)) for pack_key in to_pack: - col_subset = pack_columns[pack_key] if pack_columns is not None else None + col_subset = pack_columns.get(pack_key, None) if pack_columns is not None else None packed = pd.read_parquet(to_pack[pack_key], engine=engine, columns=col_subset) df = df.add_nested(packed, pack_key) diff --git a/tests/nested_pandas/nestedframe/test_io.py b/tests/nested_pandas/nestedframe/test_io.py new file mode 100644 index 0000000..8037755 --- /dev/null +++ b/tests/nested_pandas/nestedframe/test_io.py @@ -0,0 +1,59 @@ +import os + +import pandas as pd +import pytest +from nested_pandas import read_parquet + + +@pytest.mark.parametrize("columns", [["a"], None]) +@pytest.mark.parametrize("pack_columns", [{"nested1": ["c"], "nested2": ["e"]}, {"nested1": ["d"]}, None]) +def test_read_parquet(tmp_path, columns, pack_columns): + """Test nested parquet loading""" + # Setup a temporary directory for files + save_path = os.path.join(tmp_path, ".") + + # Generate some test data + base = pd.DataFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2]) + + nested1 = pd.DataFrame( + data={"c": [0, 2, 4, 1, 4, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]}, + index=[0, 0, 0, 1, 1, 1, 2, 2, 2], + ) + + nested2 = pd.DataFrame( + data={"e": [0, 2, 4, 1, 4, 3, 1, 4, 1], "f": [5, 4, 7, 5, 3, 1, 9, 3, 4]}, + index=[0, 0, 0, 1, 1, 1, 2, 2, 2], + ) + + # Save to parquet + base.to_parquet(os.path.join(save_path, "base.parquet")) + nested1.to_parquet(os.path.join(save_path, "nested1.parquet")) + nested2.to_parquet(os.path.join(save_path, "nested2.parquet")) + + # Read from parquet + nf = read_parquet( + data=os.path.join(save_path, "base.parquet"), + to_pack={ + "nested1": os.path.join(save_path, "nested1.parquet"), + "nested2": os.path.join(save_path, "nested2.parquet"), + }, + columns=columns, + pack_columns=pack_columns, + ) + + # Check Base Columns + if columns is not None: + assert nf.columns.tolist() == columns + ["nested1", "nested2"] + else: + assert nf.columns.tolist() == base.columns.tolist() + ["nested1", "nested2"] + + # Check Nested Columns + if pack_columns is not None: + for nested_col in pack_columns: + assert nf[nested_col].nest.fields == pack_columns[nested_col] + else: + for nested_col in nf.nested_columns: + if nested_col == "nested1": + assert nf[nested_col].nest.fields == nested1.columns.tolist() + elif nested_col == "nested2": + assert nf[nested_col].nest.fields == nested2.columns.tolist() From 7ec4bb52b8c01e7840fa52b880d663cc56843b6a Mon Sep 17 00:00:00 2001 From: Doug Branton Date: Tue, 9 Apr 2024 12:30:11 -0700 Subject: [PATCH 3/4] engine and dtypes --- src/nested_pandas/nestedframe/io.py | 35 ++++++++++++++++------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/src/nested_pandas/nestedframe/io.py b/src/nested_pandas/nestedframe/io.py index fab2da1..40e1405 100644 --- a/src/nested_pandas/nestedframe/io.py +++ b/src/nested_pandas/nestedframe/io.py @@ -2,16 +2,22 @@ from __future__ import annotations import pandas as pd +from pandas._libs import lib +from pandas._typing import ( + DtypeBackend, + FilePath, + ReadBuffer, +) from .core import NestedFrame def read_parquet( - data: str, + data: FilePath | ReadBuffer[bytes], to_pack: dict, - engine: str = "auto", columns: list[str] | None = None, pack_columns: dict | None = None, + dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, ) -> NestedFrame: """ Load a parquet object from a file path and load a set of other @@ -37,33 +43,32 @@ def read_parquet( A dictionary of parquet data paths (same criteria as `data`), where each key reflects the desired column name to pack the data into and each value reflects the parquet data to pack. - engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto' - Parquet library to use. If 'auto', then the option - ``io.parquet.engine`` is used. The default ``io.parquet.engine`` - behavior is to try 'pyarrow', falling back to 'fastparquet' if - 'pyarrow' is unavailable. - - When using the ``'pyarrow'`` engine and no storage options are provided - and a filesystem is implemented by both ``pyarrow.fs`` and ``fsspec`` - (e.g. "s3://"), then the ``pyarrow.fs`` filesystem is attempted first. - Use the filesystem keyword with an instantiated fsspec filesystem - if you wish to use its implementation. columns : list, default=None If not None, only these columns will be read from the file. pack_columns: dict, default=None If not None, selects a set of columns from each keyed nested parquet object to read from the nested files. + dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable' + Back-end data type applied to the resultant :class:`DataFrame` + (still experimental). Behaviour is as follows: + + * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` + (default). + * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` + DataFrame. Returns ------- NestedFrame """ - df = NestedFrame(pd.read_parquet(data, engine, columns)) + df = NestedFrame(pd.read_parquet(data, engine="pyarrow", columns=columns, dtype_backend=dtype_backend)) for pack_key in to_pack: col_subset = pack_columns.get(pack_key, None) if pack_columns is not None else None - packed = pd.read_parquet(to_pack[pack_key], engine=engine, columns=col_subset) + packed = pd.read_parquet( + to_pack[pack_key], engine="pyarrow", columns=col_subset, dtype_backend=dtype_backend + ) df = df.add_nested(packed, pack_key) return df From 041e513d40d35a3f60f4ad47dfec77e6ad5608c1 Mon Sep 17 00:00:00 2001 From: Doug Branton Date: Tue, 9 Apr 2024 12:32:39 -0700 Subject: [PATCH 4/4] add backend todo --- src/nested_pandas/nestedframe/io.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/nested_pandas/nestedframe/io.py b/src/nested_pandas/nestedframe/io.py index 40e1405..e0f773f 100644 --- a/src/nested_pandas/nestedframe/io.py +++ b/src/nested_pandas/nestedframe/io.py @@ -26,6 +26,7 @@ def read_parquet( Docstring based on the Pandas equivalent. #TODO after MVP: Include full kwarg-set + #TODO: Switch dtype backend default? Parameters ----------