Skip to content

Commit

Permalink
Merge pull request #21 from lincc-frameworks/read_parquet
Browse files Browse the repository at this point in the history
Initial read_parquet MVP implementation
  • Loading branch information
dougbrn authored Apr 9, 2024
2 parents 00be464 + 041e513 commit 136f2c5
Show file tree
Hide file tree
Showing 4 changed files with 137 additions and 1 deletion.
3 changes: 2 additions & 1 deletion src/nested_pandas/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from .example_module import greetings, meaning
from .nestedframe import NestedFrame
from .nestedframe.io import read_parquet

# Import for registering
from .series.accessor import NestSeriesAccessor # noqa: F401
from .series.dtype import NestedDtype

__all__ = ["greetings", "meaning", "NestedDtype", "NestedFrame"]
__all__ = ["greetings", "meaning", "NestedDtype", "NestedFrame", "read_parquet"]
1 change: 1 addition & 0 deletions src/nested_pandas/nestedframe/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
from .core import NestedFrame # noqa
from .io import read_parquet # noqa
75 changes: 75 additions & 0 deletions src/nested_pandas/nestedframe/io.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# typing.Self and "|" union syntax don't exist in Python 3.9
from __future__ import annotations

import pandas as pd
from pandas._libs import lib
from pandas._typing import (
DtypeBackend,
FilePath,
ReadBuffer,
)

from .core import NestedFrame


def read_parquet(
data: FilePath | ReadBuffer[bytes],
to_pack: dict,
columns: list[str] | None = None,
pack_columns: dict | None = None,
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
) -> NestedFrame:
"""
Load a parquet object from a file path and load a set of other
parquet objects to pack into the resulting NestedFrame.
Docstring based on the Pandas equivalent.
#TODO after MVP: Include full kwarg-set
#TODO: Switch dtype backend default?
Parameters
----------
data : str, path object or file-like object
String, path object (implementing ``os.PathLike[str]``), or file-like
object implementing a binary ``read()`` function.
The string could be a URL. Valid URL schemes include http, ftp, s3,
gs, and file. For file URLs, a host is expected. A local file could be:
``file://localhost/path/to/table.parquet``.
A file URL can also be a path to a directory that contains multiple
partitioned parquet files. Both pyarrow and fastparquet support
paths to directories as well as file URLs. A directory path could be:
``file://localhost/path/to/tables`` or ``s3://bucket/partition_dir``.
to_pack: dict,
A dictionary of parquet data paths (same criteria as `data`), where
each key reflects the desired column name to pack the data into and
each value reflects the parquet data to pack.
columns : list, default=None
If not None, only these columns will be read from the file.
pack_columns: dict, default=None
If not None, selects a set of columns from each keyed nested parquet
object to read from the nested files.
dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable'
Back-end data type applied to the resultant :class:`DataFrame`
(still experimental). Behaviour is as follows:
* ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
(default).
* ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
DataFrame.
Returns
-------
NestedFrame
"""

df = NestedFrame(pd.read_parquet(data, engine="pyarrow", columns=columns, dtype_backend=dtype_backend))

for pack_key in to_pack:
col_subset = pack_columns.get(pack_key, None) if pack_columns is not None else None
packed = pd.read_parquet(
to_pack[pack_key], engine="pyarrow", columns=col_subset, dtype_backend=dtype_backend
)
df = df.add_nested(packed, pack_key)

return df
59 changes: 59 additions & 0 deletions tests/nested_pandas/nestedframe/test_io.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import os

import pandas as pd
import pytest
from nested_pandas import read_parquet


@pytest.mark.parametrize("columns", [["a"], None])
@pytest.mark.parametrize("pack_columns", [{"nested1": ["c"], "nested2": ["e"]}, {"nested1": ["d"]}, None])
def test_read_parquet(tmp_path, columns, pack_columns):
"""Test nested parquet loading"""
# Setup a temporary directory for files
save_path = os.path.join(tmp_path, ".")

# Generate some test data
base = pd.DataFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2])

nested1 = pd.DataFrame(
data={"c": [0, 2, 4, 1, 4, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)

nested2 = pd.DataFrame(
data={"e": [0, 2, 4, 1, 4, 3, 1, 4, 1], "f": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)

# Save to parquet
base.to_parquet(os.path.join(save_path, "base.parquet"))
nested1.to_parquet(os.path.join(save_path, "nested1.parquet"))
nested2.to_parquet(os.path.join(save_path, "nested2.parquet"))

# Read from parquet
nf = read_parquet(
data=os.path.join(save_path, "base.parquet"),
to_pack={
"nested1": os.path.join(save_path, "nested1.parquet"),
"nested2": os.path.join(save_path, "nested2.parquet"),
},
columns=columns,
pack_columns=pack_columns,
)

# Check Base Columns
if columns is not None:
assert nf.columns.tolist() == columns + ["nested1", "nested2"]
else:
assert nf.columns.tolist() == base.columns.tolist() + ["nested1", "nested2"]

# Check Nested Columns
if pack_columns is not None:
for nested_col in pack_columns:
assert nf[nested_col].nest.fields == pack_columns[nested_col]
else:
for nested_col in nf.nested_columns:
if nested_col == "nested1":
assert nf[nested_col].nest.fields == nested1.columns.tolist()
elif nested_col == "nested2":
assert nf[nested_col].nest.fields == nested2.columns.tolist()

0 comments on commit 136f2c5

Please sign in to comment.