-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #21 from lincc-frameworks/read_parquet
Initial read_parquet MVP implementation
- Loading branch information
Showing
4 changed files
with
137 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,9 @@ | ||
from .example_module import greetings, meaning | ||
from .nestedframe import NestedFrame | ||
from .nestedframe.io import read_parquet | ||
|
||
# Import for registering | ||
from .series.accessor import NestSeriesAccessor # noqa: F401 | ||
from .series.dtype import NestedDtype | ||
|
||
__all__ = ["greetings", "meaning", "NestedDtype", "NestedFrame"] | ||
__all__ = ["greetings", "meaning", "NestedDtype", "NestedFrame", "read_parquet"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,2 @@ | ||
from .core import NestedFrame # noqa | ||
from .io import read_parquet # noqa |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
# typing.Self and "|" union syntax don't exist in Python 3.9 | ||
from __future__ import annotations | ||
|
||
import pandas as pd | ||
from pandas._libs import lib | ||
from pandas._typing import ( | ||
DtypeBackend, | ||
FilePath, | ||
ReadBuffer, | ||
) | ||
|
||
from .core import NestedFrame | ||
|
||
|
||
def read_parquet( | ||
data: FilePath | ReadBuffer[bytes], | ||
to_pack: dict, | ||
columns: list[str] | None = None, | ||
pack_columns: dict | None = None, | ||
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, | ||
) -> NestedFrame: | ||
""" | ||
Load a parquet object from a file path and load a set of other | ||
parquet objects to pack into the resulting NestedFrame. | ||
Docstring based on the Pandas equivalent. | ||
#TODO after MVP: Include full kwarg-set | ||
#TODO: Switch dtype backend default? | ||
Parameters | ||
---------- | ||
data : str, path object or file-like object | ||
String, path object (implementing ``os.PathLike[str]``), or file-like | ||
object implementing a binary ``read()`` function. | ||
The string could be a URL. Valid URL schemes include http, ftp, s3, | ||
gs, and file. For file URLs, a host is expected. A local file could be: | ||
``file://localhost/path/to/table.parquet``. | ||
A file URL can also be a path to a directory that contains multiple | ||
partitioned parquet files. Both pyarrow and fastparquet support | ||
paths to directories as well as file URLs. A directory path could be: | ||
``file://localhost/path/to/tables`` or ``s3://bucket/partition_dir``. | ||
to_pack: dict, | ||
A dictionary of parquet data paths (same criteria as `data`), where | ||
each key reflects the desired column name to pack the data into and | ||
each value reflects the parquet data to pack. | ||
columns : list, default=None | ||
If not None, only these columns will be read from the file. | ||
pack_columns: dict, default=None | ||
If not None, selects a set of columns from each keyed nested parquet | ||
object to read from the nested files. | ||
dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable' | ||
Back-end data type applied to the resultant :class:`DataFrame` | ||
(still experimental). Behaviour is as follows: | ||
* ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` | ||
(default). | ||
* ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` | ||
DataFrame. | ||
Returns | ||
------- | ||
NestedFrame | ||
""" | ||
|
||
df = NestedFrame(pd.read_parquet(data, engine="pyarrow", columns=columns, dtype_backend=dtype_backend)) | ||
|
||
for pack_key in to_pack: | ||
col_subset = pack_columns.get(pack_key, None) if pack_columns is not None else None | ||
packed = pd.read_parquet( | ||
to_pack[pack_key], engine="pyarrow", columns=col_subset, dtype_backend=dtype_backend | ||
) | ||
df = df.add_nested(packed, pack_key) | ||
|
||
return df |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
import os | ||
|
||
import pandas as pd | ||
import pytest | ||
from nested_pandas import read_parquet | ||
|
||
|
||
@pytest.mark.parametrize("columns", [["a"], None]) | ||
@pytest.mark.parametrize("pack_columns", [{"nested1": ["c"], "nested2": ["e"]}, {"nested1": ["d"]}, None]) | ||
def test_read_parquet(tmp_path, columns, pack_columns): | ||
"""Test nested parquet loading""" | ||
# Setup a temporary directory for files | ||
save_path = os.path.join(tmp_path, ".") | ||
|
||
# Generate some test data | ||
base = pd.DataFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2]) | ||
|
||
nested1 = pd.DataFrame( | ||
data={"c": [0, 2, 4, 1, 4, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]}, | ||
index=[0, 0, 0, 1, 1, 1, 2, 2, 2], | ||
) | ||
|
||
nested2 = pd.DataFrame( | ||
data={"e": [0, 2, 4, 1, 4, 3, 1, 4, 1], "f": [5, 4, 7, 5, 3, 1, 9, 3, 4]}, | ||
index=[0, 0, 0, 1, 1, 1, 2, 2, 2], | ||
) | ||
|
||
# Save to parquet | ||
base.to_parquet(os.path.join(save_path, "base.parquet")) | ||
nested1.to_parquet(os.path.join(save_path, "nested1.parquet")) | ||
nested2.to_parquet(os.path.join(save_path, "nested2.parquet")) | ||
|
||
# Read from parquet | ||
nf = read_parquet( | ||
data=os.path.join(save_path, "base.parquet"), | ||
to_pack={ | ||
"nested1": os.path.join(save_path, "nested1.parquet"), | ||
"nested2": os.path.join(save_path, "nested2.parquet"), | ||
}, | ||
columns=columns, | ||
pack_columns=pack_columns, | ||
) | ||
|
||
# Check Base Columns | ||
if columns is not None: | ||
assert nf.columns.tolist() == columns + ["nested1", "nested2"] | ||
else: | ||
assert nf.columns.tolist() == base.columns.tolist() + ["nested1", "nested2"] | ||
|
||
# Check Nested Columns | ||
if pack_columns is not None: | ||
for nested_col in pack_columns: | ||
assert nf[nested_col].nest.fields == pack_columns[nested_col] | ||
else: | ||
for nested_col in nf.nested_columns: | ||
if nested_col == "nested1": | ||
assert nf[nested_col].nest.fields == nested1.columns.tolist() | ||
elif nested_col == "nested2": | ||
assert nf[nested_col].nest.fields == nested2.columns.tolist() |