Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Switch from_flat 'index' keyword to 'on' #69

Merged
merged 5 commits into from
Nov 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ classifiers = [
dynamic = ["version"]
requires-python = ">=3.9"
dependencies = [
'nested-pandas>=0.2.1,<0.3',
'nested-pandas>=0.3.1,<0.4.0',
'numpy',
'dask>=2024.3.0',
'dask[distributed]>=2024.3.0',
Expand Down
8 changes: 4 additions & 4 deletions src/nested_dask/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,7 @@ def from_map(
return NestedFrame.from_dask_dataframe(nf)

@classmethod
def from_flat(cls, df, base_columns, nested_columns=None, index=None, name="nested"):
def from_flat(cls, df, base_columns, nested_columns=None, on=None, name="nested"):
"""Creates a NestedFrame with base and nested columns from a flat
dataframe.

Expand All @@ -303,7 +303,7 @@ def from_flat(cls, df, base_columns, nested_columns=None, index=None, name="nest
in the list will attempt to be packed into a single nested column
with the name provided in `nested_name`. If None, is defined as all
columns not in `base_columns`.
index: str, or None
on: str or None
The name of a column to use as the new index. Typically, the index
should have a unique value per row for base columns, and should
repeat for nested columns. For example, a dataframe with two
Expand All @@ -323,15 +323,15 @@ def from_flat(cls, df, base_columns, nested_columns=None, index=None, name="nest
meta = npd.NestedFrame(df[base_columns]._meta)

if nested_columns is None:
nested_columns = [col for col in df.columns if (col not in base_columns) and col != index]
nested_columns = [col for col in df.columns if (col not in base_columns) and col != on]

if len(nested_columns) > 0:
nested_meta = pack(df[nested_columns]._meta, name)
meta = meta.join(nested_meta)

return df.map_partitions(
lambda x: npd.NestedFrame.from_flat(
df=x, base_columns=base_columns, nested_columns=nested_columns, index=index, name=name
df=x, base_columns=base_columns, nested_columns=nested_columns, on=on, name=name
),
meta=meta,
)
Expand Down
8 changes: 7 additions & 1 deletion tests/nested_dask/conftest.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import nested_dask as nd
import nested_pandas as npd
import numpy as np
import pandas as pd
import pyarrow as pa
import pytest


Expand All @@ -18,7 +20,11 @@ def test_dataset():
layer_data = {
"t": randomstate.random(layer_size * n_base) * 20,
"flux": randomstate.random(layer_size * n_base) * 100,
"band": randomstate.choice(["r", "g"], size=layer_size * n_base),
# Ensure pyarrow[string] dtype, not large_string
# https://github.com/lincc-frameworks/nested-dask/issues/71
"band": pd.Series(
randomstate.choice(["r", "g"], size=layer_size * n_base), dtype=pd.ArrowDtype(pa.string())
),
"index": np.arange(layer_size * n_base) % n_base,
}
layer_nf = npd.NestedFrame(data=layer_data).set_index("index").sort_index()
Expand Down
2 changes: 1 addition & 1 deletion tests/nested_dask/test_nestedframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ def test_from_flat():
assert len(ndf_comp) == 2

# Check using an index
ndf = nd.NestedFrame.from_flat(nf, base_columns=["b"], index="a")
ndf = nd.NestedFrame.from_flat(nf, base_columns=["b"], on="a")
assert list(ndf.columns) == ["b", "nested"]
assert list(ndf["nested"].nest.fields) == ["c", "d"]
ndf_comp = ndf.compute()
Expand Down
Loading