diff --git a/pyproject.toml b/pyproject.toml index f010207..6c03119 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,7 @@ classifiers = [ dynamic = ["version"] requires-python = ">=3.9" dependencies = [ - 'nested-pandas>=0.2.1,<0.3', + 'nested-pandas>=0.3.1,<0.4.0', 'numpy', 'dask>=2024.3.0', 'dask[distributed]>=2024.3.0', diff --git a/src/nested_dask/core.py b/src/nested_dask/core.py index e14a1e0..dd13bfe 100644 --- a/src/nested_dask/core.py +++ b/src/nested_dask/core.py @@ -287,7 +287,7 @@ def from_map( return NestedFrame.from_dask_dataframe(nf) @classmethod - def from_flat(cls, df, base_columns, nested_columns=None, index=None, name="nested"): + def from_flat(cls, df, base_columns, nested_columns=None, on=None, name="nested"): """Creates a NestedFrame with base and nested columns from a flat dataframe. @@ -303,7 +303,7 @@ def from_flat(cls, df, base_columns, nested_columns=None, index=None, name="nest in the list will attempt to be packed into a single nested column with the name provided in `nested_name`. If None, is defined as all columns not in `base_columns`. - index: str, or None + on: str or None The name of a column to use as the new index. Typically, the index should have a unique value per row for base columns, and should repeat for nested columns. For example, a dataframe with two @@ -323,7 +323,7 @@ def from_flat(cls, df, base_columns, nested_columns=None, index=None, name="nest meta = npd.NestedFrame(df[base_columns]._meta) if nested_columns is None: - nested_columns = [col for col in df.columns if (col not in base_columns) and col != index] + nested_columns = [col for col in df.columns if (col not in base_columns) and col != on] if len(nested_columns) > 0: nested_meta = pack(df[nested_columns]._meta, name) @@ -331,7 +331,7 @@ def from_flat(cls, df, base_columns, nested_columns=None, index=None, name="nest return df.map_partitions( lambda x: npd.NestedFrame.from_flat( - df=x, base_columns=base_columns, nested_columns=nested_columns, index=index, name=name + df=x, base_columns=base_columns, nested_columns=nested_columns, on=on, name=name ), meta=meta, ) diff --git a/tests/nested_dask/conftest.py b/tests/nested_dask/conftest.py index bdcfad7..d2029bd 100644 --- a/tests/nested_dask/conftest.py +++ b/tests/nested_dask/conftest.py @@ -1,6 +1,8 @@ import nested_dask as nd import nested_pandas as npd import numpy as np +import pandas as pd +import pyarrow as pa import pytest @@ -18,7 +20,11 @@ def test_dataset(): layer_data = { "t": randomstate.random(layer_size * n_base) * 20, "flux": randomstate.random(layer_size * n_base) * 100, - "band": randomstate.choice(["r", "g"], size=layer_size * n_base), + # Ensure pyarrow[string] dtype, not large_string + # https://github.com/lincc-frameworks/nested-dask/issues/71 + "band": pd.Series( + randomstate.choice(["r", "g"], size=layer_size * n_base), dtype=pd.ArrowDtype(pa.string()) + ), "index": np.arange(layer_size * n_base) % n_base, } layer_nf = npd.NestedFrame(data=layer_data).set_index("index").sort_index() diff --git a/tests/nested_dask/test_nestedframe.py b/tests/nested_dask/test_nestedframe.py index 15ae7a0..3546eab 100644 --- a/tests/nested_dask/test_nestedframe.py +++ b/tests/nested_dask/test_nestedframe.py @@ -173,7 +173,7 @@ def test_from_flat(): assert len(ndf_comp) == 2 # Check using an index - ndf = nd.NestedFrame.from_flat(nf, base_columns=["b"], index="a") + ndf = nd.NestedFrame.from_flat(nf, base_columns=["b"], on="a") assert list(ndf.columns) == ["b", "nested"] assert list(ndf["nested"].nest.fields) == ["c", "d"] ndf_comp = ndf.compute()