From ab366d5c8021b9ff870e2894c47f605b52df502f Mon Sep 17 00:00:00 2001 From: Wilson Beebe Date: Wed, 6 Nov 2024 10:35:06 -0800 Subject: [PATCH 1/5] Switch from_flat 'index' keyword to 'on' --- src/nested_dask/core.py | 8 ++++---- tests/nested_dask/test_nestedframe.py | 7 ++++--- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/nested_dask/core.py b/src/nested_dask/core.py index e14a1e0..dd13bfe 100644 --- a/src/nested_dask/core.py +++ b/src/nested_dask/core.py @@ -287,7 +287,7 @@ def from_map( return NestedFrame.from_dask_dataframe(nf) @classmethod - def from_flat(cls, df, base_columns, nested_columns=None, index=None, name="nested"): + def from_flat(cls, df, base_columns, nested_columns=None, on=None, name="nested"): """Creates a NestedFrame with base and nested columns from a flat dataframe. @@ -303,7 +303,7 @@ def from_flat(cls, df, base_columns, nested_columns=None, index=None, name="nest in the list will attempt to be packed into a single nested column with the name provided in `nested_name`. If None, is defined as all columns not in `base_columns`. - index: str, or None + on: str or None The name of a column to use as the new index. Typically, the index should have a unique value per row for base columns, and should repeat for nested columns. For example, a dataframe with two @@ -323,7 +323,7 @@ def from_flat(cls, df, base_columns, nested_columns=None, index=None, name="nest meta = npd.NestedFrame(df[base_columns]._meta) if nested_columns is None: - nested_columns = [col for col in df.columns if (col not in base_columns) and col != index] + nested_columns = [col for col in df.columns if (col not in base_columns) and col != on] if len(nested_columns) > 0: nested_meta = pack(df[nested_columns]._meta, name) @@ -331,7 +331,7 @@ def from_flat(cls, df, base_columns, nested_columns=None, index=None, name="nest return df.map_partitions( lambda x: npd.NestedFrame.from_flat( - df=x, base_columns=base_columns, nested_columns=nested_columns, index=index, name=name + df=x, base_columns=base_columns, nested_columns=nested_columns, on=on, name=name ), meta=meta, ) diff --git a/tests/nested_dask/test_nestedframe.py b/tests/nested_dask/test_nestedframe.py index 15ae7a0..d96d383 100644 --- a/tests/nested_dask/test_nestedframe.py +++ b/tests/nested_dask/test_nestedframe.py @@ -1,14 +1,15 @@ import dask import dask.dataframe as dd -import nested_dask as nd import nested_pandas as npd import numpy as np import pandas as pd import pyarrow as pa import pytest -from nested_dask.datasets import generate_data from nested_pandas.series.dtype import NestedDtype +import nested_dask as nd +from nested_dask.datasets import generate_data + dask.config.set({"dataframe.convert-string": False}) @@ -173,7 +174,7 @@ def test_from_flat(): assert len(ndf_comp) == 2 # Check using an index - ndf = nd.NestedFrame.from_flat(nf, base_columns=["b"], index="a") + ndf = nd.NestedFrame.from_flat(nf, base_columns=["b"], on="a") assert list(ndf.columns) == ["b", "nested"] assert list(ndf["nested"].nest.fields) == ["c", "d"] ndf_comp = ndf.compute() From cce604dedee071ab7730043df775696295fa4a55 Mon Sep 17 00:00:00 2001 From: Wilson Beebe Date: Wed, 6 Nov 2024 13:01:25 -0800 Subject: [PATCH 2/5] Update nested-pandas version requirements --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index f010207..fba7c48 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,7 @@ classifiers = [ dynamic = ["version"] requires-python = ">=3.9" dependencies = [ - 'nested-pandas>=0.2.1,<0.3', + 'nested-pandas>=0.3.0,<0.4.0', 'numpy', 'dask>=2024.3.0', 'dask[distributed]>=2024.3.0', From 4f97ce46c52f19fe15fcde859cd7fbf39de43733 Mon Sep 17 00:00:00 2001 From: Konstantin Malanchev Date: Wed, 6 Nov 2024 17:39:52 -0500 Subject: [PATCH 3/5] Change test_dataset band dtype --- tests/nested_dask/conftest.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/nested_dask/conftest.py b/tests/nested_dask/conftest.py index bdcfad7..d2029bd 100644 --- a/tests/nested_dask/conftest.py +++ b/tests/nested_dask/conftest.py @@ -1,6 +1,8 @@ import nested_dask as nd import nested_pandas as npd import numpy as np +import pandas as pd +import pyarrow as pa import pytest @@ -18,7 +20,11 @@ def test_dataset(): layer_data = { "t": randomstate.random(layer_size * n_base) * 20, "flux": randomstate.random(layer_size * n_base) * 100, - "band": randomstate.choice(["r", "g"], size=layer_size * n_base), + # Ensure pyarrow[string] dtype, not large_string + # https://github.com/lincc-frameworks/nested-dask/issues/71 + "band": pd.Series( + randomstate.choice(["r", "g"], size=layer_size * n_base), dtype=pd.ArrowDtype(pa.string()) + ), "index": np.arange(layer_size * n_base) % n_base, } layer_nf = npd.NestedFrame(data=layer_data).set_index("index").sort_index() From 5b60b0c1dbfcf8f8c5629b57379ce36cc4606a90 Mon Sep 17 00:00:00 2001 From: Konstantin Malanchev Date: Wed, 6 Nov 2024 17:46:50 -0500 Subject: [PATCH 4/5] Require nested-pandas>=0.3.1 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index fba7c48..6c03119 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,7 @@ classifiers = [ dynamic = ["version"] requires-python = ">=3.9" dependencies = [ - 'nested-pandas>=0.3.0,<0.4.0', + 'nested-pandas>=0.3.1,<0.4.0', 'numpy', 'dask>=2024.3.0', 'dask[distributed]>=2024.3.0', From e4e09a7826936db2969cc516d4c16d559071510e Mon Sep 17 00:00:00 2001 From: Konstantin Malanchev Date: Wed, 6 Nov 2024 17:51:30 -0500 Subject: [PATCH 5/5] Formatting --- tests/nested_dask/test_nestedframe.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/nested_dask/test_nestedframe.py b/tests/nested_dask/test_nestedframe.py index d96d383..3546eab 100644 --- a/tests/nested_dask/test_nestedframe.py +++ b/tests/nested_dask/test_nestedframe.py @@ -1,14 +1,13 @@ import dask import dask.dataframe as dd +import nested_dask as nd import nested_pandas as npd import numpy as np import pandas as pd import pyarrow as pa import pytest -from nested_pandas.series.dtype import NestedDtype - -import nested_dask as nd from nested_dask.datasets import generate_data +from nested_pandas.series.dtype import NestedDtype dask.config.set({"dataframe.convert-string": False})