Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle non-unique index #168

Merged
merged 12 commits into from
Nov 6, 2024
94 changes: 56 additions & 38 deletions src/nested_pandas/nestedframe/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,12 @@
from pandas._libs import lib
from pandas._typing import Any, AnyAll, Axis, IndexLabel
from pandas.api.extensions import no_default
from pandas.api.types import is_bool_dtype
from pandas.core.computation.expr import PARSERS, PandasExprVisitor

from nested_pandas.series import packer
from nested_pandas.nestedframe.utils import extract_nest_names
from nested_pandas.series.dtype import NestedDtype

from ..series.packer import pack_sorted_df_into_struct
from .utils import extract_nest_names
from nested_pandas.series.packer import pack, pack_lists, pack_sorted_df_into_struct


class NestedPandasExprVisitor(PandasExprVisitor):
Expand Down Expand Up @@ -219,10 +218,8 @@
"." in key and key.split(".")[0] in self.nested_columns
):
nested, col = key.split(".")
new_flat = self[nested].nest.to_flat()
new_flat[col] = value
packed = packer.pack(new_flat)
return super().__setitem__(nested, packed)
new_nested_series = self[nested].nest.with_flat_field(col, value)
gitosaurus marked this conversation as resolved.
Show resolved Hide resolved
return super().__setitem__(nested, new_nested_series)

# Adding a new nested structure from a column
# Allows statements like ndf["new_nested.t"] = ndf["nested.t"] - 5
Expand All @@ -231,8 +228,9 @@
if isinstance(value, pd.Series):
value.name = col
value = value.to_frame()
packed = packer.pack(value)
return super().__setitem__(new_nested, packed)
new_df = self.add_nested(value, name=new_nested)
self._update_inplace(new_df)
return None

return super().__setitem__(key, value)

Expand All @@ -242,6 +240,7 @@
name: str,
*,
how: str = "left",
on: None | str | list[str] = None,
dtype: NestedDtype | pd.ArrowDtype | pa.DataType | None = None,
) -> Self: # type: ignore[name-defined] # noqa: F821
"""Packs input object to a nested column and adds it to the NestedFrame
Expand Down Expand Up @@ -272,6 +271,8 @@
index, and sort it lexicographically.
- inner: form intersection of calling frame's index with other
frame's index, preserving the order of the calling index.
on : str, list of str, default: None
Columns to join on.
dtype : dtype or None
NestedDtype to use for the nested column; pd.ArrowDtype or
pa.DataType can also be used to specify the nested dtype. If None,
Expand All @@ -283,12 +284,12 @@
A new NestedFrame with the added nested column.
"""
# Add sources to objects
packed = packer.pack(obj, name=name, dtype=dtype)
packed = pack(obj, name=name, on=on, dtype=dtype)
new_df = self.copy()
return new_df.join(packed, how=how)

@classmethod
def from_flat(cls, df, base_columns, nested_columns=None, index=None, name="nested"):
def from_flat(cls, df, base_columns, nested_columns=None, on: str | None = None, name="nested"):
"""Creates a NestedFrame with base and nested columns from a flat
dataframe.

Expand All @@ -304,7 +305,7 @@
in the list will attempt to be packed into a single nested column
with the name provided in `nested_name`. If None, is defined as all
columns not in `base_columns`.
index: str, or None
on: str or None
The name of a column to use as the new index. Typically, the index
should have a unique value per row for base columns, and should
repeat for nested columns. For example, a dataframe with two
Expand All @@ -330,11 +331,11 @@
"""

# Resolve new index
if index is not None:
if on is not None:
# if a base column is chosen remove it
if index in base_columns:
base_columns = [col for col in base_columns if col != index]
df = df.set_index(index)
if on in base_columns:
base_columns = [col for col in base_columns if col != on]
df = df.set_index(on)

# drop duplicates on index
out_df = df[base_columns][~df.index.duplicated(keep="first")]
Expand Down Expand Up @@ -401,7 +402,7 @@
raise ValueError("No columns were assigned as list columns.")

# Pack list columns into a nested column
packed_df = packer.pack_lists(df[list_columns])
packed_df = pack_lists(df[list_columns])
packed_df.name = name

# join the nested column to the base_column df
Expand Down Expand Up @@ -518,18 +519,37 @@
# to the nest and repack. Otherwise, apply it to this instance as usual,
# since it operated on the base attributes.
if isinstance(result, _SeriesFromNest):
if not is_bool_dtype(result.dtype):
raise ValueError("Query condition must evaluate to a boolean Series")

Check warning on line 523 in src/nested_pandas/nestedframe/core.py

View check run for this annotation

Codecov / codecov/patch

src/nested_pandas/nestedframe/core.py#L523

Added line #L523 was not covered by tests
hombit marked this conversation as resolved.
Show resolved Hide resolved

nest_name, flat_nest = result.nest_name, result.flat_nest
new_flat_nest = flat_nest.loc[result]
result = self.copy()
result[nest_name] = pack_sorted_df_into_struct(new_flat_nest)

# Reset index to "ordinal" like [0, 0, 0, 1, 1, 2, 2, 2]
flat_nest = flat_nest.set_index(self[nest_name].array.list_index)
query_result = result.set_axis(self[nest_name].array.list_index)
hombit marked this conversation as resolved.
Show resolved Hide resolved
# Selecting flat values matching the query result
new_flat_nest = flat_nest[query_result]
new_df = self._set_filtered_flat_df(nest_name, new_flat_nest)
else:
result = self.loc[result]
new_df = self.loc[result]

if inplace:
self._update_inplace(result)
self._update_inplace(new_df)
return None
else:
return result
return new_df

def _set_filtered_flat_df(self, nest_name, flat_df):
"""Set a filtered flat dataframe for a nested column

Here we assume that flat_df has filtered "ordinal" index,
e.g. flat_df.index == [0, 2, 2, 2], while self.index
is arbitrary (e.g. ["a", "b", "a"]),
and self[nest_name].array.list_index is [0, 0, 1, 1, 1, 2, 2, 2, 2].
"""
new_df = self.reset_index(drop=True)
new_df[nest_name] = pack_sorted_df_into_struct(flat_df, name=nest_name)
return new_df.set_index(self.index)

def _resolve_dropna_target(self, on_nested, subset):
"""resolves the target layer for a given set of dropna kwargs"""
Expand Down Expand Up @@ -654,34 +674,32 @@
return super().dropna(
axis=axis, how=how, thresh=thresh, subset=subset, inplace=inplace, ignore_index=ignore_index
)
if ignore_index:
raise ValueError("ignore_index is not supported for nested columns")

Check warning on line 678 in src/nested_pandas/nestedframe/core.py

View check run for this annotation

Codecov / codecov/patch

src/nested_pandas/nestedframe/core.py#L678

Added line #L678 was not covered by tests
if subset is not None:
subset = [col.split(".")[-1] for col in subset]
target_flat = self[target].nest.to_flat()
target_flat = target_flat.set_index(self[target].array.list_index)
if inplace:
target_flat = self[target].nest.to_flat()
target_flat.dropna(
axis=axis,
how=how,
thresh=thresh,
subset=subset,
inplace=inplace,
ignore_index=ignore_index,
inplace=True,
)
self[target] = packer.pack_flat(target_flat)
return self
# Or if not inplace
new_df = self.copy()
new_df[target] = packer.pack_flat(
new_df[target]
.nest.to_flat()
.dropna(
else:
target_flat = target_flat.dropna(
axis=axis,
how=how,
thresh=thresh,
subset=subset,
inplace=inplace,
ignore_index=ignore_index,
inplace=False,
)
)
new_df = self._set_filtered_flat_df(nest_name=target, flat_df=target_flat)
if inplace:
self._update_inplace(new_df)
return None
return new_df

def reduce(self, func, *args, **kwargs) -> NestedFrame: # type: ignore[override]
Expand Down
6 changes: 6 additions & 0 deletions src/nested_pandas/series/ext_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -648,6 +648,12 @@ def num_chunks(self) -> int:
"""Number of chunks in underlying pyarrow.ChunkedArray"""
return self._chunked_array.num_chunks

@property
def list_index(self) -> np.ndarray:
"""Keys mapping values to lists"""
gitosaurus marked this conversation as resolved.
Show resolved Hide resolved
list_index = np.arange(len(self))
return np.repeat(list_index, np.diff(self.list_offsets))

def iter_field_lists(self, field: str) -> Generator[np.ndarray, None, None]:
"""Iterate over single field nested lists, as numpy arrays

Expand Down
15 changes: 11 additions & 4 deletions src/nested_pandas/series/packer.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
name: str | None = None,
*,
index=None,
on: None | str | list[str] = None,
dtype: NestedDtype | pd.ArrowDtype | pa.DataType | None = None,
) -> pd.Series:
"""Pack a "flat" dataframe or a sequence of dataframes into a "nested" series.
Expand All @@ -40,6 +41,8 @@
index : convertable to pd.Index, optional
Index of the output series. If obj is a pd.DataFrame, it is always nested by the original index,
and this value is used to override the index after the nesting.
on: str or list of str, optional
Column name(s) to join on. If None, the index is used.
dtype : dtype or None
NestedDtype of the output series, or other type to derive from. If None,
the dtype is inferred from the first non-missing dataframe.
Expand All @@ -50,14 +53,14 @@
Output series.
"""
if isinstance(obj, pd.DataFrame):
nested = pack_flat(obj, name=name)
nested = pack_flat(obj, name=name, on=on)
if index is not None:
nested.index = index
return nested
return pack_seq(obj, name=name, index=index, dtype=dtype)


def pack_flat(df: pd.DataFrame, name: str | None = None) -> pd.Series:
def pack_flat(df: pd.DataFrame, name: str | None = None, *, on: None | str | list[str] = None) -> pd.Series:
"""Make a structure of lists representation of a "flat" dataframe.

For the input dataframe with repeated indexes, make a pandas.Series,
Expand All @@ -73,6 +76,8 @@
Input dataframe, with repeated indexes.
name : str, optional
Name of the pd.Series.
on : str or list of str, optional
Column name(s) to join on. If None, the df's index is used.

Returns
-------
Expand All @@ -86,9 +91,11 @@
nested_pandas.series.packer.pack_lists : Pack a dataframe of nested arrays.
"""

if on is not None:
df = df.set_index(on)

Check warning on line 95 in src/nested_pandas/series/packer.py

View check run for this annotation

Codecov / codecov/patch

src/nested_pandas/series/packer.py#L95

Added line #L95 was not covered by tests
# pandas knows when index is pre-sorted, so it would do nothing if it is already sorted
flat = df.sort_index(kind="stable")
return pack_sorted_df_into_struct(flat, name=name)
sorted_flat = df.sort_index(kind="stable")
return pack_sorted_df_into_struct(sorted_flat, name=name)


def pack_seq(
Expand Down
12 changes: 6 additions & 6 deletions tests/nested_pandas/nestedframe/test_nestedframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,8 +317,8 @@ def test_add_nested_for_empty_df():


@pytest.mark.parametrize("pandas", [False, True])
@pytest.mark.parametrize("index", [None, "a", "c"])
def test_from_flat(index, pandas):
@pytest.mark.parametrize("on", [None, "a", "c"])
def test_from_flat(on, pandas):
"""Test the NestedFrame.from_flat functionality"""

if pandas:
Expand All @@ -332,17 +332,17 @@ def test_from_flat(index, pandas):
index=[0, 0, 0, 1, 1],
)

out_nf = NestedFrame.from_flat(nf, base_columns=["a", "b"], index=index, name="new_nested")
out_nf = NestedFrame.from_flat(nf, base_columns=["a", "b"], on=on, name="new_nested")

if index is None:
if on is None:
assert list(out_nf.columns) == ["a", "b", "new_nested"]
assert list(out_nf.new_nested.nest.fields) == ["c", "d"]
assert len(out_nf) == 2
elif index == "a":
elif on == "a":
assert list(out_nf.columns) == ["b", "new_nested"]
assert list(out_nf.new_nested.nest.fields) == ["c", "d"]
assert len(out_nf) == 2
elif index == "c": # not what a user likely wants, but should still work
elif on == "c": # not what a user likely wants, but should still work
assert list(out_nf.columns) == ["a", "b", "new_nested"]
assert list(out_nf.new_nested.nest.fields) == ["d"]
assert len(out_nf) == 5
Expand Down