Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle non-unique index #168

Merged
merged 12 commits into from
Nov 6, 2024
95 changes: 56 additions & 39 deletions src/nested_pandas/nestedframe/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,9 @@
from pandas.api.extensions import no_default
from pandas.core.computation.expr import PARSERS, PandasExprVisitor

from nested_pandas.series import packer
from nested_pandas.nestedframe.utils import extract_nest_names
from nested_pandas.series.dtype import NestedDtype

from ..series.packer import pack_sorted_df_into_struct
from .utils import extract_nest_names
from nested_pandas.series.packer import pack, pack_lists, pack_sorted_df_into_struct


class NestedPandasExprVisitor(PandasExprVisitor):
Expand Down Expand Up @@ -219,10 +217,8 @@
"." in key and key.split(".")[0] in self.nested_columns
):
nested, col = key.split(".")
new_flat = self[nested].nest.to_flat()
new_flat[col] = value
packed = packer.pack(new_flat)
return super().__setitem__(nested, packed)
new_nested_series = self[nested].nest.with_flat_field(col, value)
gitosaurus marked this conversation as resolved.
Show resolved Hide resolved
return super().__setitem__(nested, new_nested_series)

# Adding a new nested structure from a column
# Allows statements like ndf["new_nested.t"] = ndf["nested.t"] - 5
Expand All @@ -231,8 +227,9 @@
if isinstance(value, pd.Series):
value.name = col
value = value.to_frame()
packed = packer.pack(value)
return super().__setitem__(new_nested, packed)
new_df = self.add_nested(value, name=new_nested)
self._update_inplace(new_df)
return None

return super().__setitem__(key, value)

Expand All @@ -242,6 +239,7 @@
name: str,
*,
how: str = "left",
on: None | str | list[str] = None,
dtype: NestedDtype | pd.ArrowDtype | pa.DataType | None = None,
) -> Self: # type: ignore[name-defined] # noqa: F821
"""Packs input object to a nested column and adds it to the NestedFrame
Expand Down Expand Up @@ -272,6 +270,8 @@
index, and sort it lexicographically.
- inner: form intersection of calling frame's index with other
frame's index, preserving the order of the calling index.
on : str, default: None
A column in the list
dtype : dtype or None
NestedDtype to use for the nested column; pd.ArrowDtype or
pa.DataType can also be used to specify the nested dtype. If None,
Expand All @@ -282,13 +282,16 @@
NestedFrame
A new NestedFrame with the added nested column.
"""
if on is not None and not isinstance(on, str):
raise ValueError("Currently we only support a single column for 'on'")
# Add sources to objects
packed = packer.pack(obj, name=name, dtype=dtype)
packed = pack(obj, name=name, on=on, dtype=dtype)
new_df = self.copy()
return new_df.join(packed, how=how)
res = new_df.join(packed, how=how, on=on)
return res

@classmethod
def from_flat(cls, df, base_columns, nested_columns=None, index=None, name="nested"):
def from_flat(cls, df, base_columns, nested_columns=None, on: str | None = None, name="nested"):
"""Creates a NestedFrame with base and nested columns from a flat
dataframe.

Expand All @@ -304,7 +307,7 @@
in the list will attempt to be packed into a single nested column
with the name provided in `nested_name`. If None, is defined as all
columns not in `base_columns`.
index: str, or None
on: str or None
The name of a column to use as the new index. Typically, the index
should have a unique value per row for base columns, and should
repeat for nested columns. For example, a dataframe with two
Expand All @@ -330,11 +333,11 @@
"""

# Resolve new index
if index is not None:
if on is not None:
# if a base column is chosen remove it
if index in base_columns:
base_columns = [col for col in base_columns if col != index]
df = df.set_index(index)
if on in base_columns:
base_columns = [col for col in base_columns if col != on]
df = df.set_index(on)

# drop duplicates on index
out_df = df[base_columns][~df.index.duplicated(keep="first")]
Expand Down Expand Up @@ -401,7 +404,7 @@
raise ValueError("No columns were assigned as list columns.")

# Pack list columns into a nested column
packed_df = packer.pack_lists(df[list_columns])
packed_df = pack_lists(df[list_columns])
packed_df.name = name

# join the nested column to the base_column df
Expand Down Expand Up @@ -519,17 +522,33 @@
# since it operated on the base attributes.
if isinstance(result, _SeriesFromNest):
nest_name, flat_nest = result.nest_name, result.flat_nest
new_flat_nest = flat_nest.loc[result]
result = self.copy()
result[nest_name] = pack_sorted_df_into_struct(new_flat_nest)
# Reset index to "ordinal" like [0, 0, 0, 1, 1, 2, 2, 2]
list_index = self[nest_name].array.get_list_index()
flat_nest = flat_nest.set_index(list_index)
query_result = result.set_axis(list_index)
# Selecting flat values matching the query result
new_flat_nest = flat_nest[query_result]
new_df = self._set_filtered_flat_df(nest_name, new_flat_nest)
else:
result = self.loc[result]
new_df = self.loc[result]

if inplace:
self._update_inplace(result)
self._update_inplace(new_df)
return None
else:
return result
return new_df

def _set_filtered_flat_df(self, nest_name, flat_df):
"""Set a filtered flat dataframe for a nested column

Here we assume that flat_df has filtered "ordinal" index,
e.g. flat_df.index == [0, 2, 2, 2], while self.index
is arbitrary (e.g. ["a", "b", "a"]),
and self[nest_name].array.list_index is [0, 0, 1, 1, 1, 2, 2, 2, 2].
"""
new_df = self.reset_index(drop=True)
new_df[nest_name] = pack_sorted_df_into_struct(flat_df, name=nest_name)
return new_df.set_index(self.index)

def _resolve_dropna_target(self, on_nested, subset):
"""resolves the target layer for a given set of dropna kwargs"""
Expand Down Expand Up @@ -654,34 +673,32 @@
return super().dropna(
axis=axis, how=how, thresh=thresh, subset=subset, inplace=inplace, ignore_index=ignore_index
)
if ignore_index:
raise ValueError("ignore_index is not supported for nested columns")

Check warning on line 677 in src/nested_pandas/nestedframe/core.py

View check run for this annotation

Codecov / codecov/patch

src/nested_pandas/nestedframe/core.py#L677

Added line #L677 was not covered by tests
if subset is not None:
subset = [col.split(".")[-1] for col in subset]
target_flat = self[target].nest.to_flat()
target_flat = target_flat.set_index(self[target].array.get_list_index())
if inplace:
target_flat = self[target].nest.to_flat()
target_flat.dropna(
axis=axis,
how=how,
thresh=thresh,
subset=subset,
inplace=inplace,
ignore_index=ignore_index,
inplace=True,
)
self[target] = packer.pack_flat(target_flat)
return self
# Or if not inplace
new_df = self.copy()
new_df[target] = packer.pack_flat(
new_df[target]
.nest.to_flat()
.dropna(
else:
target_flat = target_flat.dropna(
axis=axis,
how=how,
thresh=thresh,
subset=subset,
inplace=inplace,
ignore_index=ignore_index,
inplace=False,
)
)
new_df = self._set_filtered_flat_df(nest_name=target, flat_df=target_flat)
if inplace:
self._update_inplace(new_df)
return None
return new_df

def reduce(self, func, *args, **kwargs) -> NestedFrame: # type: ignore[override]
Expand Down
8 changes: 8 additions & 0 deletions src/nested_pandas/series/ext_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -648,6 +648,14 @@ def num_chunks(self) -> int:
"""Number of chunks in underlying pyarrow.ChunkedArray"""
return self._chunked_array.num_chunks

def get_list_index(self) -> np.ndarray:
"""Keys mapping values to lists"""
gitosaurus marked this conversation as resolved.
Show resolved Hide resolved
if len(self) == 0:
# Since we have no list offsets, return an empty array
return np.array([], dtype=int)
list_index = np.arange(len(self))
return np.repeat(list_index, np.diff(self.list_offsets))

def iter_field_lists(self, field: str) -> Generator[np.ndarray, None, None]:
"""Iterate over single field nested lists, as numpy arrays

Expand Down
15 changes: 11 additions & 4 deletions src/nested_pandas/series/packer.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def pack(
name: str | None = None,
*,
index=None,
on: None | str | list[str] = None,
dtype: NestedDtype | pd.ArrowDtype | pa.DataType | None = None,
) -> pd.Series:
"""Pack a "flat" dataframe or a sequence of dataframes into a "nested" series.
Expand All @@ -40,6 +41,8 @@ def pack(
index : convertable to pd.Index, optional
Index of the output series. If obj is a pd.DataFrame, it is always nested by the original index,
and this value is used to override the index after the nesting.
on: str or list of str, optional
Column name(s) to join on. If None, the index is used.
dtype : dtype or None
NestedDtype of the output series, or other type to derive from. If None,
the dtype is inferred from the first non-missing dataframe.
Expand All @@ -50,14 +53,14 @@ def pack(
Output series.
"""
if isinstance(obj, pd.DataFrame):
nested = pack_flat(obj, name=name)
nested = pack_flat(obj, name=name, on=on)
if index is not None:
nested.index = index
return nested
return pack_seq(obj, name=name, index=index, dtype=dtype)


def pack_flat(df: pd.DataFrame, name: str | None = None) -> pd.Series:
def pack_flat(df: pd.DataFrame, name: str | None = None, *, on: None | str | list[str] = None) -> pd.Series:
"""Make a structure of lists representation of a "flat" dataframe.

For the input dataframe with repeated indexes, make a pandas.Series,
Expand All @@ -73,6 +76,8 @@ def pack_flat(df: pd.DataFrame, name: str | None = None) -> pd.Series:
Input dataframe, with repeated indexes.
name : str, optional
Name of the pd.Series.
on : str or list of str, optional
Column name(s) to join on. If None, the df's index is used.

Returns
-------
Expand All @@ -86,9 +91,11 @@ def pack_flat(df: pd.DataFrame, name: str | None = None) -> pd.Series:
nested_pandas.series.packer.pack_lists : Pack a dataframe of nested arrays.
"""

if on is not None:
df = df.set_index(on)
# pandas knows when index is pre-sorted, so it would do nothing if it is already sorted
flat = df.sort_index(kind="stable")
return pack_sorted_df_into_struct(flat, name=name)
sorted_flat = df.sort_index(kind="stable")
return pack_sorted_df_into_struct(sorted_flat, name=name)


def pack_seq(
Expand Down
Loading