Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Wrap dropna #26

Merged
merged 9 commits into from
Apr 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
156 changes: 156 additions & 0 deletions src/nested_pandas/nestedframe/core.py
dougbrn marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
# typing.Self and "|" union syntax don't exist in Python 3.9
from __future__ import annotations

import numpy as np
import pandas as pd
from pandas._libs import lib
dougbrn marked this conversation as resolved.
Show resolved Hide resolved
from pandas._typing import AnyAll, Axis, IndexLabel
from pandas.api.extensions import no_default

from nested_pandas.series import packer
from nested_pandas.series.dtype import NestedDtype
Expand Down Expand Up @@ -154,3 +158,155 @@ def query(self, expr) -> Self: # type: ignore[name-defined] # noqa: F821
# TODO: does not work with queries that empty the dataframe
result[expr] = result[expr].nest.query_flat(exprs_to_use[expr])
return result

def _resolve_dropna_target(self, on_nested, subset):
"""resolves the target layer for a given set of dropna kwargs"""

nested_cols = self.nested_columns
columns = self.columns

# first check the subset kwarg input
subset_target = []
if subset:
if isinstance(subset, str):
subset = [subset]

for col in subset:
col = col.split(".")[0]
if col in nested_cols:
subset_target.append(col)
elif col in columns:
subset_target.append("base")
else:
raise ValueError(f"Column name {col} not found in any base or nested columns")

# Check for 1 target
subset_target = np.unique(subset_target)
if len(subset_target) > 1: # prohibit multi-target operations
raise ValueError(
f"Targeted multiple nested structures ({subset_target}), write one command per target dataframe" # noqa
)
subset_target = str(subset_target[0])

# Next check the on_nested kwarg input
if on_nested and on_nested not in nested_cols:
raise ValueError("Provided nested layer not found in nested dataframes")

# Resolve target layer
target = "base"
if on_nested and subset_target:
if on_nested != subset_target:
raise ValueError(
f"Provided on_nested={on_nested}, but subset columns are from {subset_target}. Make sure these are aligned or just use subset." # noqa
)
else:
target = subset_target
elif on_nested:
target = str(on_nested)
elif subset_target:
target = str(subset_target)
return target, subset

def dropna(
self,
*,
axis: Axis = 0,
how: AnyAll | lib.NoDefault = no_default,
thresh: int | lib.NoDefault = no_default,
on_nested: bool = False,
subset: IndexLabel | None = None,
inplace: bool = False,
ignore_index: bool = False,
) -> NestedFrame | None:
"""
Remove missing values for one layer of the NestedFrame.

Parameters
----------
axis : {0 or 'index', 1 or 'columns'}, default 0
Determine if rows or columns which contain missing values are
removed.

* 0, or 'index' : Drop rows which contain missing values.
* 1, or 'columns' : Drop columns which contain missing value.

Only a single axis is allowed.

how : {'any', 'all'}, default 'any'
Determine if row or column is removed from DataFrame, when we have
at least one NA or all NA.

* 'any' : If any NA values are present, drop that row or column.
* 'all' : If all values are NA, drop that row or column.
thresh : int, optional
Require that many non-NA values. Cannot be combined with how.
on_nested : str or bool, optional
If not False, applies the call to the nested dataframe in the
column with label equal to the provided string. If specified,
the nested dataframe should align with any columns given in
`subset`.
subset : column label or sequence of labels, optional
Labels along other axis to consider, e.g. if you are dropping rows
these would be a list of columns to include.

Access nested columns using `nested_df.nested_col` (where
`nested_df` refers to a particular nested dataframe and
`nested_col` is a column of that nested dataframe).
inplace : bool, default False
Whether to modify the DataFrame rather than creating a new one.
ignore_index : bool, default ``False``
If ``True``, the resulting axis will be labeled 0, 1, …, n - 1.

.. versionadded:: 2.0.0

Returns
-------
DataFrame or None
DataFrame with NA entries dropped from it or None if ``inplace=True``.

Notes
-----
Operations that target a particular nested structure return a dataframe
with rows of that particular nested structure affected.

Values for `on_nested` and `subset` should be consistent in pointing
to a single layer, multi-layer operations are not supported at this
time.
"""

# determine target dataframe
target, subset = self._resolve_dropna_target(on_nested, subset)

if target == "base":
return super().dropna(
axis=axis, how=how, thresh=thresh, subset=subset, inplace=inplace, ignore_index=ignore_index
)
if subset is not None:
subset = [col.split(".")[-1] for col in subset]
if inplace:
target_flat = self[target].nest.to_flat()
target_flat.dropna(
axis=axis,
how=how,
thresh=thresh,
subset=subset,
inplace=inplace,
ignore_index=ignore_index,
)
self[target] = packer.pack_flat(target_flat)
return self
# Or if not inplace
new_df = self.copy()
new_df[target] = packer.pack_flat(
new_df[target]
.nest.to_flat()
.dropna(
axis=axis,
how=how,
thresh=thresh,
subset=subset,
inplace=inplace,
ignore_index=ignore_index,
)
)
return new_df
105 changes: 105 additions & 0 deletions tests/nested_pandas/nestedframe/test_nestedframe.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import numpy as np
import pandas as pd
import pytest
from nested_pandas import NestedFrame
Expand Down Expand Up @@ -101,3 +102,107 @@ def test_query():

nest_queried = base.query("(nested.c > 1) and (nested.d>2)")
assert len(nest_queried.nested.nest.to_flat()) == 4


def test_dropna():
"""Test that dropna works on all layers"""

base = NestedFrame(data={"a": [1, 2, 3], "b": [2, np.NaN, 6]}, index=[0, 1, 2])

nested = pd.DataFrame(
data={"c": [0, 2, 4, 1, np.NaN, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)

base = base.add_nested(nested, "nested")

# Test basic functionality
dn_base = base.dropna(subset=["b"])
assert len(dn_base) == 2
assert len(dn_base["nested"].nest.to_flat() == 6)

# Test on_nested kwarg
dn_on_nested = base.dropna(on_nested="nested")
assert len(dn_on_nested) == 3
assert len(dn_on_nested["nested"].nest.to_flat() == 8)

# Test hierarchical column subset
dn_hierarchical = base.dropna(subset="nested.c")
assert len(dn_hierarchical) == 3
assert len(dn_hierarchical["nested"].nest.to_flat() == 8)

# Test hierarchical column subset and on_nested
dn_hierarchical = base.dropna(on_nested="nested", subset="nested.c")
assert len(dn_hierarchical) == 3
assert len(dn_hierarchical["nested"].nest.to_flat() == 8)


def test_dropna_inplace_base():
"""Test in-place behavior of dropna"""

base = NestedFrame(data={"a": [1, 2, 3], "b": [np.NaN, 4, 6]}, index=[0, 1, 2])

nested = pd.DataFrame(
data={"c": [0, 2, 4, 1, np.NaN, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)

base = base.add_nested(nested, "nested")

# Test inplace=False with base layer
dn_base = base.dropna(subset=["b"], inplace=False)
assert not dn_base.equals(base)

# Test inplace=True with base layer
base.dropna(subset=["b"], inplace=True)
assert dn_base.equals(base)


def test_dropna_inplace_nested():
"""Test in-place behavior of dropna"""

base = NestedFrame(data={"a": [1, 2, 3], "b": [np.NaN, 4, 6]}, index=[0, 1, 2])

nested = pd.DataFrame(
data={"c": [0, 2, 4, 1, np.NaN, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)

base = base.add_nested(nested, "nested")

# Test inplace=False with nested layer
dn_base = base.dropna(on_nested="nested", inplace=False)
assert not dn_base.nested.nest.to_flat().equals(base.nested.nest.to_flat())

# Test inplace=True with nested layer
base.dropna(on_nested="nested", inplace=True)
assert dn_base.equals(base)


def test_dropna_errors():
"""Test that the various dropna exceptions trigger"""

base = NestedFrame(data={"a": [1, 2, 3], "b": [2, np.NaN, 6]}, index=[0, 1, 2])

nested = pd.DataFrame(
data={"c": [0, 2, 4, 1, np.NaN, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)

base = base.add_nested(nested, "nested")

# Test multi-target
with pytest.raises(ValueError):
base.dropna(subset=["b", "nested.c"])

# Test no-target
with pytest.raises(ValueError):
base.dropna(subset=["not_nested.c"])

# Test bad on-nested value
with pytest.raises(ValueError):
base.dropna(on_nested="not_nested")

# Test on-nested + subset disagreement
with pytest.raises(ValueError):
base.dropna(on_nested="nested", subset=["b"])