Skip to content

Commit

Permalink
Merge pull request #169 from lincc-frameworks/add_nested_tests
Browse files Browse the repository at this point in the history
Tests for using the 'on' keyword with add_nested and packing
  • Loading branch information
wilsonbb authored Nov 6, 2024
2 parents a6b83d8 + efcadff commit fe45496
Show file tree
Hide file tree
Showing 5 changed files with 249 additions and 24 deletions.
21 changes: 10 additions & 11 deletions src/nested_pandas/nestedframe/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from pandas._libs import lib
from pandas._typing import Any, AnyAll, Axis, IndexLabel
from pandas.api.extensions import no_default
from pandas.api.types import is_bool_dtype
from pandas.core.computation.expr import PARSERS, PandasExprVisitor

from nested_pandas.nestedframe.utils import extract_nest_names
Expand Down Expand Up @@ -271,8 +270,8 @@ def add_nested(
index, and sort it lexicographically.
- inner: form intersection of calling frame's index with other
frame's index, preserving the order of the calling index.
on : str, list of str, default: None
Columns to join on.
on : str, default: None
A column in the list
dtype : dtype or None
NestedDtype to use for the nested column; pd.ArrowDtype or
pa.DataType can also be used to specify the nested dtype. If None,
Expand All @@ -283,10 +282,13 @@ def add_nested(
NestedFrame
A new NestedFrame with the added nested column.
"""
if on is not None and not isinstance(on, str):
raise ValueError("Currently we only support a single column for 'on'")
# Add sources to objects
packed = pack(obj, name=name, on=on, dtype=dtype)
new_df = self.copy()
return new_df.join(packed, how=how)
res = new_df.join(packed, how=how, on=on)
return res

@classmethod
def from_flat(cls, df, base_columns, nested_columns=None, on: str | None = None, name="nested"):
Expand Down Expand Up @@ -519,14 +521,11 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> NestedFrame |
# to the nest and repack. Otherwise, apply it to this instance as usual,
# since it operated on the base attributes.
if isinstance(result, _SeriesFromNest):
if not is_bool_dtype(result.dtype):
raise ValueError("Query condition must evaluate to a boolean Series")

nest_name, flat_nest = result.nest_name, result.flat_nest

# Reset index to "ordinal" like [0, 0, 0, 1, 1, 2, 2, 2]
flat_nest = flat_nest.set_index(self[nest_name].array.list_index)
query_result = result.set_axis(self[nest_name].array.list_index)
list_index = self[nest_name].array.get_list_index()
flat_nest = flat_nest.set_index(list_index)
query_result = result.set_axis(list_index)
# Selecting flat values matching the query result
new_flat_nest = flat_nest[query_result]
new_df = self._set_filtered_flat_df(nest_name, new_flat_nest)
Expand Down Expand Up @@ -679,7 +678,7 @@ def dropna(
if subset is not None:
subset = [col.split(".")[-1] for col in subset]
target_flat = self[target].nest.to_flat()
target_flat = target_flat.set_index(self[target].array.list_index)
target_flat = target_flat.set_index(self[target].array.get_list_index())
if inplace:
target_flat.dropna(
axis=axis,
Expand Down
6 changes: 4 additions & 2 deletions src/nested_pandas/series/ext_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -648,9 +648,11 @@ def num_chunks(self) -> int:
"""Number of chunks in underlying pyarrow.ChunkedArray"""
return self._chunked_array.num_chunks

@property
def list_index(self) -> np.ndarray:
def get_list_index(self) -> np.ndarray:
"""Keys mapping values to lists"""
if len(self) == 0:
# Since we have no list offests, return an empty array
return np.array([], dtype=int)
list_index = np.arange(len(self))
return np.repeat(list_index, np.diff(self.list_offsets))

Expand Down
121 changes: 115 additions & 6 deletions tests/nested_pandas/nestedframe/test_nestedframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@
import pandas as pd
import pyarrow as pa
import pytest
from pandas.testing import assert_frame_equal

from nested_pandas import NestedFrame
from nested_pandas.datasets import generate_data
from nested_pandas.nestedframe.core import _SeriesFromNest
from pandas.testing import assert_frame_equal


def test_nestedframe_construction():
Expand Down Expand Up @@ -187,10 +188,16 @@ def test_add_nested_with_flat_df():
def test_add_nested_with_flat_df_and_mismatched_index():
"""Test add_nested when index values of base are missing matches in nested"""

base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2])
base = NestedFrame(
data={"a": [1, 2, 3], "b": [2, 4, 6], "new_index": [0, 1, 3] }, index=[0, 1, 2])

nested = pd.DataFrame(
data={"c": [0, 2, 4, 1, 4, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
data={
"c": [0, 2, 4, 1, 4, 3, 1, 4, 1],
"d": [5, 4, 7, 5, 3, 1, 9, 3, 4],
# A column we can have as an alternative joining index with 'on'
"new_index": [1, 1, 1, 1, 2, 2, 5, 5, 5],
},
# no data for base index value of "2" and introduces new index value "4"
index=[0, 0, 0, 1, 1, 1, 1, 4, 4],
)
Expand All @@ -212,6 +219,35 @@ def test_add_nested_with_flat_df_and_mismatched_index():
default_res = base.add_nested(nested, "nested")
assert_frame_equal(left_res, default_res)

# Test still adding the nested frame in a "left" fashion but on the "new_index" column

# We currently don't support a list of columns for the 'on' argument
with pytest.raises(ValueError):
left_res_on = base.add_nested(nested, "nested", how="left", on=["new_index"])
# Instead we should pass a single column name, "new_index" which exists in both frames.
left_res_on = base.add_nested(nested, "nested", how="left", on="new_index")
assert "nested" in left_res_on.columns
# Check that the index of the base layer is still being used
assert (left_res_on.index == base.index).all()
# Assert that the new_index column we joined on was dropped from the nested layer
# but is present in the base layer
assert "new_index" in left_res_on.columns
assert "new_index" not in left_res_on["nested"].nest.to_flat().columns

# For each index in the columns we joined on, check that values are aligned correctly
for i in range(len(left_res_on.new_index)):
# The actual "index" value we "joined" on.
join_idx = left_res_on.new_index.iloc[i]
# Check that the nested column is aligned correctly to the base layer
if join_idx in nested["new_index"].values:
assert left_res_on.iloc[i]["nested"] is not None
# Check that it is present in new the index we constructed for the nested layer
assert join_idx in left_res_on["nested"].nest.to_flat().index
else:
# Use an iloc
assert left_res_on.iloc[i]["nested"] is None
assert join_idx not in left_res_on["nested"].nest.to_flat().index

# Test adding the nested frame in a "right" fashion, where the index of the "right"
# frame (our nested layer) is preserved
right_res = base.add_nested(nested, "nested", how="right")
Expand All @@ -235,6 +271,35 @@ def test_add_nested_with_flat_df_and_mismatched_index():
else:
assert not pd.isna(right_res.loc[idx][col])

# Test still adding the nested frame in a "right" fashion but on the "new_index" column
right_res_on = base.add_nested(nested, "nested", how="right", on="new_index")
assert "nested" in right_res_on.columns
# Check that rows were dropped if the base layer's "new_index" value is not present
# in the "right" nested layer
assert (right_res_on.new_index.values == np.unique(nested.new_index.values)).all()

# Check that the new_index column we joined on was dropped from the nested layer
assert "new_index" not in right_res_on["nested"].nest.to_flat().columns
# Check that the flattend nested layer has the same index as the original column we joined on
all(right_res_on.nested.nest.to_flat().index.values == nested.new_index.values)

# For each index check that the base layer is aligned correctly to the nested layer
for i in range(len(right_res_on)):
# The actual "index" value we "joined" on. Since it was a right join, guaranteed to
# be in the "new_index" column of the orignal frame we wanted to nest
join_idx = right_res_on.new_index.iloc[i]
assert join_idx in nested["new_index"].values

# Check the values for each column in our "base" layer
for col in base.columns:
if col != "new_index":
assert col in right_res_on.columns
if join_idx not in base.new_index.values:
# We expect a NaN value in the base layer due to the "right" join
assert pd.isna(right_res_on.iloc[i][col])
else:
assert not pd.isna(right_res_on.iloc[i][col])

# Test the "outer" behavior
outer_res = base.add_nested(nested, "nested", how="outer")
assert "nested" in outer_res.columns
Expand All @@ -255,6 +320,38 @@ def test_add_nested_with_flat_df_and_mismatched_index():
else:
assert not pd.isna(outer_res.loc[idx][col])

# Test still adding the nested frame in an "outer" fashion but with on the "new_index" column
outer_res_on = base.add_nested(nested, "nested", how="outer", on="new_index")
assert "nested" in outer_res_on.columns
# We expect the result's new_index column to be the set union of the values of that column
# in the base and nested frames
assert set(outer_res_on.new_index) == set(base.new_index).union(set(nested.new_index))

# Check that the new_index column we joined on was dropped from the nested layer
assert "new_index" not in outer_res_on["nested"].nest.to_flat().columns
# Check that the flattend nested layer has the same index as the original column we joined on
# Note that it does not have index values only present in the base layer since those empty rows
# are dropped when we flatten the nested frame.
all(outer_res_on.nested.nest.to_flat().index.values == nested.new_index.values)

for i in range(len(outer_res_on)):
# The actual "index" value we "joined" on.
join_idx = outer_res_on.new_index.iloc[i]
# Check that the nested column is aligned correctly to the base layer
if join_idx not in nested["new_index"].values:
assert outer_res_on.iloc[i]["nested"] is None
else:
assert outer_res_on.iloc[i]["nested"] is not None
# Check the values for each column in our "base" layer
for col in base.columns:
if col != "new_index":
assert col in outer_res_on.columns
if join_idx in base.new_index.values:
# We expect a NaN value in the base layer due to the "outer" join
assert not pd.isna(outer_res_on.iloc[i][col])
else:
assert pd.isna(outer_res_on.iloc[i][col])

# Test the "inner" behavior
inner_res = base.add_nested(nested, "nested", how="inner")
assert "nested" in inner_res.columns
Expand All @@ -268,6 +365,18 @@ def test_add_nested_with_flat_df_and_mismatched_index():
assert col in inner_res.columns
assert not pd.isna(inner_res.loc[idx][col])

# Test still adding the nested frame in a "inner" fashion but on the "new_index" column
inner_res_on = base.add_nested(nested, "nested", how="inner", on="new_index")
assert "nested" in inner_res_on.columns
# We expect the new index to be the set intersection of the base and nested column we used
# for the 'on' argument
assert set(inner_res_on.new_index) == set(base.new_index).intersection(set(nested.new_index))
# Check that the new_index column we joined on was dropped from the nested layer
assert "new_index" not in right_res_on["nested"].nest.to_flat().columns

# Since we have confirmed that the "nex_index" column was the intersection that we expected
# we know that none of the joined values should be none
assert not inner_res_on.isnull().values.any()

def test_add_nested_with_series():
"""Test that add_nested correctly adds a nested column to the base df"""
Expand Down Expand Up @@ -433,7 +542,7 @@ def test_from_lists():
def test_query():
"""Test that NestedFrame.query handles nested queries correctly"""

base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2])
base = NestedFrame(data={"a": [1, 2, 2, 3], "b": [2, 3, 4, 6]}, index=[0, 1, 1, 2])

nested = pd.DataFrame(
data={"c": [0, 2, 4, 1, 4, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
Expand All @@ -455,10 +564,10 @@ def test_query():

# Test nested queries
nest_queried = base.query("nested.c > 1")
assert len(nest_queried.nested.nest.to_flat()) == 5
assert len(nest_queried.nested.nest.to_flat()) == 7

nest_queried = base.query("(nested.c > 1) and (nested.d>2)")
assert len(nest_queried.nested.nest.to_flat()) == 4
assert len(nest_queried.nested.nest.to_flat()) == 5

# Check edge conditions
with pytest.raises(ValueError):
Expand Down
31 changes: 28 additions & 3 deletions tests/nested_pandas/series/test_accessor.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
import nested_pandas as npd
import numpy as np
import pandas as pd
import pyarrow as pa
import pytest
from numpy.testing import assert_array_equal
from pandas.testing import assert_frame_equal, assert_index_equal, assert_series_equal

import nested_pandas as npd
from nested_pandas import NestedDtype
from nested_pandas.series.ext_array import NestedExtensionArray
from nested_pandas.series.packer import pack_flat, pack_seq
from numpy.testing import assert_array_equal
from pandas.testing import assert_frame_equal, assert_index_equal, assert_series_equal


def test_registered():
Expand Down Expand Up @@ -981,3 +982,27 @@ def test_values():
series = pack_seq([{"a": [1, 2, 3], "b": [3, 2, 1]}, {"a": [4, None], "b": [7, 8]}])
for value in series.nest.values():
assert_series_equal(value, series.nest[value.name])

def test_get_list_index():
"""Test that the get_list_index() method works."""
# First check that an empty NestedSeries returns an empty list index.
empty_struct_array = pa.StructArray.from_arrays(arrays=[],names=[])
empty_series = pd.Series(empty_struct_array, dtype=NestedDtype(empty_struct_array.type), index=[])
assert len(empty_series) == 0
assert len(empty_series.array.get_list_index()) == 0

# Create a NestedType series
struct_array = pa.StructArray.from_arrays(
arrays=[
pa.array([np.array([0, 1, 2, 3]), np.array([4, 5, 6, 7])]),
pa.array([np.array([7, 6, 4, 2]), np.array([0, 1, 2, 3])]),
pa.array([np.array([8, 9, 1, 9]), np.array([0, 0, 2, 3])]),
],
names=["a", "b", "c"],
)
series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[5, 7])

# Validate the generation of a flat length ordinal array
list_index = series.array.get_list_index()
assert len(list_index) == series.nest.flat_length
assert np.equal(list_index, [0, 0, 0, 0, 1, 1, 1, 1]).all()
Loading

0 comments on commit fe45496

Please sign in to comment.