Merge pull request #169 from lincc-frameworks/add_nested_tests

Tests for using the 'on' keyword with add_nested and packing
lincc-frameworks · Nov 6, 2024 · fe45496 · fe45496
2 parents a6b83d8 + efcadff
commit fe45496
Show file tree

Hide file tree

Showing 5 changed files with 249 additions and 24 deletions.
diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py
@@ -10,7 +10,6 @@
 from pandas._libs import lib
 from pandas._typing import Any, AnyAll, Axis, IndexLabel
 from pandas.api.extensions import no_default
-from pandas.api.types import is_bool_dtype
 from pandas.core.computation.expr import PARSERS, PandasExprVisitor
 
 from nested_pandas.nestedframe.utils import extract_nest_names
@@ -271,8 +270,8 @@ def add_nested(
               index, and sort it lexicographically.
             - inner: form intersection of calling frame's index with other
               frame's index, preserving the order of the calling index.
-        on : str, list of str, default: None
-            Columns to join on.
+        on : str, default: None
+            A column in the list 
         dtype : dtype or None
             NestedDtype to use for the nested column; pd.ArrowDtype or
             pa.DataType can also be used to specify the nested dtype. If None,
@@ -283,10 +282,13 @@ def add_nested(
         NestedFrame
             A new NestedFrame with the added nested column.
         """
+        if on is not None and not isinstance(on, str):
+            raise ValueError("Currently we only support a single column for 'on'")
         # Add sources to objects
         packed = pack(obj, name=name, on=on, dtype=dtype)
         new_df = self.copy()
-        return new_df.join(packed, how=how)
+        res = new_df.join(packed, how=how, on=on)
+        return res
 
     @classmethod
     def from_flat(cls, df, base_columns, nested_columns=None, on: str | None = None, name="nested"):
@@ -519,14 +521,11 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> NestedFrame |
         # to the nest and repack.  Otherwise, apply it to this instance as usual,
         # since it operated on the base attributes.
         if isinstance(result, _SeriesFromNest):
-            if not is_bool_dtype(result.dtype):
-                raise ValueError("Query condition must evaluate to a boolean Series")
-
             nest_name, flat_nest = result.nest_name, result.flat_nest
-
             # Reset index to "ordinal" like [0, 0, 0, 1, 1, 2, 2, 2]
-            flat_nest = flat_nest.set_index(self[nest_name].array.list_index)
-            query_result = result.set_axis(self[nest_name].array.list_index)
+            list_index = self[nest_name].array.get_list_index()
+            flat_nest = flat_nest.set_index(list_index)
+            query_result = result.set_axis(list_index)
             # Selecting flat values matching the query result
             new_flat_nest = flat_nest[query_result]
             new_df = self._set_filtered_flat_df(nest_name, new_flat_nest)
@@ -679,7 +678,7 @@ def dropna(
         if subset is not None:
             subset = [col.split(".")[-1] for col in subset]
         target_flat = self[target].nest.to_flat()
-        target_flat = target_flat.set_index(self[target].array.list_index)
+        target_flat = target_flat.set_index(self[target].array.get_list_index())
         if inplace:
             target_flat.dropna(
                 axis=axis,

diff --git a/src/nested_pandas/series/ext_array.py b/src/nested_pandas/series/ext_array.py
@@ -648,9 +648,11 @@ def num_chunks(self) -> int:
         """Number of chunks in underlying pyarrow.ChunkedArray"""
         return self._chunked_array.num_chunks
 
-    @property
-    def list_index(self) -> np.ndarray:
+    def get_list_index(self) -> np.ndarray:
         """Keys mapping values to lists"""
+        if len(self) == 0:
+            # Since we have no list offests, return an empty array
+            return np.array([], dtype=int)
         list_index = np.arange(len(self))
         return np.repeat(list_index, np.diff(self.list_offsets))
 

diff --git a/tests/nested_pandas/nestedframe/test_nestedframe.py b/tests/nested_pandas/nestedframe/test_nestedframe.py
@@ -2,10 +2,11 @@
 import pandas as pd
 import pyarrow as pa
 import pytest
+from pandas.testing import assert_frame_equal
+
 from nested_pandas import NestedFrame
 from nested_pandas.datasets import generate_data
 from nested_pandas.nestedframe.core import _SeriesFromNest
-from pandas.testing import assert_frame_equal
 
 
 def test_nestedframe_construction():
@@ -187,10 +188,16 @@ def test_add_nested_with_flat_df():
 def test_add_nested_with_flat_df_and_mismatched_index():
     """Test add_nested when index values of base are missing matches in nested"""
 
-    base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2])
+    base = NestedFrame(
+        data={"a": [1, 2, 3], "b": [2, 4, 6], "new_index": [0, 1, 3] }, index=[0, 1, 2])
 
     nested = pd.DataFrame(
-        data={"c": [0, 2, 4, 1, 4, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
+        data={
+            "c": [0, 2, 4, 1, 4, 3, 1, 4, 1],
+            "d": [5, 4, 7, 5, 3, 1, 9, 3, 4],
+            # A column we can have as an alternative joining index with 'on'
+            "new_index": [1, 1, 1, 1, 2, 2, 5, 5, 5],
+        },
         # no data for base index value of "2" and introduces new index value "4"
         index=[0, 0, 0, 1, 1, 1, 1, 4, 4],
     )
@@ -212,6 +219,35 @@ def test_add_nested_with_flat_df_and_mismatched_index():
     default_res = base.add_nested(nested, "nested")
     assert_frame_equal(left_res, default_res)
 
+    # Test still adding the nested frame in a "left" fashion but on the "new_index" column
+
+    # We currently don't support a list of columns for the 'on' argument
+    with pytest.raises(ValueError):
+        left_res_on = base.add_nested(nested, "nested", how="left", on=["new_index"])
+    # Instead we should pass a single column name, "new_index" which exists in both frames.
+    left_res_on = base.add_nested(nested, "nested", how="left", on="new_index")
+    assert "nested" in left_res_on.columns
+    # Check that the index of the base layer is still being used
+    assert (left_res_on.index == base.index).all()
+    # Assert that the new_index column we joined on was dropped from the nested layer
+    # but is present in the base layer
+    assert "new_index" in left_res_on.columns
+    assert "new_index" not in left_res_on["nested"].nest.to_flat().columns
+
+    # For each index in the columns we joined on, check that values are aligned correctly
+    for i in range(len(left_res_on.new_index)):
+        # The actual "index" value we "joined" on.
+        join_idx = left_res_on.new_index.iloc[i]
+        # Check that the nested column is aligned correctly to the base layer
+        if join_idx in nested["new_index"].values:
+            assert left_res_on.iloc[i]["nested"] is not None
+            # Check that it is present in new the index we constructed for the nested layer
+            assert join_idx in left_res_on["nested"].nest.to_flat().index
+        else:
+            # Use an iloc
+            assert left_res_on.iloc[i]["nested"] is None
+            assert join_idx not in left_res_on["nested"].nest.to_flat().index
+
     # Test adding the nested frame in a "right" fashion, where the index of the "right"
     # frame (our nested layer) is preserved
     right_res = base.add_nested(nested, "nested", how="right")
@@ -235,6 +271,35 @@ def test_add_nested_with_flat_df_and_mismatched_index():
             else:
                 assert not pd.isna(right_res.loc[idx][col])
 
+    # Test still adding the nested frame in a "right" fashion but on the "new_index" column
+    right_res_on = base.add_nested(nested, "nested", how="right", on="new_index")
+    assert "nested" in right_res_on.columns
+    # Check that rows were dropped if the base layer's "new_index" value is not present
+    # in the "right" nested layer
+    assert (right_res_on.new_index.values == np.unique(nested.new_index.values)).all()
+
+    # Check that the new_index column we joined on was dropped from the nested layer
+    assert "new_index" not in right_res_on["nested"].nest.to_flat().columns
+    # Check that the flattend nested layer has the same index as the original column we joined on
+    all(right_res_on.nested.nest.to_flat().index.values == nested.new_index.values)
+
+    # For each index check that the base layer is aligned correctly to the nested layer
+    for i in range(len(right_res_on)):
+        # The actual "index" value we "joined" on. Since it was a right join, guaranteed to
+        # be in the "new_index" column of the orignal frame we wanted to nest
+        join_idx = right_res_on.new_index.iloc[i]
+        assert join_idx in nested["new_index"].values
+
+        # Check the values for each column in our "base" layer
+        for col in base.columns:
+            if col != "new_index":
+                assert col in right_res_on.columns
+                if join_idx not in base.new_index.values:
+                    # We expect a NaN value in the base layer due to the "right" join
+                    assert pd.isna(right_res_on.iloc[i][col])
+                else:
+                    assert not pd.isna(right_res_on.iloc[i][col])
+
     # Test the "outer" behavior
     outer_res = base.add_nested(nested, "nested", how="outer")
     assert "nested" in outer_res.columns
@@ -255,6 +320,38 @@ def test_add_nested_with_flat_df_and_mismatched_index():
             else:
                 assert not pd.isna(outer_res.loc[idx][col])
 
+    # Test still adding the nested frame in an "outer" fashion but with on the "new_index" column
+    outer_res_on = base.add_nested(nested, "nested", how="outer", on="new_index")
+    assert "nested" in outer_res_on.columns
+    # We expect the result's new_index column to be the set union of the values of that column
+    # in the base and nested frames
+    assert set(outer_res_on.new_index) == set(base.new_index).union(set(nested.new_index))
+
+    # Check that the new_index column we joined on was dropped from the nested layer
+    assert "new_index" not in outer_res_on["nested"].nest.to_flat().columns
+    # Check that the flattend nested layer has the same index as the original column we joined on
+    # Note that it does not have index values only present in the base layer since those empty rows
+    # are dropped when we flatten the nested frame.
+    all(outer_res_on.nested.nest.to_flat().index.values == nested.new_index.values)
+
+    for i in range(len(outer_res_on)):
+        # The actual "index" value we "joined" on.
+        join_idx = outer_res_on.new_index.iloc[i]
+        # Check that the nested column is aligned correctly to the base layer
+        if join_idx not in nested["new_index"].values:
+            assert outer_res_on.iloc[i]["nested"] is None
+        else:
+            assert outer_res_on.iloc[i]["nested"] is not None
+        # Check the values for each column in our "base" layer
+        for col in base.columns:
+            if col != "new_index":
+                assert col in outer_res_on.columns
+                if join_idx in base.new_index.values:
+                    # We expect a NaN value in the base layer due to the "outer" join
+                    assert not pd.isna(outer_res_on.iloc[i][col])
+                else:
+                    assert pd.isna(outer_res_on.iloc[i][col])
+
     # Test the "inner" behavior
     inner_res = base.add_nested(nested, "nested", how="inner")
     assert "nested" in inner_res.columns
@@ -268,6 +365,18 @@ def test_add_nested_with_flat_df_and_mismatched_index():
             assert col in inner_res.columns
             assert not pd.isna(inner_res.loc[idx][col])
 
+    # Test still adding the nested frame in a "inner" fashion but on the "new_index" column
+    inner_res_on = base.add_nested(nested, "nested", how="inner", on="new_index")
+    assert "nested" in inner_res_on.columns
+    # We expect the new index to be the set intersection of the base and nested column we used
+    # for the 'on' argument
+    assert set(inner_res_on.new_index) == set(base.new_index).intersection(set(nested.new_index))
+    # Check that the new_index column we joined on was dropped from the nested layer
+    assert "new_index" not in right_res_on["nested"].nest.to_flat().columns
+
+    # Since we have confirmed that the "nex_index" column was the intersection that we expected
+    # we know that none of the joined values should be none
+    assert not inner_res_on.isnull().values.any()
 
 def test_add_nested_with_series():
     """Test that add_nested correctly adds a nested column to the base df"""
@@ -433,7 +542,7 @@ def test_from_lists():
 def test_query():
     """Test that NestedFrame.query handles nested queries correctly"""
 
-    base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2])
+    base = NestedFrame(data={"a": [1, 2, 2, 3], "b": [2, 3, 4, 6]}, index=[0, 1, 1, 2])
 
     nested = pd.DataFrame(
         data={"c": [0, 2, 4, 1, 4, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
@@ -455,10 +564,10 @@ def test_query():
 
     # Test nested queries
     nest_queried = base.query("nested.c > 1")
-    assert len(nest_queried.nested.nest.to_flat()) == 5
+    assert len(nest_queried.nested.nest.to_flat()) == 7
 
     nest_queried = base.query("(nested.c > 1) and (nested.d>2)")
-    assert len(nest_queried.nested.nest.to_flat()) == 4
+    assert len(nest_queried.nested.nest.to_flat()) == 5
 
     # Check edge conditions
     with pytest.raises(ValueError):

diff --git a/tests/nested_pandas/series/test_accessor.py b/tests/nested_pandas/series/test_accessor.py
@@ -1,13 +1,14 @@
-import nested_pandas as npd
 import numpy as np
 import pandas as pd
 import pyarrow as pa
 import pytest
+from numpy.testing import assert_array_equal
+from pandas.testing import assert_frame_equal, assert_index_equal, assert_series_equal
+
+import nested_pandas as npd
 from nested_pandas import NestedDtype
 from nested_pandas.series.ext_array import NestedExtensionArray
 from nested_pandas.series.packer import pack_flat, pack_seq
-from numpy.testing import assert_array_equal
-from pandas.testing import assert_frame_equal, assert_index_equal, assert_series_equal
 
 
 def test_registered():
@@ -981,3 +982,27 @@ def test_values():
     series = pack_seq([{"a": [1, 2, 3], "b": [3, 2, 1]}, {"a": [4, None], "b": [7, 8]}])
     for value in series.nest.values():
         assert_series_equal(value, series.nest[value.name])
+
+def test_get_list_index():
+    """Test that the get_list_index() method works."""
+    # First check that an empty NestedSeries returns an empty list index.
+    empty_struct_array = pa.StructArray.from_arrays(arrays=[],names=[])
+    empty_series = pd.Series(empty_struct_array, dtype=NestedDtype(empty_struct_array.type), index=[])
+    assert len(empty_series) == 0
+    assert len(empty_series.array.get_list_index()) == 0
+
+    # Create a NestedType series
+    struct_array = pa.StructArray.from_arrays(
+        arrays=[
+            pa.array([np.array([0, 1, 2, 3]), np.array([4, 5, 6, 7])]),
+            pa.array([np.array([7, 6, 4, 2]), np.array([0, 1, 2, 3])]),
+            pa.array([np.array([8, 9, 1, 9]), np.array([0, 0, 2, 3])]),
+        ],
+        names=["a", "b", "c"],
+    )
+    series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[5, 7])
+
+    # Validate the generation of a flat length ordinal array
+    list_index = series.array.get_list_index()
+    assert len(list_index) == series.nest.flat_length
+    assert np.equal(list_index, [0, 0, 0, 0, 1, 1, 1, 1]).all()