diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py index 8736c19..a0674b8 100644 --- a/src/nested_pandas/nestedframe/core.py +++ b/src/nested_pandas/nestedframe/core.py @@ -12,11 +12,9 @@ from pandas.api.extensions import no_default from pandas.core.computation.expr import PARSERS, PandasExprVisitor -from nested_pandas.series import packer +from nested_pandas.nestedframe.utils import extract_nest_names from nested_pandas.series.dtype import NestedDtype - -from ..series.packer import pack_sorted_df_into_struct -from .utils import extract_nest_names +from nested_pandas.series.packer import pack, pack_lists, pack_sorted_df_into_struct class NestedPandasExprVisitor(PandasExprVisitor): @@ -219,10 +217,8 @@ def __setitem__(self, key, value): "." in key and key.split(".")[0] in self.nested_columns ): nested, col = key.split(".") - new_flat = self[nested].nest.to_flat() - new_flat[col] = value - packed = packer.pack(new_flat) - return super().__setitem__(nested, packed) + new_nested_series = self[nested].nest.with_flat_field(col, value) + return super().__setitem__(nested, new_nested_series) # Adding a new nested structure from a column # Allows statements like ndf["new_nested.t"] = ndf["nested.t"] - 5 @@ -231,8 +227,9 @@ def __setitem__(self, key, value): if isinstance(value, pd.Series): value.name = col value = value.to_frame() - packed = packer.pack(value) - return super().__setitem__(new_nested, packed) + new_df = self.add_nested(value, name=new_nested) + self._update_inplace(new_df) + return None return super().__setitem__(key, value) @@ -242,6 +239,7 @@ def add_nested( name: str, *, how: str = "left", + on: None | str | list[str] = None, dtype: NestedDtype | pd.ArrowDtype | pa.DataType | None = None, ) -> Self: # type: ignore[name-defined] # noqa: F821 """Packs input object to a nested column and adds it to the NestedFrame @@ -272,6 +270,8 @@ def add_nested( index, and sort it lexicographically. - inner: form intersection of calling frame's index with other frame's index, preserving the order of the calling index. + on : str, default: None + A column in the list dtype : dtype or None NestedDtype to use for the nested column; pd.ArrowDtype or pa.DataType can also be used to specify the nested dtype. If None, @@ -282,13 +282,16 @@ def add_nested( NestedFrame A new NestedFrame with the added nested column. """ + if on is not None and not isinstance(on, str): + raise ValueError("Currently we only support a single column for 'on'") # Add sources to objects - packed = packer.pack(obj, name=name, dtype=dtype) + packed = pack(obj, name=name, on=on, dtype=dtype) new_df = self.copy() - return new_df.join(packed, how=how) + res = new_df.join(packed, how=how, on=on) + return res @classmethod - def from_flat(cls, df, base_columns, nested_columns=None, index=None, name="nested"): + def from_flat(cls, df, base_columns, nested_columns=None, on: str | None = None, name="nested"): """Creates a NestedFrame with base and nested columns from a flat dataframe. @@ -304,7 +307,7 @@ def from_flat(cls, df, base_columns, nested_columns=None, index=None, name="nest in the list will attempt to be packed into a single nested column with the name provided in `nested_name`. If None, is defined as all columns not in `base_columns`. - index: str, or None + on: str or None The name of a column to use as the new index. Typically, the index should have a unique value per row for base columns, and should repeat for nested columns. For example, a dataframe with two @@ -330,11 +333,11 @@ def from_flat(cls, df, base_columns, nested_columns=None, index=None, name="nest """ # Resolve new index - if index is not None: + if on is not None: # if a base column is chosen remove it - if index in base_columns: - base_columns = [col for col in base_columns if col != index] - df = df.set_index(index) + if on in base_columns: + base_columns = [col for col in base_columns if col != on] + df = df.set_index(on) # drop duplicates on index out_df = df[base_columns][~df.index.duplicated(keep="first")] @@ -401,7 +404,7 @@ def from_lists(cls, df, base_columns=None, list_columns=None, name="nested"): raise ValueError("No columns were assigned as list columns.") # Pack list columns into a nested column - packed_df = packer.pack_lists(df[list_columns]) + packed_df = pack_lists(df[list_columns]) packed_df.name = name # join the nested column to the base_column df @@ -519,17 +522,33 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> NestedFrame | # since it operated on the base attributes. if isinstance(result, _SeriesFromNest): nest_name, flat_nest = result.nest_name, result.flat_nest - new_flat_nest = flat_nest.loc[result] - result = self.copy() - result[nest_name] = pack_sorted_df_into_struct(new_flat_nest) + # Reset index to "ordinal" like [0, 0, 0, 1, 1, 2, 2, 2] + list_index = self[nest_name].array.get_list_index() + flat_nest = flat_nest.set_index(list_index) + query_result = result.set_axis(list_index) + # Selecting flat values matching the query result + new_flat_nest = flat_nest[query_result] + new_df = self._set_filtered_flat_df(nest_name, new_flat_nest) else: - result = self.loc[result] + new_df = self.loc[result] if inplace: - self._update_inplace(result) + self._update_inplace(new_df) return None else: - return result + return new_df + + def _set_filtered_flat_df(self, nest_name, flat_df): + """Set a filtered flat dataframe for a nested column + + Here we assume that flat_df has filtered "ordinal" index, + e.g. flat_df.index == [0, 2, 2, 2], while self.index + is arbitrary (e.g. ["a", "b", "a"]), + and self[nest_name].array.list_index is [0, 0, 1, 1, 1, 2, 2, 2, 2]. + """ + new_df = self.reset_index(drop=True) + new_df[nest_name] = pack_sorted_df_into_struct(flat_df, name=nest_name) + return new_df.set_index(self.index) def _resolve_dropna_target(self, on_nested, subset): """resolves the target layer for a given set of dropna kwargs""" @@ -654,34 +673,32 @@ def dropna( return super().dropna( axis=axis, how=how, thresh=thresh, subset=subset, inplace=inplace, ignore_index=ignore_index ) + if ignore_index: + raise ValueError("ignore_index is not supported for nested columns") if subset is not None: subset = [col.split(".")[-1] for col in subset] + target_flat = self[target].nest.to_flat() + target_flat = target_flat.set_index(self[target].array.get_list_index()) if inplace: - target_flat = self[target].nest.to_flat() target_flat.dropna( axis=axis, how=how, thresh=thresh, subset=subset, - inplace=inplace, - ignore_index=ignore_index, + inplace=True, ) - self[target] = packer.pack_flat(target_flat) - return self - # Or if not inplace - new_df = self.copy() - new_df[target] = packer.pack_flat( - new_df[target] - .nest.to_flat() - .dropna( + else: + target_flat = target_flat.dropna( axis=axis, how=how, thresh=thresh, subset=subset, - inplace=inplace, - ignore_index=ignore_index, + inplace=False, ) - ) + new_df = self._set_filtered_flat_df(nest_name=target, flat_df=target_flat) + if inplace: + self._update_inplace(new_df) + return None return new_df def reduce(self, func, *args, **kwargs) -> NestedFrame: # type: ignore[override] diff --git a/src/nested_pandas/series/ext_array.py b/src/nested_pandas/series/ext_array.py index 6f98816..f63ca8b 100644 --- a/src/nested_pandas/series/ext_array.py +++ b/src/nested_pandas/series/ext_array.py @@ -648,6 +648,14 @@ def num_chunks(self) -> int: """Number of chunks in underlying pyarrow.ChunkedArray""" return self._chunked_array.num_chunks + def get_list_index(self) -> np.ndarray: + """Keys mapping values to lists""" + if len(self) == 0: + # Since we have no list offsets, return an empty array + return np.array([], dtype=int) + list_index = np.arange(len(self)) + return np.repeat(list_index, np.diff(self.list_offsets)) + def iter_field_lists(self, field: str) -> Generator[np.ndarray, None, None]: """Iterate over single field nested lists, as numpy arrays diff --git a/src/nested_pandas/series/packer.py b/src/nested_pandas/series/packer.py index ff22930..1ea8abb 100644 --- a/src/nested_pandas/series/packer.py +++ b/src/nested_pandas/series/packer.py @@ -27,6 +27,7 @@ def pack( name: str | None = None, *, index=None, + on: None | str | list[str] = None, dtype: NestedDtype | pd.ArrowDtype | pa.DataType | None = None, ) -> pd.Series: """Pack a "flat" dataframe or a sequence of dataframes into a "nested" series. @@ -40,6 +41,8 @@ def pack( index : convertable to pd.Index, optional Index of the output series. If obj is a pd.DataFrame, it is always nested by the original index, and this value is used to override the index after the nesting. + on: str or list of str, optional + Column name(s) to join on. If None, the index is used. dtype : dtype or None NestedDtype of the output series, or other type to derive from. If None, the dtype is inferred from the first non-missing dataframe. @@ -50,14 +53,14 @@ def pack( Output series. """ if isinstance(obj, pd.DataFrame): - nested = pack_flat(obj, name=name) + nested = pack_flat(obj, name=name, on=on) if index is not None: nested.index = index return nested return pack_seq(obj, name=name, index=index, dtype=dtype) -def pack_flat(df: pd.DataFrame, name: str | None = None) -> pd.Series: +def pack_flat(df: pd.DataFrame, name: str | None = None, *, on: None | str | list[str] = None) -> pd.Series: """Make a structure of lists representation of a "flat" dataframe. For the input dataframe with repeated indexes, make a pandas.Series, @@ -73,6 +76,8 @@ def pack_flat(df: pd.DataFrame, name: str | None = None) -> pd.Series: Input dataframe, with repeated indexes. name : str, optional Name of the pd.Series. + on : str or list of str, optional + Column name(s) to join on. If None, the df's index is used. Returns ------- @@ -86,9 +91,11 @@ def pack_flat(df: pd.DataFrame, name: str | None = None) -> pd.Series: nested_pandas.series.packer.pack_lists : Pack a dataframe of nested arrays. """ + if on is not None: + df = df.set_index(on) # pandas knows when index is pre-sorted, so it would do nothing if it is already sorted - flat = df.sort_index(kind="stable") - return pack_sorted_df_into_struct(flat, name=name) + sorted_flat = df.sort_index(kind="stable") + return pack_sorted_df_into_struct(sorted_flat, name=name) def pack_seq( diff --git a/tests/nested_pandas/nestedframe/test_nestedframe.py b/tests/nested_pandas/nestedframe/test_nestedframe.py index 151b5f0..9175b3d 100644 --- a/tests/nested_pandas/nestedframe/test_nestedframe.py +++ b/tests/nested_pandas/nestedframe/test_nestedframe.py @@ -187,10 +187,15 @@ def test_add_nested_with_flat_df(): def test_add_nested_with_flat_df_and_mismatched_index(): """Test add_nested when index values of base are missing matches in nested""" - base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2]) + base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6], "new_index": [0, 1, 3]}, index=[0, 1, 2]) nested = pd.DataFrame( - data={"c": [0, 2, 4, 1, 4, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]}, + data={ + "c": [0, 2, 4, 1, 4, 3, 1, 4, 1], + "d": [5, 4, 7, 5, 3, 1, 9, 3, 4], + # A column we can have as an alternative joining index with 'on' + "new_index": [1, 1, 1, 1, 2, 2, 5, 5, 5], + }, # no data for base index value of "2" and introduces new index value "4" index=[0, 0, 0, 1, 1, 1, 1, 4, 4], ) @@ -212,6 +217,35 @@ def test_add_nested_with_flat_df_and_mismatched_index(): default_res = base.add_nested(nested, "nested") assert_frame_equal(left_res, default_res) + # Test still adding the nested frame in a "left" fashion but on the "new_index" column + + # We currently don't support a list of columns for the 'on' argument + with pytest.raises(ValueError): + left_res_on = base.add_nested(nested, "nested", how="left", on=["new_index"]) + # Instead we should pass a single column name, "new_index" which exists in both frames. + left_res_on = base.add_nested(nested, "nested", how="left", on="new_index") + assert "nested" in left_res_on.columns + # Check that the index of the base layer is still being used + assert (left_res_on.index == base.index).all() + # Assert that the new_index column we joined on was dropped from the nested layer + # but is present in the base layer + assert "new_index" in left_res_on.columns + assert "new_index" not in left_res_on["nested"].nest.to_flat().columns + + # For each index in the columns we joined on, check that values are aligned correctly + for i in range(len(left_res_on.new_index)): + # The actual "index" value we "joined" on. + join_idx = left_res_on.new_index.iloc[i] + # Check that the nested column is aligned correctly to the base layer + if join_idx in nested["new_index"].values: + assert left_res_on.iloc[i]["nested"] is not None + # Check that it is present in new the index we constructed for the nested layer + assert join_idx in left_res_on["nested"].nest.to_flat().index + else: + # Use an iloc + assert left_res_on.iloc[i]["nested"] is None + assert join_idx not in left_res_on["nested"].nest.to_flat().index + # Test adding the nested frame in a "right" fashion, where the index of the "right" # frame (our nested layer) is preserved right_res = base.add_nested(nested, "nested", how="right") @@ -235,6 +269,35 @@ def test_add_nested_with_flat_df_and_mismatched_index(): else: assert not pd.isna(right_res.loc[idx][col]) + # Test still adding the nested frame in a "right" fashion but on the "new_index" column + right_res_on = base.add_nested(nested, "nested", how="right", on="new_index") + assert "nested" in right_res_on.columns + # Check that rows were dropped if the base layer's "new_index" value is not present + # in the "right" nested layer + assert (right_res_on.new_index.values == np.unique(nested.new_index.values)).all() + + # Check that the new_index column we joined on was dropped from the nested layer + assert "new_index" not in right_res_on["nested"].nest.to_flat().columns + # Check that the flattend nested layer has the same index as the original column we joined on + all(right_res_on.nested.nest.to_flat().index.values == nested.new_index.values) + + # For each index check that the base layer is aligned correctly to the nested layer + for i in range(len(right_res_on)): + # The actual "index" value we "joined" on. Since it was a right join, guaranteed to + # be in the "new_index" column of the orignal frame we wanted to nest + join_idx = right_res_on.new_index.iloc[i] + assert join_idx in nested["new_index"].values + + # Check the values for each column in our "base" layer + for col in base.columns: + if col != "new_index": + assert col in right_res_on.columns + if join_idx not in base.new_index.values: + # We expect a NaN value in the base layer due to the "right" join + assert pd.isna(right_res_on.iloc[i][col]) + else: + assert not pd.isna(right_res_on.iloc[i][col]) + # Test the "outer" behavior outer_res = base.add_nested(nested, "nested", how="outer") assert "nested" in outer_res.columns @@ -255,6 +318,38 @@ def test_add_nested_with_flat_df_and_mismatched_index(): else: assert not pd.isna(outer_res.loc[idx][col]) + # Test still adding the nested frame in an "outer" fashion but with on the "new_index" column + outer_res_on = base.add_nested(nested, "nested", how="outer", on="new_index") + assert "nested" in outer_res_on.columns + # We expect the result's new_index column to be the set union of the values of that column + # in the base and nested frames + assert set(outer_res_on.new_index) == set(base.new_index).union(set(nested.new_index)) + + # Check that the new_index column we joined on was dropped from the nested layer + assert "new_index" not in outer_res_on["nested"].nest.to_flat().columns + # Check that the flattend nested layer has the same index as the original column we joined on + # Note that it does not have index values only present in the base layer since those empty rows + # are dropped when we flatten the nested frame. + all(outer_res_on.nested.nest.to_flat().index.values == nested.new_index.values) + + for i in range(len(outer_res_on)): + # The actual "index" value we "joined" on. + join_idx = outer_res_on.new_index.iloc[i] + # Check that the nested column is aligned correctly to the base layer + if join_idx not in nested["new_index"].values: + assert outer_res_on.iloc[i]["nested"] is None + else: + assert outer_res_on.iloc[i]["nested"] is not None + # Check the values for each column in our "base" layer + for col in base.columns: + if col != "new_index": + assert col in outer_res_on.columns + if join_idx in base.new_index.values: + # We expect a NaN value in the base layer due to the "outer" join + assert not pd.isna(outer_res_on.iloc[i][col]) + else: + assert pd.isna(outer_res_on.iloc[i][col]) + # Test the "inner" behavior inner_res = base.add_nested(nested, "nested", how="inner") assert "nested" in inner_res.columns @@ -268,6 +363,19 @@ def test_add_nested_with_flat_df_and_mismatched_index(): assert col in inner_res.columns assert not pd.isna(inner_res.loc[idx][col]) + # Test still adding the nested frame in a "inner" fashion but on the "new_index" column + inner_res_on = base.add_nested(nested, "nested", how="inner", on="new_index") + assert "nested" in inner_res_on.columns + # We expect the new index to be the set intersection of the base and nested column we used + # for the 'on' argument + assert set(inner_res_on.new_index) == set(base.new_index).intersection(set(nested.new_index)) + # Check that the new_index column we joined on was dropped from the nested layer + assert "new_index" not in right_res_on["nested"].nest.to_flat().columns + + # Since we have confirmed that the "nex_index" column was the intersection that we expected + # we know that none of the joined values should be none + assert not inner_res_on.isnull().values.any() + def test_add_nested_with_series(): """Test that add_nested correctly adds a nested column to the base df""" @@ -317,8 +425,8 @@ def test_add_nested_for_empty_df(): @pytest.mark.parametrize("pandas", [False, True]) -@pytest.mark.parametrize("index", [None, "a", "c"]) -def test_from_flat(index, pandas): +@pytest.mark.parametrize("on", [None, "a", "c"]) +def test_from_flat(on, pandas): """Test the NestedFrame.from_flat functionality""" if pandas: @@ -332,17 +440,17 @@ def test_from_flat(index, pandas): index=[0, 0, 0, 1, 1], ) - out_nf = NestedFrame.from_flat(nf, base_columns=["a", "b"], index=index, name="new_nested") + out_nf = NestedFrame.from_flat(nf, base_columns=["a", "b"], on=on, name="new_nested") - if index is None: + if on is None: assert list(out_nf.columns) == ["a", "b", "new_nested"] assert list(out_nf.new_nested.nest.fields) == ["c", "d"] assert len(out_nf) == 2 - elif index == "a": + elif on == "a": assert list(out_nf.columns) == ["b", "new_nested"] assert list(out_nf.new_nested.nest.fields) == ["c", "d"] assert len(out_nf) == 2 - elif index == "c": # not what a user likely wants, but should still work + elif on == "c": # not what a user likely wants, but should still work assert list(out_nf.columns) == ["a", "b", "new_nested"] assert list(out_nf.new_nested.nest.fields) == ["d"] assert len(out_nf) == 5 @@ -433,7 +541,7 @@ def test_from_lists(): def test_query(): """Test that NestedFrame.query handles nested queries correctly""" - base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2]) + base = NestedFrame(data={"a": [1, 2, 2, 3], "b": [2, 3, 4, 6]}, index=[0, 1, 1, 2]) nested = pd.DataFrame( data={"c": [0, 2, 4, 1, 4, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]}, @@ -455,10 +563,10 @@ def test_query(): # Test nested queries nest_queried = base.query("nested.c > 1") - assert len(nest_queried.nested.nest.to_flat()) == 5 + assert len(nest_queried.nested.nest.to_flat()) == 7 nest_queried = base.query("(nested.c > 1) and (nested.d>2)") - assert len(nest_queried.nested.nest.to_flat()) == 4 + assert len(nest_queried.nested.nest.to_flat()) == 5 # Check edge conditions with pytest.raises(ValueError): diff --git a/tests/nested_pandas/series/test_accessor.py b/tests/nested_pandas/series/test_accessor.py index 9510ec1..63da799 100644 --- a/tests/nested_pandas/series/test_accessor.py +++ b/tests/nested_pandas/series/test_accessor.py @@ -981,3 +981,28 @@ def test_values(): series = pack_seq([{"a": [1, 2, 3], "b": [3, 2, 1]}, {"a": [4, None], "b": [7, 8]}]) for value in series.nest.values(): assert_series_equal(value, series.nest[value.name]) + + +def test_get_list_index(): + """Test that the get_list_index() method works.""" + # First check that an empty NestedSeries returns an empty list index. + empty_struct_array = pa.StructArray.from_arrays(arrays=[], names=[]) + empty_series = pd.Series(empty_struct_array, dtype=NestedDtype(empty_struct_array.type), index=[]) + assert len(empty_series) == 0 + assert len(empty_series.array.get_list_index()) == 0 + + # Create a NestedType series + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([0, 1, 2, 3]), np.array([4, 5, 6, 7])]), + pa.array([np.array([7, 6, 4, 2]), np.array([0, 1, 2, 3])]), + pa.array([np.array([8, 9, 1, 9]), np.array([0, 0, 2, 3])]), + ], + names=["a", "b", "c"], + ) + series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[5, 7]) + + # Validate the generation of a flat length ordinal array + list_index = series.array.get_list_index() + assert len(list_index) == series.nest.flat_length + assert np.equal(list_index, [0, 0, 0, 0, 1, 1, 1, 1]).all() diff --git a/tests/nested_pandas/series/test_packer.py b/tests/nested_pandas/series/test_packer.py index 55801c0..d06f8d2 100644 --- a/tests/nested_pandas/series/test_packer.py +++ b/tests/nested_pandas/series/test_packer.py @@ -67,6 +67,66 @@ def test_pack_with_flat_df_and_index(): assert_series_equal(series, desired) +def test_pack_with_flat_df_and_on(): + """Test packing a dataframe on a column""" + df = pd.DataFrame( + data={ + "a": [1, 2, 3, 4], + "b": [0, 1, 0, 1], + "c": [1, 0, 1, 0], + }, + index=[1, 2, 1, 2], + ) + series = packer.pack(df, name="series", on="c") + + desired = pd.Series( + data=[ + # All of the values where the column c is 0 + (np.array([2, 4]), np.array([1, 1])), + # All of the values where the column c is 1 + (np.array([1, 3]), np.array([0, 0])), + ], + # Since we packed on 'c', we expect to see the unique sorted + # values of 'c' as the index + index=[0, 1], + dtype=NestedDtype.from_fields(dict(a=pa.int64(), b=pa.int64())), + name="series", + ) + # The index name should be the same as the column we packed on + desired.index.name = "c" + offsets_reused(series) + assert_series_equal(series, desired) + + +def test_pack_with_flat_df_and_on_and_index(): + """Test packing a dataframe on a column while also specifying an index""" + df = pd.DataFrame( + data={ + "a": [1, 2, 3, 4], + "b": [0, 1, 0, 1], + "c": [1, 0, 1, 0], + }, + index=[1, 2, 1, 2], + ) + new_index = [101, 102] + series = packer.pack(df, name="series", index=new_index, on="c") + + desired = pd.Series( + data=[ + # All of the values where the column c is 0 + (np.array([2, 4]), np.array([1, 1])), + # All of the values where the column c is 1 + (np.array([1, 3]), np.array([0, 0])), + ], + # We still expect to see the overriden index despite packing on 'c' + index=new_index, + dtype=NestedDtype.from_fields(dict(a=pa.int64(), b=pa.int64())), + name="series", + ) + offsets_reused(series) + assert_series_equal(series, desired) + + def test_pack_with_series_of_dfs(): """Test pack(pd.Series([pd.DataFrame(), ...])).""" input_series = pd.Series( @@ -127,6 +187,40 @@ def test_pack_flat(): assert_series_equal(actual, desired) +def test_pack_flat_with_on(): + """Test pack_flat() where you pack on a given column.""" + df = pd.DataFrame( + data={ + "a": [7, 8, 9, 1, 2, 3, 4, 5, 6], + "b": [0, 1, 0, 0, 1, 0, 1, 0, 1], + "c": [1, 0, 1, 0, 1, 0, 1, 0, 1], + }, + index=[4, 4, 4, 1, 1, 2, 2, 3, 3], + ) + # Pack on the c olumn + actual = packer.pack_flat(df, on="c") + + desired = pd.Series( + data=[ + # Index 0: # All of the values where column 'c' is 0 + ( + np.array([8, 1, 3, 5]), # values from column 'a' + np.array([1, 0, 0, 0]), # values from column 'b' + ), + # Index 1: # All of the values where column 'c' is 1 + ( + np.array([7, 9, 2, 4, 6]), # values from column 'a' + np.array([0, 0, 1, 1, 1]), # values from column 'b' + ), + ], + index=[0, 1], + dtype=NestedDtype.from_fields(dict(a=pa.int64(), b=pa.int64())), + ) + desired.index.name = "c" + offsets_reused(actual) + assert_series_equal(actual, desired) + + def test_pack_sorted_df_into_struct(): """Test pack_sorted_df_into_struct().""" df = pd.DataFrame(