Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reimplementation of NestedFrame.reduce() #96

Merged
merged 2 commits into from
May 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 6 additions & 26 deletions src/nested_pandas/nestedframe/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -413,35 +413,15 @@ def my_sum(col1, col2):
if len(requested_columns) < len(args):
extra_args = args[len(requested_columns) :]

# find targeted layers
layers = np.unique([col[0] for col in requested_columns])

# build a flat dataframe with array columns to apply to the function
apply_df = NestedFrame()
for layer in layers:
iterators = []
for layer, col in requested_columns:
if layer == "base":
columns = [col[1] for col in requested_columns if col[0] == layer]
apply_df = apply_df.join(self[columns], how="outer")
iterators.append(self[col])
else:
# TODO: It should be faster to pass these columns to to_lists, but its 20x slower
# columns = [col[1] for col in requested_columns if col[0] == layer]
apply_df = apply_df.join(self[layer].nest.to_lists(), how="outer")
iterators.append(self[layer].array.iter_field_lists(col))

# Translates the requested columns into the scalars or arrays we pass to func.
def translate_cols(frame, layer, col):
if layer == "base":
# We pass the "base" column as a scalar
return frame[col]
return np.asarray(frame[col])

# send arrays along to the apply call
result = apply_df.apply(
lambda x: func(
*[translate_cols(x, layer, col) for layer, col in requested_columns], *extra_args, **kwargs
),
axis=1, # to apply func on each row of our nested frame)
)
return result
results = [func(*cols, *extra_args, **kwargs) for cols in zip(*iterators)]
return NestedFrame(results, index=self.index)

def to_parquet(self, path, by_layer=False, **kwargs) -> None:
"""Creates parquet file(s) with the data of a NestedFrame, either
Expand Down
25 changes: 22 additions & 3 deletions src/nested_pandas/series/ext_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
# typing.Self and "|" union syntax don't exist in Python 3.9
from __future__ import annotations

from collections.abc import Iterable, Iterator, Sequence
from collections.abc import Generator, Iterable, Iterator, Sequence
from typing import Any, Callable, cast

import numpy as np
Expand Down Expand Up @@ -648,8 +648,27 @@ def num_chunks(self) -> int:
"""Number of chunks in underlying pyarrow.ChunkedArray"""
return self._chunked_array.num_chunks

def iter_field_lists(self, field: str) -> Generator[np.ndarray, None, None]:
"""Iterate over single field nested lists, as numpy arrays

Parameters
----------
field : str
The name of the field to iterate over.

Yields
------
np.ndarray
The numpy array view over a list scalar.
"""
for chunk in self._chunked_array.iterchunks():
struct_array: pa.StructArray = cast(pa.StructArray, chunk)
list_array: pa.ListArray = cast(pa.ListArray, struct_array.field(field))
for list_scalar in list_array:
yield np.asarray(list_scalar.values)

def view_fields(self, fields: str | list[str]) -> Self: # type: ignore[name-defined] # noqa: F821
"""Get a view of the series with only the specified fields
"""Get a view of the extension array with only the specified fields

Parameters
----------
Expand All @@ -659,7 +678,7 @@ def view_fields(self, fields: str | list[str]) -> Self: # type: ignore[name-def
Returns
-------
NestedExtensionArray
The view of the series with only the specified fields.
The view of the array with only the specified fields.
"""
if isinstance(fields, str):
fields = [fields]
Expand Down
16 changes: 16 additions & 0 deletions tests/nested_pandas/series/test_ext_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -1255,6 +1255,22 @@ def test_num_chunks():
assert ext_array.num_chunks == 7


def test_iter_field_lists():
"""Test .iter_field_lists() yields the correct field lists"""
a = [[1, 2, 3], [1, 2, 3, 4]]
b = [np.array(["a", "b", "c"]), np.array(["x", "y", "z", "w"])]
struct_array = pa.StructArray.from_arrays(
arrays=[a, b],
names=["a", "b"],
)
ext_array = NestedExtensionArray(struct_array)

for actual, desired in zip(ext_array.iter_field_lists("a"), a):
assert_array_equal(actual, desired)
for actual, desired in zip(ext_array.iter_field_lists("b"), b):
assert_array_equal(actual, desired)


def test_view_fields_with_single_field():
"""Tests ext_array.view("field")"""
arrays = [
Expand Down