Skip to content

Commit

Permalink
Merge pull request #96 from lincc-frameworks/reduce-reimpl
Browse files Browse the repository at this point in the history
Reimplementation of NestedFrame.reduce()
  • Loading branch information
hombit authored May 30, 2024
2 parents aadb12f + fc0b320 commit 82fe541
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 29 deletions.
32 changes: 6 additions & 26 deletions src/nested_pandas/nestedframe/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -413,35 +413,15 @@ def my_sum(col1, col2):
if len(requested_columns) < len(args):
extra_args = args[len(requested_columns) :]

# find targeted layers
layers = np.unique([col[0] for col in requested_columns])

# build a flat dataframe with array columns to apply to the function
apply_df = NestedFrame()
for layer in layers:
iterators = []
for layer, col in requested_columns:
if layer == "base":
columns = [col[1] for col in requested_columns if col[0] == layer]
apply_df = apply_df.join(self[columns], how="outer")
iterators.append(self[col])
else:
# TODO: It should be faster to pass these columns to to_lists, but its 20x slower
# columns = [col[1] for col in requested_columns if col[0] == layer]
apply_df = apply_df.join(self[layer].nest.to_lists(), how="outer")
iterators.append(self[layer].array.iter_field_lists(col))

# Translates the requested columns into the scalars or arrays we pass to func.
def translate_cols(frame, layer, col):
if layer == "base":
# We pass the "base" column as a scalar
return frame[col]
return np.asarray(frame[col])

# send arrays along to the apply call
result = apply_df.apply(
lambda x: func(
*[translate_cols(x, layer, col) for layer, col in requested_columns], *extra_args, **kwargs
),
axis=1, # to apply func on each row of our nested frame)
)
return result
results = [func(*cols, *extra_args, **kwargs) for cols in zip(*iterators)]
return NestedFrame(results, index=self.index)

def to_parquet(self, path, by_layer=False, **kwargs) -> None:
"""Creates parquet file(s) with the data of a NestedFrame, either
Expand Down
25 changes: 22 additions & 3 deletions src/nested_pandas/series/ext_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
# typing.Self and "|" union syntax don't exist in Python 3.9
from __future__ import annotations

from collections.abc import Iterable, Iterator, Sequence
from collections.abc import Generator, Iterable, Iterator, Sequence
from typing import Any, Callable, cast

import numpy as np
Expand Down Expand Up @@ -648,8 +648,27 @@ def num_chunks(self) -> int:
"""Number of chunks in underlying pyarrow.ChunkedArray"""
return self._chunked_array.num_chunks

def iter_field_lists(self, field: str) -> Generator[np.ndarray, None, None]:
"""Iterate over single field nested lists, as numpy arrays
Parameters
----------
field : str
The name of the field to iterate over.
Yields
------
np.ndarray
The numpy array view over a list scalar.
"""
for chunk in self._chunked_array.iterchunks():
struct_array: pa.StructArray = cast(pa.StructArray, chunk)
list_array: pa.ListArray = cast(pa.ListArray, struct_array.field(field))
for list_scalar in list_array:
yield np.asarray(list_scalar.values)

def view_fields(self, fields: str | list[str]) -> Self: # type: ignore[name-defined] # noqa: F821
"""Get a view of the series with only the specified fields
"""Get a view of the extension array with only the specified fields
Parameters
----------
Expand All @@ -659,7 +678,7 @@ def view_fields(self, fields: str | list[str]) -> Self: # type: ignore[name-def
Returns
-------
NestedExtensionArray
The view of the series with only the specified fields.
The view of the array with only the specified fields.
"""
if isinstance(fields, str):
fields = [fields]
Expand Down
16 changes: 16 additions & 0 deletions tests/nested_pandas/series/test_ext_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -1255,6 +1255,22 @@ def test_num_chunks():
assert ext_array.num_chunks == 7


def test_iter_field_lists():
"""Test .iter_field_lists() yields the correct field lists"""
a = [[1, 2, 3], [1, 2, 3, 4]]
b = [np.array(["a", "b", "c"]), np.array(["x", "y", "z", "w"])]
struct_array = pa.StructArray.from_arrays(
arrays=[a, b],
names=["a", "b"],
)
ext_array = NestedExtensionArray(struct_array)

for actual, desired in zip(ext_array.iter_field_lists("a"), a):
assert_array_equal(actual, desired)
for actual, desired in zip(ext_array.iter_field_lists("b"), b):
assert_array_equal(actual, desired)


def test_view_fields_with_single_field():
"""Tests ext_array.view("field")"""
arrays = [
Expand Down

0 comments on commit 82fe541

Please sign in to comment.