Skip to content
This repository has been archived by the owner on Apr 1, 2024. It is now read-only.

Commit

Permalink
Make item to be a pd.DataFrame
Browse files Browse the repository at this point in the history
Closes #11
  • Loading branch information
hombit committed Feb 2, 2024
1 parent 2c01f15 commit 7c612c4
Show file tree
Hide file tree
Showing 6 changed files with 104 additions and 109 deletions.
61 changes: 34 additions & 27 deletions docs/notebooks/intro_notebook.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@
"id": "165a7a0918b5a866",
"metadata": {
"ExecuteTime": {
"end_time": "2024-02-02T14:37:48.355994Z",
"start_time": "2024-02-02T14:37:48.339061Z"
"end_time": "2024-02-02T21:06:06.593949Z",
"start_time": "2024-02-02T21:06:06.560451Z"
}
},
"outputs": [],
Expand Down Expand Up @@ -75,8 +75,8 @@
"id": "951dbb53d50f21c3",
"metadata": {
"ExecuteTime": {
"end_time": "2024-02-02T14:37:55.893432Z",
"start_time": "2024-02-02T14:37:55.879979Z"
"end_time": "2024-02-02T21:06:06.594146Z",
"start_time": "2024-02-02T21:06:06.573280Z"
}
},
"outputs": [],
Expand All @@ -87,73 +87,82 @@
},
{
"cell_type": "markdown",
"id": "2a7fef0ef94ff597",
"id": "b08d49b762877dcb",
"metadata": {
"collapsed": false
},
"source": [
"### Get packed sources series and play with `.ts` accessors\n",
"This series is a collection of structures, each structure consist of multiple fields, and each field is a \"list\" of values. "
"### Single item of the packed series is returned as a new DataFrame"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "620ad241f94d3e98",
"id": "edd0b2714196c9d0",
"metadata": {
"ExecuteTime": {
"end_time": "2024-02-02T14:38:24.765169Z",
"start_time": "2024-02-02T14:38:24.756505Z"
"end_time": "2024-02-02T21:06:06.615393Z",
"start_time": "2024-02-02T21:06:06.595599Z"
}
},
"outputs": [],
"source": [
"packed.ts.to_flat()"
"packed.iloc[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "63e47b51a269305f",
"id": "3144e1a6c5964ed9",
"metadata": {
"ExecuteTime": {
"end_time": "2024-02-02T14:38:25.567293Z",
"start_time": "2024-02-02T14:38:25.555048Z"
"end_time": "2024-02-02T21:39:59.851440Z",
"start_time": "2024-02-02T21:39:59.845160Z"
}
},
"outputs": [],
"source": [
"packed.ts.to_nested()"
"# Get the linearly interpolated flux for time=10\n",
"packed.apply(lambda df: np.interp(10.0, df[\"time\"], df[\"flux\"]))"
]
},
{
"cell_type": "markdown",
"id": "2a7fef0ef94ff597",
"metadata": {
"collapsed": false
},
"source": [
"### Get packed sources series and play with `.ts` accessors\n",
"This series is a collection of structures, each structure consist of multiple fields, and each field is a \"list\" of values. "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "371a7693866b9b70",
"id": "620ad241f94d3e98",
"metadata": {
"ExecuteTime": {
"end_time": "2024-02-02T14:38:26.339595Z",
"start_time": "2024-02-02T14:38:26.334556Z"
"start_time": "2024-02-02T21:06:06.602564Z"
}
},
"outputs": [],
"source": [
"packed.ts.iget(1)"
"packed.ts.to_flat()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "95631744bf97db91",
"id": "63e47b51a269305f",
"metadata": {
"ExecuteTime": {
"end_time": "2024-02-02T14:38:26.701825Z",
"start_time": "2024-02-02T14:38:26.699175Z"
"start_time": "2024-02-02T21:06:06.604284Z"
}
},
"outputs": [],
"source": [
"packed.ts.get(8001)"
"packed.ts.to_nested()"
]
},
{
Expand All @@ -162,8 +171,7 @@
"id": "ac15e872786696ef",
"metadata": {
"ExecuteTime": {
"end_time": "2024-02-02T14:38:27.235676Z",
"start_time": "2024-02-02T14:38:27.218584Z"
"start_time": "2024-02-02T21:06:06.605164Z"
}
},
"outputs": [],
Expand All @@ -177,8 +185,7 @@
"id": "dc7dbd52f1a8407a",
"metadata": {
"ExecuteTime": {
"end_time": "2024-02-02T14:38:40.512026Z",
"start_time": "2024-02-02T14:38:40.505424Z"
"start_time": "2024-02-02T21:06:06.605800Z"
}
},
"outputs": [],
Expand Down
43 changes: 0 additions & 43 deletions src/pandas_ts/ts_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,46 +68,3 @@ def fields(self) -> pd.Index:

def __getitem__(self, key: str) -> pd.Series:
return self._series.struct.field(key)

def get(self, index: Any) -> pd.DataFrame:
"""Get a single ts item by label (index value) as a dataframe
Parameters
----------
index : Any
The label of the item to get, must be in the index of
the series.
Returns
-------
pd.DataFrame
A dataframe with the nested arrays of the item.
See Also
--------
pandas_ts.TsAccessor.iget : Get a single ts item by position.
"""
item = self._series.loc[index]
return pd.DataFrame.from_dict(item)

def iget(self, index: int) -> pd.DataFrame:
"""Get a single ts item by position as a dataframe
Parameters
----------
index : int
The position of the item to get, must be a valid position
in the series, i.e. between 0 and len(series) - 1.
Returns
-------
pd.DataFrame
A dataframe with the nested arrays of the item.
See Also
--------
pandas_ts.TsAccessor.get : Get a single ts item by label (index value).
"""
item = self._series.iloc[index]
print(item)
return pd.DataFrame.from_dict(item)
3 changes: 3 additions & 0 deletions src/pandas_ts/ts_dtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from typing import Mapping, cast

import pandas as pd
import pyarrow as pa
from pandas import ArrowDtype
from pandas.api.extensions import register_extension_dtype
Expand Down Expand Up @@ -99,3 +100,5 @@ def construct_array_type(cls) -> type[ArrowExtensionArray]:
def name(self) -> str:
fields = ", ".join([f"{field.name}: [{field.type.value_type!s}]" for field in self.pyarrow_dtype])
return f"ts<{fields}>"

type = pd.DataFrame
35 changes: 34 additions & 1 deletion src/pandas_ts/ts_ext_array.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,18 @@
from __future__ import annotations

from typing import cast
from typing import Any, Iterator, cast

import numpy as np
import pandas as pd
import pyarrow as pa
from numpy.typing import DTypeLike

# Needed by ArrowExtensionArray.to_numpy(na_value=no_default)
from pandas._libs.lib import no_default

# It is considered to be an experimental, so we need to be careful with it.
from pandas.core.arrays import ArrowExtensionArray
from pyarrow import ExtensionArray

from pandas_ts.ts_dtype import TsDtype
from pandas_ts.utils import is_pa_type_a_list
Expand Down Expand Up @@ -64,6 +71,32 @@ def _validate(array: pa.ChunkedArray) -> None:
if not first_list_array.offsets.equals(list_array.offsets):
raise ValueError("Offsets of all ListArrays must be the same")

def __getitem__(self, item):
value = super().__getitem__(item)
# Convert "scalar" value to pd.DataFrame
if not isinstance(value, dict):
return value
return pd.DataFrame(value, copy=True)

def __iter__(self) -> Iterator[Any]:
for value in super().__iter__():

Check warning on line 82 in src/pandas_ts/ts_ext_array.py

View check run for this annotation

Codecov / codecov/patch

src/pandas_ts/ts_ext_array.py#L82

Added line #L82 was not covered by tests
# Convert "scalar" value to pd.DataFrame
if not isinstance(value, dict):
yield value

Check warning on line 85 in src/pandas_ts/ts_ext_array.py

View check run for this annotation

Codecov / codecov/patch

src/pandas_ts/ts_ext_array.py#L84-L85

Added lines #L84 - L85 were not covered by tests
else:
yield pd.DataFrame(value, copy=True)

Check warning on line 87 in src/pandas_ts/ts_ext_array.py

View check run for this annotation

Codecov / codecov/patch

src/pandas_ts/ts_ext_array.py#L87

Added line #L87 was not covered by tests

def to_numpy(
self, dtype: DTypeLike | None = None, copy: bool = False, na_value: Any = no_default
) -> np.ndarray:
array = super().to_numpy(dtype=dtype, copy=copy, na_value=na_value)

# Hack with np.empty is the only way to force numpy to create 1-d array of objects
result = np.empty(shape=array.shape, dtype=object)
# We do copy=False here because user's 'copy' is already handled by ArrowExtensionArray.to_numpy
result[:] = [pd.DataFrame(value, copy=False) for value in array]
return result

@property
def list_offsets(self) -> pa.ChunkedArray:
"""The list offsets of the field arrays.
Expand Down
37 changes: 0 additions & 37 deletions tests/pandas_ts/test_ts_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,40 +181,3 @@ def test_ts_accessor___getitem__():
name="b",
),
)


def test_ts_accessor_get():
import pandas_ts as _

struct_array = pa.StructArray.from_arrays(
arrays=[
pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0])]),
pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]),
],
names=["a", "b"],
)
series = pd.Series(struct_array, dtype=pd.ArrowDtype(struct_array.type), index=[100, 101])

second_row_as_df = series.ts.get(101)
assert_frame_equal(
second_row_as_df, pd.DataFrame({"a": np.array([1.0, 2.0, 1.0]), "b": -np.array([3.0, 4.0, 5.0])})
)


def test_ts_accessor_iget():
import pandas_ts as _

struct_array = pa.StructArray.from_arrays(
arrays=[
pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0])]),
pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]),
],
names=["a", "b"],
)

series = pd.Series(struct_array, dtype=pd.ArrowDtype(struct_array.type), index=[100, 101])

first_row_as_df = series.ts.iget(-2)
assert_frame_equal(
first_row_as_df, pd.DataFrame({"a": np.array([1.0, 2.0, 3.0]), "b": -np.array([4.0, 5.0, 6.0])})
)
34 changes: 33 additions & 1 deletion tests/pandas_ts/test_ts_ext_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pyarrow as pa
import pytest
from numpy.testing import assert_array_equal
from pandas.testing import assert_series_equal
from pandas.testing import assert_frame_equal, assert_series_equal

from pandas_ts import TsDtype
from pandas_ts.ts_ext_array import TsExtensionArray
Expand Down Expand Up @@ -106,3 +106,35 @@ def test_list_offsets():

desired = pa.chunked_array([pa.array([0, 3, 6])])
assert_array_equal(ext_array.list_offsets, desired)


def test___getitem__():
struct_array = pa.StructArray.from_arrays(
arrays=[
pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0])]),
pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]),
],
names=["a", "b"],
)
series = pd.Series(struct_array, dtype=TsDtype(struct_array.type), index=[100, 101])

second_row_as_df = series[101]
assert_frame_equal(
second_row_as_df, pd.DataFrame({"a": np.array([1.0, 2.0, 1.0]), "b": -np.array([3.0, 4.0, 5.0])})
)


def test_series_apply_udf_argument():
struct_array = pa.StructArray.from_arrays(
arrays=[
pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0])]),
pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]),
],
names=["a", "b"],
)
series = pd.Series(struct_array, dtype=TsDtype(struct_array.type), index=[100, 101])

series_of_dfs = series.apply(lambda x: x)
assert_frame_equal(
series_of_dfs.iloc[0], pd.DataFrame({"a": np.array([1.0, 2.0, 3.0]), "b": -np.array([4.0, 5.0, 6.0])})
)

0 comments on commit 7c612c4

Please sign in to comment.