Skip to content
This repository has been archived by the owner on Apr 1, 2024. It is now read-only.

Assign pd.DataFrame to TsExensiontArray elements #32

Merged
merged 4 commits into from
Feb 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 60 additions & 33 deletions docs/notebooks/intro_notebook.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,15 @@
"id": "165a7a0918b5a866",
"metadata": {
"ExecuteTime": {
"end_time": "2024-02-06T17:39:12.184284Z",
"start_time": "2024-02-06T17:39:12.170219Z"
"end_time": "2024-02-10T02:23:42.191161Z",
"start_time": "2024-02-10T02:23:42.178595Z"
}
},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from pandas_ts.packer import pack_flat\n",
"from pandas_ts.packer import pack_flat, pack_dfs\n",
"\n",
"\n",
"# Adopted from\n",
Expand Down Expand Up @@ -74,8 +74,8 @@
"id": "951dbb53d50f21c3",
"metadata": {
"ExecuteTime": {
"end_time": "2024-02-06T17:39:12.195676Z",
"start_time": "2024-02-06T17:39:12.182905Z"
"end_time": "2024-02-10T02:23:42.217564Z",
"start_time": "2024-02-10T02:23:42.193011Z"
}
},
"outputs": [],
Expand All @@ -100,8 +100,8 @@
"id": "edd0b2714196c9d0",
"metadata": {
"ExecuteTime": {
"end_time": "2024-02-06T17:39:12.205248Z",
"start_time": "2024-02-06T17:39:12.196528Z"
"end_time": "2024-02-10T02:23:42.229553Z",
"start_time": "2024-02-10T02:23:42.204915Z"
}
},
"outputs": [],
Expand All @@ -115,8 +115,8 @@
"id": "3144e1a6c5964ed9",
"metadata": {
"ExecuteTime": {
"end_time": "2024-02-06T17:39:12.209602Z",
"start_time": "2024-02-06T17:39:12.200870Z"
"end_time": "2024-02-10T02:23:42.229932Z",
"start_time": "2024-02-10T02:23:42.211090Z"
}
},
"outputs": [],
Expand All @@ -142,8 +142,8 @@
"id": "620ad241f94d3e98",
"metadata": {
"ExecuteTime": {
"end_time": "2024-02-06T17:39:12.222079Z",
"start_time": "2024-02-06T17:39:12.204054Z"
"end_time": "2024-02-10T02:23:42.231416Z",
"start_time": "2024-02-10T02:23:42.216005Z"
}
},
"outputs": [],
Expand All @@ -157,8 +157,8 @@
"id": "63e47b51a269305f",
"metadata": {
"ExecuteTime": {
"end_time": "2024-02-06T17:39:12.252890Z",
"start_time": "2024-02-06T17:39:12.210171Z"
"end_time": "2024-02-10T02:23:42.284198Z",
"start_time": "2024-02-10T02:23:42.221854Z"
}
},
"outputs": [],
Expand All @@ -172,8 +172,8 @@
"id": "ac15e872786696ef",
"metadata": {
"ExecuteTime": {
"end_time": "2024-02-06T17:39:12.253906Z",
"start_time": "2024-02-06T17:39:12.221785Z"
"end_time": "2024-02-10T02:23:42.284672Z",
"start_time": "2024-02-10T02:23:42.236972Z"
}
},
"outputs": [],
Expand All @@ -187,8 +187,8 @@
"id": "dc7dbd52f1a8407a",
"metadata": {
"ExecuteTime": {
"end_time": "2024-02-06T17:39:12.254279Z",
"start_time": "2024-02-06T17:39:12.226204Z"
"end_time": "2024-02-10T02:23:42.284955Z",
"start_time": "2024-02-10T02:23:42.240108Z"
}
},
"outputs": [],
Expand All @@ -212,8 +212,8 @@
"id": "996f07b4d16e17e5",
"metadata": {
"ExecuteTime": {
"end_time": "2024-02-06T17:39:12.265890Z",
"start_time": "2024-02-06T17:39:12.228393Z"
"end_time": "2024-02-10T02:23:42.285205Z",
"start_time": "2024-02-10T02:23:42.242144Z"
}
},
"outputs": [],
Expand All @@ -229,19 +229,18 @@
"id": "21d5c009ef0990a4",
"metadata": {
"ExecuteTime": {
"end_time": "2024-02-06T17:39:12.271944Z",
"start_time": "2024-02-06T17:39:12.233991Z"
"end_time": "2024-02-10T02:23:42.324854Z",
"start_time": "2024-02-10T02:23:42.287797Z"
}
},
"outputs": [],
"source": [
"# Change errors for object 8003 by replacing the entire nested list series\n",
"# We need to convert it to Python lists, so we can change them in-place\n",
"err = pd.Series(packed.ts[\"err\"], dtype=object)\n",
"err[8003] = [e + 25 for e in err[8003]]\n",
"assert len(err) == len(packed.ts._series)\n",
"packed.ts[\"err\"] = err\n",
"packed.ts[\"err\"]"
"# Change errors for object 8003\n",
"light_curve = packed.loc[8003]\n",
"light_curve[\"err\"] += 25\n",
"# packed.lpc[8003] = ... does not work\n",
"packed.iloc[3:4] = [light_curve]\n",
"packed.iloc[0]"
]
},
{
Expand All @@ -250,8 +249,8 @@
"id": "3a713c94897456e1",
"metadata": {
"ExecuteTime": {
"end_time": "2024-02-06T17:39:12.284939Z",
"start_time": "2024-02-06T17:39:12.239517Z"
"end_time": "2024-02-10T02:23:42.358157Z",
"start_time": "2024-02-10T02:23:42.316636Z"
}
},
"outputs": [],
Expand All @@ -264,14 +263,42 @@
"packed"
]
},
{
"cell_type": "markdown",
"id": "7fc7ac2d28acf1de",
"metadata": {
"collapsed": false
},
"source": [
"### Change all items and pack to a new Series"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ab27747eba156888",
"metadata": {
"ExecuteTime": {
"end_time": "2024-02-10T02:23:42.385333Z",
"start_time": "2024-02-10T02:23:42.360474Z"
}
},
"outputs": [],
"source": [
"# Subsample light curves\n",
"dfs = packed.apply(lambda df: df.iloc[::50])\n",
"subsampled = pack_dfs(dfs, name=\"subsampled\")\n",
"packed.loc[8000], subsampled.loc[8000]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "92e6d69978ae9479",
"id": "26c558e3551b5092",
"metadata": {
"ExecuteTime": {
"end_time": "2024-02-06T17:39:12.309173Z",
"start_time": "2024-02-06T17:39:12.246921Z"
"end_time": "2024-02-10T02:23:42.387707Z",
"start_time": "2024-02-10T02:23:42.370602Z"
}
},
"outputs": [],
Expand Down
40 changes: 39 additions & 1 deletion src/pandas_ts/packer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,19 @@
# "|" for python 3.9
from __future__ import annotations

from collections.abc import Sequence

import numpy as np
import pandas as pd
import pyarrow as pa

from pandas_ts.ts_dtype import TsDtype
from pandas_ts.ts_ext_array import TsExtensionArray

__all__ = ["pack_flat", "pack_lists"]
__all__ = ["pack_flat", "pack_lists", "pack_dfs"]


N_ROWS_INFER_DTYPE = 1000


def pack_flat_into_df(df: pd.DataFrame, name=None) -> pd.DataFrame:
Expand Down Expand Up @@ -80,6 +86,38 @@ def pack_flat(df: pd.DataFrame, name: str | None = None) -> pd.Series:
return pack_sorted_df_into_struct(flat, name=name)


def pack_dfs(dfs: Sequence[pd.DataFrame], index: object = None, name: str | None = None) -> pd.Series:
"""Pack a sequence of "flat" dataframes into a "nested" series.

Parameters
----------
dfs : Sequence[pd.DataFrame]
Input sequence of dataframes.
index : pd.Index, optional
Index of the output series.
name : str, optional
Name of the output series.

Returns
-------
pd.Series
Output series.
"""
if isinstance(dfs, pd.Series) and index is None:
index = dfs.index

first_df = dfs.iloc[0] if hasattr(dfs, "iloc") else dfs[0]

field_types = {
column: pa.array(first_df[column].iloc[:N_ROWS_INFER_DTYPE]).type for column in first_df.columns
}
dtype = TsDtype.from_fields(field_types)
dummy_value: dict[str, list] = {column: [] for column in first_df.columns}
series = pd.Series([dummy_value] * len(dfs), dtype=dtype, index=index, name=name)
series[:] = dfs
return series


def pack_sorted_df_into_struct(df: pd.DataFrame, name: str | None = None) -> pd.Series:
"""Make a structure of lists representation of a "flat" dataframe.

Expand Down
37 changes: 36 additions & 1 deletion src/pandas_ts/ts_ext_array.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from __future__ import annotations

from collections.abc import Iterator
from collections.abc import Collection, Iterable, Iterator, Sequence
from typing import Any, cast

import numpy as np
Expand Down Expand Up @@ -50,6 +50,37 @@ def __init__(self, values: pa.Array | pa.ChunkedArray, *, validate: bool = True)
if validate:
self._validate(self._pa_array)

@staticmethod
def _convert_df_to_pa_scalar(df: pd.DataFrame, *, type: pa.DataType | None) -> pa.Scalar:
d = {column: series.values for column, series in df.to_dict("series").items()}
return pa.scalar(d, type=type)

@staticmethod
def _convert_df_value_to_pa(value: object, *, type: pa.DataType | None) -> object:
# Convert "scalar" pd.DataFrame to a dict
if isinstance(value, pd.DataFrame):
return TsExtensionArray._convert_df_to_pa_scalar(value, type=type)
# Convert pd.DataFrame collection to a list of dicts
if hasattr(value, "__getitem__") and isinstance(value, Iterable):
if hasattr(value, "iloc"):
first = value.iloc[0]
else:
try:
first = value[0] # type: ignore[index]
except IndexError:
return value
if isinstance(first, pd.DataFrame):
return [TsExtensionArray._convert_df_to_pa_scalar(v, type=type) for v in value]
return value

@classmethod
def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False) -> Self: # type: ignore[name-defined] # noqa: F821
scalars = cls._convert_df_value_to_pa(scalars, type=None)
# The previous line may return an iterator, but parent's _from_sequence needs Sequence
if not isinstance(scalars, Sequence) and isinstance(scalars, Collection):
scalars = list(scalars)
return super()._from_sequence(scalars, dtype=dtype, copy=copy)

@staticmethod
def _validate(array: pa.ChunkedArray) -> None:
for chunk in array.iterchunks():
Expand Down Expand Up @@ -121,6 +152,10 @@ def to_numpy(self, dtype: None = None, copy: bool = False, na_value: Any = no_de
result[:] = [pd.DataFrame(value, copy=False) for value in array]
return result

def __setitem__(self, key, value) -> None:
value = self._convert_df_value_to_pa(value, type=self._dtype.pyarrow_dtype)
super().__setitem__(key, value)

@property
def list_offsets(self) -> pa.ChunkedArray:
"""The list offsets of the field arrays.
Expand Down
46 changes: 46 additions & 0 deletions tests/pandas_ts/test_packer.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,52 @@ def test_pack_lists():
assert_series_equal(series.struct.field(field_name), packed_df[field_name])


def test_dfs():
dfs = [
pd.DataFrame(
data={
"a": [1, 2],
"b": [0, 1],
},
index=[100, 100],
),
pd.DataFrame(
data={
"a": [3, 4],
"b": [0, 1],
},
index=[101, 101],
),
pd.DataFrame(
data={
"a": [5, 6],
"b": [0, 1],
},
index=[102, 102],
),
pd.DataFrame(
data={
"a": [7, 8, 9],
"b": [0, 1, 0],
},
index=[103, 103, 103],
),
]
series = packer.pack_dfs(dfs, index=[100, 101, 102, 103])

desired = pd.Series(
data=[
(np.array([1, 2]), np.array([0, 1])),
(np.array([3, 4]), np.array([0, 1])),
(np.array([5, 6]), np.array([0, 1])),
(np.array([7, 8, 9]), np.array([0, 1, 0])),
],
index=[100, 101, 102, 103],
dtype=TsDtype.from_fields(dict(a=pa.int64(), b=pa.int64())),
)
assert_series_equal(series, desired)


def test_view_sorted_df_as_list_arrays():
flat_df = pd.DataFrame(
data={
Expand Down
Loading
Loading