Skip to content
This repository has been archived by the owner on Apr 1, 2024. It is now read-only.

Commit

Permalink
Merge pull request #2 from lincc-frameworks/ts-accessor
Browse files Browse the repository at this point in the history
Init impl of .ts accessor
  • Loading branch information
hombit authored Feb 1, 2024
2 parents 7fb5861 + 5363757 commit aa45391
Show file tree
Hide file tree
Showing 6 changed files with 453 additions and 14 deletions.
104 changes: 96 additions & 8 deletions docs/notebooks/intro_notebook.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@
"id": "165a7a0918b5a866",
"metadata": {
"ExecuteTime": {
"end_time": "2024-01-29T20:39:19.731782Z",
"start_time": "2024-01-29T20:39:19.718734Z"
"end_time": "2024-02-01T15:26:25.116844Z",
"start_time": "2024-02-01T15:26:25.107993Z"
}
},
"outputs": [],
Expand Down Expand Up @@ -75,28 +75,116 @@
"id": "951dbb53d50f21c3",
"metadata": {
"ExecuteTime": {
"end_time": "2024-01-29T20:39:19.779538Z",
"start_time": "2024-01-29T20:39:19.730689Z"
"end_time": "2024-02-01T15:26:25.170266Z",
"start_time": "2024-02-01T15:26:25.117566Z"
}
},
"outputs": [],
"source": [
"packed = pack_df(sources, name=\"light_curve\")\n",
"packed = pack_df(sources, name=\"sources\")\n",
"packed"
]
},
{
"cell_type": "markdown",
"id": "2a7fef0ef94ff597",
"metadata": {
"collapsed": false
},
"source": [
"### Get packed sources series and play with `.ts` accessors\n",
"This series is a collection of structures, each structure consist of multiple fields, and each field is a \"list\" of values. "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1b9860df979d217a",
"metadata": {
"ExecuteTime": {
"end_time": "2024-02-01T15:26:25.170937Z",
"start_time": "2024-02-01T15:26:25.139968Z"
}
},
"outputs": [],
"source": [
"struct_sources = packed[\"sources\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "620ad241f94d3e98",
"metadata": {
"ExecuteTime": {
"end_time": "2024-01-29T20:39:19.779728Z",
"start_time": "2024-01-29T20:39:19.757341Z"
"end_time": "2024-02-01T15:26:25.171546Z",
"start_time": "2024-02-01T15:26:25.143260Z"
}
},
"outputs": [],
"source": [
"struct_sources.ts.to_flat()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "63e47b51a269305f",
"metadata": {
"ExecuteTime": {
"end_time": "2024-02-01T15:26:25.171965Z",
"start_time": "2024-02-01T15:26:25.148412Z"
}
},
"outputs": [],
"source": []
"source": [
"struct_sources.ts.to_nested()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "371a7693866b9b70",
"metadata": {
"ExecuteTime": {
"end_time": "2024-02-01T15:26:25.182801Z",
"start_time": "2024-02-01T15:26:25.163958Z"
}
},
"outputs": [],
"source": [
"struct_sources.ts.iget(1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "95631744bf97db91",
"metadata": {
"ExecuteTime": {
"end_time": "2024-02-01T15:35:00.548930Z",
"start_time": "2024-02-01T15:35:00.543882Z"
}
},
"outputs": [],
"source": [
"struct_sources.ts.get(8001)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ac15e872786696ef",
"metadata": {
"ExecuteTime": {
"end_time": "2024-02-01T15:26:25.183096Z",
"start_time": "2024-02-01T15:26:25.166595Z"
}
},
"outputs": [],
"source": [
"struct_sources.ts[\"flux\"]"
]
}
],
"metadata": {
Expand Down
1 change: 1 addition & 0 deletions src/pandas_ts/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .ts_accessor import TsAccessor
117 changes: 117 additions & 0 deletions src/pandas_ts/ts_accessor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
from typing import Any, cast

import numpy as np
import pandas as pd
import pyarrow as pa
from pandas.api.extensions import register_series_accessor

__all__ = ["TsAccessor"]


def pa_type_is_any_list(pa_type):
return (
pa.types.is_list(pa_type) or pa.types.is_large_list(pa_type) or pa.types.is_fixed_size_list(pa_type)
)


@register_series_accessor("ts")
class TsAccessor:
def __init__(self, series):
self._check_series(series)

self._series = series

@staticmethod
def _check_series(series):
dtype = series.dtype
TsAccessor._check_dtype(dtype)

@staticmethod
def _check_dtype(dtype):
# TODO: check if dtype is TsDtype when it is implemented
if not hasattr(dtype, "pyarrow_dtype"):
raise AttributeError("Can only use .ts accessor with a Series with dtype pyarrow struct dtype")
pyarrow_dtype = dtype.pyarrow_dtype
if not pa.types.is_struct(pyarrow_dtype):
raise AttributeError("Can only use .ts accessor with a Series with dtype pyarrow struct dtype")

for field in pyarrow_dtype:
if not pa_type_is_any_list(field.type):
raise AttributeError(
f"Can only use .ts accessor with a Series with dtype pyarrow struct dtype, all fields must be list types. Given struct has unsupported field {field}"
)

def to_nested(self):
"""Convert ts into dataframe of nested arrays"""
return self._series.struct.explode()

def to_flat(self):
"""Convert ts into dataframe of flat arrays"""
fields = self._series.struct.dtypes.index
if len(fields) == 0:
raise ValueError("Cannot flatten a struct with no fields")

flat_series = {}
index = None
for field in fields:
list_array = cast(pa.ListArray, pa.array(self._series.struct.field(field)))
if index is None:
index = np.repeat(self._series.index.values, np.diff(list_array.offsets))
flat_series[field] = pd.Series(
list_array.flatten(),
index=index,
name=field,
copy=False,
)
return pd.DataFrame(flat_series)

@property
def fields(self) -> pd.Index:
"""Names of the nested columns"""
return self._series.struct.dtypes.index

def __getitem__(self, key: str) -> pd.Series:
return self._series.struct.field(key)

def get(self, index: Any) -> pd.DataFrame:
"""Get a single ts item by label (index value) as a dataframe
Parameters
----------
index : Any
The label of the item to get, must be in the index of
the series.
Returns
-------
pd.DataFrame
A dataframe with the nested arrays of the item.
See Also
--------
pandas_ts.TsAccessor.iget : Get a single ts item by position.
"""
item = self._series.loc[index]
return pd.DataFrame.from_dict(item)

def iget(self, index: int) -> pd.DataFrame:
"""Get a single ts item by position as a dataframe
Parameters
----------
index : int
The position of the item to get, must be a valid position
in the series, i.e. between 0 and len(series) - 1.
Returns
-------
pd.DataFrame
A dataframe with the nested arrays of the item.
See Also
--------
pandas_ts.TsAccessor.get : Get a single ts item by label (index value).
"""
item = self._series.iloc[index]
print(item)
return pd.DataFrame.from_dict(item)
15 changes: 15 additions & 0 deletions tests/pandas_ts/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from numpy.testing import assert_array_equal


def assert_nested_array_series_equal(a, b):
assert_array_equal(a.index, b.index)
for inner_a, inner_b in zip(a, b):
assert_array_equal(inner_a, inner_b, err_msg=f"Series '{a.name}' is not equal series '{b.name}'")


def assert_df_equal(a, b):
assert_array_equal(a.index, b.index)
assert_array_equal(a.columns, b.columns)
assert_array_equal(a.dtypes, b.dtypes)
for column in a.columns:
assert_array_equal(a[column], b[column], err_msg=f"Column '{column}' is not equal column '{column}'")
7 changes: 1 addition & 6 deletions tests/pandas_ts/test_packer.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,12 @@
import numpy as np
import pandas as pd
import pyarrow as pa
from conftest import assert_nested_array_series_equal
from numpy.testing import assert_array_equal

from pandas_ts import packer


def assert_nested_array_series_equal(a, b):
assert_array_equal(a.index, b.index)
for inner_a, inner_b in zip(a, b):
assert_array_equal(inner_a, inner_b, err_msg=f"Series '{a.name}' is not equal series '{b.name}'")


def test_pack_df():
df = pd.DataFrame(
data={
Expand Down
Loading

0 comments on commit aa45391

Please sign in to comment.