Merge pull request #2 from lincc-frameworks/ts-accessor

Init impl of .ts accessor
lincc-frameworks · Feb 1, 2024 · aa45391 · aa45391
2 parents 7fb5861 + 5363757
commit aa45391
Show file tree

Hide file tree

Showing 6 changed files with 453 additions and 14 deletions.
diff --git a/docs/notebooks/intro_notebook.ipynb b/docs/notebooks/intro_notebook.ipynb
@@ -32,8 +32,8 @@
    "id": "165a7a0918b5a866",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2024-01-29T20:39:19.731782Z",
-     "start_time": "2024-01-29T20:39:19.718734Z"
+     "end_time": "2024-02-01T15:26:25.116844Z",
+     "start_time": "2024-02-01T15:26:25.107993Z"
     }
    },
    "outputs": [],
@@ -75,28 +75,116 @@
    "id": "951dbb53d50f21c3",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2024-01-29T20:39:19.779538Z",
-     "start_time": "2024-01-29T20:39:19.730689Z"
+     "end_time": "2024-02-01T15:26:25.170266Z",
+     "start_time": "2024-02-01T15:26:25.117566Z"
     }
    },
    "outputs": [],
    "source": [
-    "packed = pack_df(sources, name=\"light_curve\")\n",
+    "packed = pack_df(sources, name=\"sources\")\n",
     "packed"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "2a7fef0ef94ff597",
+   "metadata": {
+    "collapsed": false
+   },
+   "source": [
+    "### Get packed sources series and play with `.ts` accessors\n",
+    "This series is a collection of structures, each structure consist of multiple fields, and each field is a \"list\" of values. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1b9860df979d217a",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-02-01T15:26:25.170937Z",
+     "start_time": "2024-02-01T15:26:25.139968Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "struct_sources = packed[\"sources\"]"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "id": "620ad241f94d3e98",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2024-01-29T20:39:19.779728Z",
-     "start_time": "2024-01-29T20:39:19.757341Z"
+     "end_time": "2024-02-01T15:26:25.171546Z",
+     "start_time": "2024-02-01T15:26:25.143260Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "struct_sources.ts.to_flat()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "63e47b51a269305f",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-02-01T15:26:25.171965Z",
+     "start_time": "2024-02-01T15:26:25.148412Z"
     }
    },
    "outputs": [],
-   "source": []
+   "source": [
+    "struct_sources.ts.to_nested()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "371a7693866b9b70",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-02-01T15:26:25.182801Z",
+     "start_time": "2024-02-01T15:26:25.163958Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "struct_sources.ts.iget(1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "95631744bf97db91",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-02-01T15:35:00.548930Z",
+     "start_time": "2024-02-01T15:35:00.543882Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "struct_sources.ts.get(8001)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ac15e872786696ef",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-02-01T15:26:25.183096Z",
+     "start_time": "2024-02-01T15:26:25.166595Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "struct_sources.ts[\"flux\"]"
+   ]
   }
  ],
  "metadata": {

diff --git a/src/pandas_ts/__init__.py b/src/pandas_ts/__init__.py
@@ -0,0 +1 @@
+from .ts_accessor import TsAccessor
diff --git a/src/pandas_ts/ts_accessor.py b/src/pandas_ts/ts_accessor.py
@@ -0,0 +1,117 @@
+from typing import Any, cast
+
+import numpy as np
+import pandas as pd
+import pyarrow as pa
+from pandas.api.extensions import register_series_accessor
+
+__all__ = ["TsAccessor"]
+
+
+def pa_type_is_any_list(pa_type):
+    return (
+        pa.types.is_list(pa_type) or pa.types.is_large_list(pa_type) or pa.types.is_fixed_size_list(pa_type)
+    )
+
+
+@register_series_accessor("ts")
+class TsAccessor:
+    def __init__(self, series):
+        self._check_series(series)
+
+        self._series = series
+
+    @staticmethod
+    def _check_series(series):
+        dtype = series.dtype
+        TsAccessor._check_dtype(dtype)
+
+    @staticmethod
+    def _check_dtype(dtype):
+        # TODO: check if dtype is TsDtype when it is implemented
+        if not hasattr(dtype, "pyarrow_dtype"):
+            raise AttributeError("Can only use .ts accessor with a Series with dtype pyarrow struct dtype")
+        pyarrow_dtype = dtype.pyarrow_dtype
+        if not pa.types.is_struct(pyarrow_dtype):
+            raise AttributeError("Can only use .ts accessor with a Series with dtype pyarrow struct dtype")
+
+        for field in pyarrow_dtype:
+            if not pa_type_is_any_list(field.type):
+                raise AttributeError(
+                    f"Can only use .ts accessor with a Series with dtype pyarrow struct dtype, all fields must be list types. Given struct has unsupported field {field}"
+                )
+
+    def to_nested(self):
+        """Convert ts into dataframe of nested arrays"""
+        return self._series.struct.explode()
+
+    def to_flat(self):
+        """Convert ts into dataframe of flat arrays"""
+        fields = self._series.struct.dtypes.index
+        if len(fields) == 0:
+            raise ValueError("Cannot flatten a struct with no fields")
+
+        flat_series = {}
+        index = None
+        for field in fields:
+            list_array = cast(pa.ListArray, pa.array(self._series.struct.field(field)))
+            if index is None:
+                index = np.repeat(self._series.index.values, np.diff(list_array.offsets))
+            flat_series[field] = pd.Series(
+                list_array.flatten(),
+                index=index,
+                name=field,
+                copy=False,
+            )
+        return pd.DataFrame(flat_series)
+
+    @property
+    def fields(self) -> pd.Index:
+        """Names of the nested columns"""
+        return self._series.struct.dtypes.index
+
+    def __getitem__(self, key: str) -> pd.Series:
+        return self._series.struct.field(key)
+
+    def get(self, index: Any) -> pd.DataFrame:
+        """Get a single ts item by label (index value) as a dataframe
+
+        Parameters
+        ----------
+        index : Any
+            The label of the item to get, must be in the index of
+            the series.
+
+        Returns
+        -------
+        pd.DataFrame
+            A dataframe with the nested arrays of the item.
+
+        See Also
+        --------
+        pandas_ts.TsAccessor.iget : Get a single ts item by position.
+        """
+        item = self._series.loc[index]
+        return pd.DataFrame.from_dict(item)
+
+    def iget(self, index: int) -> pd.DataFrame:
+        """Get a single ts item by position as a dataframe
+
+        Parameters
+        ----------
+        index : int
+            The position of the item to get, must be a valid position
+            in the series, i.e. between 0 and len(series) - 1.
+
+        Returns
+        -------
+        pd.DataFrame
+            A dataframe with the nested arrays of the item.
+
+        See Also
+        --------
+        pandas_ts.TsAccessor.get : Get a single ts item by label (index value).
+        """
+        item = self._series.iloc[index]
+        print(item)
+        return pd.DataFrame.from_dict(item)
diff --git a/tests/pandas_ts/conftest.py b/tests/pandas_ts/conftest.py
@@ -0,0 +1,15 @@
+from numpy.testing import assert_array_equal
+
+
+def assert_nested_array_series_equal(a, b):
+    assert_array_equal(a.index, b.index)
+    for inner_a, inner_b in zip(a, b):
+        assert_array_equal(inner_a, inner_b, err_msg=f"Series '{a.name}' is not equal series '{b.name}'")
+
+
+def assert_df_equal(a, b):
+    assert_array_equal(a.index, b.index)
+    assert_array_equal(a.columns, b.columns)
+    assert_array_equal(a.dtypes, b.dtypes)
+    for column in a.columns:
+        assert_array_equal(a[column], b[column], err_msg=f"Column '{column}' is not equal column '{column}'")
diff --git a/tests/pandas_ts/test_packer.py b/tests/pandas_ts/test_packer.py
@@ -1,17 +1,12 @@
 import numpy as np
 import pandas as pd
 import pyarrow as pa
+from conftest import assert_nested_array_series_equal
 from numpy.testing import assert_array_equal
 
 from pandas_ts import packer
 
 
-def assert_nested_array_series_equal(a, b):
-    assert_array_equal(a.index, b.index)
-    for inner_a, inner_b in zip(a, b):
-        assert_array_equal(inner_a, inner_b, err_msg=f"Series '{a.name}' is not equal series '{b.name}'")
-
-
 def test_pack_df():
     df = pd.DataFrame(
         data={