Make item to be a pd.DataFrame

Closes #11
lincc-frameworks · Feb 2, 2024 · 7c612c4 · 7c612c4
1 parent 2c01f15
commit 7c612c4
Show file tree

Hide file tree

Showing 6 changed files with 104 additions and 109 deletions.
diff --git a/docs/notebooks/intro_notebook.ipynb b/docs/notebooks/intro_notebook.ipynb
@@ -32,8 +32,8 @@
    "id": "165a7a0918b5a866",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2024-02-02T14:37:48.355994Z",
-     "start_time": "2024-02-02T14:37:48.339061Z"
+     "end_time": "2024-02-02T21:06:06.593949Z",
+     "start_time": "2024-02-02T21:06:06.560451Z"
     }
    },
    "outputs": [],
@@ -75,8 +75,8 @@
    "id": "951dbb53d50f21c3",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2024-02-02T14:37:55.893432Z",
-     "start_time": "2024-02-02T14:37:55.879979Z"
+     "end_time": "2024-02-02T21:06:06.594146Z",
+     "start_time": "2024-02-02T21:06:06.573280Z"
     }
    },
    "outputs": [],
@@ -87,73 +87,82 @@
   },
   {
    "cell_type": "markdown",
-   "id": "2a7fef0ef94ff597",
+   "id": "b08d49b762877dcb",
    "metadata": {
     "collapsed": false
    },
    "source": [
-    "### Get packed sources series and play with `.ts` accessors\n",
-    "This series is a collection of structures, each structure consist of multiple fields, and each field is a \"list\" of values. "
+    "### Single item of the packed series is returned as a new DataFrame"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "620ad241f94d3e98",
+   "id": "edd0b2714196c9d0",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2024-02-02T14:38:24.765169Z",
-     "start_time": "2024-02-02T14:38:24.756505Z"
+     "end_time": "2024-02-02T21:06:06.615393Z",
+     "start_time": "2024-02-02T21:06:06.595599Z"
     }
    },
    "outputs": [],
    "source": [
-    "packed.ts.to_flat()"
+    "packed.iloc[0]"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "63e47b51a269305f",
+   "id": "3144e1a6c5964ed9",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2024-02-02T14:38:25.567293Z",
-     "start_time": "2024-02-02T14:38:25.555048Z"
+     "end_time": "2024-02-02T21:39:59.851440Z",
+     "start_time": "2024-02-02T21:39:59.845160Z"
     }
    },
    "outputs": [],
    "source": [
-    "packed.ts.to_nested()"
+    "# Get the linearly interpolated flux for time=10\n",
+    "packed.apply(lambda df: np.interp(10.0, df[\"time\"], df[\"flux\"]))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2a7fef0ef94ff597",
+   "metadata": {
+    "collapsed": false
+   },
+   "source": [
+    "### Get packed sources series and play with `.ts` accessors\n",
+    "This series is a collection of structures, each structure consist of multiple fields, and each field is a \"list\" of values. "
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "371a7693866b9b70",
+   "id": "620ad241f94d3e98",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2024-02-02T14:38:26.339595Z",
-     "start_time": "2024-02-02T14:38:26.334556Z"
+     "start_time": "2024-02-02T21:06:06.602564Z"
     }
    },
    "outputs": [],
    "source": [
-    "packed.ts.iget(1)"
+    "packed.ts.to_flat()"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "95631744bf97db91",
+   "id": "63e47b51a269305f",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2024-02-02T14:38:26.701825Z",
-     "start_time": "2024-02-02T14:38:26.699175Z"
+     "start_time": "2024-02-02T21:06:06.604284Z"
     }
    },
    "outputs": [],
    "source": [
-    "packed.ts.get(8001)"
+    "packed.ts.to_nested()"
    ]
   },
   {
@@ -162,8 +171,7 @@
    "id": "ac15e872786696ef",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2024-02-02T14:38:27.235676Z",
-     "start_time": "2024-02-02T14:38:27.218584Z"
+     "start_time": "2024-02-02T21:06:06.605164Z"
     }
    },
    "outputs": [],
@@ -177,8 +185,7 @@
    "id": "dc7dbd52f1a8407a",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2024-02-02T14:38:40.512026Z",
-     "start_time": "2024-02-02T14:38:40.505424Z"
+     "start_time": "2024-02-02T21:06:06.605800Z"
     }
    },
    "outputs": [],

diff --git a/src/pandas_ts/ts_accessor.py b/src/pandas_ts/ts_accessor.py
@@ -68,46 +68,3 @@ def fields(self) -> pd.Index:
 
     def __getitem__(self, key: str) -> pd.Series:
         return self._series.struct.field(key)
-
-    def get(self, index: Any) -> pd.DataFrame:
-        """Get a single ts item by label (index value) as a dataframe
-
-        Parameters
-        ----------
-        index : Any
-            The label of the item to get, must be in the index of
-            the series.
-
-        Returns
-        -------
-        pd.DataFrame
-            A dataframe with the nested arrays of the item.
-
-        See Also
-        --------
-        pandas_ts.TsAccessor.iget : Get a single ts item by position.
-        """
-        item = self._series.loc[index]
-        return pd.DataFrame.from_dict(item)
-
-    def iget(self, index: int) -> pd.DataFrame:
-        """Get a single ts item by position as a dataframe
-
-        Parameters
-        ----------
-        index : int
-            The position of the item to get, must be a valid position
-            in the series, i.e. between 0 and len(series) - 1.
-
-        Returns
-        -------
-        pd.DataFrame
-            A dataframe with the nested arrays of the item.
-
-        See Also
-        --------
-        pandas_ts.TsAccessor.get : Get a single ts item by label (index value).
-        """
-        item = self._series.iloc[index]
-        print(item)
-        return pd.DataFrame.from_dict(item)
diff --git a/src/pandas_ts/ts_dtype.py b/src/pandas_ts/ts_dtype.py
@@ -3,6 +3,7 @@
 
 from typing import Mapping, cast
 
+import pandas as pd
 import pyarrow as pa
 from pandas import ArrowDtype
 from pandas.api.extensions import register_extension_dtype
@@ -99,3 +100,5 @@ def construct_array_type(cls) -> type[ArrowExtensionArray]:
     def name(self) -> str:
         fields = ", ".join([f"{field.name}: [{field.type.value_type!s}]" for field in self.pyarrow_dtype])
         return f"ts<{fields}>"
+
+    type = pd.DataFrame
diff --git a/src/pandas_ts/ts_ext_array.py b/src/pandas_ts/ts_ext_array.py
@@ -1,11 +1,18 @@
 from __future__ import annotations
 
-from typing import cast
+from typing import Any, Iterator, cast
 
+import numpy as np
+import pandas as pd
 import pyarrow as pa
+from numpy.typing import DTypeLike
+
+# Needed by ArrowExtensionArray.to_numpy(na_value=no_default)
+from pandas._libs.lib import no_default
 
 # It is considered to be an experimental, so we need to be careful with it.
 from pandas.core.arrays import ArrowExtensionArray
+from pyarrow import ExtensionArray
 
 from pandas_ts.ts_dtype import TsDtype
 from pandas_ts.utils import is_pa_type_a_list
@@ -64,6 +71,32 @@ def _validate(array: pa.ChunkedArray) -> None:
                 if not first_list_array.offsets.equals(list_array.offsets):
                     raise ValueError("Offsets of all ListArrays must be the same")
 
+    def __getitem__(self, item):
+        value = super().__getitem__(item)
+        # Convert "scalar" value to pd.DataFrame
+        if not isinstance(value, dict):
+            return value
+        return pd.DataFrame(value, copy=True)
+
+    def __iter__(self) -> Iterator[Any]:
+        for value in super().__iter__():
+            # Convert "scalar" value to pd.DataFrame
+            if not isinstance(value, dict):
+                yield value
+            else:
+                yield pd.DataFrame(value, copy=True)
+
+    def to_numpy(
+        self, dtype: DTypeLike | None = None, copy: bool = False, na_value: Any = no_default
+    ) -> np.ndarray:
+        array = super().to_numpy(dtype=dtype, copy=copy, na_value=na_value)
+
+        # Hack with np.empty is the only way to force numpy to create 1-d array of objects
+        result = np.empty(shape=array.shape, dtype=object)
+        # We do copy=False here because user's 'copy' is already handled by ArrowExtensionArray.to_numpy
+        result[:] = [pd.DataFrame(value, copy=False) for value in array]
+        return result
+
     @property
     def list_offsets(self) -> pa.ChunkedArray:
         """The list offsets of the field arrays.

diff --git a/tests/pandas_ts/test_ts_accessor.py b/tests/pandas_ts/test_ts_accessor.py
@@ -181,40 +181,3 @@ def test_ts_accessor___getitem__():
             name="b",
         ),
     )
-
-
-def test_ts_accessor_get():
-    import pandas_ts as _
-
-    struct_array = pa.StructArray.from_arrays(
-        arrays=[
-            pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0])]),
-            pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]),
-        ],
-        names=["a", "b"],
-    )
-    series = pd.Series(struct_array, dtype=pd.ArrowDtype(struct_array.type), index=[100, 101])
-
-    second_row_as_df = series.ts.get(101)
-    assert_frame_equal(
-        second_row_as_df, pd.DataFrame({"a": np.array([1.0, 2.0, 1.0]), "b": -np.array([3.0, 4.0, 5.0])})
-    )
-
-
-def test_ts_accessor_iget():
-    import pandas_ts as _
-
-    struct_array = pa.StructArray.from_arrays(
-        arrays=[
-            pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0])]),
-            pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]),
-        ],
-        names=["a", "b"],
-    )
-
-    series = pd.Series(struct_array, dtype=pd.ArrowDtype(struct_array.type), index=[100, 101])
-
-    first_row_as_df = series.ts.iget(-2)
-    assert_frame_equal(
-        first_row_as_df, pd.DataFrame({"a": np.array([1.0, 2.0, 3.0]), "b": -np.array([4.0, 5.0, 6.0])})
-    )
diff --git a/tests/pandas_ts/test_ts_ext_array.py b/tests/pandas_ts/test_ts_ext_array.py
@@ -3,7 +3,7 @@
 import pyarrow as pa
 import pytest
 from numpy.testing import assert_array_equal
-from pandas.testing import assert_series_equal
+from pandas.testing import assert_frame_equal, assert_series_equal
 
 from pandas_ts import TsDtype
 from pandas_ts.ts_ext_array import TsExtensionArray
@@ -106,3 +106,35 @@ def test_list_offsets():
 
     desired = pa.chunked_array([pa.array([0, 3, 6])])
     assert_array_equal(ext_array.list_offsets, desired)
+
+
+def test___getitem__():
+    struct_array = pa.StructArray.from_arrays(
+        arrays=[
+            pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0])]),
+            pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]),
+        ],
+        names=["a", "b"],
+    )
+    series = pd.Series(struct_array, dtype=TsDtype(struct_array.type), index=[100, 101])
+
+    second_row_as_df = series[101]
+    assert_frame_equal(
+        second_row_as_df, pd.DataFrame({"a": np.array([1.0, 2.0, 1.0]), "b": -np.array([3.0, 4.0, 5.0])})
+    )
+
+
+def test_series_apply_udf_argument():
+    struct_array = pa.StructArray.from_arrays(
+        arrays=[
+            pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0])]),
+            pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]),
+        ],
+        names=["a", "b"],
+    )
+    series = pd.Series(struct_array, dtype=TsDtype(struct_array.type), index=[100, 101])
+
+    series_of_dfs = series.apply(lambda x: x)
+    assert_frame_equal(
+        series_of_dfs.iloc[0], pd.DataFrame({"a": np.array([1.0, 2.0, 3.0]), "b": -np.array([4.0, 5.0, 6.0])})
+    )