diff --git a/docs/notebooks/intro_notebook.ipynb b/docs/notebooks/intro_notebook.ipynb index 841013d..6f0730d 100644 --- a/docs/notebooks/intro_notebook.ipynb +++ b/docs/notebooks/intro_notebook.ipynb @@ -32,15 +32,15 @@ "id": "165a7a0918b5a866", "metadata": { "ExecuteTime": { - "end_time": "2024-02-06T17:39:12.184284Z", - "start_time": "2024-02-06T17:39:12.170219Z" + "end_time": "2024-02-10T02:23:42.191161Z", + "start_time": "2024-02-10T02:23:42.178595Z" } }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", - "from pandas_ts.packer import pack_flat\n", + "from pandas_ts.packer import pack_flat, pack_dfs\n", "\n", "\n", "# Adopted from\n", @@ -74,8 +74,8 @@ "id": "951dbb53d50f21c3", "metadata": { "ExecuteTime": { - "end_time": "2024-02-06T17:39:12.195676Z", - "start_time": "2024-02-06T17:39:12.182905Z" + "end_time": "2024-02-10T02:23:42.217564Z", + "start_time": "2024-02-10T02:23:42.193011Z" } }, "outputs": [], @@ -100,8 +100,8 @@ "id": "edd0b2714196c9d0", "metadata": { "ExecuteTime": { - "end_time": "2024-02-06T17:39:12.205248Z", - "start_time": "2024-02-06T17:39:12.196528Z" + "end_time": "2024-02-10T02:23:42.229553Z", + "start_time": "2024-02-10T02:23:42.204915Z" } }, "outputs": [], @@ -115,8 +115,8 @@ "id": "3144e1a6c5964ed9", "metadata": { "ExecuteTime": { - "end_time": "2024-02-06T17:39:12.209602Z", - "start_time": "2024-02-06T17:39:12.200870Z" + "end_time": "2024-02-10T02:23:42.229932Z", + "start_time": "2024-02-10T02:23:42.211090Z" } }, "outputs": [], @@ -142,8 +142,8 @@ "id": "620ad241f94d3e98", "metadata": { "ExecuteTime": { - "end_time": "2024-02-06T17:39:12.222079Z", - "start_time": "2024-02-06T17:39:12.204054Z" + "end_time": "2024-02-10T02:23:42.231416Z", + "start_time": "2024-02-10T02:23:42.216005Z" } }, "outputs": [], @@ -157,8 +157,8 @@ "id": "63e47b51a269305f", "metadata": { "ExecuteTime": { - "end_time": "2024-02-06T17:39:12.252890Z", - "start_time": "2024-02-06T17:39:12.210171Z" + "end_time": "2024-02-10T02:23:42.284198Z", + "start_time": "2024-02-10T02:23:42.221854Z" } }, "outputs": [], @@ -172,8 +172,8 @@ "id": "ac15e872786696ef", "metadata": { "ExecuteTime": { - "end_time": "2024-02-06T17:39:12.253906Z", - "start_time": "2024-02-06T17:39:12.221785Z" + "end_time": "2024-02-10T02:23:42.284672Z", + "start_time": "2024-02-10T02:23:42.236972Z" } }, "outputs": [], @@ -187,8 +187,8 @@ "id": "dc7dbd52f1a8407a", "metadata": { "ExecuteTime": { - "end_time": "2024-02-06T17:39:12.254279Z", - "start_time": "2024-02-06T17:39:12.226204Z" + "end_time": "2024-02-10T02:23:42.284955Z", + "start_time": "2024-02-10T02:23:42.240108Z" } }, "outputs": [], @@ -212,8 +212,8 @@ "id": "996f07b4d16e17e5", "metadata": { "ExecuteTime": { - "end_time": "2024-02-06T17:39:12.265890Z", - "start_time": "2024-02-06T17:39:12.228393Z" + "end_time": "2024-02-10T02:23:42.285205Z", + "start_time": "2024-02-10T02:23:42.242144Z" } }, "outputs": [], @@ -229,19 +229,18 @@ "id": "21d5c009ef0990a4", "metadata": { "ExecuteTime": { - "end_time": "2024-02-06T17:39:12.271944Z", - "start_time": "2024-02-06T17:39:12.233991Z" + "end_time": "2024-02-10T02:23:42.324854Z", + "start_time": "2024-02-10T02:23:42.287797Z" } }, "outputs": [], "source": [ - "# Change errors for object 8003 by replacing the entire nested list series\n", - "# We need to convert it to Python lists, so we can change them in-place\n", - "err = pd.Series(packed.ts[\"err\"], dtype=object)\n", - "err[8003] = [e + 25 for e in err[8003]]\n", - "assert len(err) == len(packed.ts._series)\n", - "packed.ts[\"err\"] = err\n", - "packed.ts[\"err\"]" + "# Change errors for object 8003\n", + "light_curve = packed.loc[8003]\n", + "light_curve[\"err\"] += 25\n", + "# packed.lpc[8003] = ... does not work\n", + "packed.iloc[3:4] = [light_curve]\n", + "packed.iloc[0]" ] }, { @@ -250,8 +249,8 @@ "id": "3a713c94897456e1", "metadata": { "ExecuteTime": { - "end_time": "2024-02-06T17:39:12.284939Z", - "start_time": "2024-02-06T17:39:12.239517Z" + "end_time": "2024-02-10T02:23:42.358157Z", + "start_time": "2024-02-10T02:23:42.316636Z" } }, "outputs": [], @@ -264,14 +263,42 @@ "packed" ] }, + { + "cell_type": "markdown", + "id": "7fc7ac2d28acf1de", + "metadata": { + "collapsed": false + }, + "source": [ + "### Change all items and pack to a new Series" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ab27747eba156888", + "metadata": { + "ExecuteTime": { + "end_time": "2024-02-10T02:23:42.385333Z", + "start_time": "2024-02-10T02:23:42.360474Z" + } + }, + "outputs": [], + "source": [ + "# Subsample light curves\n", + "dfs = packed.apply(lambda df: df.iloc[::50])\n", + "subsampled = pack_dfs(dfs, name=\"subsampled\")\n", + "packed.loc[8000], subsampled.loc[8000]" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "92e6d69978ae9479", + "id": "26c558e3551b5092", "metadata": { "ExecuteTime": { - "end_time": "2024-02-06T17:39:12.309173Z", - "start_time": "2024-02-06T17:39:12.246921Z" + "end_time": "2024-02-10T02:23:42.387707Z", + "start_time": "2024-02-10T02:23:42.370602Z" } }, "outputs": [], diff --git a/src/pandas_ts/packer.py b/src/pandas_ts/packer.py index 2a7a58a..34573b3 100644 --- a/src/pandas_ts/packer.py +++ b/src/pandas_ts/packer.py @@ -7,13 +7,19 @@ # "|" for python 3.9 from __future__ import annotations +from collections.abc import Sequence + import numpy as np import pandas as pd import pyarrow as pa +from pandas_ts.ts_dtype import TsDtype from pandas_ts.ts_ext_array import TsExtensionArray -__all__ = ["pack_flat", "pack_lists"] +__all__ = ["pack_flat", "pack_lists", "pack_dfs"] + + +N_ROWS_INFER_DTYPE = 1000 def pack_flat_into_df(df: pd.DataFrame, name=None) -> pd.DataFrame: @@ -80,6 +86,38 @@ def pack_flat(df: pd.DataFrame, name: str | None = None) -> pd.Series: return pack_sorted_df_into_struct(flat, name=name) +def pack_dfs(dfs: Sequence[pd.DataFrame], index: object = None, name: str | None = None) -> pd.Series: + """Pack a sequence of "flat" dataframes into a "nested" series. + + Parameters + ---------- + dfs : Sequence[pd.DataFrame] + Input sequence of dataframes. + index : pd.Index, optional + Index of the output series. + name : str, optional + Name of the output series. + + Returns + ------- + pd.Series + Output series. + """ + if isinstance(dfs, pd.Series) and index is None: + index = dfs.index + + first_df = dfs.iloc[0] if hasattr(dfs, "iloc") else dfs[0] + + field_types = { + column: pa.array(first_df[column].iloc[:N_ROWS_INFER_DTYPE]).type for column in first_df.columns + } + dtype = TsDtype.from_fields(field_types) + dummy_value: dict[str, list] = {column: [] for column in first_df.columns} + series = pd.Series([dummy_value] * len(dfs), dtype=dtype, index=index, name=name) + series[:] = dfs + return series + + def pack_sorted_df_into_struct(df: pd.DataFrame, name: str | None = None) -> pd.Series: """Make a structure of lists representation of a "flat" dataframe. diff --git a/src/pandas_ts/ts_ext_array.py b/src/pandas_ts/ts_ext_array.py index ef03968..208d0e9 100644 --- a/src/pandas_ts/ts_ext_array.py +++ b/src/pandas_ts/ts_ext_array.py @@ -1,6 +1,6 @@ from __future__ import annotations -from collections.abc import Iterator +from collections.abc import Collection, Iterable, Iterator, Sequence from typing import Any, cast import numpy as np @@ -50,6 +50,37 @@ def __init__(self, values: pa.Array | pa.ChunkedArray, *, validate: bool = True) if validate: self._validate(self._pa_array) + @staticmethod + def _convert_df_to_pa_scalar(df: pd.DataFrame, *, type: pa.DataType | None) -> pa.Scalar: + d = {column: series.values for column, series in df.to_dict("series").items()} + return pa.scalar(d, type=type) + + @staticmethod + def _convert_df_value_to_pa(value: object, *, type: pa.DataType | None) -> object: + # Convert "scalar" pd.DataFrame to a dict + if isinstance(value, pd.DataFrame): + return TsExtensionArray._convert_df_to_pa_scalar(value, type=type) + # Convert pd.DataFrame collection to a list of dicts + if hasattr(value, "__getitem__") and isinstance(value, Iterable): + if hasattr(value, "iloc"): + first = value.iloc[0] + else: + try: + first = value[0] # type: ignore[index] + except IndexError: + return value + if isinstance(first, pd.DataFrame): + return [TsExtensionArray._convert_df_to_pa_scalar(v, type=type) for v in value] + return value + + @classmethod + def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False) -> Self: # type: ignore[name-defined] # noqa: F821 + scalars = cls._convert_df_value_to_pa(scalars, type=None) + # The previous line may return an iterator, but parent's _from_sequence needs Sequence + if not isinstance(scalars, Sequence) and isinstance(scalars, Collection): + scalars = list(scalars) + return super()._from_sequence(scalars, dtype=dtype, copy=copy) + @staticmethod def _validate(array: pa.ChunkedArray) -> None: for chunk in array.iterchunks(): @@ -121,6 +152,10 @@ def to_numpy(self, dtype: None = None, copy: bool = False, na_value: Any = no_de result[:] = [pd.DataFrame(value, copy=False) for value in array] return result + def __setitem__(self, key, value) -> None: + value = self._convert_df_value_to_pa(value, type=self._dtype.pyarrow_dtype) + super().__setitem__(key, value) + @property def list_offsets(self) -> pa.ChunkedArray: """The list offsets of the field arrays. diff --git a/tests/pandas_ts/test_packer.py b/tests/pandas_ts/test_packer.py index 162ea0f..fe01774 100644 --- a/tests/pandas_ts/test_packer.py +++ b/tests/pandas_ts/test_packer.py @@ -128,6 +128,52 @@ def test_pack_lists(): assert_series_equal(series.struct.field(field_name), packed_df[field_name]) +def test_dfs(): + dfs = [ + pd.DataFrame( + data={ + "a": [1, 2], + "b": [0, 1], + }, + index=[100, 100], + ), + pd.DataFrame( + data={ + "a": [3, 4], + "b": [0, 1], + }, + index=[101, 101], + ), + pd.DataFrame( + data={ + "a": [5, 6], + "b": [0, 1], + }, + index=[102, 102], + ), + pd.DataFrame( + data={ + "a": [7, 8, 9], + "b": [0, 1, 0], + }, + index=[103, 103, 103], + ), + ] + series = packer.pack_dfs(dfs, index=[100, 101, 102, 103]) + + desired = pd.Series( + data=[ + (np.array([1, 2]), np.array([0, 1])), + (np.array([3, 4]), np.array([0, 1])), + (np.array([5, 6]), np.array([0, 1])), + (np.array([7, 8, 9]), np.array([0, 1, 0])), + ], + index=[100, 101, 102, 103], + dtype=TsDtype.from_fields(dict(a=pa.int64(), b=pa.int64())), + ) + assert_series_equal(series, desired) + + def test_view_sorted_df_as_list_arrays(): flat_df = pd.DataFrame( data={ diff --git a/tests/pandas_ts/test_ts_ext_array.py b/tests/pandas_ts/test_ts_ext_array.py index 908a0c3..62a7955 100644 --- a/tests/pandas_ts/test_ts_ext_array.py +++ b/tests/pandas_ts/test_ts_ext_array.py @@ -70,6 +70,202 @@ def test_series_built_from_dict(): assert_series_equal(series, pd.Series(desired_ext_array)) +def test__convert_df_to_pa_scalar(): + df = pd.DataFrame({"a": [1, 2, 3], "b": [-4.0, -5.0, -6.0]}) + pa_scalar = TsExtensionArray._convert_df_to_pa_scalar(df, type=None) + + assert pa_scalar == pa.scalar( + {"a": [1, 2, 3], "b": [-4.0, -5.0, -6.0]}, + type=pa.struct([pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.list_(pa.float64()))]), + ) + + +def test__convert_df_to_pa_from_scalar(): + df = pd.DataFrame({"a": [1, 2, 3], "b": [-4.0, -5.0, -6.0]}) + pa_scalar = TsExtensionArray._convert_df_to_pa_scalar(df, type=None) + + assert pa_scalar == pa.scalar( + {"a": [1, 2, 3], "b": [-4.0, -5.0, -6.0]}, + type=pa.struct([pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.list_(pa.float64()))]), + ) + + +def test__convert_df_to_pa_from_series(): + series = pd.Series( + [ + pd.DataFrame({"a": [1, 2, 3], "b": [-4.0, -5.0, -6.0]}), + pd.DataFrame({"a": [1, 2, 1], "b": [-3.0, -4.0, -5.0]}), + ] + ) + list_of_dicts = list(TsExtensionArray._convert_df_value_to_pa(series, type=None)) + + desired_type = pa.struct([pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.list_(pa.float64()))]) + + assert list_of_dicts == [ + pa.scalar({"a": [1, 2, 3], "b": [-4.0, -5.0, -6.0]}, type=desired_type), + pa.scalar({"a": [1, 2, 1], "b": [-3.0, -4.0, -5.0]}, type=desired_type), + ] + + +def test__convert_df_to_pa_from_list(): + list_of_dfs = [ + pd.DataFrame({"a": [1, 2, 3], "b": [-4.0, -5.0, -6.0]}), + pd.DataFrame({"a": [1, 2, 1], "b": [-3.0, -4.0, -5.0]}), + ] + list_of_dicts = list(TsExtensionArray._convert_df_value_to_pa(list_of_dfs, type=None)) + + desired_type = pa.struct([pa.field("a", pa.list_(pa.int64())), pa.field("b", pa.list_(pa.float64()))]) + + assert list_of_dicts == [ + pa.scalar({"a": [1, 2, 3], "b": [-4.0, -5.0, -6.0]}, type=desired_type), + pa.scalar({"a": [1, 2, 1], "b": [-3.0, -4.0, -5.0]}, type=desired_type), + ] + + +def test__from_sequence(): + list_of_dfs = [ + pd.DataFrame({"a": [1, 2, 3], "b": [-4.0, -5.0, -6.0]}), + pd.DataFrame({"a": [1, 2, 1], "b": [-3.0, -4.0, -5.0]}), + ] + ext_array = TsExtensionArray._from_sequence(list_of_dfs, dtype=None) + + desired = TsExtensionArray( + pa.StructArray.from_arrays( + [pa.array([[1, 2, 3], [1, 2, 1]]), pa.array([[-4.0, -5.0, -6.0], [-3.0, -4.0, -5.0]])], + names=["a", "b"], + ) + ) + assert ext_array.equals(desired) + + +def test___setitem___single_df(): + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1, 2, 3]), np.array([1, 2, 1])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]), + ], + names=["a", "b"], + ) + ext_array = TsExtensionArray(struct_array) + + ext_array[0] = pd.DataFrame({"a": [5, 6, 7], "b": [100.0, 200.0, 300.0]}) + + desired_struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([5, 6, 7]), np.array([1, 2, 1])]), + pa.array([np.array([100.0, 200.0, 300.0]), -np.array([3.0, 4.0, 5.0])]), + ], + names=["a", "b"], + ) + desired = TsExtensionArray(desired_struct_array) + + assert ext_array.equals(desired) + + +def test___setitem___single_df_different_size(): + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1, 2, 3]), np.array([1, 2, 1])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]), + ], + names=["a", "b"], + ) + ext_array = TsExtensionArray(struct_array) + + ext_array[0] = pd.DataFrame({"a": [5, 6], "b": [100.0, 200.0]}) + + desired_struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([5, 6]), np.array([1, 2, 1])]), + pa.array([np.array([100.0, 200.0]), -np.array([3.0, 4.0, 5.0])]), + ], + names=["a", "b"], + ) + desired = TsExtensionArray(desired_struct_array) + + assert ext_array.equals(desired) + + +def test___setitem___single_df_to_all_rows(): + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1, 2, 3]), np.array([1, 2, 1])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]), + ], + names=["a", "b"], + ) + ext_array = TsExtensionArray(struct_array) + + ext_array[:] = pd.DataFrame({"a": [5, 6], "b": [100.0, 200.0]}) + + desired_struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([5, 6]), np.array([5, 6])]), + pa.array([np.array([100.0, 200.0]), np.array([100.0, 200.0])]), + ], + names=["a", "b"], + ) + desired = TsExtensionArray(desired_struct_array) + + assert ext_array.equals(desired) + + +def test___setitem___list_of_dfs(): + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1, 2, 3]), np.array([1, 2, 1])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]), + ], + names=["a", "b"], + ) + ext_array = TsExtensionArray(struct_array) + + ext_array[:] = [ + pd.DataFrame({"a": [5, 6], "b": [100.0, 200.0]}), + pd.DataFrame({"a": [7, 8], "b": [300.0, 400.0]}), + ] + + desired_struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([5, 6]), np.array([7, 8])]), + pa.array([np.array([100.0, 200.0]), np.array([300.0, 400.0])]), + ], + names=["a", "b"], + ) + desired = TsExtensionArray(desired_struct_array) + + assert ext_array.equals(desired) + + +def test___setitem___series_of_dfs(): + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1, 2, 3]), np.array([1, 2, 1])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]), + ], + names=["a", "b"], + ) + ext_array = TsExtensionArray(struct_array) + + ext_array[:] = pd.Series( + [ + pd.DataFrame({"a": [5, 6], "b": [100.0, 200.0]}), + pd.DataFrame({"a": [7, 8], "b": [300.0, 400.0]}), + ] + ) + + desired_struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([5, 6]), np.array([7, 8])]), + pa.array([np.array([100.0, 200.0]), np.array([300.0, 400.0])]), + ], + names=["a", "b"], + ) + desired = TsExtensionArray(desired_struct_array) + + assert ext_array.equals(desired) + + # Test exception raises for wrong dtype @pytest.mark.parametrize( "data",