From b2832eadfd975a17034501d56d43015e4f5989d2 Mon Sep 17 00:00:00 2001
From: Doug Branton <brantd@uw.edu>
Date: Mon, 8 Apr 2024 12:59:39 -0700
Subject: [PATCH 1/4] Initial read_parquet MVP implementation

---
 src/nested_pandas/nestedframe/__init__.py |  1 +
 src/nested_pandas/nestedframe/io.py       | 69 +++++++++++++++++++++++
 2 files changed, 70 insertions(+)
 create mode 100644 src/nested_pandas/nestedframe/io.py

diff --git a/src/nested_pandas/nestedframe/__init__.py b/src/nested_pandas/nestedframe/__init__.py
index 54af689..a656cf3 100644
--- a/src/nested_pandas/nestedframe/__init__.py
+++ b/src/nested_pandas/nestedframe/__init__.py
@@ -1 +1,2 @@
 from .core import NestedFrame  # noqa
+from .io import read_parquet  # noqa
diff --git a/src/nested_pandas/nestedframe/io.py b/src/nested_pandas/nestedframe/io.py
new file mode 100644
index 0000000..b27b713
--- /dev/null
+++ b/src/nested_pandas/nestedframe/io.py
@@ -0,0 +1,69 @@
+# typing.Self and "|" union syntax don't exist in Python 3.9
+from __future__ import annotations
+
+import pandas as pd
+
+from .core import NestedFrame
+
+
+def read_parquet(
+    data: str,
+    to_pack: dict,
+    engine: str = "auto",
+    columns: list[str] | None = None,
+    pack_columns: dict | None = None,
+) -> NestedFrame:
+    """
+    Load a parquet object from a file path and load a set of other
+    parquet objects to pack into the resulting NestedFrame.
+
+    Docstring based on the Pandas equivalent.
+
+    #TODO after MVP: Include full kwarg-set
+
+    Parameters
+    ----------
+    data : str, path object or file-like object
+        String, path object (implementing ``os.PathLike[str]``), or file-like
+        object implementing a binary ``read()`` function.
+        The string could be a URL. Valid URL schemes include http, ftp, s3,
+        gs, and file. For file URLs, a host is expected. A local file could be:
+        ``file://localhost/path/to/table.parquet``.
+        A file URL can also be a path to a directory that contains multiple
+        partitioned parquet files. Both pyarrow and fastparquet support
+        paths to directories as well as file URLs. A directory path could be:
+        ``file://localhost/path/to/tables`` or ``s3://bucket/partition_dir``.
+    to_pack: dict,
+        A dictionary of parquet data paths (same criteria as `data`), where
+        each key reflects the desired column name to pack the data into and
+        each value reflects the parquet data to pack.
+    engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto'
+        Parquet library to use. If 'auto', then the option
+        ``io.parquet.engine`` is used. The default ``io.parquet.engine``
+        behavior is to try 'pyarrow', falling back to 'fastparquet' if
+        'pyarrow' is unavailable.
+
+        When using the ``'pyarrow'`` engine and no storage options are provided
+        and a filesystem is implemented by both ``pyarrow.fs`` and ``fsspec``
+        (e.g. "s3://"), then the ``pyarrow.fs`` filesystem is attempted first.
+        Use the filesystem keyword with an instantiated fsspec filesystem
+        if you wish to use its implementation.
+    columns : list, default=None
+        If not None, only these columns will be read from the file.
+    pack_columns: dict, default=None
+        If not None, selects a set of columns from each keyed nested parquet
+        object to read from the nested files.
+
+    Returns
+    -------
+    NestedFrame
+    """
+
+    df = NestedFrame(pd.read_parquet(data, engine, columns))
+
+    for pack_key in to_pack:
+        col_subset = pack_columns[pack_key] if pack_columns is not None else None
+        packed = pd.read_parquet(to_pack[pack_key], engine=engine, columns=col_subset)
+        df = df.add_nested(packed, pack_key)
+
+    return df

From 189c59676e0b1209ec575e02d96ea0b461946196 Mon Sep 17 00:00:00 2001
From: Doug Branton <brantd@uw.edu>
Date: Tue, 9 Apr 2024 11:48:33 -0700
Subject: [PATCH 2/4] add read_parquet test, tweak read_parquet

---
 src/nested_pandas/__init__.py              |  3 +-
 src/nested_pandas/nestedframe/io.py        |  2 +-
 tests/nested_pandas/nestedframe/test_io.py | 59 ++++++++++++++++++++++
 3 files changed, 62 insertions(+), 2 deletions(-)
 create mode 100644 tests/nested_pandas/nestedframe/test_io.py

diff --git a/src/nested_pandas/__init__.py b/src/nested_pandas/__init__.py
index 613f651..e63bd1c 100644
--- a/src/nested_pandas/__init__.py
+++ b/src/nested_pandas/__init__.py
@@ -1,8 +1,9 @@
 from .example_module import greetings, meaning
 from .nestedframe import NestedFrame
+from .nestedframe.io import read_parquet
 
 # Import for registering
 from .series.accessor import NestSeriesAccessor  # noqa: F401
 from .series.dtype import NestedDtype
 
-__all__ = ["greetings", "meaning", "NestedDtype", "NestedFrame"]
+__all__ = ["greetings", "meaning", "NestedDtype", "NestedFrame", "read_parquet"]
diff --git a/src/nested_pandas/nestedframe/io.py b/src/nested_pandas/nestedframe/io.py
index b27b713..fab2da1 100644
--- a/src/nested_pandas/nestedframe/io.py
+++ b/src/nested_pandas/nestedframe/io.py
@@ -62,7 +62,7 @@ def read_parquet(
     df = NestedFrame(pd.read_parquet(data, engine, columns))
 
     for pack_key in to_pack:
-        col_subset = pack_columns[pack_key] if pack_columns is not None else None
+        col_subset = pack_columns.get(pack_key, None) if pack_columns is not None else None
         packed = pd.read_parquet(to_pack[pack_key], engine=engine, columns=col_subset)
         df = df.add_nested(packed, pack_key)
 
diff --git a/tests/nested_pandas/nestedframe/test_io.py b/tests/nested_pandas/nestedframe/test_io.py
new file mode 100644
index 0000000..8037755
--- /dev/null
+++ b/tests/nested_pandas/nestedframe/test_io.py
@@ -0,0 +1,59 @@
+import os
+
+import pandas as pd
+import pytest
+from nested_pandas import read_parquet
+
+
+@pytest.mark.parametrize("columns", [["a"], None])
+@pytest.mark.parametrize("pack_columns", [{"nested1": ["c"], "nested2": ["e"]}, {"nested1": ["d"]}, None])
+def test_read_parquet(tmp_path, columns, pack_columns):
+    """Test nested parquet loading"""
+    # Setup a temporary directory for files
+    save_path = os.path.join(tmp_path, ".")
+
+    # Generate some test data
+    base = pd.DataFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2])
+
+    nested1 = pd.DataFrame(
+        data={"c": [0, 2, 4, 1, 4, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
+        index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
+    )
+
+    nested2 = pd.DataFrame(
+        data={"e": [0, 2, 4, 1, 4, 3, 1, 4, 1], "f": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
+        index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
+    )
+
+    # Save to parquet
+    base.to_parquet(os.path.join(save_path, "base.parquet"))
+    nested1.to_parquet(os.path.join(save_path, "nested1.parquet"))
+    nested2.to_parquet(os.path.join(save_path, "nested2.parquet"))
+
+    # Read from parquet
+    nf = read_parquet(
+        data=os.path.join(save_path, "base.parquet"),
+        to_pack={
+            "nested1": os.path.join(save_path, "nested1.parquet"),
+            "nested2": os.path.join(save_path, "nested2.parquet"),
+        },
+        columns=columns,
+        pack_columns=pack_columns,
+    )
+
+    # Check Base Columns
+    if columns is not None:
+        assert nf.columns.tolist() == columns + ["nested1", "nested2"]
+    else:
+        assert nf.columns.tolist() == base.columns.tolist() + ["nested1", "nested2"]
+
+    # Check Nested Columns
+    if pack_columns is not None:
+        for nested_col in pack_columns:
+            assert nf[nested_col].nest.fields == pack_columns[nested_col]
+    else:
+        for nested_col in nf.nested_columns:
+            if nested_col == "nested1":
+                assert nf[nested_col].nest.fields == nested1.columns.tolist()
+            elif nested_col == "nested2":
+                assert nf[nested_col].nest.fields == nested2.columns.tolist()

From 7ec4bb52b8c01e7840fa52b880d663cc56843b6a Mon Sep 17 00:00:00 2001
From: Doug Branton <brantd@uw.edu>
Date: Tue, 9 Apr 2024 12:30:11 -0700
Subject: [PATCH 3/4] engine and dtypes

---
 src/nested_pandas/nestedframe/io.py | 35 ++++++++++++++++-------------
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/src/nested_pandas/nestedframe/io.py b/src/nested_pandas/nestedframe/io.py
index fab2da1..40e1405 100644
--- a/src/nested_pandas/nestedframe/io.py
+++ b/src/nested_pandas/nestedframe/io.py
@@ -2,16 +2,22 @@
 from __future__ import annotations
 
 import pandas as pd
+from pandas._libs import lib
+from pandas._typing import (
+    DtypeBackend,
+    FilePath,
+    ReadBuffer,
+)
 
 from .core import NestedFrame
 
 
 def read_parquet(
-    data: str,
+    data: FilePath | ReadBuffer[bytes],
     to_pack: dict,
-    engine: str = "auto",
     columns: list[str] | None = None,
     pack_columns: dict | None = None,
+    dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
 ) -> NestedFrame:
     """
     Load a parquet object from a file path and load a set of other
@@ -37,33 +43,32 @@ def read_parquet(
         A dictionary of parquet data paths (same criteria as `data`), where
         each key reflects the desired column name to pack the data into and
         each value reflects the parquet data to pack.
-    engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto'
-        Parquet library to use. If 'auto', then the option
-        ``io.parquet.engine`` is used. The default ``io.parquet.engine``
-        behavior is to try 'pyarrow', falling back to 'fastparquet' if
-        'pyarrow' is unavailable.
-
-        When using the ``'pyarrow'`` engine and no storage options are provided
-        and a filesystem is implemented by both ``pyarrow.fs`` and ``fsspec``
-        (e.g. "s3://"), then the ``pyarrow.fs`` filesystem is attempted first.
-        Use the filesystem keyword with an instantiated fsspec filesystem
-        if you wish to use its implementation.
     columns : list, default=None
         If not None, only these columns will be read from the file.
     pack_columns: dict, default=None
         If not None, selects a set of columns from each keyed nested parquet
         object to read from the nested files.
+    dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable'
+        Back-end data type applied to the resultant :class:`DataFrame`
+        (still experimental). Behaviour is as follows:
+
+        * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
+          (default).
+        * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
+          DataFrame.
 
     Returns
     -------
     NestedFrame
     """
 
-    df = NestedFrame(pd.read_parquet(data, engine, columns))
+    df = NestedFrame(pd.read_parquet(data, engine="pyarrow", columns=columns, dtype_backend=dtype_backend))
 
     for pack_key in to_pack:
         col_subset = pack_columns.get(pack_key, None) if pack_columns is not None else None
-        packed = pd.read_parquet(to_pack[pack_key], engine=engine, columns=col_subset)
+        packed = pd.read_parquet(
+            to_pack[pack_key], engine="pyarrow", columns=col_subset, dtype_backend=dtype_backend
+        )
         df = df.add_nested(packed, pack_key)
 
     return df

From 041e513d40d35a3f60f4ad47dfec77e6ad5608c1 Mon Sep 17 00:00:00 2001
From: Doug Branton <brantd@uw.edu>
Date: Tue, 9 Apr 2024 12:32:39 -0700
Subject: [PATCH 4/4] add backend todo

---
 src/nested_pandas/nestedframe/io.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/nested_pandas/nestedframe/io.py b/src/nested_pandas/nestedframe/io.py
index 40e1405..e0f773f 100644
--- a/src/nested_pandas/nestedframe/io.py
+++ b/src/nested_pandas/nestedframe/io.py
@@ -26,6 +26,7 @@ def read_parquet(
     Docstring based on the Pandas equivalent.
 
     #TODO after MVP: Include full kwarg-set
+    #TODO: Switch dtype backend default?
 
     Parameters
     ----------