Skip to content

Commit

Permalink
ENH (string dtype): convert string_view columns to future string dtyp…
Browse files Browse the repository at this point in the history
…e instead of object dtype in Parquet IO
  • Loading branch information
jorisvandenbossche committed Nov 7, 2024
1 parent f9d2e50 commit 2f0272c
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 2 deletions.
9 changes: 7 additions & 2 deletions pandas/io/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import numpy as np

from pandas.compat import pa_version_under18p0
from pandas.compat._optional import import_optional_dependency

import pandas as pd
Expand Down Expand Up @@ -35,7 +36,11 @@ def _arrow_dtype_mapping() -> dict:
def arrow_string_types_mapper() -> Callable:
pa = import_optional_dependency("pyarrow")

return {
mapping = {
pa.string(): pd.StringDtype(na_value=np.nan),
pa.large_string(): pd.StringDtype(na_value=np.nan),
}.get
}
if not pa_version_under18p0:
mapping[pa.string_view()] = pd.StringDtype(na_value=np.nan)

return mapping.get
21 changes: 21 additions & 0 deletions pandas/tests/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
pa_version_under13p0,
pa_version_under15p0,
pa_version_under17p0,
pa_version_under18p0,
)

import pandas as pd
Expand Down Expand Up @@ -1144,6 +1145,26 @@ def test_infer_string_large_string_type(self, tmp_path, pa):
)
tm.assert_frame_equal(result, expected)

@pytest.mark.skipif(pa_version_under18p0, reason="not supported before 18.0")
def test_infer_string_string_view_type(self, tmp_path, pa):
# GH#54798
import pyarrow as pa
import pyarrow.parquet as pq

path = tmp_path / "string_view.parquet"

table = pa.table({"a": pa.array([None, "b", "c"], pa.string_view())})
pq.write_table(table, path)

with pd.option_context("future.infer_string", True):
result = read_parquet(path)
expected = pd.DataFrame(
data={"a": [None, "b", "c"]},
dtype=pd.StringDtype(na_value=np.nan),
columns=pd.Index(["a"], dtype=pd.StringDtype(na_value=np.nan)),
)
tm.assert_frame_equal(result, expected)

# NOTE: this test is not run by default, because it requires a lot of memory (>5GB)
# @pytest.mark.slow
# def test_string_column_above_2GB(self, tmp_path, pa):
Expand Down

0 comments on commit 2f0272c

Please sign in to comment.