Skip to content

Commit f307a0a

Browse files
ENH (string dtype): convert string_view columns to future string dtype instead of object dtype in Parquet/Feather IO (pandas-dev#60235)
* ENH (string dtype): convert string_view columns to future string dtype instead of object dtype in Parquet IO * move test to feather * fixup
1 parent cccf1e6 commit f307a0a

File tree

2 files changed

+27
-2
lines changed

2 files changed

+27
-2
lines changed

pandas/io/_util.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
import numpy as np
66

7+
from pandas.compat import pa_version_under18p0
78
from pandas.compat._optional import import_optional_dependency
89

910
import pandas as pd
@@ -35,7 +36,11 @@ def _arrow_dtype_mapping() -> dict:
3536
def arrow_string_types_mapper() -> Callable:
3637
pa = import_optional_dependency("pyarrow")
3738

38-
return {
39+
mapping = {
3940
pa.string(): pd.StringDtype(na_value=np.nan),
4041
pa.large_string(): pd.StringDtype(na_value=np.nan),
41-
}.get
42+
}
43+
if not pa_version_under18p0:
44+
mapping[pa.string_view()] = pd.StringDtype(na_value=np.nan)
45+
46+
return mapping.get

pandas/tests/io/test_feather.py

+20
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
import numpy as np
77
import pytest
88

9+
from pandas.compat.pyarrow import pa_version_under18p0
10+
911
import pandas as pd
1012
import pandas._testing as tm
1113

@@ -249,6 +251,24 @@ def test_string_inference(self, tmp_path):
249251
)
250252
tm.assert_frame_equal(result, expected)
251253

254+
@pytest.mark.skipif(pa_version_under18p0, reason="not supported before 18.0")
255+
def test_string_inference_string_view_type(self, tmp_path):
256+
# GH#54798
257+
import pyarrow as pa
258+
from pyarrow import feather
259+
260+
path = tmp_path / "string_view.parquet"
261+
table = pa.table({"a": pa.array([None, "b", "c"], pa.string_view())})
262+
feather.write_feather(table, path)
263+
264+
with pd.option_context("future.infer_string", True):
265+
result = read_feather(path)
266+
267+
expected = pd.DataFrame(
268+
data={"a": [None, "b", "c"]}, dtype=pd.StringDtype(na_value=np.nan)
269+
)
270+
tm.assert_frame_equal(result, expected)
271+
252272
def test_out_of_bounds_datetime_to_feather(self):
253273
# GH#47832
254274
df = pd.DataFrame(

0 commit comments

Comments
 (0)