Skip to content

Commit 2054463

Browse files
[backport 2.3.x] ENH (string dtype): convert string_view columns to future string dtype instead of object dtype in Parquet/Feather IO (#60235) (#60291)
(cherry picked from commit f307a0a)
1 parent 0808657 commit 2054463

File tree

4 files changed

+31
-2
lines changed

4 files changed

+31
-2
lines changed

pandas/compat/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
pa_version_under14p1,
3434
pa_version_under16p0,
3535
pa_version_under17p0,
36+
pa_version_under18p0,
3637
)
3738

3839
if TYPE_CHECKING:
@@ -191,6 +192,7 @@ def get_bz2_file() -> type[pandas.compat.compressors.BZ2File]:
191192
"pa_version_under14p1",
192193
"pa_version_under16p0",
193194
"pa_version_under17p0",
195+
"pa_version_under18p0",
194196
"HAS_PYARROW",
195197
"IS64",
196198
"ISMUSL",

pandas/compat/pyarrow.py

+2
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
pa_version_under15p0 = _palv < Version("15.0.0")
1818
pa_version_under16p0 = _palv < Version("16.0.0")
1919
pa_version_under17p0 = _palv < Version("17.0.0")
20+
pa_version_under18p0 = _palv < Version("18.0.0")
2021
HAS_PYARROW = True
2122
except ImportError:
2223
pa_version_under10p1 = True
@@ -28,4 +29,5 @@
2829
pa_version_under15p0 = True
2930
pa_version_under16p0 = True
3031
pa_version_under17p0 = True
32+
pa_version_under18p0 = False
3133
HAS_PYARROW = False

pandas/io/_util.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
import numpy as np
66

7+
from pandas.compat import pa_version_under18p0
78
from pandas.compat._optional import import_optional_dependency
89

910
import pandas as pd
@@ -32,7 +33,11 @@ def _arrow_dtype_mapping() -> dict:
3233
def arrow_string_types_mapper() -> Callable:
3334
pa = import_optional_dependency("pyarrow")
3435

35-
return {
36+
mapping = {
3637
pa.string(): pd.StringDtype(na_value=np.nan),
3738
pa.large_string(): pd.StringDtype(na_value=np.nan),
38-
}.get
39+
}
40+
if not pa_version_under18p0:
41+
mapping[pa.string_view()] = pd.StringDtype(na_value=np.nan)
42+
43+
return mapping.get

pandas/tests/io/test_feather.py

+20
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
import numpy as np
33
import pytest
44

5+
from pandas.compat.pyarrow import pa_version_under18p0
6+
57
import pandas as pd
68
import pandas._testing as tm
79

@@ -250,3 +252,21 @@ def test_string_inference(self, tmp_path):
250252
data={"a": ["x", "y"]}, dtype=pd.StringDtype(na_value=np.nan)
251253
)
252254
tm.assert_frame_equal(result, expected)
255+
256+
@pytest.mark.skipif(pa_version_under18p0, reason="not supported before 18.0")
257+
def test_string_inference_string_view_type(self, tmp_path):
258+
# GH#54798
259+
import pyarrow as pa
260+
from pyarrow import feather
261+
262+
path = tmp_path / "string_view.parquet"
263+
table = pa.table({"a": pa.array([None, "b", "c"], pa.string_view())})
264+
feather.write_feather(table, path)
265+
266+
with pd.option_context("future.infer_string", True):
267+
result = read_feather(path)
268+
269+
expected = pd.DataFrame(
270+
data={"a": [None, "b", "c"]}, dtype=pd.StringDtype(na_value=np.nan)
271+
)
272+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)