Skip to content

Commit c638e69

Browse files
[backport 2.3.x] Update PyArrow conversion and arrow/parquet tests for pyarrow 19.0 (pandas-dev#60716) (pandas-dev#60755)
Co-authored-by: Matthew Roeschke <[email protected]> (cherry picked from commit 5efac82) * fixup * don't hardcode object dtype * also enable CoW when enabling future.infer_string
1 parent 36d34a1 commit c638e69

File tree

9 files changed

+103
-40
lines changed

9 files changed

+103
-40
lines changed

.github/workflows/unit-tests.yml

+2
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,8 @@ jobs:
105105
- name: "Pyarrow Nightly"
106106
env_file: actions-311-pyarrownightly.yaml
107107
pattern: "not slow and not network and not single_cpu"
108+
pandas_future_infer_string: "1"
109+
pandas_copy_on_write: "1"
108110
fail-fast: false
109111
name: ${{ matrix.name || format('ubuntu-latest {0}', matrix.env_file) }}
110112
env:

ci/deps/actions-311-pyarrownightly.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ dependencies:
2525

2626
- pip:
2727
- "tzdata>=2022.7"
28-
- "--extra-index-url https://pypi.fury.io/arrow-nightlies/"
28+
- "--extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple"
2929
- "--prefer-binary"
3030
- "--pre"
3131
- "pyarrow"

pandas/compat/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
pa_version_under16p0,
3535
pa_version_under17p0,
3636
pa_version_under18p0,
37+
pa_version_under19p0,
3738
)
3839

3940
if TYPE_CHECKING:
@@ -193,6 +194,7 @@ def get_bz2_file() -> type[pandas.compat.compressors.BZ2File]:
193194
"pa_version_under16p0",
194195
"pa_version_under17p0",
195196
"pa_version_under18p0",
197+
"pa_version_under19p0",
196198
"HAS_PYARROW",
197199
"IS64",
198200
"ISMUSL",

pandas/compat/pyarrow.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
pa_version_under16p0 = _palv < Version("16.0.0")
1919
pa_version_under17p0 = _palv < Version("17.0.0")
2020
pa_version_under18p0 = _palv < Version("18.0.0")
21+
pa_version_under19p0 = _palv < Version("19.0.0")
2122
HAS_PYARROW = True
2223
except ImportError:
2324
pa_version_under10p1 = True
@@ -29,5 +30,6 @@
2930
pa_version_under15p0 = True
3031
pa_version_under16p0 = True
3132
pa_version_under17p0 = True
32-
pa_version_under18p0 = False
33+
pa_version_under18p0 = True
34+
pa_version_under19p0 = True
3335
HAS_PYARROW = False

pandas/io/_util.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,10 @@
1010
from pandas._config import using_string_dtype
1111

1212
from pandas._libs import lib
13-
from pandas.compat import pa_version_under18p0
13+
from pandas.compat import (
14+
pa_version_under18p0,
15+
pa_version_under19p0,
16+
)
1417
from pandas.compat._optional import import_optional_dependency
1518

1619
import pandas as pd
@@ -78,7 +81,10 @@ def arrow_table_to_pandas(
7881
elif dtype_backend == "pyarrow":
7982
types_mapper = pd.ArrowDtype
8083
elif using_string_dtype():
81-
types_mapper = _arrow_string_types_mapper()
84+
if pa_version_under19p0:
85+
types_mapper = _arrow_string_types_mapper()
86+
else:
87+
types_mapper = None
8288
elif dtype_backend is lib.no_default or dtype_backend == "numpy":
8389
types_mapper = None
8490
else:

pandas/tests/arrays/string_/test_string.py

+20-2
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,10 @@
99

1010
from pandas._config import using_string_dtype
1111

12-
from pandas.compat.pyarrow import pa_version_under12p0
12+
from pandas.compat.pyarrow import (
13+
pa_version_under12p0,
14+
pa_version_under19p0,
15+
)
1316

1417
from pandas.core.dtypes.common import is_dtype_equal
1518

@@ -541,7 +544,7 @@ def test_arrow_roundtrip(dtype, string_storage, using_infer_string):
541544
assert table.field("a").type == "large_string"
542545
with pd.option_context("string_storage", string_storage):
543546
result = table.to_pandas()
544-
if dtype.na_value is np.nan and not using_string_dtype():
547+
if dtype.na_value is np.nan and not using_infer_string:
545548
assert result["a"].dtype == "object"
546549
else:
547550
assert isinstance(result["a"].dtype, pd.StringDtype)
@@ -555,6 +558,21 @@ def test_arrow_roundtrip(dtype, string_storage, using_infer_string):
555558
assert result.loc[2, "a"] is result["a"].dtype.na_value
556559

557560

561+
@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
562+
def test_arrow_from_string(using_infer_string):
563+
# not roundtrip, but starting with pyarrow table without pandas metadata
564+
pa = pytest.importorskip("pyarrow")
565+
table = pa.table({"a": pa.array(["a", "b", None], type=pa.string())})
566+
567+
result = table.to_pandas()
568+
569+
if using_infer_string and not pa_version_under19p0:
570+
expected = pd.DataFrame({"a": ["a", "b", None]}, dtype="str")
571+
else:
572+
expected = pd.DataFrame({"a": ["a", "b", None]}, dtype="object")
573+
tm.assert_frame_equal(result, expected)
574+
575+
558576
@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
559577
def test_arrow_load_from_zero_chunks(dtype, string_storage, using_infer_string):
560578
# GH-41040

pandas/tests/io/test_common.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import pytest
2020

2121
from pandas.compat import is_platform_windows
22+
from pandas.compat.pyarrow import pa_version_under19p0
2223
import pandas.util._test_decorators as td
2324

2425
import pandas as pd
@@ -166,8 +167,8 @@ def test_get_handle_pyarrow_compat(self):
166167
s = StringIO(data)
167168
with icom.get_handle(s, "rb", is_text=False) as handles:
168169
df = pa_csv.read_csv(handles.handle).to_pandas()
169-
# TODO will have to update this when pyarrow' to_pandas() is fixed
170-
expected = expected.astype("object")
170+
if pa_version_under19p0:
171+
expected = expected.astype("object")
171172
tm.assert_frame_equal(df, expected)
172173
assert not s.closed
173174

pandas/tests/io/test_feather.py

+20-6
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,10 @@
22
import numpy as np
33
import pytest
44

5-
from pandas.compat.pyarrow import pa_version_under18p0
5+
from pandas.compat.pyarrow import (
6+
pa_version_under18p0,
7+
pa_version_under19p0,
8+
)
69

710
import pandas as pd
811
import pandas._testing as tm
@@ -133,17 +136,17 @@ def test_rw_use_threads(self):
133136
def test_path_pathlib(self):
134137
df = pd.DataFrame(
135138
1.1 * np.arange(120).reshape((30, 4)),
136-
columns=pd.Index(list("ABCD"), dtype=object),
137-
index=pd.Index([f"i-{i}" for i in range(30)], dtype=object),
139+
columns=pd.Index(list("ABCD")),
140+
index=pd.Index([f"i-{i}" for i in range(30)]),
138141
).reset_index()
139142
result = tm.round_trip_pathlib(df.to_feather, read_feather)
140143
tm.assert_frame_equal(df, result)
141144

142145
def test_path_localpath(self):
143146
df = pd.DataFrame(
144147
1.1 * np.arange(120).reshape((30, 4)),
145-
columns=pd.Index(list("ABCD"), dtype=object),
146-
index=pd.Index([f"i-{i}" for i in range(30)], dtype=object),
148+
columns=pd.Index(list("ABCD")),
149+
index=pd.Index([f"i-{i}" for i in range(30)]),
147150
).reset_index()
148151
result = tm.round_trip_localpath(df.to_feather, read_feather)
149152
tm.assert_frame_equal(df, result)
@@ -241,16 +244,27 @@ def test_invalid_dtype_backend(self):
241244
with pytest.raises(ValueError, match=msg):
242245
read_feather(path, dtype_backend="numpy")
243246

244-
def test_string_inference(self, tmp_path):
247+
def test_string_inference(self, tmp_path, using_infer_string):
245248
# GH#54431
246249
path = tmp_path / "test_string_inference.p"
247250
df = pd.DataFrame(data={"a": ["x", "y"]})
248251
df.to_feather(path)
249252
with pd.option_context("future.infer_string", True):
250253
result = read_feather(path)
254+
dtype = pd.StringDtype(na_value=np.nan)
251255
expected = pd.DataFrame(
252256
data={"a": ["x", "y"]}, dtype=pd.StringDtype(na_value=np.nan)
253257
)
258+
expected = pd.DataFrame(
259+
data={"a": ["x", "y"]},
260+
dtype=dtype,
261+
columns=pd.Index(
262+
["a"],
263+
dtype=object
264+
if pa_version_under19p0 and not using_infer_string
265+
else dtype,
266+
),
267+
)
254268
tm.assert_frame_equal(result, expected)
255269

256270
@pytest.mark.skipif(pa_version_under18p0, reason="not supported before 18.0")

pandas/tests/io/test_parquet.py

+44-26
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
pa_version_under11p0,
2020
pa_version_under13p0,
2121
pa_version_under15p0,
22+
pa_version_under19p0,
2223
)
2324

2425
import pandas as pd
@@ -110,10 +111,7 @@ def fp(request):
110111

111112
@pytest.fixture
112113
def df_compat():
113-
# TODO(infer_string) should this give str columns?
114-
return pd.DataFrame(
115-
{"A": [1, 2, 3], "B": "foo"}, columns=pd.Index(["A", "B"], dtype=object)
116-
)
114+
return pd.DataFrame({"A": [1, 2, 3], "B": "foo"}, columns=pd.Index(["A", "B"]))
117115

118116

119117
@pytest.fixture
@@ -261,8 +259,10 @@ def test_invalid_engine(df_compat):
261259
check_round_trip(df_compat, "foo", "bar")
262260

263261

264-
def test_options_py(df_compat, pa):
262+
def test_options_py(df_compat, pa, using_infer_string):
265263
# use the set option
264+
if using_infer_string and not pa_version_under19p0:
265+
df_compat.columns = df_compat.columns.astype("str")
266266

267267
with pd.option_context("io.parquet.engine", "pyarrow"):
268268
check_round_trip(df_compat)
@@ -798,18 +798,21 @@ def test_unsupported_float16_cleanup(self, pa, path_type):
798798

799799
def test_categorical(self, pa):
800800
# supported in >= 0.7.0
801-
df = pd.DataFrame()
802-
df["a"] = pd.Categorical(list("abcdef"))
803-
804-
# test for null, out-of-order values, and unobserved category
805-
df["b"] = pd.Categorical(
806-
["bar", "foo", "foo", "bar", None, "bar"],
807-
dtype=pd.CategoricalDtype(["foo", "bar", "baz"]),
808-
)
809-
810-
# test for ordered flag
811-
df["c"] = pd.Categorical(
812-
["a", "b", "c", "a", "c", "b"], categories=["b", "c", "d"], ordered=True
801+
df = pd.DataFrame(
802+
{
803+
"a": pd.Categorical(list("abcdef")),
804+
# test for null, out-of-order values, and unobserved category
805+
"b": pd.Categorical(
806+
["bar", "foo", "foo", "bar", None, "bar"],
807+
dtype=pd.CategoricalDtype(["foo", "bar", "baz"]),
808+
),
809+
# test for ordered flag
810+
"c": pd.Categorical(
811+
["a", "b", "c", "a", "c", "b"],
812+
categories=["b", "c", "d"],
813+
ordered=True,
814+
),
815+
}
813816
)
814817

815818
check_round_trip(df, pa)
@@ -878,11 +881,13 @@ def test_s3_roundtrip_for_dir(
878881
repeat=1,
879882
)
880883

881-
def test_read_file_like_obj_support(self, df_compat):
884+
def test_read_file_like_obj_support(self, df_compat, using_infer_string):
882885
pytest.importorskip("pyarrow")
883886
buffer = BytesIO()
884887
df_compat.to_parquet(buffer)
885888
df_from_buf = read_parquet(buffer)
889+
if using_infer_string and not pa_version_under19p0:
890+
df_compat.columns = df_compat.columns.astype("str")
886891
tm.assert_frame_equal(df_compat, df_from_buf)
887892

888893
def test_expand_user(self, df_compat, monkeypatch):
@@ -949,7 +954,7 @@ def test_additional_extension_arrays(self, pa, using_infer_string):
949954
"c": pd.Series(["a", None, "c"], dtype="string"),
950955
}
951956
)
952-
if using_infer_string:
957+
if using_infer_string and pa_version_under19p0:
953958
check_round_trip(df, pa, expected=df.astype({"c": "str"}))
954959
else:
955960
check_round_trip(df, pa)
@@ -963,7 +968,10 @@ def test_pyarrow_backed_string_array(self, pa, string_storage, using_infer_strin
963968
df = pd.DataFrame({"a": pd.Series(["a", None, "c"], dtype="string[pyarrow]")})
964969
with pd.option_context("string_storage", string_storage):
965970
if using_infer_string:
966-
expected = df.astype("str")
971+
if pa_version_under19p0:
972+
expected = df.astype("str")
973+
else:
974+
expected = df.astype(f"string[{string_storage}]")
967975
expected.columns = expected.columns.astype("str")
968976
else:
969977
expected = df.astype(f"string[{string_storage}]")
@@ -1128,17 +1136,24 @@ def test_df_attrs_persistence(self, tmp_path, pa):
11281136
new_df = read_parquet(path, engine=pa)
11291137
assert new_df.attrs == df.attrs
11301138

1131-
def test_string_inference(self, tmp_path, pa):
1139+
def test_string_inference(self, tmp_path, pa, using_infer_string):
11321140
# GH#54431
11331141
path = tmp_path / "test_string_inference.p"
11341142
df = pd.DataFrame(data={"a": ["x", "y"]}, index=["a", "b"])
1135-
df.to_parquet(path, engine="pyarrow")
1143+
df.to_parquet(path, engine=pa)
11361144
with pd.option_context("future.infer_string", True):
1137-
result = read_parquet(path, engine="pyarrow")
1145+
result = read_parquet(path, engine=pa)
1146+
dtype = pd.StringDtype(na_value=np.nan)
11381147
expected = pd.DataFrame(
11391148
data={"a": ["x", "y"]},
1140-
dtype=pd.StringDtype(na_value=np.nan),
1141-
index=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)),
1149+
dtype=dtype,
1150+
index=pd.Index(["a", "b"], dtype=dtype),
1151+
columns=pd.Index(
1152+
["a"],
1153+
dtype=object
1154+
if pa_version_under19p0 and not using_infer_string
1155+
else dtype,
1156+
),
11421157
)
11431158
tm.assert_frame_equal(result, expected)
11441159

@@ -1151,7 +1166,10 @@ def test_roundtrip_decimal(self, tmp_path, pa):
11511166
df = pd.DataFrame({"a": [Decimal("123.00")]}, dtype="string[pyarrow]")
11521167
df.to_parquet(path, schema=pa.schema([("a", pa.decimal128(5))]))
11531168
result = read_parquet(path)
1154-
expected = pd.DataFrame({"a": ["123"]}, dtype="string[python]")
1169+
if pa_version_under19p0:
1170+
expected = pd.DataFrame({"a": ["123"]}, dtype="string[python]")
1171+
else:
1172+
expected = pd.DataFrame({"a": [Decimal("123.00")]}, dtype="object")
11551173
tm.assert_frame_equal(result, expected)
11561174

11571175
def test_infer_string_large_string_type(self, tmp_path, pa):

0 commit comments

Comments
 (0)