Skip to content

Commit afc8edd

Browse files
Update PyArrow conversion and arrow/parquet tests for pyarrow 19.0 (pandas-dev#60716)
* Update PyArrow conversion and arrow/parquet tests for pyarrow 19.0 * update pypi index * extra filterwarnings * more test updates * temp enable infer_string option * Adapt test_get_handle_pyarrow_compat for pyarrow 19 * Use pa_version_under19p0 in test_get_handle_pyarrow_compat * Adjust test_string_inference for using_infer_string * Fix test_string_inference for feather --------- Co-authored-by: Matthew Roeschke <[email protected]> (cherry picked from commit 5efac82)
1 parent 36d34a1 commit afc8edd

File tree

9 files changed

+97
-48
lines changed

9 files changed

+97
-48
lines changed

.github/workflows/unit-tests.yml

+1
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ jobs:
105105
- name: "Pyarrow Nightly"
106106
env_file: actions-311-pyarrownightly.yaml
107107
pattern: "not slow and not network and not single_cpu"
108+
pandas_future_infer_string: "1"
108109
fail-fast: false
109110
name: ${{ matrix.name || format('ubuntu-latest {0}', matrix.env_file) }}
110111
env:

ci/deps/actions-311-pyarrownightly.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ dependencies:
2525

2626
- pip:
2727
- "tzdata>=2022.7"
28-
- "--extra-index-url https://pypi.fury.io/arrow-nightlies/"
28+
- "--extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple"
2929
- "--prefer-binary"
3030
- "--pre"
3131
- "pyarrow"

pandas/compat/__init__.py

+2-16
Original file line numberDiff line numberDiff line change
@@ -14,18 +14,9 @@
1414
import sys
1515
from typing import TYPE_CHECKING
1616

17-
from pandas.compat._constants import (
18-
IS64,
19-
ISMUSL,
20-
PY310,
21-
PY311,
22-
PY312,
23-
PYPY,
24-
)
2517
import pandas.compat.compressors
2618
from pandas.compat.numpy import is_numpy_dev
2719
from pandas.compat.pyarrow import (
28-
HAS_PYARROW,
2920
pa_version_under10p1,
3021
pa_version_under11p0,
3122
pa_version_under13p0,
@@ -34,6 +25,7 @@
3425
pa_version_under16p0,
3526
pa_version_under17p0,
3627
pa_version_under18p0,
28+
pa_version_under19p0,
3729
)
3830

3931
if TYPE_CHECKING:
@@ -193,11 +185,5 @@ def get_bz2_file() -> type[pandas.compat.compressors.BZ2File]:
193185
"pa_version_under16p0",
194186
"pa_version_under17p0",
195187
"pa_version_under18p0",
196-
"HAS_PYARROW",
197-
"IS64",
198-
"ISMUSL",
199-
"PY310",
200-
"PY311",
201-
"PY312",
202-
"PYPY",
188+
"pa_version_under19p0",
203189
]

pandas/compat/pyarrow.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
pa_version_under16p0 = _palv < Version("16.0.0")
1919
pa_version_under17p0 = _palv < Version("17.0.0")
2020
pa_version_under18p0 = _palv < Version("18.0.0")
21+
pa_version_under19p0 = _palv < Version("19.0.0")
2122
HAS_PYARROW = True
2223
except ImportError:
2324
pa_version_under10p1 = True
@@ -29,5 +30,6 @@
2930
pa_version_under15p0 = True
3031
pa_version_under16p0 = True
3132
pa_version_under17p0 = True
32-
pa_version_under18p0 = False
33+
pa_version_under18p0 = True
34+
pa_version_under19p0 = True
3335
HAS_PYARROW = False

pandas/io/_util.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,10 @@
1010
from pandas._config import using_string_dtype
1111

1212
from pandas._libs import lib
13-
from pandas.compat import pa_version_under18p0
13+
from pandas.compat import (
14+
pa_version_under18p0,
15+
pa_version_under19p0,
16+
)
1417
from pandas.compat._optional import import_optional_dependency
1518

1619
import pandas as pd
@@ -78,7 +81,10 @@ def arrow_table_to_pandas(
7881
elif dtype_backend == "pyarrow":
7982
types_mapper = pd.ArrowDtype
8083
elif using_string_dtype():
81-
types_mapper = _arrow_string_types_mapper()
84+
if pa_version_under19p0:
85+
types_mapper = _arrow_string_types_mapper()
86+
else:
87+
types_mapper = None
8288
elif dtype_backend is lib.no_default or dtype_backend == "numpy":
8389
types_mapper = None
8490
else:

pandas/tests/arrays/string_/test_string.py

+20-2
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,10 @@
99

1010
from pandas._config import using_string_dtype
1111

12-
from pandas.compat.pyarrow import pa_version_under12p0
12+
from pandas.compat.pyarrow import (
13+
pa_version_under12p0,
14+
pa_version_under19p0,
15+
)
1316

1417
from pandas.core.dtypes.common import is_dtype_equal
1518

@@ -541,7 +544,7 @@ def test_arrow_roundtrip(dtype, string_storage, using_infer_string):
541544
assert table.field("a").type == "large_string"
542545
with pd.option_context("string_storage", string_storage):
543546
result = table.to_pandas()
544-
if dtype.na_value is np.nan and not using_string_dtype():
547+
if dtype.na_value is np.nan and not using_infer_string:
545548
assert result["a"].dtype == "object"
546549
else:
547550
assert isinstance(result["a"].dtype, pd.StringDtype)
@@ -555,6 +558,21 @@ def test_arrow_roundtrip(dtype, string_storage, using_infer_string):
555558
assert result.loc[2, "a"] is result["a"].dtype.na_value
556559

557560

561+
@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
562+
def test_arrow_from_string(using_infer_string):
563+
# not roundtrip, but starting with pyarrow table without pandas metadata
564+
pa = pytest.importorskip("pyarrow")
565+
table = pa.table({"a": pa.array(["a", "b", None], type=pa.string())})
566+
567+
result = table.to_pandas()
568+
569+
if using_infer_string and not pa_version_under19p0:
570+
expected = pd.DataFrame({"a": ["a", "b", None]}, dtype="str")
571+
else:
572+
expected = pd.DataFrame({"a": ["a", "b", None]}, dtype="object")
573+
tm.assert_frame_equal(result, expected)
574+
575+
558576
@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
559577
def test_arrow_load_from_zero_chunks(dtype, string_storage, using_infer_string):
560578
# GH-41040

pandas/tests/io/test_common.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import pytest
2020

2121
from pandas.compat import is_platform_windows
22+
from pandas.compat.pyarrow import pa_version_under19p0
2223
import pandas.util._test_decorators as td
2324

2425
import pandas as pd
@@ -166,8 +167,8 @@ def test_get_handle_pyarrow_compat(self):
166167
s = StringIO(data)
167168
with icom.get_handle(s, "rb", is_text=False) as handles:
168169
df = pa_csv.read_csv(handles.handle).to_pandas()
169-
# TODO will have to update this when pyarrow' to_pandas() is fixed
170-
expected = expected.astype("object")
170+
if pa_version_under19p0:
171+
expected = expected.astype("object")
171172
tm.assert_frame_equal(df, expected)
172173
assert not s.closed
173174

pandas/tests/io/test_feather.py

+16-2
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,10 @@
22
import numpy as np
33
import pytest
44

5-
from pandas.compat.pyarrow import pa_version_under18p0
5+
from pandas.compat.pyarrow import (
6+
pa_version_under18p0,
7+
pa_version_under19p0,
8+
)
69

710
import pandas as pd
811
import pandas._testing as tm
@@ -241,16 +244,27 @@ def test_invalid_dtype_backend(self):
241244
with pytest.raises(ValueError, match=msg):
242245
read_feather(path, dtype_backend="numpy")
243246

244-
def test_string_inference(self, tmp_path):
247+
def test_string_inference(self, tmp_path, using_infer_string):
245248
# GH#54431
246249
path = tmp_path / "test_string_inference.p"
247250
df = pd.DataFrame(data={"a": ["x", "y"]})
248251
df.to_feather(path)
249252
with pd.option_context("future.infer_string", True):
250253
result = read_feather(path)
254+
dtype = pd.StringDtype(na_value=np.nan)
251255
expected = pd.DataFrame(
252256
data={"a": ["x", "y"]}, dtype=pd.StringDtype(na_value=np.nan)
253257
)
258+
expected = pd.DataFrame(
259+
data={"a": ["x", "y"]},
260+
dtype=dtype,
261+
columns=pd.Index(
262+
["a"],
263+
dtype=object
264+
if pa_version_under19p0 and not using_infer_string
265+
else dtype,
266+
),
267+
)
254268
tm.assert_frame_equal(result, expected)
255269

256270
@pytest.mark.skipif(pa_version_under18p0, reason="not supported before 18.0")

pandas/tests/io/test_parquet.py

+43-22
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
pa_version_under11p0,
2020
pa_version_under13p0,
2121
pa_version_under15p0,
22+
pa_version_under19p0,
2223
)
2324

2425
import pandas as pd
@@ -261,8 +262,10 @@ def test_invalid_engine(df_compat):
261262
check_round_trip(df_compat, "foo", "bar")
262263

263264

264-
def test_options_py(df_compat, pa):
265+
def test_options_py(df_compat, pa, using_infer_string):
265266
# use the set option
267+
if using_infer_string and not pa_version_under19p0:
268+
df_compat.columns = df_compat.columns.astype("str")
266269

267270
with pd.option_context("io.parquet.engine", "pyarrow"):
268271
check_round_trip(df_compat)
@@ -798,18 +801,21 @@ def test_unsupported_float16_cleanup(self, pa, path_type):
798801

799802
def test_categorical(self, pa):
800803
# supported in >= 0.7.0
801-
df = pd.DataFrame()
802-
df["a"] = pd.Categorical(list("abcdef"))
803-
804-
# test for null, out-of-order values, and unobserved category
805-
df["b"] = pd.Categorical(
806-
["bar", "foo", "foo", "bar", None, "bar"],
807-
dtype=pd.CategoricalDtype(["foo", "bar", "baz"]),
808-
)
809-
810-
# test for ordered flag
811-
df["c"] = pd.Categorical(
812-
["a", "b", "c", "a", "c", "b"], categories=["b", "c", "d"], ordered=True
804+
df = pd.DataFrame(
805+
{
806+
"a": pd.Categorical(list("abcdef")),
807+
# test for null, out-of-order values, and unobserved category
808+
"b": pd.Categorical(
809+
["bar", "foo", "foo", "bar", None, "bar"],
810+
dtype=pd.CategoricalDtype(["foo", "bar", "baz"]),
811+
),
812+
# test for ordered flag
813+
"c": pd.Categorical(
814+
["a", "b", "c", "a", "c", "b"],
815+
categories=["b", "c", "d"],
816+
ordered=True,
817+
),
818+
}
813819
)
814820

815821
check_round_trip(df, pa)
@@ -878,11 +884,13 @@ def test_s3_roundtrip_for_dir(
878884
repeat=1,
879885
)
880886

881-
def test_read_file_like_obj_support(self, df_compat):
887+
def test_read_file_like_obj_support(self, df_compat, using_infer_string):
882888
pytest.importorskip("pyarrow")
883889
buffer = BytesIO()
884890
df_compat.to_parquet(buffer)
885891
df_from_buf = read_parquet(buffer)
892+
if using_infer_string and not pa_version_under19p0:
893+
df_compat.columns = df_compat.columns.astype("str")
886894
tm.assert_frame_equal(df_compat, df_from_buf)
887895

888896
def test_expand_user(self, df_compat, monkeypatch):
@@ -949,7 +957,7 @@ def test_additional_extension_arrays(self, pa, using_infer_string):
949957
"c": pd.Series(["a", None, "c"], dtype="string"),
950958
}
951959
)
952-
if using_infer_string:
960+
if using_infer_string and pa_version_under19p0:
953961
check_round_trip(df, pa, expected=df.astype({"c": "str"}))
954962
else:
955963
check_round_trip(df, pa)
@@ -963,7 +971,10 @@ def test_pyarrow_backed_string_array(self, pa, string_storage, using_infer_strin
963971
df = pd.DataFrame({"a": pd.Series(["a", None, "c"], dtype="string[pyarrow]")})
964972
with pd.option_context("string_storage", string_storage):
965973
if using_infer_string:
966-
expected = df.astype("str")
974+
if pa_version_under19p0:
975+
expected = df.astype("str")
976+
else:
977+
expected = df.astype(f"string[{string_storage}]")
967978
expected.columns = expected.columns.astype("str")
968979
else:
969980
expected = df.astype(f"string[{string_storage}]")
@@ -1128,17 +1139,24 @@ def test_df_attrs_persistence(self, tmp_path, pa):
11281139
new_df = read_parquet(path, engine=pa)
11291140
assert new_df.attrs == df.attrs
11301141

1131-
def test_string_inference(self, tmp_path, pa):
1142+
def test_string_inference(self, tmp_path, pa, using_infer_string):
11321143
# GH#54431
11331144
path = tmp_path / "test_string_inference.p"
11341145
df = pd.DataFrame(data={"a": ["x", "y"]}, index=["a", "b"])
1135-
df.to_parquet(path, engine="pyarrow")
1146+
df.to_parquet(path, engine=pa)
11361147
with pd.option_context("future.infer_string", True):
1137-
result = read_parquet(path, engine="pyarrow")
1148+
result = read_parquet(path, engine=pa)
1149+
dtype = pd.StringDtype(na_value=np.nan)
11381150
expected = pd.DataFrame(
11391151
data={"a": ["x", "y"]},
1140-
dtype=pd.StringDtype(na_value=np.nan),
1141-
index=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)),
1152+
dtype=dtype,
1153+
index=pd.Index(["a", "b"], dtype=dtype),
1154+
columns=pd.Index(
1155+
["a"],
1156+
dtype=object
1157+
if pa_version_under19p0 and not using_infer_string
1158+
else dtype,
1159+
),
11421160
)
11431161
tm.assert_frame_equal(result, expected)
11441162

@@ -1151,7 +1169,10 @@ def test_roundtrip_decimal(self, tmp_path, pa):
11511169
df = pd.DataFrame({"a": [Decimal("123.00")]}, dtype="string[pyarrow]")
11521170
df.to_parquet(path, schema=pa.schema([("a", pa.decimal128(5))]))
11531171
result = read_parquet(path)
1154-
expected = pd.DataFrame({"a": ["123"]}, dtype="string[python]")
1172+
if pa_version_under19p0:
1173+
expected = pd.DataFrame({"a": ["123"]}, dtype="string[python]")
1174+
else:
1175+
expected = pd.DataFrame({"a": [Decimal("123.00")]}, dtype="object")
11551176
tm.assert_frame_equal(result, expected)
11561177

11571178
def test_infer_string_large_string_type(self, tmp_path, pa):

0 commit comments

Comments
 (0)