Skip to content

Commit 6adba55

Browse files
String dtype: restrict options.mode.string_storage to python|pyarrow (remove pyarrow_numpy) (#59376)
* String dtype: restrict options.mode.string_storage to python|pyarrow (remove pyarrow_numpy) * add type annotation
1 parent 0d12b44 commit 6adba55

File tree

7 files changed

+42
-50
lines changed

7 files changed

+42
-50
lines changed

pandas/conftest.py

-2
Original file line numberDiff line numberDiff line change
@@ -1296,7 +1296,6 @@ def nullable_string_dtype(request):
12961296
params=[
12971297
"python",
12981298
pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")),
1299-
pytest.param("pyarrow_numpy", marks=td.skip_if_no("pyarrow")),
13001299
]
13011300
)
13021301
def string_storage(request):
@@ -1305,7 +1304,6 @@ def string_storage(request):
13051304
13061305
* 'python'
13071306
* 'pyarrow'
1308-
* 'pyarrow_numpy'
13091307
"""
13101308
return request.param
13111309

pandas/core/config_init.py

+17-1
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
from collections.abc import Callable
1616
import os
17+
from typing import Any
1718

1819
import pandas._config.config as cf
1920
from pandas._config.config import (
@@ -455,12 +456,27 @@ def is_terminal() -> bool:
455456
``future.infer_string`` is set to True.
456457
"""
457458

459+
460+
def is_valid_string_storage(value: Any) -> None:
461+
legal_values = ["python", "pyarrow"]
462+
if value not in legal_values:
463+
msg = "Value must be one of python|pyarrow"
464+
if value == "pyarrow_numpy":
465+
# TODO: we can remove extra message after 3.0
466+
msg += (
467+
". 'pyarrow_numpy' was specified, but this option should be "
468+
"enabled using pandas.options.future.infer_string instead"
469+
)
470+
raise ValueError(msg)
471+
472+
458473
with cf.config_prefix("mode"):
459474
cf.register_option(
460475
"string_storage",
461476
"python",
462477
string_storage_doc,
463-
validator=is_one_of_factory(["python", "pyarrow", "pyarrow_numpy"]),
478+
# validator=is_one_of_factory(["python", "pyarrow"]),
479+
validator=is_valid_string_storage,
464480
)
465481

466482

pandas/tests/arrays/string_/test_string.py

+8-24
Original file line numberDiff line numberDiff line change
@@ -514,50 +514,34 @@ def test_arrow_array(dtype):
514514
assert arr.equals(expected)
515515

516516

517-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
517+
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
518518
@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
519-
def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string):
519+
def test_arrow_roundtrip(dtype, string_storage, using_infer_string):
520520
# roundtrip possible from arrow 1.0.0
521521
pa = pytest.importorskip("pyarrow")
522522

523-
if using_infer_string and string_storage2 != "pyarrow_numpy":
524-
request.applymarker(
525-
pytest.mark.xfail(
526-
reason="infer_string takes precedence over string storage"
527-
)
528-
)
529-
530523
data = pd.array(["a", "b", None], dtype=dtype)
531524
df = pd.DataFrame({"a": data})
532525
table = pa.table(df)
533526
if dtype.storage == "python":
534527
assert table.field("a").type == "string"
535528
else:
536529
assert table.field("a").type == "large_string"
537-
with pd.option_context("string_storage", string_storage2):
530+
with pd.option_context("string_storage", string_storage):
538531
result = table.to_pandas()
539532
assert isinstance(result["a"].dtype, pd.StringDtype)
540-
expected = df.astype(f"string[{string_storage2}]")
533+
expected = df.astype(f"string[{string_storage}]")
541534
tm.assert_frame_equal(result, expected)
542535
# ensure the missing value is represented by NA and not np.nan or None
543536
assert result.loc[2, "a"] is result["a"].dtype.na_value
544537

545538

546-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
539+
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
547540
@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
548-
def test_arrow_load_from_zero_chunks(
549-
dtype, string_storage2, request, using_infer_string
550-
):
541+
def test_arrow_load_from_zero_chunks(dtype, string_storage, using_infer_string):
551542
# GH-41040
552543
pa = pytest.importorskip("pyarrow")
553544

554-
if using_infer_string and string_storage2 != "pyarrow_numpy":
555-
request.applymarker(
556-
pytest.mark.xfail(
557-
reason="infer_string takes precedence over string storage"
558-
)
559-
)
560-
561545
data = pd.array([], dtype=dtype)
562546
df = pd.DataFrame({"a": data})
563547
table = pa.table(df)
@@ -567,10 +551,10 @@ def test_arrow_load_from_zero_chunks(
567551
assert table.field("a").type == "large_string"
568552
# Instantiate the same table with no chunks at all
569553
table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema)
570-
with pd.option_context("string_storage", string_storage2):
554+
with pd.option_context("string_storage", string_storage):
571555
result = table.to_pandas()
572556
assert isinstance(result["a"].dtype, pd.StringDtype)
573-
expected = df.astype(f"string[{string_storage2}]")
557+
expected = df.astype(f"string[{string_storage}]")
574558
tm.assert_frame_equal(result, expected)
575559

576560

pandas/tests/arrays/string_/test_string_arrow.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -27,16 +27,18 @@ def test_eq_all_na():
2727

2828

2929
def test_config(string_storage, request, using_infer_string):
30-
if using_infer_string and string_storage != "pyarrow_numpy":
31-
request.applymarker(pytest.mark.xfail(reason="infer string takes precedence"))
32-
if string_storage == "pyarrow_numpy":
30+
if using_infer_string and string_storage == "python":
31+
# python string storage with na_value=NaN is not yet implemented
3332
request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))
33+
3434
with pd.option_context("string_storage", string_storage):
3535
assert StringDtype().storage == string_storage
3636
result = pd.array(["a", "b"])
3737
assert result.dtype.storage == string_storage
3838

39-
dtype = StringDtype(string_storage)
39+
dtype = StringDtype(
40+
string_storage, na_value=np.nan if using_infer_string else pd.NA
41+
)
4042
expected = dtype.construct_array_type()._from_sequence(["a", "b"], dtype=dtype)
4143
tm.assert_equal(result, expected)
4244

pandas/tests/frame/methods/test_astype.py

+9
Original file line numberDiff line numberDiff line change
@@ -897,3 +897,12 @@ def test_astype_to_string_not_modifying_input(string_storage, val):
897897
with option_context("mode.string_storage", string_storage):
898898
df.astype("string")
899899
tm.assert_frame_equal(df, expected)
900+
901+
902+
@pytest.mark.parametrize("val", [None, 1, 1.5, np.nan, NaT])
903+
def test_astype_to_string_dtype_not_modifying_input(any_string_dtype, val):
904+
# GH#51073 - variant of the above test with explicit dtype instances
905+
df = DataFrame({"a": ["a", "b", val]})
906+
expected = df.copy()
907+
df.astype(any_string_dtype)
908+
tm.assert_frame_equal(df, expected)

pandas/tests/frame/methods/test_convert_dtypes.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010

1111

1212
class TestConvertDtypes:
13+
# TODO convert_dtypes should not use NaN variant of string dtype, but always NA
14+
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
1315
@pytest.mark.parametrize(
1416
"convert_integer, expected", [(False, np.dtype("int32")), (True, "Int32")]
1517
)
@@ -18,9 +20,6 @@ def test_convert_dtypes(
1820
):
1921
# Specific types are tested in tests/series/test_dtypes.py
2022
# Just check that it works for DataFrame here
21-
if using_infer_string:
22-
string_storage = "pyarrow_numpy"
23-
2423
df = pd.DataFrame(
2524
{
2625
"a": pd.Series([1, 2, 3], dtype=np.dtype("int32")),

pandas/tests/io/conftest.py

-16
Original file line numberDiff line numberDiff line change
@@ -224,19 +224,3 @@ def compression_format(request):
224224
@pytest.fixture(params=_compression_formats_params)
225225
def compression_ext(request):
226226
return request.param[0]
227-
228-
229-
@pytest.fixture(
230-
params=[
231-
"python",
232-
pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")),
233-
]
234-
)
235-
def string_storage(request):
236-
"""
237-
Parametrized fixture for pd.options.mode.string_storage.
238-
239-
* 'python'
240-
* 'pyarrow'
241-
"""
242-
return request.param

0 commit comments

Comments
 (0)