Skip to content

Commit 8982137

Browse files
jorisvandenbosscheWillAyd
authored andcommitted
String dtype: restrict options.mode.string_storage to python|pyarrow (remove pyarrow_numpy) (pandas-dev#59376)
* String dtype: restrict options.mode.string_storage to python|pyarrow (remove pyarrow_numpy) * add type annotation
1 parent 5d82d05 commit 8982137

File tree

7 files changed

+45
-51
lines changed

7 files changed

+45
-51
lines changed

pandas/conftest.py

-2
Original file line numberDiff line numberDiff line change
@@ -1248,7 +1248,6 @@ def nullable_string_dtype(request):
12481248
params=[
12491249
"python",
12501250
pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")),
1251-
pytest.param("pyarrow_numpy", marks=td.skip_if_no("pyarrow")),
12521251
]
12531252
)
12541253
def string_storage(request):
@@ -1257,7 +1256,6 @@ def string_storage(request):
12571256
12581257
* 'python'
12591258
* 'pyarrow'
1260-
* 'pyarrow_numpy'
12611259
"""
12621260
return request.param
12631261

pandas/core/config_init.py

+20-2
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,10 @@
1212
from __future__ import annotations
1313

1414
import os
15-
from typing import Callable
15+
from typing import (
16+
Any,
17+
Callable,
18+
)
1619

1720
import pandas._config.config as cf
1821
from pandas._config.config import (
@@ -506,12 +509,27 @@ def use_inf_as_na_cb(key) -> None:
506509
``future.infer_string`` is set to True.
507510
"""
508511

512+
513+
def is_valid_string_storage(value: Any) -> None:
514+
legal_values = ["python", "pyarrow"]
515+
if value not in legal_values:
516+
msg = "Value must be one of python|pyarrow"
517+
if value == "pyarrow_numpy":
518+
# TODO: we can remove extra message after 3.0
519+
msg += (
520+
". 'pyarrow_numpy' was specified, but this option should be "
521+
"enabled using pandas.options.future.infer_string instead"
522+
)
523+
raise ValueError(msg)
524+
525+
509526
with cf.config_prefix("mode"):
510527
cf.register_option(
511528
"string_storage",
512529
"python",
513530
string_storage_doc,
514-
validator=is_one_of_factory(["python", "pyarrow", "pyarrow_numpy"]),
531+
# validator=is_one_of_factory(["python", "pyarrow"]),
532+
validator=is_valid_string_storage,
515533
)
516534

517535

pandas/tests/arrays/string_/test_string.py

+8-24
Original file line numberDiff line numberDiff line change
@@ -516,50 +516,34 @@ def test_arrow_array(dtype):
516516
assert arr.equals(expected)
517517

518518

519-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
519+
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
520520
@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
521-
def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string):
521+
def test_arrow_roundtrip(dtype, string_storage, using_infer_string):
522522
# roundtrip possible from arrow 1.0.0
523523
pa = pytest.importorskip("pyarrow")
524524

525-
if using_infer_string and string_storage2 != "pyarrow_numpy":
526-
request.applymarker(
527-
pytest.mark.xfail(
528-
reason="infer_string takes precedence over string storage"
529-
)
530-
)
531-
532525
data = pd.array(["a", "b", None], dtype=dtype)
533526
df = pd.DataFrame({"a": data})
534527
table = pa.table(df)
535528
if dtype.storage == "python":
536529
assert table.field("a").type == "string"
537530
else:
538531
assert table.field("a").type == "large_string"
539-
with pd.option_context("string_storage", string_storage2):
532+
with pd.option_context("string_storage", string_storage):
540533
result = table.to_pandas()
541534
assert isinstance(result["a"].dtype, pd.StringDtype)
542-
expected = df.astype(f"string[{string_storage2}]")
535+
expected = df.astype(f"string[{string_storage}]")
543536
tm.assert_frame_equal(result, expected)
544537
# ensure the missing value is represented by NA and not np.nan or None
545538
assert result.loc[2, "a"] is result["a"].dtype.na_value
546539

547540

548-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
541+
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
549542
@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
550-
def test_arrow_load_from_zero_chunks(
551-
dtype, string_storage2, request, using_infer_string
552-
):
543+
def test_arrow_load_from_zero_chunks(dtype, string_storage, using_infer_string):
553544
# GH-41040
554545
pa = pytest.importorskip("pyarrow")
555546

556-
if using_infer_string and string_storage2 != "pyarrow_numpy":
557-
request.applymarker(
558-
pytest.mark.xfail(
559-
reason="infer_string takes precedence over string storage"
560-
)
561-
)
562-
563547
data = pd.array([], dtype=dtype)
564548
df = pd.DataFrame({"a": data})
565549
table = pa.table(df)
@@ -569,10 +553,10 @@ def test_arrow_load_from_zero_chunks(
569553
assert table.field("a").type == "large_string"
570554
# Instantiate the same table with no chunks at all
571555
table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema)
572-
with pd.option_context("string_storage", string_storage2):
556+
with pd.option_context("string_storage", string_storage):
573557
result = table.to_pandas()
574558
assert isinstance(result["a"].dtype, pd.StringDtype)
575-
expected = df.astype(f"string[{string_storage2}]")
559+
expected = df.astype(f"string[{string_storage}]")
576560
tm.assert_frame_equal(result, expected)
577561

578562

pandas/tests/arrays/string_/test_string_arrow.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -27,16 +27,18 @@ def test_eq_all_na():
2727

2828

2929
def test_config(string_storage, request, using_infer_string):
30-
if using_infer_string and string_storage != "pyarrow_numpy":
31-
request.applymarker(pytest.mark.xfail(reason="infer string takes precedence"))
32-
if string_storage == "pyarrow_numpy":
30+
if using_infer_string and string_storage == "python":
31+
# python string storage with na_value=NaN is not yet implemented
3332
request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))
33+
3434
with pd.option_context("string_storage", string_storage):
3535
assert StringDtype().storage == string_storage
3636
result = pd.array(["a", "b"])
3737
assert result.dtype.storage == string_storage
3838

39-
dtype = StringDtype(string_storage)
39+
dtype = StringDtype(
40+
string_storage, na_value=np.nan if using_infer_string else pd.NA
41+
)
4042
expected = dtype.construct_array_type()._from_sequence(["a", "b"], dtype=dtype)
4143
tm.assert_equal(result, expected)
4244

pandas/tests/frame/methods/test_astype.py

+9
Original file line numberDiff line numberDiff line change
@@ -912,3 +912,12 @@ def test_astype_to_string_not_modifying_input(string_storage, val):
912912
with option_context("mode.string_storage", string_storage):
913913
df.astype("string", copy=False)
914914
tm.assert_frame_equal(df, expected)
915+
916+
917+
@pytest.mark.parametrize("val", [None, 1, 1.5, np.nan, NaT])
918+
def test_astype_to_string_dtype_not_modifying_input(any_string_dtype, val):
919+
# GH#51073 - variant of the above test with explicit dtype instances
920+
df = DataFrame({"a": ["a", "b", val]})
921+
expected = df.copy()
922+
df.astype(any_string_dtype)
923+
tm.assert_frame_equal(df, expected)

pandas/tests/frame/methods/test_convert_dtypes.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010

1111

1212
class TestConvertDtypes:
13+
# TODO convert_dtypes should not use NaN variant of string dtype, but always NA
14+
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
1315
@pytest.mark.parametrize(
1416
"convert_integer, expected", [(False, np.dtype("int32")), (True, "Int32")]
1517
)
@@ -18,9 +20,6 @@ def test_convert_dtypes(
1820
):
1921
# Specific types are tested in tests/series/test_dtypes.py
2022
# Just check that it works for DataFrame here
21-
if using_infer_string:
22-
string_storage = "pyarrow_numpy"
23-
2423
df = pd.DataFrame(
2524
{
2625
"a": pd.Series([1, 2, 3], dtype=np.dtype("int32")),

pandas/tests/io/conftest.py

-16
Original file line numberDiff line numberDiff line change
@@ -224,19 +224,3 @@ def compression_format(request):
224224
@pytest.fixture(params=_compression_formats_params)
225225
def compression_ext(request):
226226
return request.param[0]
227-
228-
229-
@pytest.fixture(
230-
params=[
231-
"python",
232-
pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")),
233-
]
234-
)
235-
def string_storage(request):
236-
"""
237-
Parametrized fixture for pd.options.mode.string_storage.
238-
239-
* 'python'
240-
* 'pyarrow'
241-
"""
242-
return request.param

0 commit comments

Comments
 (0)