From acd303e450b6c4f337001af82e838c8592e4ab98 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 12 Aug 2024 10:57:50 +0200 Subject: [PATCH 1/3] String dtype: honor mode.string_storage option (and change default to None) --- pandas/core/arrays/string_.py | 12 +++++++----- pandas/core/config_init.py | 7 +++---- pandas/tests/arrays/string_/test_string_arrow.py | 6 ------ 3 files changed, 10 insertions(+), 15 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 2ba7c9fccbfce..1ec8946dd8a49 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -140,12 +140,14 @@ def __init__( # infer defaults if storage is None: if na_value is not libmissing.NA: - if HAS_PYARROW: - storage = "pyarrow" - else: - storage = "python" - else: storage = get_option("mode.string_storage") + if storage is None: + if HAS_PYARROW: + storage = "pyarrow" + else: + storage = "python" + else: + storage = get_option("mode.string_storage") or "python" if storage == "pyarrow_numpy": # TODO raise a deprecation warning diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index e62cda0dfe8d0..04e98825326f7 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -452,13 +452,12 @@ def is_terminal() -> bool: string_storage_doc = """ : string - The default storage for StringDtype. This option is ignored if - ``future.infer_string`` is set to True. + The default storage for StringDtype. """ def is_valid_string_storage(value: Any) -> None: - legal_values = ["python", "pyarrow"] + legal_values = [None, "python", "pyarrow"] if value not in legal_values: msg = "Value must be one of python|pyarrow" if value == "pyarrow_numpy": @@ -473,7 +472,7 @@ def is_valid_string_storage(value: Any) -> None: with cf.config_prefix("mode"): cf.register_option( "string_storage", - "python", + None, string_storage_doc, # validator=is_one_of_factory(["python", "pyarrow"]), validator=is_valid_string_storage, diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index 7d4aae0f7bb4e..d0922af53b6ef 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -4,7 +4,6 @@ import numpy as np import pytest -from pandas.compat import HAS_PYARROW import pandas.util._test_decorators as td import pandas as pd @@ -28,11 +27,6 @@ def test_eq_all_na(): def test_config(string_storage, request, using_infer_string): - if using_infer_string and string_storage == "python" and HAS_PYARROW: - # string storage with na_value=NaN always uses pyarrow if available - # -> does not yet honor the option - request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) - with pd.option_context("string_storage", string_storage): assert StringDtype().storage == string_storage result = pd.array(["a", "b"]) From 1e646a37fdd1cc4da41993c379ec46e3759e6283 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 12 Aug 2024 13:07:37 +0200 Subject: [PATCH 2/3] fix test + explicitly test default --- pandas/tests/arrays/string_/test_string_arrow.py | 6 +++++- pandas/tests/dtypes/test_common.py | 13 +++++++++---- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index d0922af53b6ef..0274240030745 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -26,7 +26,11 @@ def test_eq_all_na(): tm.assert_extension_array_equal(result, expected) -def test_config(string_storage, request, using_infer_string): +def test_config(string_storage, using_infer_string): + # with the default string_storage setting + # always "python" at the moment + assert StringDtype().storage == "python" + with pd.option_context("string_storage", string_storage): assert StringDtype().storage == string_storage result = pd.array(["a", "b"]) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 4bf97b1fd8494..2c2dff7a957fe 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -3,6 +3,7 @@ import numpy as np import pytest +from pandas.compat import HAS_PYARROW import pandas.util._test_decorators as td from pandas.core.dtypes.astype import astype_array @@ -802,13 +803,17 @@ def test_pandas_dtype_ea_not_instance(): def test_pandas_dtype_string_dtypes(string_storage): - # TODO(infer_string) remove skip if "python" is supported - pytest.importorskip("pyarrow") + with pd.option_context("future.infer_string", True): + # with the default string_storage setting + result = pandas_dtype("str") + assert result == pd.StringDtype( + "pyarrow" if HAS_PYARROW else "python", na_value=np.nan + ) + with pd.option_context("future.infer_string", True): with pd.option_context("string_storage", string_storage): result = pandas_dtype("str") - # TODO(infer_string) hardcoded to pyarrow until python is supported - assert result == pd.StringDtype("pyarrow", na_value=np.nan) + assert result == pd.StringDtype(string_storage, na_value=np.nan) with pd.option_context("future.infer_string", False): with pd.option_context("string_storage", string_storage): From 9e8bb8eb86d9e5898ce051d1a25765198476d4f1 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 12 Aug 2024 20:27:53 +0200 Subject: [PATCH 3/3] use 'auto' instead of None --- pandas/core/arrays/string_.py | 6 ++++-- pandas/core/config_init.py | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 1ec8946dd8a49..45dc0ef374b13 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -141,13 +141,15 @@ def __init__( if storage is None: if na_value is not libmissing.NA: storage = get_option("mode.string_storage") - if storage is None: + if storage == "auto": if HAS_PYARROW: storage = "pyarrow" else: storage = "python" else: - storage = get_option("mode.string_storage") or "python" + storage = get_option("mode.string_storage") + if storage == "auto": + storage = "python" if storage == "pyarrow_numpy": # TODO raise a deprecation warning diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 04e98825326f7..e4eefb570fd95 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -457,7 +457,7 @@ def is_terminal() -> bool: def is_valid_string_storage(value: Any) -> None: - legal_values = [None, "python", "pyarrow"] + legal_values = ["auto", "python", "pyarrow"] if value not in legal_values: msg = "Value must be one of python|pyarrow" if value == "pyarrow_numpy": @@ -472,7 +472,7 @@ def is_valid_string_storage(value: Any) -> None: with cf.config_prefix("mode"): cf.register_option( "string_storage", - None, + "auto", string_storage_doc, # validator=is_one_of_factory(["python", "pyarrow"]), validator=is_valid_string_storage,