Skip to content

Commit 7d96e7b

Browse files
phoflmeeseeksmachine
authored andcommitted
Backport PR pandas-dev#54794: Infer string storage based on infer_string option
1 parent 0084f77 commit 7d96e7b

File tree

4 files changed

+20
-3
lines changed

4 files changed

+20
-3
lines changed

doc/source/whatsnew/v2.1.0.rst

+5-1
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,15 @@ We are collecting feedback on this decision `here <https://github.com/pandas-dev
3939
Avoid NumPy object dtype for strings by default
4040
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
4141

42-
Previously, all strings were stored in columns with NumPy object dtype.
42+
Previously, all strings were stored in columns with NumPy object dtype by default.
4343
This release introduces an option ``future.infer_string`` that infers all
4444
strings as PyArrow backed strings with dtype ``"string[pyarrow_numpy]"`` instead.
4545
This is a new string dtype implementation that follows NumPy semantics in comparison
4646
operations and will return ``np.nan`` as the missing value indicator.
47+
Setting the option will also infer the dtype ``"string"`` as a :class:`StringDtype` with
48+
storage set to ``"pyarrow_numpy"``, ignoring the value behind the option
49+
``mode.string_storage``.
50+
4751
This option only works if PyArrow is installed. PyArrow backed strings have a
4852
significantly reduced memory footprint and provide a big performance improvement
4953
compared to NumPy object (:issue:`54430`).

pandas/core/arrays/string_.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,11 @@ def na_value(self) -> libmissing.NAType | float: # type: ignore[override]
112112

113113
def __init__(self, storage=None) -> None:
114114
if storage is None:
115-
storage = get_option("mode.string_storage")
115+
infer_string = get_option("future.infer_string")
116+
if infer_string:
117+
storage = "pyarrow_numpy"
118+
else:
119+
storage = get_option("mode.string_storage")
116120
if storage not in {"python", "pyarrow", "pyarrow_numpy"}:
117121
raise ValueError(
118122
f"Storage must be 'python' or 'pyarrow'. Got {storage} instead."

pandas/core/config_init.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -492,7 +492,8 @@ def use_inf_as_na_cb(key) -> None:
492492

493493
string_storage_doc = """
494494
: string
495-
The default storage for StringDtype.
495+
The default storage for StringDtype. This option is ignored if
496+
``future.infer_string`` is set to True.
496497
"""
497498

498499
with cf.config_prefix("mode"):

pandas/tests/series/test_constructors.py

+8
Original file line numberDiff line numberDiff line change
@@ -2115,6 +2115,14 @@ def test_series_string_inference_array_string_dtype(self):
21152115
ser = Series(np.array(["a", "b"]))
21162116
tm.assert_series_equal(ser, expected)
21172117

2118+
def test_series_string_inference_storage_definition(self):
2119+
# GH#54793
2120+
pytest.importorskip("pyarrow")
2121+
expected = Series(["a", "b"], dtype="string[pyarrow_numpy]")
2122+
with pd.option_context("future.infer_string", True):
2123+
result = Series(["a", "b"], dtype="string")
2124+
tm.assert_series_equal(result, expected)
2125+
21182126

21192127
class TestSeriesConstructorIndexCoercion:
21202128
def test_series_constructor_datetimelike_index_coercion(self):

0 commit comments

Comments
 (0)