Skip to content

Commit 5e681c1

Browse files
phoflmroeschke
authored andcommitted
Infer string storage based on infer_string option (pandas-dev#54794)
1 parent 4b4a2a1 commit 5e681c1

File tree

4 files changed

+20
-3
lines changed

4 files changed

+20
-3
lines changed

doc/source/whatsnew/v2.1.0.rst

+5-1
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,15 @@ We are collecting feedback on this decision `here <https://github.com/pandas-dev
3939
Avoid NumPy object dtype for strings by default
4040
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
4141

42-
Previously, all strings were stored in columns with NumPy object dtype.
42+
Previously, all strings were stored in columns with NumPy object dtype by default.
4343
This release introduces an option ``future.infer_string`` that infers all
4444
strings as PyArrow backed strings with dtype ``"string[pyarrow_numpy]"`` instead.
4545
This is a new string dtype implementation that follows NumPy semantics in comparison
4646
operations and will return ``np.nan`` as the missing value indicator.
47+
Setting the option will also infer the dtype ``"string"`` as a :class:`StringDtype` with
48+
storage set to ``"pyarrow_numpy"``, ignoring the value behind the option
49+
``mode.string_storage``.
50+
4751
This option only works if PyArrow is installed. PyArrow backed strings have a
4852
significantly reduced memory footprint and provide a big performance improvement
4953
compared to NumPy object (:issue:`54430`).

pandas/core/arrays/string_.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,11 @@ def na_value(self) -> libmissing.NAType | float: # type: ignore[override]
115115

116116
def __init__(self, storage=None) -> None:
117117
if storage is None:
118-
storage = get_option("mode.string_storage")
118+
infer_string = get_option("future.infer_string")
119+
if infer_string:
120+
storage = "pyarrow_numpy"
121+
else:
122+
storage = get_option("mode.string_storage")
119123
if storage not in {"python", "pyarrow", "pyarrow_numpy"}:
120124
raise ValueError(
121125
f"Storage must be 'python' or 'pyarrow'. Got {storage} instead."

pandas/core/config_init.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -493,7 +493,8 @@ def use_inf_as_na_cb(key) -> None:
493493

494494
string_storage_doc = """
495495
: string
496-
The default storage for StringDtype.
496+
The default storage for StringDtype. This option is ignored if
497+
``future.infer_string`` is set to True.
497498
"""
498499

499500
with cf.config_prefix("mode"):

pandas/tests/series/test_constructors.py

+8
Original file line numberDiff line numberDiff line change
@@ -2115,6 +2115,14 @@ def test_series_string_inference_array_string_dtype(self):
21152115
ser = Series(np.array(["a", "b"]))
21162116
tm.assert_series_equal(ser, expected)
21172117

2118+
def test_series_string_inference_storage_definition(self):
2119+
# GH#54793
2120+
pytest.importorskip("pyarrow")
2121+
expected = Series(["a", "b"], dtype="string[pyarrow_numpy]")
2122+
with pd.option_context("future.infer_string", True):
2123+
result = Series(["a", "b"], dtype="string")
2124+
tm.assert_series_equal(result, expected)
2125+
21182126

21192127
class TestSeriesConstructorIndexCoercion:
21202128
def test_series_constructor_datetimelike_index_coercion(self):

0 commit comments

Comments
 (0)