File tree 4 files changed +20
-3
lines changed
4 files changed +20
-3
lines changed Original file line number Diff line number Diff line change @@ -39,11 +39,15 @@ We are collecting feedback on this decision `here <https://github.com/pandas-dev
39
39
Avoid NumPy object dtype for strings by default
40
40
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
41
41
42
- Previously, all strings were stored in columns with NumPy object dtype.
42
+ Previously, all strings were stored in columns with NumPy object dtype by default .
43
43
This release introduces an option ``future.infer_string `` that infers all
44
44
strings as PyArrow backed strings with dtype ``"string[pyarrow_numpy]" `` instead.
45
45
This is a new string dtype implementation that follows NumPy semantics in comparison
46
46
operations and will return ``np.nan `` as the missing value indicator.
47
+ Setting the option will also infer the dtype ``"string" `` as a :class: `StringDtype ` with
48
+ storage set to ``"pyarrow_numpy" ``, ignoring the value behind the option
49
+ ``mode.string_storage ``.
50
+
47
51
This option only works if PyArrow is installed. PyArrow backed strings have a
48
52
significantly reduced memory footprint and provide a big performance improvement
49
53
compared to NumPy object (:issue: `54430 `).
Original file line number Diff line number Diff line change @@ -115,7 +115,11 @@ def na_value(self) -> libmissing.NAType | float: # type: ignore[override]
115
115
116
116
def __init__ (self , storage = None ) -> None :
117
117
if storage is None :
118
- storage = get_option ("mode.string_storage" )
118
+ infer_string = get_option ("future.infer_string" )
119
+ if infer_string :
120
+ storage = "pyarrow_numpy"
121
+ else :
122
+ storage = get_option ("mode.string_storage" )
119
123
if storage not in {"python" , "pyarrow" , "pyarrow_numpy" }:
120
124
raise ValueError (
121
125
f"Storage must be 'python' or 'pyarrow'. Got { storage } instead."
Original file line number Diff line number Diff line change @@ -493,7 +493,8 @@ def use_inf_as_na_cb(key) -> None:
493
493
494
494
string_storage_doc = """
495
495
: string
496
- The default storage for StringDtype.
496
+ The default storage for StringDtype. This option is ignored if
497
+ ``future.infer_string`` is set to True.
497
498
"""
498
499
499
500
with cf .config_prefix ("mode" ):
Original file line number Diff line number Diff line change @@ -2115,6 +2115,14 @@ def test_series_string_inference_array_string_dtype(self):
2115
2115
ser = Series (np .array (["a" , "b" ]))
2116
2116
tm .assert_series_equal (ser , expected )
2117
2117
2118
+ def test_series_string_inference_storage_definition (self ):
2119
+ # GH#54793
2120
+ pytest .importorskip ("pyarrow" )
2121
+ expected = Series (["a" , "b" ], dtype = "string[pyarrow_numpy]" )
2122
+ with pd .option_context ("future.infer_string" , True ):
2123
+ result = Series (["a" , "b" ], dtype = "string" )
2124
+ tm .assert_series_equal (result , expected )
2125
+
2118
2126
2119
2127
class TestSeriesConstructorIndexCoercion :
2120
2128
def test_series_constructor_datetimelike_index_coercion (self ):
You can’t perform that action at this time.
0 commit comments