File tree 4 files changed +20
-3
lines changed
4 files changed +20
-3
lines changed Original file line number Diff line number Diff line change @@ -39,11 +39,15 @@ We are collecting feedback on this decision `here <https://github.com/pandas-dev
39
39
Avoid NumPy object dtype for strings by default
40
40
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
41
41
42
- Previously, all strings were stored in columns with NumPy object dtype.
42
+ Previously, all strings were stored in columns with NumPy object dtype by default .
43
43
This release introduces an option ``future.infer_string `` that infers all
44
44
strings as PyArrow backed strings with dtype ``"string[pyarrow_numpy]" `` instead.
45
45
This is a new string dtype implementation that follows NumPy semantics in comparison
46
46
operations and will return ``np.nan `` as the missing value indicator.
47
+ Setting the option will also infer the dtype ``"string" `` as a :class: `StringDtype ` with
48
+ storage set to ``"pyarrow_numpy" ``, ignoring the value behind the option
49
+ ``mode.string_storage ``.
50
+
47
51
This option only works if PyArrow is installed. PyArrow backed strings have a
48
52
significantly reduced memory footprint and provide a big performance improvement
49
53
compared to NumPy object (:issue: `54430 `).
Original file line number Diff line number Diff line change @@ -112,7 +112,11 @@ def na_value(self) -> libmissing.NAType | float: # type: ignore[override]
112
112
113
113
def __init__ (self , storage = None ) -> None :
114
114
if storage is None :
115
- storage = get_option ("mode.string_storage" )
115
+ infer_string = get_option ("future.infer_string" )
116
+ if infer_string :
117
+ storage = "pyarrow_numpy"
118
+ else :
119
+ storage = get_option ("mode.string_storage" )
116
120
if storage not in {"python" , "pyarrow" , "pyarrow_numpy" }:
117
121
raise ValueError (
118
122
f"Storage must be 'python' or 'pyarrow'. Got { storage } instead."
Original file line number Diff line number Diff line change @@ -492,7 +492,8 @@ def use_inf_as_na_cb(key) -> None:
492
492
493
493
string_storage_doc = """
494
494
: string
495
- The default storage for StringDtype.
495
+ The default storage for StringDtype. This option is ignored if
496
+ ``future.infer_string`` is set to True.
496
497
"""
497
498
498
499
with cf .config_prefix ("mode" ):
Original file line number Diff line number Diff line change @@ -2115,6 +2115,14 @@ def test_series_string_inference_array_string_dtype(self):
2115
2115
ser = Series (np .array (["a" , "b" ]))
2116
2116
tm .assert_series_equal (ser , expected )
2117
2117
2118
+ def test_series_string_inference_storage_definition (self ):
2119
+ # GH#54793
2120
+ pytest .importorskip ("pyarrow" )
2121
+ expected = Series (["a" , "b" ], dtype = "string[pyarrow_numpy]" )
2122
+ with pd .option_context ("future.infer_string" , True ):
2123
+ result = Series (["a" , "b" ], dtype = "string" )
2124
+ tm .assert_series_equal (result , expected )
2125
+
2118
2126
2119
2127
class TestSeriesConstructorIndexCoercion :
2120
2128
def test_series_constructor_datetimelike_index_coercion (self ):
You can’t perform that action at this time.
0 commit comments