diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index d350351075cb6..b4ae1d27df2b5 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -267,14 +267,16 @@ i.e., from the end of the string to the beginning of the string: s3 s3.str.replace("^.a|dog", "XX-XX ", case=False, regex=True) -.. warning:: - Some caution must be taken when dealing with regular expressions! The current behavior - is to treat single character patterns as literal strings, even when ``regex`` is set - to ``True``. This behavior is deprecated and will be removed in a future version so - that the ``regex`` keyword is always respected. +.. versionchanged:: 2.0 + +Single character pattern with ``regex=True`` will also be treated as regular expressions: + +.. ipython:: python -.. versionchanged:: 1.2.0 + s4 = pd.Series(["a.b", ".", "b", np.nan, ""], dtype="string") + s4 + s4.str.replace(".", "a", regex=True) If you want literal replacement of a string (equivalent to :meth:`str.replace`), you can set the optional ``regex`` parameter to ``False``, rather than escaping each diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index d71160cdbc369..478cf234b0908 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -311,6 +311,7 @@ Removal of prior version deprecations/changes - Changed behavior of :class:`Index` constructor when passed a ``SparseArray`` or ``SparseDtype`` to retain that dtype instead of casting to ``numpy.ndarray`` (:issue:`43930`) - Changed behavior of :class:`Index`, :class:`Series`, :class:`DataFrame` constructors with floating-dtype data and a :class:`DatetimeTZDtype`, the data are now interpreted as UTC-times instead of wall-times, consistent with how integer-dtype data are treated (:issue:`45573`) - Removed the deprecated ``base`` and ``loffset`` arguments from :meth:`pandas.DataFrame.resample`, :meth:`pandas.Series.resample` and :class:`pandas.Grouper`. Use ``offset`` or ``origin`` instead (:issue:`31809`) +- Change the default argument of ``regex`` for :meth:`Series.str.replace` from ``True`` to ``False``. Additionally, a single character ``pat`` with ``regex=True`` is now treated as a regular expression instead of a string literal. (:issue:`36695`, :issue:`24804`) - Changed behavior of :meth:`DataFrame.any` and :meth:`DataFrame.all` with ``bool_only=True``; object-dtype columns with all-bool values will no longer be included, manually cast to ``bool`` dtype first (:issue:`46188`) - Changed behavior of comparison of a :class:`Timestamp` with a ``datetime.date`` object; these now compare as un-equal and raise on inequality comparisons, matching the ``datetime.datetime`` behavior (:issue:`36131`) - Enforced deprecation of silently dropping columns that raised a ``TypeError`` in :class:`Series.transform` and :class:`DataFrame.transform` when used with a list or dictionary (:issue:`43740`) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 0024cbcb01bfc..71a50c69bfee1 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -1323,7 +1323,7 @@ def replace( n: int = -1, case: bool | None = None, flags: int = 0, - regex: bool | None = None, + regex: bool = False, ): r""" Replace each occurrence of pattern/regex in the Series/Index. @@ -1351,7 +1351,7 @@ def replace( flags : int, default 0 (no flags) Regex module flags, e.g. re.IGNORECASE. Cannot be set if `pat` is a compiled regex. - regex : bool, default True + regex : bool, default False Determines if the passed-in pattern is a regular expression: - If True, assumes the passed-in pattern is a regular expression. @@ -1359,8 +1359,6 @@ def replace( - Cannot be set to False if `pat` is a compiled regex or `repl` is a callable. - .. versionadded:: 0.23.0 - Returns ------- Series or Index of object @@ -1444,20 +1442,6 @@ def replace( 2 NaN dtype: object """ - if regex is None: - if isinstance(pat, str) and any(c in pat for c in ".+*|^$?[](){}\\"): - # warn only in cases where regex behavior would differ from literal - msg = ( - "The default value of regex will change from True to False " - "in a future version." - ) - if len(pat) == 1: - msg += ( - " In addition, single character regular expressions will " - "*not* be treated as literal strings when regex=True." - ) - warnings.warn(msg, FutureWarning, stacklevel=find_stack_level()) - # Check whether repl is valid (GH 13438, GH 15055) if not (isinstance(repl, str) or callable(repl)): raise TypeError("repl must be a string or callable") @@ -1476,14 +1460,6 @@ def replace( elif callable(repl): raise ValueError("Cannot use a callable replacement when regex=False") - # The current behavior is to treat single character patterns as literal strings, - # even when ``regex`` is set to ``True``. - if isinstance(pat, str) and len(pat) == 1: - regex = False - - if regex is None: - regex = True - if case is None: case = True diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 59b41e0ec944a..6f6acb7a996b2 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -423,7 +423,7 @@ def test_replace_callable_raises(any_string_dtype, repl): with tm.maybe_produces_warning( PerformanceWarning, any_string_dtype == "string[pyarrow]" ): - values.str.replace("a", repl) + values.str.replace("a", repl, regex=True) def test_replace_callable_named_groups(any_string_dtype): @@ -477,7 +477,7 @@ def test_replace_compiled_regex_unicode(any_string_dtype): with tm.maybe_produces_warning( PerformanceWarning, any_string_dtype == "string[pyarrow]" ): - result = ser.str.replace(pat, ", ") + result = ser.str.replace(pat, ", ", regex=True) tm.assert_series_equal(result, expected) @@ -490,13 +490,13 @@ def test_replace_compiled_regex_raises(any_string_dtype): msg = "case and flags cannot be set when pat is a compiled regex" with pytest.raises(ValueError, match=msg): - ser.str.replace(pat, "", flags=re.IGNORECASE) + ser.str.replace(pat, "", flags=re.IGNORECASE, regex=True) with pytest.raises(ValueError, match=msg): - ser.str.replace(pat, "", case=False) + ser.str.replace(pat, "", case=False, regex=True) with pytest.raises(ValueError, match=msg): - ser.str.replace(pat, "", case=True) + ser.str.replace(pat, "", case=True, regex=True) def test_replace_compiled_regex_callable(any_string_dtype): @@ -507,7 +507,7 @@ def test_replace_compiled_regex_callable(any_string_dtype): with tm.maybe_produces_warning( PerformanceWarning, any_string_dtype == "string[pyarrow]" ): - result = ser.str.replace(pat, repl, n=2) + result = ser.str.replace(pat, repl, n=2, regex=True) expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -617,48 +617,25 @@ def test_replace_not_case_sensitive_not_regex(any_string_dtype): tm.assert_series_equal(result, expected) -def test_replace_regex_default_warning(any_string_dtype): +def test_replace_regex(any_string_dtype): # https://github.com/pandas-dev/pandas/pull/24809 s = Series(["a", "b", "ac", np.nan, ""], dtype=any_string_dtype) - msg = ( - "The default value of regex will change from True to False in a " - "future version\\.$" - ) - - with tm.assert_produces_warning( - FutureWarning, - match=msg, - raise_on_extra_warnings=any_string_dtype != "string[pyarrow]", - ): - result = s.str.replace("^.$", "a") + result = s.str.replace("^.$", "a", regex=True) expected = Series(["a", "a", "ac", np.nan, ""], dtype=any_string_dtype) tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("regex", [True, False, None]) +@pytest.mark.parametrize("regex", [True, False]) def test_replace_regex_single_character(regex, any_string_dtype): - # https://github.com/pandas-dev/pandas/pull/24809 - - # The current behavior is to treat single character patterns as literal strings, - # even when ``regex`` is set to ``True``. - + # https://github.com/pandas-dev/pandas/pull/24809, enforced in 2.0 + # GH 24804 s = Series(["a.b", ".", "b", np.nan, ""], dtype=any_string_dtype) - if regex is None: - msg = re.escape( - "The default value of regex will change from True to False in a future " - "version. In addition, single character regular expressions will *not* " - "be treated as literal strings when regex=True." - ) - with tm.assert_produces_warning( - FutureWarning, - match=msg, - ): - result = s.str.replace(".", "a", regex=regex) + result = s.str.replace(".", "a", regex=regex) + if regex: + expected = Series(["aaa", "a", "a", np.nan, ""], dtype=any_string_dtype) else: - result = s.str.replace(".", "a", regex=regex) - - expected = Series(["aab", "a", "b", np.nan, ""], dtype=any_string_dtype) + expected = Series(["aab", "a", "b", np.nan, ""], dtype=any_string_dtype) tm.assert_series_equal(result, expected)