API: Deprecate regex=True default in Series.str.replace (pandas-dev#36695)

dsaxton · Kevin D Smith · commit 397252f165ed · 2020-11-02T08:51:46.000-06:00
diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst
@@ -255,7 +255,7 @@ i.e., from the end of the string to the beginning of the string:
 
    s2.str.rsplit("_", expand=True, n=1)
 
-``replace`` by default replaces `regular expressions
+``replace`` optionally uses `regular expressions
 <https://docs.python.org/3/library/re.html>`__:
 
 .. ipython:: python
@@ -265,35 +265,27 @@ i.e., from the end of the string to the beginning of the string:
        dtype="string",
    )
    s3
-   s3.str.replace("^.a|dog", "XX-XX ", case=False)
+   s3.str.replace("^.a|dog", "XX-XX ", case=False, regex=True)
 
-Some caution must be taken to keep regular expressions in mind! For example, the
-following code will cause trouble because of the regular expression meaning of
-``$``:
-
-.. ipython:: python
-
-   # Consider the following badly formatted financial data
-   dollars = pd.Series(["12", "-$10", "$10,000"], dtype="string")
-
-   # This does what you'd naively expect:
-   dollars.str.replace("$", "")
+.. warning::
 
-   # But this doesn't:
-   dollars.str.replace("-$", "-")
+    Some caution must be taken when dealing with regular expressions! The current behavior
+    is to treat single character patterns as literal strings, even when ``regex`` is set
+    to ``True``. This behavior is deprecated and will be removed in a future version so
+    that the ``regex`` keyword is always respected.
 
-   # We need to escape the special character (for >1 len patterns)
-   dollars.str.replace(r"-\$", "-")
+.. versionchanged:: 1.2.0
 
-If you do want literal replacement of a string (equivalent to
-:meth:`str.replace`), you can set the optional ``regex`` parameter to
-``False``, rather than escaping each character. In this case both ``pat``
-and ``repl`` must be strings:
+If you want literal replacement of a string (equivalent to :meth:`str.replace`), you
+can set the optional ``regex`` parameter to ``False``, rather than escaping each
+character. In this case both ``pat`` and ``repl`` must be strings:
 
 .. ipython:: python
 
+    dollars = pd.Series(["12", "-$10", "$10,000"], dtype="string")
+
     # These lines are equivalent
-    dollars.str.replace(r"-\$", "-")
+    dollars.str.replace(r"-\$", "-", regex=True)
     dollars.str.replace("-$", "-", regex=False)
 
 The ``replace`` method can also take a callable as replacement. It is called
@@ -310,7 +302,10 @@ positional argument (a regex object) and return a string.
        return m.group(0)[::-1]
 
 
-   pd.Series(["foo 123", "bar baz", np.nan], dtype="string").str.replace(pat, repl)
+   pd.Series(
+       ["foo 123", "bar baz", np.nan],
+       dtype="string"
+   ).str.replace(pat, repl, regex=True)
 
    # Using regex groups
    pat = r"(?P<one>\w+) (?P<two>\w+) (?P<three>\w+)"
@@ -320,7 +315,9 @@ positional argument (a regex object) and return a string.
        return m.group("two").swapcase()
 
 
-   pd.Series(["Foo Bar Baz", np.nan], dtype="string").str.replace(pat, repl)
+   pd.Series(["Foo Bar Baz", np.nan], dtype="string").str.replace(
+       pat, repl, regex=True
+   )
 
 The ``replace`` method also accepts a compiled regular expression object
 from :func:`re.compile` as a pattern. All flags should be included in the
@@ -331,7 +328,7 @@ compiled regular expression object.
    import re
 
    regex_pat = re.compile(r"^.a|dog", flags=re.IGNORECASE)
-   s3.str.replace(regex_pat, "XX-XX ")
+   s3.str.replace(regex_pat, "XX-XX ", regex=True)
 
 Including a ``flags`` argument when calling ``replace`` with a compiled
 regular expression object will raise a ``ValueError``.
diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
@@ -287,6 +287,7 @@ Deprecations
 - Deprecated indexing :class:`DataFrame` rows with datetime-like strings ``df[string]``, use ``df.loc[string]`` instead (:issue:`36179`)
 - Deprecated casting an object-dtype index of ``datetime`` objects to :class:`DatetimeIndex` in the :class:`Series` constructor (:issue:`23598`)
 - Deprecated :meth:`Index.is_all_dates` (:issue:`27744`)
+- The default value of ``regex`` for :meth:`Series.str.replace` will change from ``True`` to ``False`` in a future release. In addition, single character regular expressions will *not* be treated as literal strings when ``regex=True`` is set. (:issue:`24804`)
 - Deprecated automatic alignment on comparison operations between :class:`DataFrame` and :class:`Series`, do ``frame, ser = frame.align(ser, axis=1, copy=False)`` before e.g. ``frame == ser`` (:issue:`28759`)
 - :meth:`Rolling.count` with ``min_periods=None`` will default to the size of the window in a future version (:issue:`31302`)
 - Deprecated slice-indexing on timezone-aware :class:`DatetimeIndex` with naive ``datetime`` objects, to match scalar indexing behavior (:issue:`36148`)
diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py
@@ -483,7 +483,7 @@ def melt_stub(df, stub: str, i, j, value_vars, sep: str):
             var_name=j,
         )
         newdf[j] = Categorical(newdf[j])
-        newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "")
+        newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "", regex=True)
 
         # GH17627 Cast numerics suffixes to int/float
         newdf[j] = to_numeric(newdf[j], errors="ignore")
diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
@@ -1178,7 +1178,7 @@ def fullmatch(self, pat, case=True, flags=0, na=None):
         return self._wrap_result(result, fill_value=na, returns_string=False)
 
     @forbid_nonstring_types(["bytes"])
-    def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True):
+    def replace(self, pat, repl, n=-1, case=None, flags=0, regex=None):
         r"""
         Replace each occurrence of pattern/regex in the Series/Index.
 
@@ -1296,6 +1296,20 @@ def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True):
         2    NaN
         dtype: object
         """
+        if regex is None:
+            if isinstance(pat, str) and any(c in pat for c in ".+*|^$?[](){}\\"):
+                # warn only in cases where regex behavior would differ from literal
+                msg = (
+                    "The default value of regex will change from True to False "
+                    "in a future version."
+                )
+                if len(pat) == 1:
+                    msg += (
+                        " In addition, single character regular expressions will"
+                        "*not* be treated as literal strings when regex=True."
+                    )
+                warnings.warn(msg, FutureWarning, stacklevel=3)
+            regex = True
         result = self._array._str_replace(
             pat, repl, n=n, case=case, flags=flags, regex=regex
         )
diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py
@@ -449,3 +449,14 @@ def test_replace_with_compiled_regex(self):
         result = s.replace({regex: "z"}, regex=True)
         expected = pd.Series(["z", "b", "c"])
         tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize("pattern", ["^.$", "."])
+    def test_str_replace_regex_default_raises_warning(self, pattern):
+        # https://github.com/pandas-dev/pandas/pull/24809
+        s = pd.Series(["a", "b", "c"])
+        msg = r"The default value of regex will change from True to False"
+        if len(pattern) == 1:
+            msg += r".*single character regular expressions.*not.*literal strings"
+        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False) as w:
+            s.str.replace(pattern, "")
+            assert re.match(msg, str(w[0].message))
diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py
@@ -984,11 +984,11 @@ def test_casemethods(self):
     def test_replace(self):
         values = Series(["fooBAD__barBAD", np.nan])
 
-        result = values.str.replace("BAD[_]*", "")
+        result = values.str.replace("BAD[_]*", "", regex=True)
         exp = Series(["foobar", np.nan])
         tm.assert_series_equal(result, exp)
 
-        result = values.str.replace("BAD[_]*", "", n=1)
+        result = values.str.replace("BAD[_]*", "", n=1, regex=True)
         exp = Series(["foobarBAD", np.nan])
         tm.assert_series_equal(result, exp)
 
@@ -997,15 +997,17 @@ def test_replace(self):
             ["aBAD", np.nan, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0]
         )
 
-        rs = Series(mixed).str.replace("BAD[_]*", "")
+        rs = Series(mixed).str.replace("BAD[_]*", "", regex=True)
         xp = Series(["a", np.nan, "b", np.nan, np.nan, "foo", np.nan, np.nan, np.nan])
         assert isinstance(rs, Series)
         tm.assert_almost_equal(rs, xp)
 
         # flags + unicode
         values = Series([b"abcd,\xc3\xa0".decode("utf-8")])
         exp = Series([b"abcd, \xc3\xa0".decode("utf-8")])
-        result = values.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE)
+        result = values.str.replace(
+            r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE, regex=True
+        )
         tm.assert_series_equal(result, exp)
 
         # GH 13438
@@ -1023,7 +1025,7 @@ def test_replace_callable(self):
 
         # test with callable
         repl = lambda m: m.group(0).swapcase()
-        result = values.str.replace("[a-z][A-Z]{2}", repl, n=2)
+        result = values.str.replace("[a-z][A-Z]{2}", repl, n=2, regex=True)
         exp = Series(["foObaD__baRbaD", np.nan])
         tm.assert_series_equal(result, exp)
 
@@ -1049,7 +1051,7 @@ def test_replace_callable(self):
         values = Series(["Foo Bar Baz", np.nan])
         pat = r"(?P<first>\w+) (?P<middle>\w+) (?P<last>\w+)"
         repl = lambda m: m.group("middle").swapcase()
-        result = values.str.replace(pat, repl)
+        result = values.str.replace(pat, repl, regex=True)
         exp = Series(["bAR", np.nan])
         tm.assert_series_equal(result, exp)
 
@@ -1059,11 +1061,11 @@ def test_replace_compiled_regex(self):
 
         # test with compiled regex
         pat = re.compile(r"BAD[_]*")
-        result = values.str.replace(pat, "")
+        result = values.str.replace(pat, "", regex=True)
         exp = Series(["foobar", np.nan])
         tm.assert_series_equal(result, exp)
 
-        result = values.str.replace(pat, "", n=1)
+        result = values.str.replace(pat, "", n=1, regex=True)
         exp = Series(["foobarBAD", np.nan])
         tm.assert_series_equal(result, exp)
 
@@ -1072,7 +1074,7 @@ def test_replace_compiled_regex(self):
             ["aBAD", np.nan, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0]
         )
 
-        rs = Series(mixed).str.replace(pat, "")
+        rs = Series(mixed).str.replace(pat, "", regex=True)
         xp = Series(["a", np.nan, "b", np.nan, np.nan, "foo", np.nan, np.nan, np.nan])
         assert isinstance(rs, Series)
         tm.assert_almost_equal(rs, xp)
@@ -1110,7 +1112,7 @@ def test_replace_literal(self):
         # GH16808 literal replace (regex=False vs regex=True)
         values = Series(["f.o", "foo", np.nan])
         exp = Series(["bao", "bao", np.nan])
-        result = values.str.replace("f.", "ba")
+        result = values.str.replace("f.", "ba", regex=True)
         tm.assert_series_equal(result, exp)
 
         exp = Series(["bao", "foo", np.nan])
@@ -3044,7 +3046,7 @@ def test_pipe_failures(self):
 
         tm.assert_series_equal(result, exp)
 
-        result = s.str.replace("|", " ")
+        result = s.str.replace("|", " ", regex=False)
         exp = Series(["A B C"])
 
         tm.assert_series_equal(result, exp)
@@ -3345,7 +3347,7 @@ def test_replace_moar(self):
         )
         tm.assert_series_equal(result, expected)
 
-        result = s.str.replace("^.a|dog", "XX-XX ", case=False)
+        result = s.str.replace("^.a|dog", "XX-XX ", case=False, regex=True)
         expected = Series(
             [
                 "A",

Original file line number	Diff line number	Diff line change
`@@ -483,7 +483,7 @@ def melt_stub(df, stub: str, i, j, value_vars, sep: str):`
`483`	`483`	`var_name=j,`
`484`	`484`	`)`
`485`	`485`	`newdf[j] = Categorical(newdf[j])`
`486`		`- newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "")`
	`486`	`+ newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "", regex=True)`
`487`	`487`
`488`	`488`	`# GH17627 Cast numerics suffixes to int/float`
`489`	`489`	`newdf[j] = to_numeric(newdf[j], errors="ignore")`