diff --git a/doc/source/text.rst b/doc/source/text.rst index 52e05c5d511bc..2b2520cb6100f 100644 --- a/doc/source/text.rst +++ b/doc/source/text.rst @@ -164,6 +164,27 @@ positional argument (a regex object) and return a string. repl = lambda m: m.group('two').swapcase() pd.Series(['Foo Bar Baz', np.nan]).str.replace(pat, repl) +The ``replace`` method also accepts a compiled regular expression object +from :func:`re.compile` as a pattern. All flags should be included in the +compiled regular expression object. + +.. versionadded:: 0.20.0 + +.. ipython:: python + + import re + regex_pat = re.compile(r'^.a|dog', flags=re.IGNORECASE) + s3.str.replace(regex_pat, 'XX-XX ') + +Including a ``flags`` argument when calling ``replace`` with a compiled +regular expression object will raise a ``ValueError``. + +.. ipython:: + + @verbatim + In [1]: s3.str.replace(regex_pat, 'XX-XX ', flags=re.IGNORECASE) + --------------------------------------------------------------------------- + ValueError: case and flags cannot be set when pat is a compiled regex Indexing with ``.str`` ---------------------- diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index f51ff4cd0c908..a15eef0aa53a8 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -28,7 +28,8 @@ New features ~~~~~~~~~~~~ - Integration with the ``feather-format``, including a new top-level ``pd.read_feather()`` and ``DataFrame.to_feather()`` method, see :ref:`here `. -- ``.str.replace`` now accepts a callable, as replacement, which is passed to ``re.sub`` (:issue:`15055`) +- ``Series.str.replace()`` now accepts a callable, as replacement, which is passed to ``re.sub`` (:issue:`15055`) +- ``Series.str.replace()`` now accepts a compiled regular expression as a pattern (:issue:`15446`) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index ac8d1db6a0bf3..46ba48b4cd846 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -9,7 +9,8 @@ is_string_like, is_list_like, is_scalar, - is_integer) + is_integer, + is_re) from pandas.core.common import _values_from_object from pandas.core.algorithms import take_1d @@ -303,7 +304,7 @@ def str_endswith(arr, pat, na=np.nan): return _na_map(f, arr, na, dtype=bool) -def str_replace(arr, pat, repl, n=-1, case=True, flags=0): +def str_replace(arr, pat, repl, n=-1, case=None, flags=0): """ Replace occurrences of pattern/regex in the Series/Index with some other string. Equivalent to :meth:`str.replace` or @@ -311,8 +312,12 @@ def str_replace(arr, pat, repl, n=-1, case=True, flags=0): Parameters ---------- - pat : string - Character sequence or regular expression + pat : string or compiled regex + String can be a character sequence or regular expression. + + .. versionadded:: 0.20.0 + `pat` also accepts a compiled regex. + repl : string or callable Replacement string or a callable. The callable is passed the regex match object and must return a replacement string to be used. @@ -323,15 +328,24 @@ def str_replace(arr, pat, repl, n=-1, case=True, flags=0): n : int, default -1 (all) Number of replacements to make from start - case : boolean, default True - If True, case sensitive + case : boolean, default None + - If True, case sensitive (the default if `pat` is a string) + - Set to False for case insensitive + - Cannot be set if `pat` is a compiled regex flags : int, default 0 (no flags) - re module flags, e.g. re.IGNORECASE + - re module flags, e.g. re.IGNORECASE + - Cannot be set if `pat` is a compiled regex Returns ------- replaced : Series/Index of objects + Notes + ----- + When `pat` is a compiled regex, all flags should be included in the + compiled regex. Use of `case` or `flags` with a compiled regex will + raise an error. + Examples -------- When `repl` is a string, every `pat` is replaced as with @@ -372,21 +386,42 @@ def str_replace(arr, pat, repl, n=-1, case=True, flags=0): 0 tWO 1 bAR dtype: object + + Using a compiled regex with flags + + >>> regex_pat = re.compile(r'FUZ', flags=re.IGNORECASE) + >>> pd.Series(['foo', 'fuz', np.nan]).str.replace(regex_pat, 'bar') + 0 foo + 1 bar + 2 NaN + dtype: object """ # Check whether repl is valid (GH 13438, GH 15055) if not (is_string_like(repl) or callable(repl)): raise TypeError("repl must be a string or callable") - use_re = not case or len(pat) > 1 or flags or callable(repl) - if use_re: - if not case: + is_compiled_re = is_re(pat) + if is_compiled_re: + if (case is not None) or (flags != 0): + raise ValueError("case and flags cannot be set" + " when pat is a compiled regex") + else: + # not a compiled regex + # set default case + if case is None: + case = True + + # add case flag, if provided + if case is False: flags |= re.IGNORECASE - regex = re.compile(pat, flags=flags) - n = n if n >= 0 else 0 - def f(x): - return regex.sub(repl, x, count=n) + use_re = is_compiled_re or len(pat) > 1 or flags or callable(repl) + + if use_re: + n = n if n >= 0 else 0 + regex = re.compile(pat, flags=flags) + f = lambda x: regex.sub(repl=repl, string=x, count=n) else: f = lambda x: x.replace(pat, repl, n) @@ -1558,7 +1593,7 @@ def match(self, pat, case=True, flags=0, na=np.nan, as_indexer=False): return self._wrap_result(result) @copy(str_replace) - def replace(self, pat, repl, n=-1, case=True, flags=0): + def replace(self, pat, repl, n=-1, case=None, flags=0): result = str_replace(self._data, pat, repl, n=n, case=case, flags=flags) return self._wrap_result(result) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index ce97b09b7e3ca..f98cabbb70477 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -469,6 +469,65 @@ def test_replace_callable(self): exp = Series(['bAR', NA]) tm.assert_series_equal(result, exp) + def test_replace_compiled_regex(self): + # GH 15446 + values = Series(['fooBAD__barBAD', NA]) + + # test with compiled regex + pat = re.compile(r'BAD[_]*') + result = values.str.replace(pat, '') + exp = Series(['foobar', NA]) + tm.assert_series_equal(result, exp) + + # mixed + mixed = Series(['aBAD', NA, 'bBAD', True, datetime.today(), 'fooBAD', + None, 1, 2.]) + + rs = Series(mixed).str.replace(pat, '') + xp = Series(['a', NA, 'b', NA, NA, 'foo', NA, NA, NA]) + tm.assertIsInstance(rs, Series) + tm.assert_almost_equal(rs, xp) + + # unicode + values = Series([u('fooBAD__barBAD'), NA]) + + result = values.str.replace(pat, '') + exp = Series([u('foobar'), NA]) + tm.assert_series_equal(result, exp) + + result = values.str.replace(pat, '', n=1) + exp = Series([u('foobarBAD'), NA]) + tm.assert_series_equal(result, exp) + + # flags + unicode + values = Series([b"abcd,\xc3\xa0".decode("utf-8")]) + exp = Series([b"abcd, \xc3\xa0".decode("utf-8")]) + pat = re.compile(r"(?<=\w),(?=\w)", flags=re.UNICODE) + result = values.str.replace(pat, ", ") + tm.assert_series_equal(result, exp) + + # case and flags provided to str.replace will have no effect + # and will produce warnings + values = Series(['fooBAD__barBAD__bad', NA]) + pat = re.compile(r'BAD[_]*') + + with tm.assertRaisesRegexp(ValueError, "case and flags must be"): + result = values.str.replace(pat, '', flags=re.IGNORECASE) + + with tm.assertRaisesRegexp(ValueError, "case and flags must be"): + result = values.str.replace(pat, '', case=False) + + with tm.assertRaisesRegexp(ValueError, "case and flags must be"): + result = values.str.replace(pat, '', case=True) + + # test with callable + values = Series(['fooBAD__barBAD', NA]) + repl = lambda m: m.group(0).swapcase() + pat = re.compile('[a-z][A-Z]{2}') + result = values.str.replace(pat, repl, n=2) + exp = Series(['foObaD__baRbaD', NA]) + tm.assert_series_equal(result, exp) + def test_repeat(self): values = Series(['a', 'b', NA, 'c', NA, 'd'])