Skip to content

Commit f6a381e

Browse files
committed
ENH: str.replace accepts a compiled expression
- Series.str.replace now accepts a compiled regular expression for `pat`. - Signature for .str.replace changed, but remains backwards compatible. See #15446
1 parent d652485 commit f6a381e

File tree

4 files changed

+132
-16
lines changed

4 files changed

+132
-16
lines changed

doc/source/text.rst

+21
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,27 @@ positional argument (a regex object) and return a string.
164164
repl = lambda m: m.group('two').swapcase()
165165
pd.Series(['Foo Bar Baz', np.nan]).str.replace(pat, repl)
166166
167+
The ``replace`` method also accepts a compiled regular expression object
168+
from :func:`re.compile` as a pattern. All flags should be included in the
169+
compiled regular expression object.
170+
171+
.. versionadded:: 0.20.0
172+
173+
.. ipython:: python
174+
175+
import re
176+
regex_pat = re.compile(r'^.a|dog', flags=re.IGNORECASE)
177+
s3.str.replace(regex_pat, 'XX-XX ')
178+
179+
Including a ``flags`` argument when calling ``replace`` with a compiled
180+
regular expression object will raise a ``ValueError``.
181+
182+
.. ipython::
183+
184+
@verbatim
185+
In [1]: s3.str.replace(regex_pat, 'XX-XX ', flags=re.IGNORECASE)
186+
---------------------------------------------------------------------------
187+
ValueError: case and flags must be default values when pat is a compiled regex
167188

168189
Indexing with ``.str``
169190
----------------------

doc/source/whatsnew/v0.20.0.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@ New features
2828
~~~~~~~~~~~~
2929

3030
- Integration with the ``feather-format``, including a new top-level ``pd.read_feather()`` and ``DataFrame.to_feather()`` method, see :ref:`here <io.feather>`.
31-
- ``.str.replace`` now accepts a callable, as replacement, which is passed to ``re.sub`` (:issue:`15055`)
31+
- ``Series.str.replace()`` now accepts a callable, as replacement, which is passed to ``re.sub`` (:issue:`15055`)
32+
- ``Series.str.replace()`` now accepts a compiled regular expression as a pattern (:issue:`15446`)
3233

3334

3435

pandas/core/strings.py

+50-15
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@
99
is_string_like,
1010
is_list_like,
1111
is_scalar,
12-
is_integer)
12+
is_integer,
13+
is_re)
1314
from pandas.core.common import _values_from_object
1415

1516
from pandas.core.algorithms import take_1d
@@ -303,16 +304,20 @@ def str_endswith(arr, pat, na=np.nan):
303304
return _na_map(f, arr, na, dtype=bool)
304305

305306

306-
def str_replace(arr, pat, repl, n=-1, case=True, flags=0):
307+
def str_replace(arr, pat, repl, n=-1, case=None, flags=0):
307308
"""
308309
Replace occurrences of pattern/regex in the Series/Index with
309310
some other string. Equivalent to :meth:`str.replace` or
310311
:func:`re.sub`.
311312
312313
Parameters
313314
----------
314-
pat : string
315-
Character sequence or regular expression
315+
pat : string or compiled regex
316+
String can be a character sequence or regular expression.
317+
318+
.. versionadded:: 0.20.0
319+
`pat` also accepts a compiled regex.
320+
316321
repl : string or callable
317322
Replacement string or a callable. The callable is passed the regex
318323
match object and must return a replacement string to be used.
@@ -323,15 +328,24 @@ def str_replace(arr, pat, repl, n=-1, case=True, flags=0):
323328
324329
n : int, default -1 (all)
325330
Number of replacements to make from start
326-
case : boolean, default True
327-
If True, case sensitive
331+
case : boolean, default None
332+
- If True, case sensitive
333+
- Defaults to True if `pat` is a string
334+
- Must be None if `pat` is a compiled regex
328335
flags : int, default 0 (no flags)
329-
re module flags, e.g. re.IGNORECASE
336+
- re module flags, e.g. re.IGNORECASE
337+
- Must be 0 if `pat` is a compiled regex
330338
331339
Returns
332340
-------
333341
replaced : Series/Index of objects
334342
343+
Notes
344+
-----
345+
When `pat` is a compiled regex, all flags should be included in the
346+
compiled regex. Use of `case` or `flags` with a compiled regex will
347+
raise an error.
348+
335349
Examples
336350
--------
337351
When `repl` is a string, every `pat` is replaced as with
@@ -372,21 +386,42 @@ def str_replace(arr, pat, repl, n=-1, case=True, flags=0):
372386
0 tWO
373387
1 bAR
374388
dtype: object
389+
390+
Using a compiled regex with flags
391+
392+
>>> regex_pat = re.compile(r'FUZ', flags=re.IGNORECASE)
393+
>>> pd.Series(['foo', 'fuz', np.nan]).str.replace(regex_pat, 'bar')
394+
0 foo
395+
1 bar
396+
2 NaN
397+
dtype: object
375398
"""
376399

377400
# Check whether repl is valid (GH 13438, GH 15055)
378401
if not (is_string_like(repl) or callable(repl)):
379402
raise TypeError("repl must be a string or callable")
380-
use_re = not case or len(pat) > 1 or flags or callable(repl)
381403

382-
if use_re:
383-
if not case:
404+
is_compiled_re = is_re(pat)
405+
if is_compiled_re:
406+
if (case is not None) or (flags != 0):
407+
raise ValueError("case and flags must be default values"
408+
" when pat is a compiled regex")
409+
else:
410+
# not a compiled regex
411+
# set default case
412+
if case is None:
413+
case = True
414+
415+
# add case flag, if provided
416+
if case is False:
384417
flags |= re.IGNORECASE
385-
regex = re.compile(pat, flags=flags)
386-
n = n if n >= 0 else 0
387418

388-
def f(x):
389-
return regex.sub(repl, x, count=n)
419+
use_re = is_compiled_re or len(pat) > 1 or flags or callable(repl)
420+
421+
if use_re:
422+
n = n if n >= 0 else 0
423+
f = lambda x: re.sub(pattern=pat, repl=repl, string=x,
424+
count=n, flags=flags)
390425
else:
391426
f = lambda x: x.replace(pat, repl, n)
392427

@@ -1558,7 +1593,7 @@ def match(self, pat, case=True, flags=0, na=np.nan, as_indexer=False):
15581593
return self._wrap_result(result)
15591594

15601595
@copy(str_replace)
1561-
def replace(self, pat, repl, n=-1, case=True, flags=0):
1596+
def replace(self, pat, repl, n=-1, case=None, flags=0):
15621597
result = str_replace(self._data, pat, repl, n=n, case=case,
15631598
flags=flags)
15641599
return self._wrap_result(result)

pandas/tests/test_strings.py

+59
Original file line numberDiff line numberDiff line change
@@ -469,6 +469,65 @@ def test_replace_callable(self):
469469
exp = Series(['bAR', NA])
470470
tm.assert_series_equal(result, exp)
471471

472+
def test_replace_compiled_regex(self):
473+
# GH 15446
474+
values = Series(['fooBAD__barBAD', NA])
475+
476+
# test with compiled regex
477+
pat = re.compile(r'BAD[_]*')
478+
result = values.str.replace(pat, '')
479+
exp = Series(['foobar', NA])
480+
tm.assert_series_equal(result, exp)
481+
482+
# mixed
483+
mixed = Series(['aBAD', NA, 'bBAD', True, datetime.today(), 'fooBAD',
484+
None, 1, 2.])
485+
486+
rs = Series(mixed).str.replace(pat, '')
487+
xp = Series(['a', NA, 'b', NA, NA, 'foo', NA, NA, NA])
488+
tm.assertIsInstance(rs, Series)
489+
tm.assert_almost_equal(rs, xp)
490+
491+
# unicode
492+
values = Series([u('fooBAD__barBAD'), NA])
493+
494+
result = values.str.replace(pat, '')
495+
exp = Series([u('foobar'), NA])
496+
tm.assert_series_equal(result, exp)
497+
498+
result = values.str.replace(pat, '', n=1)
499+
exp = Series([u('foobarBAD'), NA])
500+
tm.assert_series_equal(result, exp)
501+
502+
# flags + unicode
503+
values = Series([b"abcd,\xc3\xa0".decode("utf-8")])
504+
exp = Series([b"abcd, \xc3\xa0".decode("utf-8")])
505+
pat = re.compile(r"(?<=\w),(?=\w)", flags=re.UNICODE)
506+
result = values.str.replace(pat, ", ")
507+
tm.assert_series_equal(result, exp)
508+
509+
# case and flags provided to str.replace will have no effect
510+
# and will produce warnings
511+
values = Series(['fooBAD__barBAD__bad', NA])
512+
pat = re.compile(r'BAD[_]*')
513+
514+
with tm.assertRaisesRegexp(ValueError, "case and flags must be"):
515+
result = values.str.replace(pat, '', flags=re.IGNORECASE)
516+
517+
with tm.assertRaisesRegexp(ValueError, "case and flags must be"):
518+
result = values.str.replace(pat, '', case=False)
519+
520+
with tm.assertRaisesRegexp(ValueError, "case and flags must be"):
521+
result = values.str.replace(pat, '', case=True)
522+
523+
# test with callable
524+
values = Series(['fooBAD__barBAD', NA])
525+
repl = lambda m: m.group(0).swapcase()
526+
pat = re.compile('[a-z][A-Z]{2}')
527+
result = values.str.replace(pat, repl, n=2)
528+
exp = Series(['foObaD__baRbaD', NA])
529+
tm.assert_series_equal(result, exp)
530+
472531
def test_repeat(self):
473532
values = Series(['a', 'b', NA, 'c', NA, 'd'])
474533

0 commit comments

Comments
 (0)