Skip to content

Commit 07b2a9d

Browse files
committed
ENH: str.replace accepts a compiled expression
.str.replace now accepts a compiled regular expression. See pandas-dev#15446
1 parent e1d5407 commit 07b2a9d

File tree

3 files changed

+99
-11
lines changed

3 files changed

+99
-11
lines changed

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ New features
2828

2929
- Integration with the ``feather-format``, including a new top-level ``pd.read_feather()`` and ``DataFrame.to_feather()`` method, see :ref:`here <io.feather>`.
3030
- ``.str.replace`` now accepts a callable, as replacement, which is passed to ``re.sub`` (:issue:`15055`)
31+
- ``.str.replace`` now accepts a compiled regular expression as pattern (:issue:`15446`)
3132

3233

3334

pandas/core/strings.py

+38-11
Original file line numberDiff line numberDiff line change
@@ -311,22 +311,25 @@ def str_replace(arr, pat, repl, n=-1, case=True, flags=0):
311311
312312
Parameters
313313
----------
314-
pat : string
315-
Character sequence or regular expression
314+
pat : string or compiled regex
315+
String can be a character sequence or regular expression. Also, a
316+
compiled regex may be used.
316317
repl : string or callable
317318
Replacement string or a callable. The callable is passed the regex
318319
match object and must return a replacement string to be used.
319320
See :func:`re.sub`.
320321
321322
.. versionadded:: 0.20.0
323+
`pat` also accepts a compiled regex.
322324
`repl` also accepts a callable.
323325
324326
n : int, default -1 (all)
325327
Number of replacements to make from start
326328
case : boolean, default True
327-
If True, case sensitive
329+
If True, case sensitive. Ignored if `pat` is a compiled regex.
328330
flags : int, default 0 (no flags)
329-
re module flags, e.g. re.IGNORECASE
331+
re module flags, e.g. re.IGNORECASE. Ignored if `pat` is a compiled
332+
regex.
330333
331334
Returns
332335
-------
@@ -372,27 +375,51 @@ def str_replace(arr, pat, repl, n=-1, case=True, flags=0):
372375
0 tWO
373376
1 bAR
374377
dtype: object
378+
379+
When `pat` is a compiled regex, all flags should be included in the
380+
compiled regex. `case` and `flags` are otherwise ignored.
381+
382+
>>> regex_pat = re.compile(r'FUZ', flags=re.IGNORECASE)
383+
>>> pd.Series(['foo', 'fuz', np.nan]).str.replace(regex_pat, 'bar')
384+
0 foo
385+
1 bar
386+
2 NaN
387+
dtype: object
375388
"""
376389

377390
# Check whether repl is valid (GH 13438, GH 15055)
378391
if not (is_string_like(repl) or callable(repl)):
379392
raise TypeError("repl must be a string or callable")
380-
use_re = not case or len(pat) > 1 or flags or callable(repl)
381-
382-
if use_re:
393+
# Check whether pat is a compiled regex or should be compiled
394+
is_re = isinstance(pat, type(re.compile('')))
395+
build_re = not is_re and (
396+
not case or len(pat) > 1 or flags or callable(repl))
397+
398+
if is_re:
399+
if not case or flags:
400+
warnings.warn(
401+
"case and flags are ignored when providing a compiled regex.",
402+
UserWarning, stacklevel=3)
403+
f = _str_replace_regex_func(pat, repl, n)
404+
elif build_re:
383405
if not case:
384406
flags |= re.IGNORECASE
385407
regex = re.compile(pat, flags=flags)
386-
n = n if n >= 0 else 0
387-
388-
def f(x):
389-
return regex.sub(repl, x, count=n)
408+
f = _str_replace_regex_func(regex, repl, n)
390409
else:
391410
f = lambda x: x.replace(pat, repl, n)
392411

393412
return _na_map(f, arr)
394413

395414

415+
def _str_replace_regex_func(regex, repl, n=-1):
416+
n = n if n >= 0 else 0
417+
418+
def f(x):
419+
return regex.sub(repl, x, count=n)
420+
return f
421+
422+
396423
def str_repeat(arr, repeats):
397424
"""
398425
Duplicate each string in the Series/Index by indicated number

pandas/tests/test_strings.py

+60
Original file line numberDiff line numberDiff line change
@@ -469,6 +469,66 @@ def test_replace_callable(self):
469469
exp = Series(['bAR', NA])
470470
tm.assert_series_equal(result, exp)
471471

472+
def test_replace_regex(self):
473+
# GH 15446
474+
values = Series(['fooBAD__barBAD', NA])
475+
476+
# test with compiled regex
477+
pat = re.compile(r'BAD[_]*')
478+
result = values.str.replace(pat, '')
479+
exp = Series(['foobar', NA])
480+
tm.assert_series_equal(result, exp)
481+
482+
# mixed
483+
mixed = Series(['aBAD', NA, 'bBAD', True, datetime.today(), 'fooBAD',
484+
None, 1, 2.])
485+
486+
rs = Series(mixed).str.replace(pat, '')
487+
xp = Series(['a', NA, 'b', NA, NA, 'foo', NA, NA, NA])
488+
tm.assertIsInstance(rs, Series)
489+
tm.assert_almost_equal(rs, xp)
490+
491+
# unicode
492+
values = Series([u('fooBAD__barBAD'), NA])
493+
494+
result = values.str.replace(pat, '')
495+
exp = Series([u('foobar'), NA])
496+
tm.assert_series_equal(result, exp)
497+
498+
result = values.str.replace(pat, '', n=1)
499+
exp = Series([u('foobarBAD'), NA])
500+
tm.assert_series_equal(result, exp)
501+
502+
# flags + unicode
503+
values = Series([b"abcd,\xc3\xa0".decode("utf-8")])
504+
exp = Series([b"abcd, \xc3\xa0".decode("utf-8")])
505+
pat = re.compile(r"(?<=\w),(?=\w)", flags=re.UNICODE)
506+
result = values.str.replace(pat, ", ")
507+
tm.assert_series_equal(result, exp)
508+
509+
# case and flags provided to str.replace will have no effect
510+
# and will produce warnings
511+
values = Series(['fooBAD__barBAD__bad', NA])
512+
pat = re.compile(r'BAD[_]*')
513+
514+
with tm.assert_produces_warning():
515+
result = values.str.replace(pat, '', flags=re.IGNORECASE)
516+
exp = Series(['foobarbad', NA])
517+
tm.assert_series_equal(result, exp)
518+
519+
with tm.assert_produces_warning():
520+
result = values.str.replace(pat, '', case=False)
521+
exp = Series(['foobarbad', NA])
522+
tm.assert_series_equal(result, exp)
523+
524+
# test with callable
525+
values = Series(['fooBAD__barBAD', NA])
526+
repl = lambda m: m.group(0).swapcase()
527+
pat = re.compile('[a-z][A-Z]{2}')
528+
result = values.str.replace(pat, repl, n=2)
529+
exp = Series(['foObaD__baRbaD', NA])
530+
tm.assert_series_equal(result, exp)
531+
472532
def test_repeat(self):
473533
values = Series(['a', 'b', NA, 'c', NA, 'd'])
474534

0 commit comments

Comments
 (0)