-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
ENH: str.replace accepts a compiled expression #15456
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,7 +9,8 @@ | |
is_string_like, | ||
is_list_like, | ||
is_scalar, | ||
is_integer) | ||
is_integer, | ||
is_re) | ||
from pandas.core.common import _values_from_object | ||
|
||
from pandas.core.algorithms import take_1d | ||
|
@@ -303,16 +304,20 @@ def str_endswith(arr, pat, na=np.nan): | |
return _na_map(f, arr, na, dtype=bool) | ||
|
||
|
||
def str_replace(arr, pat, repl, n=-1, case=True, flags=0): | ||
def str_replace(arr, pat, repl, n=-1, case=None, flags=0): | ||
""" | ||
Replace occurrences of pattern/regex in the Series/Index with | ||
some other string. Equivalent to :meth:`str.replace` or | ||
:func:`re.sub`. | ||
|
||
Parameters | ||
---------- | ||
pat : string | ||
Character sequence or regular expression | ||
pat : string or compiled regex | ||
String can be a character sequence or regular expression. | ||
|
||
.. versionadded:: 0.20.0 | ||
`pat` also accepts a compiled regex. | ||
|
||
repl : string or callable | ||
Replacement string or a callable. The callable is passed the regex | ||
match object and must return a replacement string to be used. | ||
|
@@ -323,15 +328,24 @@ def str_replace(arr, pat, repl, n=-1, case=True, flags=0): | |
|
||
n : int, default -1 (all) | ||
Number of replacements to make from start | ||
case : boolean, default True | ||
If True, case sensitive | ||
case : boolean, default None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. another possiblity here is to make There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I like the idea, especially because of the The code is much simpler: if not case:
flags |= re.IGNORECASE
is_compiled_re = is_re(pat)
if is_compiled_re and flags:
raise ValueError("case and flags must be default values"
" when pat is a compiled regex")
use_re = is_compiled_re or len(pat) > 1 or flags or callable(repl) Here's the documentation I'd imagine works well with that. @jreback, can you please let me know what you think before I run the whole thing through again with the other changes above? case : boolean, default True (case sensitive)
Must be True if `pat` is a compiled regex There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i'll let joris comment but i think it should work There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I know using such a sentinel is a way to go when you want to be able to distinguish None (a passed None as a value for the arg you want to catch), but this is not the case here. We can use None to distinguish a passed True or the default None as True. We don't need to be able to catch an explicitly passed None. So I don't understand why using As far as I see it, we just need to choose between those options:
Code-wise the first is the simplest. |
||
- If True, case sensitive (the default if `pat` is a string) | ||
- Set to False for case insensitive | ||
- Cannot be set if `pat` is a compiled regex | ||
flags : int, default 0 (no flags) | ||
re module flags, e.g. re.IGNORECASE | ||
- re module flags, e.g. re.IGNORECASE | ||
- Cannot be set if `pat` is a compiled regex | ||
|
||
Returns | ||
------- | ||
replaced : Series/Index of objects | ||
|
||
Notes | ||
----- | ||
When `pat` is a compiled regex, all flags should be included in the | ||
compiled regex. Use of `case` or `flags` with a compiled regex will | ||
raise an error. | ||
|
||
Examples | ||
-------- | ||
When `repl` is a string, every `pat` is replaced as with | ||
|
@@ -372,21 +386,42 @@ def str_replace(arr, pat, repl, n=-1, case=True, flags=0): | |
0 tWO | ||
1 bAR | ||
dtype: object | ||
|
||
Using a compiled regex with flags | ||
|
||
>>> regex_pat = re.compile(r'FUZ', flags=re.IGNORECASE) | ||
>>> pd.Series(['foo', 'fuz', np.nan]).str.replace(regex_pat, 'bar') | ||
0 foo | ||
1 bar | ||
2 NaN | ||
dtype: object | ||
""" | ||
|
||
# Check whether repl is valid (GH 13438, GH 15055) | ||
if not (is_string_like(repl) or callable(repl)): | ||
raise TypeError("repl must be a string or callable") | ||
use_re = not case or len(pat) > 1 or flags or callable(repl) | ||
|
||
if use_re: | ||
if not case: | ||
is_compiled_re = is_re(pat) | ||
if is_compiled_re: | ||
if (case is not None) or (flags != 0): | ||
raise ValueError("case and flags cannot be set" | ||
" when pat is a compiled regex") | ||
else: | ||
# not a compiled regex | ||
# set default case | ||
if case is None: | ||
case = True | ||
|
||
# add case flag, if provided | ||
if case is False: | ||
flags |= re.IGNORECASE | ||
regex = re.compile(pat, flags=flags) | ||
n = n if n >= 0 else 0 | ||
|
||
def f(x): | ||
return regex.sub(repl, x, count=n) | ||
use_re = is_compiled_re or len(pat) > 1 or flags or callable(repl) | ||
|
||
if use_re: | ||
n = n if n >= 0 else 0 | ||
regex = re.compile(pat, flags=flags) | ||
f = lambda x: regex.sub(repl=repl, string=x, count=n) | ||
else: | ||
f = lambda x: x.replace(pat, repl, n) | ||
|
||
|
@@ -1558,7 +1593,7 @@ def match(self, pat, case=True, flags=0, na=np.nan, as_indexer=False): | |
return self._wrap_result(result) | ||
|
||
@copy(str_replace) | ||
def replace(self, pat, repl, n=-1, case=True, flags=0): | ||
def replace(self, pat, repl, n=-1, case=None, flags=0): | ||
result = str_replace(self._data, pat, repl, n=n, case=case, | ||
flags=flags) | ||
return self._wrap_result(result) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -469,6 +469,65 @@ def test_replace_callable(self): | |
exp = Series(['bAR', NA]) | ||
tm.assert_series_equal(result, exp) | ||
|
||
def test_replace_compiled_regex(self): | ||
# GH 15446 | ||
values = Series(['fooBAD__barBAD', NA]) | ||
|
||
# test with compiled regex | ||
pat = re.compile(r'BAD[_]*') | ||
result = values.str.replace(pat, '') | ||
exp = Series(['foobar', NA]) | ||
tm.assert_series_equal(result, exp) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so looks like this is failing of the builds: https://circleci.com/gh/pandas-dev/pandas/188?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link This is a build where we override LOCALE='C' You can tests locally by adding
at the top of and running with 3.5 not entirely sure what is happening There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for tracing it out. I believe the issue is related to |
||
|
||
# mixed | ||
mixed = Series(['aBAD', NA, 'bBAD', True, datetime.today(), 'fooBAD', | ||
None, 1, 2.]) | ||
|
||
rs = Series(mixed).str.replace(pat, '') | ||
xp = Series(['a', NA, 'b', NA, NA, 'foo', NA, NA, NA]) | ||
tm.assertIsInstance(rs, Series) | ||
tm.assert_almost_equal(rs, xp) | ||
|
||
# unicode | ||
values = Series([u('fooBAD__barBAD'), NA]) | ||
|
||
result = values.str.replace(pat, '') | ||
exp = Series([u('foobar'), NA]) | ||
tm.assert_series_equal(result, exp) | ||
|
||
result = values.str.replace(pat, '', n=1) | ||
exp = Series([u('foobarBAD'), NA]) | ||
tm.assert_series_equal(result, exp) | ||
|
||
# flags + unicode | ||
values = Series([b"abcd,\xc3\xa0".decode("utf-8")]) | ||
exp = Series([b"abcd, \xc3\xa0".decode("utf-8")]) | ||
pat = re.compile(r"(?<=\w),(?=\w)", flags=re.UNICODE) | ||
result = values.str.replace(pat, ", ") | ||
tm.assert_series_equal(result, exp) | ||
|
||
# case and flags provided to str.replace will have no effect | ||
# and will produce warnings | ||
values = Series(['fooBAD__barBAD__bad', NA]) | ||
pat = re.compile(r'BAD[_]*') | ||
|
||
with tm.assertRaisesRegexp(ValueError, "case and flags must be"): | ||
result = values.str.replace(pat, '', flags=re.IGNORECASE) | ||
|
||
with tm.assertRaisesRegexp(ValueError, "case and flags must be"): | ||
result = values.str.replace(pat, '', case=False) | ||
|
||
with tm.assertRaisesRegexp(ValueError, "case and flags must be"): | ||
result = values.str.replace(pat, '', case=True) | ||
|
||
# test with callable | ||
values = Series(['fooBAD__barBAD', NA]) | ||
repl = lambda m: m.group(0).swapcase() | ||
pat = re.compile('[a-z][A-Z]{2}') | ||
result = values.str.replace(pat, repl, n=2) | ||
exp = Series(['foObaD__baRbaD', NA]) | ||
tm.assert_series_equal(result, exp) | ||
|
||
def test_repeat(self): | ||
values = Series(['a', 'b', NA, 'c', NA, 'd']) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
the signature needs to change for
case=None
andflags=None
(and then set them if not a is_compiled_re (and they are None)