Skip to content

Commit 75dd0f2

Browse files
committed
ENH/CLN: Redefine str.match, and issue a warning on deprecated default behavior.
1 parent d686154 commit 75dd0f2

File tree

2 files changed

+118
-23
lines changed

2 files changed

+118
-23
lines changed

pandas/core/strings.py

+56-17
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import pandas.compat as compat
88
import re
99
import pandas.lib as lib
10-
10+
import warnings
1111

1212
def _get_array_list(arr, others):
1313
if isinstance(others[0], (list, np.ndarray)):
@@ -169,6 +169,10 @@ def str_contains(arr, pat, case=True, flags=0, na=np.nan):
169169

170170
regex = re.compile(pat, flags=flags)
171171

172+
if regex.groups > 0:
173+
warnings.warn("""This pattern has match groups. To actually get the
174+
groups, use str.extract.""", UserWarning)
175+
172176
f = lambda x: bool(regex.search(x))
173177
return _na_map(f, arr, na)
174178

@@ -303,35 +307,70 @@ def rep(x, r):
303307
return result
304308

305309

306-
def str_match(arr, pat, flags=0):
310+
def str_match(arr, pat, case=True, flags=0, na=np.nan, as_indexer=False):
307311
"""
308-
Find groups in each string (from beginning) using passed regular expression
312+
Deprecated: Find groups in each string using passed regular expression.
313+
If as_indexer=True, determine if each string matches a regular expression.
309314
310315
Parameters
311316
----------
312317
pat : string
313-
Pattern or regular expression
318+
Character sequence or regular expression
319+
case : boolean, default True
320+
If True, case sensitive
314321
flags : int, default 0 (no flags)
315322
re module flags, e.g. re.IGNORECASE
323+
na : default NaN, fill value for missing values.
324+
as_indexer : False, by default, gives deprecated behavior better achieved
325+
using str_extract. True return boolean indexer.
326+
316327
317328
Returns
318329
-------
319-
matches : array
330+
matches : boolean array (if as_indexer=True)
331+
matches : array of tuples (if as_indexer=False, default but deprecated)
332+
333+
Note
334+
----
335+
To extract matched groups, which is the deprecated behavior of match, use
336+
str.extract.
320337
"""
338+
339+
if not case:
340+
flags |= re.IGNORECASE
341+
321342
regex = re.compile(pat, flags=flags)
322343

323-
def f(x):
324-
m = regex.match(x)
325-
if m:
326-
return m.groups()
327-
else:
328-
return []
344+
if (not as_indexer) and regex.groups > 0:
345+
# Do this first, to make sure it happens even if the re.compile
346+
# raises below.
347+
warnings.warn("""In future versions of pandas, match will change
348+
to always return a bool indexer.""", UserWarning)
349+
350+
if as_indexer and regex.groups > 0:
351+
warnings.warn("""This pattern has match groups. To actually get the
352+
groups, use str.extract.""", UserWarning)
353+
354+
# If not as_indexer and regex.groups == 0, this returns empty lists
355+
# and is basically useless, so we will not warn.
356+
357+
if (not as_indexer) and regex.groups > 0:
358+
def f(x):
359+
m = regex.match(x)
360+
if m:
361+
return m.groups()
362+
else:
363+
return []
364+
else:
365+
# This is the new behavior of str_match.
366+
f = lambda x: bool(regex.match(x))
329367

330368
return _na_map(f, arr)
331369

370+
332371
def str_extract(arr, pat, flags=0):
333372
"""
334-
Find groups in each string (from beginning) using passed regular expression
373+
Find groups in each string using passed regular expression
335374
336375
Parameters
337376
----------
@@ -358,7 +397,7 @@ def str_extract(arr, pat, flags=0):
358397
def f(x):
359398
if not isinstance(x, compat.string_types):
360399
return None
361-
m = regex.match(x)
400+
m = regex.search(x)
362401
if m:
363402
return m.groups()[0] # may be None
364403
else:
@@ -368,7 +407,7 @@ def f(x):
368407
def f(x):
369408
if not isinstance(x, compat.string_types):
370409
return empty_row
371-
m = regex.match(x)
410+
m = regex.search(x)
372411
if m:
373412
return Series(list(m.groups())) # may contain None
374413
else:
@@ -668,13 +707,13 @@ def wrapper(self):
668707
return wrapper
669708

670709

671-
def _pat_wrapper(f, flags=False, na=False):
710+
def _pat_wrapper(f, flags=False, na=False, **kwargs):
672711
def wrapper1(self, pat):
673712
result = f(self.series, pat)
674713
return self._wrap_result(result)
675714

676-
def wrapper2(self, pat, flags=0):
677-
result = f(self.series, pat, flags=flags)
715+
def wrapper2(self, pat, flags=0, **kwargs):
716+
result = f(self.series, pat, flags=flags, **kwargs)
678717
return self._wrap_result(result)
679718

680719
def wrapper3(self, pat, na=np.nan):

pandas/tests/test_strings.py

+62-6
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import operator
66
import re
77
import unittest
8+
import warnings
89

910
import nose
1011

@@ -392,29 +393,78 @@ def test_repeat(self):
392393
u('dddddd')])
393394
tm.assert_series_equal(result, exp)
394395

395-
def test_match(self):
396+
def test_deprecated_match(self):
397+
# Old match behavior, deprecated (but still default) in 0.13
396398
values = Series(['fooBAD__barBAD', NA, 'foo'])
397399

398-
result = values.str.match('.*(BAD[_]+).*(BAD)')
400+
with warnings.catch_warnings(record=True) as w:
401+
warnings.simplefilter('always')
402+
result = values.str.match('.*(BAD[_]+).*(BAD)')
403+
assert issubclass(w[-1].category, UserWarning)
399404
exp = Series([('BAD__', 'BAD'), NA, []])
400405
tm.assert_series_equal(result, exp)
401406

402407
# mixed
403408
mixed = Series(['aBAD_BAD', NA, 'BAD_b_BAD', True, datetime.today(),
404409
'foo', None, 1, 2.])
405410

406-
rs = Series(mixed).str.match('.*(BAD[_]+).*(BAD)')
411+
with warnings.catch_warnings(record=True) as w:
412+
warnings.simplefilter('always')
413+
rs = Series(mixed).str.match('.*(BAD[_]+).*(BAD)')
414+
assert issubclass(w[-1].category, UserWarning)
407415
xp = [('BAD_', 'BAD'), NA, ('BAD_', 'BAD'), NA, NA, [], NA, NA, NA]
408416
tm.assert_isinstance(rs, Series)
409417
tm.assert_almost_equal(rs, xp)
410418

411419
# unicode
412420
values = Series([u('fooBAD__barBAD'), NA, u('foo')])
413421

414-
result = values.str.match('.*(BAD[_]+).*(BAD)')
422+
with warnings.catch_warnings(record=True) as w:
423+
warnings.simplefilter('always')
424+
result = values.str.match('.*(BAD[_]+).*(BAD)')
425+
assert issubclass(w[-1].category, UserWarning)
415426
exp = Series([(u('BAD__'), u('BAD')), NA, []])
416427
tm.assert_series_equal(result, exp)
417428

429+
def test_match(self):
430+
# New match behavior introduced in 0.13
431+
values = Series(['fooBAD__barBAD', NA, 'foo'])
432+
with warnings.catch_warnings(record=True) as w:
433+
warnings.simplefilter('always')
434+
result = values.str.match('.*(BAD[_]+).*(BAD)', as_indexer=True)
435+
assert issubclass(w[-1].category, UserWarning)
436+
exp = Series([True, NA, False])
437+
tm.assert_series_equal(result, exp)
438+
439+
# If no groups, use new behavior even when as_indexer is False.
440+
# (Old behavior is pretty much useless in this case.)
441+
values = Series(['fooBAD__barBAD', NA, 'foo'])
442+
result = values.str.match('.*BAD[_]+.*BAD', as_indexer=False)
443+
exp = Series([True, NA, False])
444+
tm.assert_series_equal(result, exp)
445+
446+
# mixed
447+
mixed = Series(['aBAD_BAD', NA, 'BAD_b_BAD', True, datetime.today(),
448+
'foo', None, 1, 2.])
449+
450+
with warnings.catch_warnings(record=True) as w:
451+
warnings.simplefilter('always')
452+
rs = Series(mixed).str.match('.*(BAD[_]+).*(BAD)', as_indexer=True)
453+
assert issubclass(w[-1].category, UserWarning)
454+
xp = [True, NA, True, NA, NA, False, NA, NA, NA]
455+
tm.assert_isinstance(rs, Series)
456+
tm.assert_almost_equal(rs, xp)
457+
458+
# unicode
459+
values = Series([u('fooBAD__barBAD'), NA, u('foo')])
460+
461+
with warnings.catch_warnings(record=True) as w:
462+
warnings.simplefilter('always')
463+
result = values.str.match('.*(BAD[_]+).*(BAD)', as_indexer=True)
464+
assert issubclass(w[-1].category, UserWarning)
465+
exp = Series([True, NA, False])
466+
tm.assert_series_equal(result, exp)
467+
418468
def test_extract(self):
419469
# Contains tests like those in test_match and some others.
420470

@@ -966,7 +1016,10 @@ def test_match_findall_flags(self):
9661016

9671017
pat = pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
9681018

969-
result = data.str.match(pat, flags=re.IGNORECASE)
1019+
with warnings.catch_warnings(record=True) as w:
1020+
warnings.simplefilter('always')
1021+
result = data.str.match(pat, flags=re.IGNORECASE)
1022+
assert issubclass(w[-1].category, UserWarning)
9701023
self.assertEquals(result[0], ('dave', 'google', 'com'))
9711024

9721025
result = data.str.findall(pat, flags=re.IGNORECASE)
@@ -975,7 +1028,10 @@ def test_match_findall_flags(self):
9751028
result = data.str.count(pat, flags=re.IGNORECASE)
9761029
self.assertEquals(result[0], 1)
9771030

978-
result = data.str.contains(pat, flags=re.IGNORECASE)
1031+
with warnings.catch_warnings(record=True) as w:
1032+
warnings.simplefilter('always')
1033+
result = data.str.contains(pat, flags=re.IGNORECASE)
1034+
assert issubclass(w[-1].category, UserWarning)
9791035
self.assertEquals(result[0], True)
9801036

9811037
def test_encode_decode(self):

0 commit comments

Comments
 (0)