Skip to content

Commit 0788de2

Browse files
API: change default behaviour of str.match from deprecated extract to match (GH5224)
1 parent fb7af6e commit 0788de2

File tree

4 files changed

+34
-94
lines changed

4 files changed

+34
-94
lines changed

doc/source/text.rst

-12
Original file line numberDiff line numberDiff line change
@@ -385,18 +385,6 @@ or match a pattern:
385385
The distinction between ``match`` and ``contains`` is strictness: ``match``
386386
relies on strict ``re.match``, while ``contains`` relies on ``re.search``.
387387

388-
.. warning::
389-
390-
In previous versions, ``match`` was for *extracting* groups,
391-
returning a not-so-convenient Series of tuples. The new method ``extract``
392-
(described in the previous section) is now preferred.
393-
394-
This old, deprecated behavior of ``match`` is still the default. As
395-
demonstrated above, use the new behavior by setting ``as_indexer=True``.
396-
In this mode, ``match`` is analogous to ``contains``, returning a boolean
397-
Series. The new behavior will become the default behavior in a future
398-
release.
399-
400388
Methods like ``match``, ``contains``, ``startswith``, and ``endswith`` take
401389
an extra ``na`` argument so missing values can be considered True or False:
402390

doc/source/whatsnew/v0.20.0.txt

+5
Original file line numberDiff line numberDiff line change
@@ -729,6 +729,11 @@ Other API Changes
729729
- ``Series.sort_values()`` accepts a one element list of bool for consistency with the behavior of ``DataFrame.sort_values()`` (:issue:`15604`)
730730
- ``.merge()`` and ``.join()`` on ``category`` dtype columns will now preserve the category dtype when possible (:issue:`10409`)
731731
- ``SparseDataFrame.default_fill_value`` will be 0, previously was ``nan`` in the return from ``pd.get_dummies(..., sparse=True)`` (:issue:`15594`)
732+
- The default behaviour of ``Series.str.match`` has changed from extracting
733+
groups to matching the pattern. The extracting behaviour was deprecated
734+
since pandas version 0.13.0 and can be done with the ``Series.str.extract``
735+
method (:issue:`5224`).
736+
732737

733738
.. _whatsnew_0200.deprecations:
734739

pandas/core/strings.py

+12-41
Original file line numberDiff line numberDiff line change
@@ -464,11 +464,9 @@ def rep(x, r):
464464
return result
465465

466466

467-
def str_match(arr, pat, case=True, flags=0, na=np.nan, as_indexer=False):
467+
def str_match(arr, pat, case=True, flags=0, na=np.nan, as_indexer=None):
468468
"""
469-
Deprecated: Find groups in each string in the Series/Index
470-
using passed regular expression.
471-
If as_indexer=True, determine if each string matches a regular expression.
469+
Determine if each string matches a regular expression.
472470
473471
Parameters
474472
----------
@@ -479,60 +477,33 @@ def str_match(arr, pat, case=True, flags=0, na=np.nan, as_indexer=False):
479477
flags : int, default 0 (no flags)
480478
re module flags, e.g. re.IGNORECASE
481479
na : default NaN, fill value for missing values.
482-
as_indexer : False, by default, gives deprecated behavior better achieved
483-
using str_extract. True return boolean indexer.
480+
as_indexer : ignored
484481
485482
Returns
486483
-------
487484
Series/array of boolean values
488-
if as_indexer=True
489-
Series/Index of tuples
490-
if as_indexer=False, default but deprecated
491485
492486
See Also
493487
--------
494488
contains : analogous, but less strict, relying on re.search instead of
495489
re.match
496-
extract : now preferred to the deprecated usage of match (as_indexer=False)
490+
extract : extract matched groups
497491
498-
Notes
499-
-----
500-
To extract matched groups, which is the deprecated behavior of match, use
501-
str.extract.
502492
"""
503-
504493
if not case:
505494
flags |= re.IGNORECASE
506495

507496
regex = re.compile(pat, flags=flags)
508497

509-
if (not as_indexer) and regex.groups > 0:
510-
# Do this first, to make sure it happens even if the re.compile
511-
# raises below.
512-
warnings.warn("In future versions of pandas, match will change to"
513-
" always return a bool indexer.", FutureWarning,
514-
stacklevel=3)
515-
516-
if as_indexer and regex.groups > 0:
517-
warnings.warn("This pattern has match groups. To actually get the"
518-
" groups, use str.extract.", UserWarning, stacklevel=3)
498+
if as_indexer is not None:
499+
# Previously, this keyword was used for changing the default but
500+
# deprecated behaviour. This keyword is now no longer needed.
501+
warnings.warn("'as_indexer' keyword was specified but will be ignored;"
502+
" match now returns a boolean indexer by default.",
503+
UserWarning, stacklevel=3)
519504

520-
# If not as_indexer and regex.groups == 0, this returns empty lists
521-
# and is basically useless, so we will not warn.
522-
523-
if (not as_indexer) and regex.groups > 0:
524-
dtype = object
525-
526-
def f(x):
527-
m = regex.match(x)
528-
if m:
529-
return m.groups()
530-
else:
531-
return []
532-
else:
533-
# This is the new behavior of str_match.
534-
dtype = bool
535-
f = lambda x: bool(regex.match(x))
505+
dtype = bool
506+
f = lambda x: bool(regex.match(x))
536507

537508
return _na_map(f, arr, na, dtype=dtype)
538509

pandas/tests/test_strings.py

+17-41
Original file line numberDiff line numberDiff line change
@@ -559,64 +559,39 @@ def test_repeat(self):
559559
exp = Series([u('a'), u('bb'), NA, u('cccc'), NA, u('dddddd')])
560560
tm.assert_series_equal(result, exp)
561561

562-
def test_deprecated_match(self):
563-
# Old match behavior, deprecated (but still default) in 0.13
562+
def test_match(self):
563+
# New match behavior introduced in 0.13
564564
values = Series(['fooBAD__barBAD', NA, 'foo'])
565-
566-
with tm.assert_produces_warning():
567-
result = values.str.match('.*(BAD[_]+).*(BAD)')
568-
exp = Series([('BAD__', 'BAD'), NA, []])
569-
tm.assert_series_equal(result, exp)
570-
571-
# mixed
572-
mixed = Series(['aBAD_BAD', NA, 'BAD_b_BAD', True, datetime.today(),
573-
'foo', None, 1, 2.])
574-
575-
with tm.assert_produces_warning():
576-
rs = Series(mixed).str.match('.*(BAD[_]+).*(BAD)')
577-
xp = Series([('BAD_', 'BAD'), NA, ('BAD_', 'BAD'),
578-
NA, NA, [], NA, NA, NA])
579-
tm.assertIsInstance(rs, Series)
580-
tm.assert_series_equal(rs, xp)
581-
582-
# unicode
583-
values = Series([u('fooBAD__barBAD'), NA, u('foo')])
584-
585-
with tm.assert_produces_warning():
586-
result = values.str.match('.*(BAD[_]+).*(BAD)')
587-
exp = Series([(u('BAD__'), u('BAD')), NA, []])
565+
result = values.str.match('.*(BAD[_]+).*(BAD)')
566+
exp = Series([True, NA, False])
588567
tm.assert_series_equal(result, exp)
589568

590-
def test_match(self):
591-
# New match behavior introduced in 0.13
592569
values = Series(['fooBAD__barBAD', NA, 'foo'])
593-
with tm.assert_produces_warning():
594-
result = values.str.match('.*(BAD[_]+).*(BAD)', as_indexer=True)
570+
result = values.str.match('.*BAD[_]+.*BAD')
595571
exp = Series([True, NA, False])
596572
tm.assert_series_equal(result, exp)
597573

598-
# If no groups, use new behavior even when as_indexer is False.
599-
# (Old behavior is pretty much useless in this case.)
574+
# test passing as_indexer still works but is ignored
600575
values = Series(['fooBAD__barBAD', NA, 'foo'])
601-
result = values.str.match('.*BAD[_]+.*BAD', as_indexer=False)
602576
exp = Series([True, NA, False])
577+
with tm.assert_produces_warning(UserWarning):
578+
result = values.str.match('.*BAD[_]+.*BAD', as_indexer=True)
579+
tm.assert_series_equal(result, exp)
580+
with tm.assert_produces_warning(UserWarning):
581+
result = values.str.match('.*BAD[_]+.*BAD', as_indexer=False)
603582
tm.assert_series_equal(result, exp)
604583

605584
# mixed
606585
mixed = Series(['aBAD_BAD', NA, 'BAD_b_BAD', True, datetime.today(),
607586
'foo', None, 1, 2.])
608-
609-
with tm.assert_produces_warning():
610-
rs = Series(mixed).str.match('.*(BAD[_]+).*(BAD)', as_indexer=True)
587+
rs = Series(mixed).str.match('.*(BAD[_]+).*(BAD)')
611588
xp = Series([True, NA, True, NA, NA, False, NA, NA, NA])
612589
tm.assertIsInstance(rs, Series)
613590
tm.assert_series_equal(rs, xp)
614591

615592
# unicode
616593
values = Series([u('fooBAD__barBAD'), NA, u('foo')])
617-
618-
with tm.assert_produces_warning():
619-
result = values.str.match('.*(BAD[_]+).*(BAD)', as_indexer=True)
594+
result = values.str.match('.*(BAD[_]+).*(BAD)')
620595
exp = Series([True, NA, False])
621596
tm.assert_series_equal(result, exp)
622597

@@ -2610,11 +2585,12 @@ def test_match_findall_flags(self):
26102585

26112586
pat = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
26122587

2613-
with tm.assert_produces_warning(FutureWarning):
2614-
result = data.str.match(pat, flags=re.IGNORECASE)
2615-
2588+
result = data.str.extract(pat, flags=re.IGNORECASE)
26162589
self.assertEqual(result[0], ('dave', 'google', 'com'))
26172590

2591+
result = data.str.match(pat, flags=re.IGNORECASE)
2592+
self.assertEqual(result[0], True)
2593+
26182594
result = data.str.findall(pat, flags=re.IGNORECASE)
26192595
self.assertEqual(result[0][0], ('dave', 'google', 'com'))
26202596

0 commit comments

Comments
 (0)