Merge pull request pandas-dev#5224 from daniel-ballan/redefine-match

jtratner · jtratner · commit 3ebd7699280e · 2013-10-30T22:13:01.000-04:00
diff --git a/doc/source/basics.rst b/doc/source/basics.rst
@@ -960,6 +960,9 @@ importantly, these methods exclude missing/NA values automatically. These are
 accessed via the Series's ``str`` attribute and generally have names matching
 the equivalent (scalar) build-in string methods:
 
+Splitting and Replacing Strings
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
 .. ipython:: python
 
    s = Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
@@ -990,11 +993,12 @@ Methods like ``replace`` and ``findall`` take regular expressions, too:
    s3
    s3.str.replace('^.a|dog', 'XX-XX ', case=False)
 
-The method ``match`` returns the groups in a regular expression in one tuple.
-Starting in pandas version 0.13.0, the method ``extract`` is available to
-accomplish this more conveniently.
+Extracting Substrings
+~~~~~~~~~~~~~~~~~~~~~
 
-Extracting a regular expression with one group returns a Series of strings.
+The method ``extract`` (introduced in version 0.13) accepts regular expressions
+with match groups. Extracting a regular expression with one group returns 
+a Series of strings.
 
 .. ipython:: python
 
@@ -1016,18 +1020,34 @@ Named groups like
 
 .. ipython:: python
 
-   Series(['a1', 'b2', 'c3']).str.match('(?P<letter>[ab])(?P<digit>\d)')
+   Series(['a1', 'b2', 'c3']).str.extract('(?P<letter>[ab])(?P<digit>\d)')
 
 and optional groups like
 
 .. ipython:: python
 
-   Series(['a1', 'b2', '3']).str.match('(?P<letter>[ab])?(?P<digit>\d)')
+   Series(['a1', 'b2', '3']).str.extract('(?P<letter>[ab])?(?P<digit>\d)')
 
 can also be used.
 
-Methods like ``contains``, ``startswith``, and ``endswith`` takes an extra
-``na`` arguement so missing values can be considered True or False:
+Testing for Strings that Match or Contain a Pattern 
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In previous versions, *extracting* match groups was accomplished by ``match``,
+which returned a not-so-convenient Series of tuples. Starting in version 0.14,
+the default behavior of match will change. It will return a boolean
+indexer, analagous to the method ``contains``.
+
+The distinction between
+``match`` and ``contains`` is strictness: ``match`` relies on
+strict ``re.match`` while ``contains`` relies on ``re.search``.
+
+In version 0.13, ``match`` performs its old, deprecated behavior by default, 
+but the new behavior is availabe through the keyword argument 
+``as_indexer=True``.
+
+Methods like ``match``, ``contains``, ``startswith``, and ``endswith`` take
+ an extra ``na`` arguement so missing values can be considered True or False:
 
 .. ipython:: python
 
diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt
@@ -102,6 +102,14 @@ Deprecated in 0.13.0
 - deprecated ``iterkv``, which will be removed in a future release (this was
   an alias of iteritems used to bypass ``2to3``'s changes).
   (:issue:`4384`, :issue:`4375`, :issue:`4372`)
+- deprecated the string method ``match``, whose role is now performed more
+  idiomatically by ``extract``. In a future release, the default behavior
+  of ``match`` will change to become analogous to ``contains``, which returns
+  a boolean indexer. (Their
+  distinction is strictness: ``match`` relies on ``re.match`` while
+  ``contains`` relies on ``re.serach``.) In this release, the deprecated
+  behavior is the default, but the new behavior is available through the
+  keyword argument ``as_indexer=True``.
 
 Indexing API Changes
 ~~~~~~~~~~~~~~~~~~~~
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
@@ -7,7 +7,7 @@
 import pandas.compat as compat
 import re
 import pandas.lib as lib
-
+import warnings
 
 def _get_array_list(arr, others):
     if isinstance(others[0], (list, np.ndarray)):
@@ -169,6 +169,10 @@ def str_contains(arr, pat, case=True, flags=0, na=np.nan):
 
     regex = re.compile(pat, flags=flags)
 
+    if regex.groups > 0:
+        warnings.warn("This pattern has match groups. To actually get the"
+                      " groups, use str.extract.", UserWarning)
+
     f = lambda x: bool(regex.search(x))
     return _na_map(f, arr, na)
 
@@ -303,35 +307,70 @@ def rep(x, r):
         return result
 
 
-def str_match(arr, pat, flags=0):
+def str_match(arr, pat, case=True, flags=0, na=np.nan, as_indexer=False):
     """
-    Find groups in each string (from beginning) using passed regular expression
+    Deprecated: Find groups in each string using passed regular expression.
+    If as_indexer=True, determine if each string matches a regular expression.
 
     Parameters
     ----------
     pat : string
-        Pattern or regular expression
+        Character sequence or regular expression
+    case : boolean, default True
+        If True, case sensitive
     flags : int, default 0 (no flags)
         re module flags, e.g. re.IGNORECASE
+    na : default NaN, fill value for missing values.
+    as_indexer : False, by default, gives deprecated behavior better achieved
+        using str_extract. True return boolean indexer.
+
 
     Returns
     -------
-    matches : array
+    matches : boolean array (if as_indexer=True)
+    matches : array of tuples (if as_indexer=False, default but deprecated)
+
+    Note
+    ----
+    To extract matched groups, which is the deprecated behavior of match, use
+    str.extract.
     """
+
+    if not case:
+        flags |= re.IGNORECASE
+
     regex = re.compile(pat, flags=flags)
 
-    def f(x):
-        m = regex.match(x)
-        if m:
-            return m.groups()
-        else:
-            return []
+    if (not as_indexer) and regex.groups > 0:
+        # Do this first, to make sure it happens even if the re.compile
+        # raises below.
+        warnings.warn("In future versions of pandas, match will change to"
+                      " always return a bool indexer.""", UserWarning)
+
+    if as_indexer and regex.groups > 0:
+        warnings.warn("This pattern has match groups. To actually get the"
+                      " groups, use str.extract.""", UserWarning)
+
+    # If not as_indexer and regex.groups == 0, this returns empty lists
+    # and is basically useless, so we will not warn.
+
+    if (not as_indexer) and regex.groups > 0:
+        def f(x):
+            m = regex.match(x)
+            if m:
+                return m.groups()
+            else:
+                return []
+    else:
+        # This is the new behavior of str_match.
+        f = lambda x: bool(regex.match(x))
 
     return _na_map(f, arr)
 
+
 def str_extract(arr, pat, flags=0):
     """
-    Find groups in each string (from beginning) using passed regular expression
+    Find groups in each string using passed regular expression
 
     Parameters
     ----------
@@ -358,7 +397,7 @@ def str_extract(arr, pat, flags=0):
         def f(x):
             if not isinstance(x, compat.string_types):
                 return None
-            m = regex.match(x)
+            m = regex.search(x)
             if m:
                 return m.groups()[0] # may be None
             else:
@@ -368,7 +407,7 @@ def f(x):
         def f(x):
             if not isinstance(x, compat.string_types):
                 return empty_row
-            m = regex.match(x)
+            m = regex.search(x)
             if m:
                 return Series(list(m.groups())) # may contain None
             else:
@@ -668,13 +707,13 @@ def wrapper(self):
     return wrapper
 
 
-def _pat_wrapper(f, flags=False, na=False):
+def _pat_wrapper(f, flags=False, na=False, **kwargs):
     def wrapper1(self, pat):
         result = f(self.series, pat)
         return self._wrap_result(result)
 
-    def wrapper2(self, pat, flags=0):
-        result = f(self.series, pat, flags=flags)
+    def wrapper2(self, pat, flags=0, **kwargs):
+        result = f(self.series, pat, flags=flags, **kwargs)
         return self._wrap_result(result)
 
     def wrapper3(self, pat, na=np.nan):
diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py
@@ -5,6 +5,7 @@
 import operator
 import re
 import unittest
+import warnings
 
 import nose
 
@@ -392,29 +393,66 @@ def test_repeat(self):
                       u('dddddd')])
         tm.assert_series_equal(result, exp)
 
-    def test_match(self):
+    def test_deprecated_match(self):
+        # Old match behavior, deprecated (but still default) in 0.13
         values = Series(['fooBAD__barBAD', NA, 'foo'])
 
-        result = values.str.match('.*(BAD[_]+).*(BAD)')
+        with tm.assert_produces_warning():
+            result = values.str.match('.*(BAD[_]+).*(BAD)')
         exp = Series([('BAD__', 'BAD'), NA, []])
         tm.assert_series_equal(result, exp)
 
         # mixed
         mixed = Series(['aBAD_BAD', NA, 'BAD_b_BAD', True, datetime.today(),
                         'foo', None, 1, 2.])
 
-        rs = Series(mixed).str.match('.*(BAD[_]+).*(BAD)')
+        with tm.assert_produces_warning():
+            rs = Series(mixed).str.match('.*(BAD[_]+).*(BAD)')
         xp = [('BAD_', 'BAD'), NA, ('BAD_', 'BAD'), NA, NA, [], NA, NA, NA]
         tm.assert_isinstance(rs, Series)
         tm.assert_almost_equal(rs, xp)
 
         # unicode
         values = Series([u('fooBAD__barBAD'), NA, u('foo')])
 
-        result = values.str.match('.*(BAD[_]+).*(BAD)')
+        with tm.assert_produces_warning():
+            result = values.str.match('.*(BAD[_]+).*(BAD)')
         exp = Series([(u('BAD__'), u('BAD')), NA, []])
         tm.assert_series_equal(result, exp)
 
+    def test_match(self):
+        # New match behavior introduced in 0.13
+        values = Series(['fooBAD__barBAD', NA, 'foo'])
+        with tm.assert_produces_warning():
+            result = values.str.match('.*(BAD[_]+).*(BAD)', as_indexer=True)
+        exp = Series([True, NA, False])
+        tm.assert_series_equal(result, exp)
+
+        # If no groups, use new behavior even when as_indexer is False.
+        # (Old behavior is pretty much useless in this case.)
+        values = Series(['fooBAD__barBAD', NA, 'foo'])
+        result = values.str.match('.*BAD[_]+.*BAD', as_indexer=False)
+        exp = Series([True, NA, False])
+        tm.assert_series_equal(result, exp)
+
+        # mixed
+        mixed = Series(['aBAD_BAD', NA, 'BAD_b_BAD', True, datetime.today(),
+                        'foo', None, 1, 2.])
+
+        with tm.assert_produces_warning():
+            rs = Series(mixed).str.match('.*(BAD[_]+).*(BAD)', as_indexer=True)
+        xp = [True, NA, True, NA, NA, False, NA, NA, NA]
+        tm.assert_isinstance(rs, Series)
+        tm.assert_almost_equal(rs, xp)
+
+        # unicode
+        values = Series([u('fooBAD__barBAD'), NA, u('foo')])
+
+        with tm.assert_produces_warning():
+            result = values.str.match('.*(BAD[_]+).*(BAD)', as_indexer=True)
+        exp = Series([True, NA, False])
+        tm.assert_series_equal(result, exp)
+
     def test_extract(self):
         # Contains tests like those in test_match and some others.
 
@@ -966,7 +1004,10 @@ def test_match_findall_flags(self):
 
         pat = pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
 
-        result = data.str.match(pat, flags=re.IGNORECASE)
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter('always')
+            result = data.str.match(pat, flags=re.IGNORECASE)
+            assert issubclass(w[-1].category, UserWarning)
         self.assertEquals(result[0], ('dave', 'google', 'com'))
 
         result = data.str.findall(pat, flags=re.IGNORECASE)
@@ -975,7 +1016,10 @@ def test_match_findall_flags(self):
         result = data.str.count(pat, flags=re.IGNORECASE)
         self.assertEquals(result[0], 1)
 
-        result = data.str.contains(pat, flags=re.IGNORECASE)
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter('always')
+            result = data.str.contains(pat, flags=re.IGNORECASE)
+            assert issubclass(w[-1].category, UserWarning)
         self.assertEquals(result[0], True)
 
     def test_encode_decode(self):