ENH/CLN: Redefine str.match, and issue a warning on deprecated default behavior.

danielballan · danielballan · commit 75dd0f200179 · 2013-10-30T20:07:03.000-04:00
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
@@ -7,7 +7,7 @@
 import pandas.compat as compat
 import re
 import pandas.lib as lib
-
+import warnings
 
 def _get_array_list(arr, others):
     if isinstance(others[0], (list, np.ndarray)):
@@ -169,6 +169,10 @@ def str_contains(arr, pat, case=True, flags=0, na=np.nan):
 
     regex = re.compile(pat, flags=flags)
 
+    if regex.groups > 0:
+        warnings.warn("""This pattern has match groups. To actually get the 
+groups, use str.extract.""", UserWarning)
+
     f = lambda x: bool(regex.search(x))
     return _na_map(f, arr, na)
 
@@ -303,35 +307,70 @@ def rep(x, r):
         return result
 
 
-def str_match(arr, pat, flags=0):
+def str_match(arr, pat, case=True, flags=0, na=np.nan, as_indexer=False):
     """
-    Find groups in each string (from beginning) using passed regular expression
+    Deprecated: Find groups in each string using passed regular expression.
+    If as_indexer=True, determine if each string matches a regular expression.
 
     Parameters
     ----------
     pat : string
-        Pattern or regular expression
+        Character sequence or regular expression
+    case : boolean, default True
+        If True, case sensitive
     flags : int, default 0 (no flags)
         re module flags, e.g. re.IGNORECASE
+    na : default NaN, fill value for missing values.
+    as_indexer : False, by default, gives deprecated behavior better achieved
+        using str_extract. True return boolean indexer.
+
 
     Returns
     -------
-    matches : array
+    matches : boolean array (if as_indexer=True)
+    matches : array of tuples (if as_indexer=False, default but deprecated)
+    
+    Note
+    ----
+    To extract matched groups, which is the deprecated behavior of match, use
+    str.extract.
     """
+
+    if not case:
+        flags |= re.IGNORECASE
+
     regex = re.compile(pat, flags=flags)
 
-    def f(x):
-        m = regex.match(x)
-        if m:
-            return m.groups()
-        else:
-            return []
+    if (not as_indexer) and regex.groups > 0:
+        # Do this first, to make sure it happens even if the re.compile
+        # raises below.
+        warnings.warn("""In future versions of pandas, match will change 
+to always return a bool indexer.""", UserWarning)
+
+    if as_indexer and regex.groups > 0:
+        warnings.warn("""This pattern has match groups. To actually get the 
+groups, use str.extract.""", UserWarning)
+
+    # If not as_indexer and regex.groups == 0, this returns empty lists
+    # and is basically useless, so we will not warn.
+
+    if (not as_indexer) and regex.groups > 0:
+        def f(x):
+            m = regex.match(x)
+            if m:
+                return m.groups()
+            else:
+                return []
+    else:
+        # This is the new behavior of str_match.
+        f = lambda x: bool(regex.match(x))
 
     return _na_map(f, arr)
 
+
 def str_extract(arr, pat, flags=0):
     """
-    Find groups in each string (from beginning) using passed regular expression
+    Find groups in each string using passed regular expression
 
     Parameters
     ----------
@@ -358,7 +397,7 @@ def str_extract(arr, pat, flags=0):
         def f(x):
             if not isinstance(x, compat.string_types):
                 return None
-            m = regex.match(x)
+            m = regex.search(x)
             if m:
                 return m.groups()[0] # may be None
             else:
@@ -368,7 +407,7 @@ def f(x):
         def f(x):
             if not isinstance(x, compat.string_types):
                 return empty_row
-            m = regex.match(x)
+            m = regex.search(x)
             if m:
                 return Series(list(m.groups())) # may contain None
             else:
@@ -668,13 +707,13 @@ def wrapper(self):
     return wrapper
 
 
-def _pat_wrapper(f, flags=False, na=False):
+def _pat_wrapper(f, flags=False, na=False, **kwargs):
     def wrapper1(self, pat):
         result = f(self.series, pat)
         return self._wrap_result(result)
 
-    def wrapper2(self, pat, flags=0):
-        result = f(self.series, pat, flags=flags)
+    def wrapper2(self, pat, flags=0, **kwargs):
+        result = f(self.series, pat, flags=flags, **kwargs)
         return self._wrap_result(result)
 
     def wrapper3(self, pat, na=np.nan):
diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py
@@ -5,6 +5,7 @@
 import operator
 import re
 import unittest
+import warnings
 
 import nose
 
@@ -392,29 +393,78 @@ def test_repeat(self):
                       u('dddddd')])
         tm.assert_series_equal(result, exp)
 
-    def test_match(self):
+    def test_deprecated_match(self):
+        # Old match behavior, deprecated (but still default) in 0.13
         values = Series(['fooBAD__barBAD', NA, 'foo'])
 
-        result = values.str.match('.*(BAD[_]+).*(BAD)')
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter('always')
+            result = values.str.match('.*(BAD[_]+).*(BAD)')
+            assert issubclass(w[-1].category, UserWarning)
         exp = Series([('BAD__', 'BAD'), NA, []])
         tm.assert_series_equal(result, exp)
 
         # mixed
         mixed = Series(['aBAD_BAD', NA, 'BAD_b_BAD', True, datetime.today(),
                         'foo', None, 1, 2.])
 
-        rs = Series(mixed).str.match('.*(BAD[_]+).*(BAD)')
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter('always')
+            rs = Series(mixed).str.match('.*(BAD[_]+).*(BAD)')
+            assert issubclass(w[-1].category, UserWarning)
         xp = [('BAD_', 'BAD'), NA, ('BAD_', 'BAD'), NA, NA, [], NA, NA, NA]
         tm.assert_isinstance(rs, Series)
         tm.assert_almost_equal(rs, xp)
 
         # unicode
         values = Series([u('fooBAD__barBAD'), NA, u('foo')])
 
-        result = values.str.match('.*(BAD[_]+).*(BAD)')
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter('always')
+            result = values.str.match('.*(BAD[_]+).*(BAD)')
+            assert issubclass(w[-1].category, UserWarning)
         exp = Series([(u('BAD__'), u('BAD')), NA, []])
         tm.assert_series_equal(result, exp)
 
+    def test_match(self):
+        # New match behavior introduced in 0.13
+        values = Series(['fooBAD__barBAD', NA, 'foo'])
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter('always')
+            result = values.str.match('.*(BAD[_]+).*(BAD)', as_indexer=True)
+            assert issubclass(w[-1].category, UserWarning)
+        exp = Series([True, NA, False])
+        tm.assert_series_equal(result, exp)
+
+        # If no groups, use new behavior even when as_indexer is False.
+        # (Old behavior is pretty much useless in this case.)
+        values = Series(['fooBAD__barBAD', NA, 'foo'])
+        result = values.str.match('.*BAD[_]+.*BAD', as_indexer=False)
+        exp = Series([True, NA, False])
+        tm.assert_series_equal(result, exp)
+
+        # mixed
+        mixed = Series(['aBAD_BAD', NA, 'BAD_b_BAD', True, datetime.today(),
+                        'foo', None, 1, 2.])
+
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter('always')
+            rs = Series(mixed).str.match('.*(BAD[_]+).*(BAD)', as_indexer=True)
+            assert issubclass(w[-1].category, UserWarning)
+        xp = [True, NA, True, NA, NA, False, NA, NA, NA]
+        tm.assert_isinstance(rs, Series)
+        tm.assert_almost_equal(rs, xp)
+
+        # unicode
+        values = Series([u('fooBAD__barBAD'), NA, u('foo')])
+
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter('always')
+            result = values.str.match('.*(BAD[_]+).*(BAD)', as_indexer=True)
+            assert issubclass(w[-1].category, UserWarning)
+        exp = Series([True, NA, False])
+        tm.assert_series_equal(result, exp)
+
     def test_extract(self):
         # Contains tests like those in test_match and some others.
 
@@ -966,7 +1016,10 @@ def test_match_findall_flags(self):
 
         pat = pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
 
-        result = data.str.match(pat, flags=re.IGNORECASE)
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter('always')
+            result = data.str.match(pat, flags=re.IGNORECASE)
+            assert issubclass(w[-1].category, UserWarning)
         self.assertEquals(result[0], ('dave', 'google', 'com'))
 
         result = data.str.findall(pat, flags=re.IGNORECASE)
@@ -975,7 +1028,10 @@ def test_match_findall_flags(self):
         result = data.str.count(pat, flags=re.IGNORECASE)
         self.assertEquals(result[0], 1)
 
-        result = data.str.contains(pat, flags=re.IGNORECASE)
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter('always')
+            result = data.str.contains(pat, flags=re.IGNORECASE)
+            assert issubclass(w[-1].category, UserWarning)
         self.assertEquals(result[0], True)
 
     def test_encode_decode(self):