Merge pull request #4696 from danielballan/str_extract

jreback · jreback · commit 351ba922f603 · 2013-09-20T07:37:39.000-07:00
ENH: Series.str.extract returns regex matches more conveniently
diff --git a/doc/source/basics.rst b/doc/source/basics.rst
@@ -975,6 +975,42 @@ Methods like ``replace`` and ``findall`` take regular expressions, too:
    s3
    s3.str.replace('^.a|dog', 'XX-XX ', case=False)
 
+The method ``match`` returns the groups in a regular expression in one tuple.
+ Starting in pandas version 0.13, the method ``extract`` is available to 
+accomplish this more conveniently.
+
+Extracting a regular expression with one group returns a Series of strings.
+
+.. ipython:: python
+
+   Series(['a1', 'b2', 'c3']).str.extract('[ab](\d)')
+
+Elements that do not match return ``NaN``. Extracting a regular expression 
+with more than one group returns a DataFrame with one column per group.
+
+.. ipython:: python
+
+   Series(['a1', 'b2', 'c3']).str.extract('([ab])(\d)')
+
+Elements that do not match return a row of ``NaN``s. 
+Thus, a Series of messy strings can be "converted" into a 
+like-indexed Series or DataFrame of cleaned-up or more useful strings, 
+without necessitating ``get()`` to access tuples or ``re.match`` objects.
+
+Named groups like
+
+.. ipython:: python
+
+   Series(['a1', 'b2', 'c3']).str.match('(?P<letter>[ab])(?P<digit>\d)')
+
+and optional groups like
+
+.. ipython:: python
+
+   Series(['a1', 'b2', '3']).str.match('(?P<letter>[ab])?(?P<digit>\d)')
+
+can also be used.
+
 Methods like ``contains``, ``startswith``, and ``endswith`` takes an extra
 ``na`` arguement so missing values can be considered True or False:
 
@@ -1003,6 +1039,7 @@ Methods like ``contains``, ``startswith``, and ``endswith`` takes an extra
     ``endswidth``,Equivalent to ``str.endswith(pat)`` for each element
     ``findall``,Compute list of all occurrences of pattern/regex for each string
     ``match``,"Call ``re.match`` on each element, returning matched groups as list"
+    ``extract``,"Call ``re.match`` on each element, as ``match`` does, but return matched groups as strings for convenience."
     ``len``,Compute string lengths
     ``strip``,Equivalent to ``str.strip``
     ``rstrip``,Equivalent to ``str.rstrip``
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -59,6 +59,7 @@ New features
 
   - Added ``isin`` method to DataFrame (:issue:`4211`)
   - Clipboard functionality now works with PySide (:issue:`4282`)
+  - New ``extract`` string method returns regex matches more conveniently (:issue:`4685`)
 
 Improvements to existing features
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt
@@ -297,6 +297,38 @@ Enhancements
     the bandwidth, and to gkde.evaluate() to specify the indicies at which it
     is evaluated, respecttively. See scipy docs.
   - DataFrame constructor now accepts a numpy masked record array (:issue:`3478`)
+  - The new vectorized string method ``extract`` return regular expression
+    matches more conveniently.
+
+    .. ipython:: python
+
+       Series(['a1', 'b2', 'c3']).str.extract('[ab](\d)')
+
+    Elements that do not match return ``NaN``. Extracting a regular expression
+    with more than one group returns a DataFrame with one column per group.
+
+
+    .. ipython:: python
+
+       Series(['a1', 'b2', 'c3']).str.extract('([ab])(\d)')
+
+    Elements that do not match return a row of ``NaN``s. 
+    Thus, a Series of messy strings can be "converted" into a 
+    like-indexed Series or DataFrame of cleaned-up or more useful strings, 
+    without necessitating ``get()`` to access tuples or ``re.match`` objects.
+
+    Named groups like
+
+    .. ipython:: python
+
+       Series(['a1', 'b2', 'c3']).str.match('(?P<letter>[ab])(?P<digit>\d)')
+
+    and optional groups like
+
+    .. ipython:: python
+        Series(['a1', 'b2', '3']).str.match('(?P<letter>[ab])?(?P<digit>\d)')
+
+    can also be used.
 
 
 .. _whatsnew_0130.experimental:
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
@@ -3,6 +3,7 @@
 from pandas.compat import zip
 from pandas.core.common import isnull, _values_from_object
 from pandas.core.series import Series
+from pandas.core.frame import DataFrame
 import pandas.compat as compat
 import re
 import pandas.lib as lib
@@ -328,6 +329,59 @@ def f(x):
 
     return _na_map(f, arr)
 
+def str_extract(arr, pat, flags=0):
+    """
+    Find groups in each string (from beginning) using passed regular expression
+
+    Parameters
+    ----------
+    pat : string
+        Pattern or regular expression
+    flags : int, default 0 (no flags)
+        re module flags, e.g. re.IGNORECASE
+
+    Returns
+    -------
+    extracted groups : Series (one group) or DataFrame (multiple groups)
+
+
+    Note
+    ----
+    Compare to the string method match, which returns re.match objects.
+    """
+    regex = re.compile(pat, flags=flags)
+
+    # just to be safe, check this
+    if regex.groups == 0:
+        raise ValueError("This pattern contains no groups to capture.")
+    elif regex.groups == 1:
+        def f(x):
+            if not isinstance(x, compat.string_types):
+                return None
+            m = regex.match(x)
+            if m:
+                return m.groups()[0] # may be None
+            else:
+                return None
+    else:
+        empty_row = Series(regex.groups*[None])
+        def f(x):
+            if not isinstance(x, compat.string_types):
+                return empty_row
+            m = regex.match(x)
+            if m:
+                return Series(list(m.groups())) # may contain None
+            else:
+                return empty_row
+    result = arr.apply(f)
+    result.replace({None: np.nan}, inplace=True)
+    if regex.groups > 1:
+        result = DataFrame(result) # Don't rely on the wrapper; name columns.
+        names = dict(zip(regex.groupindex.values(), regex.groupindex.keys()))
+        result.columns = [names.get(1 + i, i) for i in range(regex.groups)]
+    else:
+        result.name = regex.groupindex.get(0)
+    return result
 
 def str_join(arr, sep):
     """
@@ -675,8 +729,12 @@ def __iter__(self):
             g = self.get(i)
 
     def _wrap_result(self, result):
-        return Series(result, index=self.series.index,
-                      name=self.series.name)
+        assert result.ndim < 3
+        if result.ndim == 1:
+            return Series(result, index=self.series.index,
+                          name=self.series.name)
+        else:
+            return DataFrame(result, index=self.series.index)
 
     @copy(str_cat)
     def cat(self, others=None, sep=None, na_rep=None):
@@ -764,6 +822,7 @@ def rstrip(self, to_strip=None):
     endswith = _pat_wrapper(str_endswith, na=True)
     findall = _pat_wrapper(str_findall, flags=True)
     match = _pat_wrapper(str_match, flags=True)
+    extract = _pat_wrapper(str_extract, flags=True)
 
     len = _noarg_wrapper(str_len)
     lower = _noarg_wrapper(str_lower)
diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py
@@ -415,6 +415,94 @@ def test_match(self):
         exp = Series([(u('BAD__'), u('BAD')), NA, []])
         tm.assert_series_equal(result, exp)
 
+    def test_extract(self):
+        # Contains tests like those in test_match and some others.
+
+        values = Series(['fooBAD__barBAD', NA, 'foo'])
+        er = [NA, NA] # empty row
+
+        result = values.str.extract('.*(BAD[_]+).*(BAD)')
+        exp = DataFrame([['BAD__', 'BAD'], er, er])
+        tm.assert_frame_equal(result, exp)
+
+        # mixed
+        mixed = Series(['aBAD_BAD', NA, 'BAD_b_BAD', True, datetime.today(),
+                        'foo', None, 1, 2.])
+
+        rs = Series(mixed).str.extract('.*(BAD[_]+).*(BAD)')
+        exp = DataFrame([['BAD_', 'BAD'], er, ['BAD_', 'BAD'], er, er,
+                         er, er, er, er])
+        tm.assert_frame_equal(rs, exp)
+
+        # unicode
+        values = Series([u('fooBAD__barBAD'), NA, u('foo')])
+
+        result = values.str.extract('.*(BAD[_]+).*(BAD)')
+        exp = DataFrame([[u('BAD__'), u('BAD')], er, er])
+        tm.assert_frame_equal(result, exp)
+
+        # no groups
+        s = Series(['A1', 'B2', 'C3']) 
+        f = lambda: s.str.extract('[ABC][123]')
+        self.assertRaises(ValueError, f)
+
+        # only non-capturing groups
+        f = lambda: s.str.extract('(?:[AB]).*')
+        self.assertRaises(ValueError, f)
+
+        # one group, no matches
+        result = s.str.extract('(_)')
+        exp = Series([NA, NA, NA])
+        tm.assert_series_equal(result, exp)
+ 
+        # two groups, no matches
+        result = s.str.extract('(_)(_)')
+        exp = DataFrame([[NA, NA], [NA, NA], [NA, NA]])
+        tm.assert_frame_equal(result, exp)
+
+        # one group, some matches
+        result = s.str.extract('([AB])[123]')
+        exp = Series(['A', 'B', NA])
+        tm.assert_series_equal(result, exp)
+
+        # two groups, some matches
+        result = s.str.extract('([AB])([123])')
+        exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]])
+        tm.assert_frame_equal(result, exp)
+
+        # named group/groups
+        result = s.str.extract('(?P<letter>[AB])(?P<number>[123])')
+        exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]], columns=['letter', 'number'])
+        tm.assert_frame_equal(result, exp)
+        result = s.str.extract('(?P<letter>[AB])')
+        exp = Series(['A', 'B', NA], name='letter')
+        tm.assert_series_equal(result, exp)
+
+        # mix named and unnamed groups
+        result = s.str.extract('([AB])(?P<number>[123])')
+        exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]], columns=[0, 'number'])
+        tm.assert_frame_equal(result, exp)
+
+        # one normal group, one non-capturing group
+        result = s.str.extract('([AB])(?:[123])')
+        exp = Series(['A', 'B', NA])
+        tm.assert_series_equal(result, exp)
+
+        # two normal groups, one non-capturing group
+        result = Series(['A11', 'B22', 'C33']).str.extract('([AB])([123])(?:[123])')
+        exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]])
+        tm.assert_frame_equal(result, exp)
+
+        # one optional group followed by one normal group
+        result = Series(['A1', 'B2', '3']).str.extract('(?P<letter>[AB])?(?P<number>[123])')
+        exp = DataFrame([['A', '1'], ['B', '2'], [NA, '3']], columns=['letter', 'number'])
+        tm.assert_frame_equal(result, exp)
+
+        # one normal group followed by one optional group
+        result = Series(['A1', 'B2', 'C']).str.extract('(?P<letter>[ABC])(?P<number>[123])?')
+        exp = DataFrame([['A', '1'], ['B', '2'], ['C', NA]], columns=['letter', 'number'])
+        tm.assert_frame_equal(result, exp)
+
     def test_join(self):
         values = Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h'])
         result = values.str.split('_').str.join('_')