ENH: improve extract and get_dummies methods for Index.str (fix for #9980)

mortada · jreback · commit e68638778b62 · 2015-05-07T20:14:35.000-04:00
simplify str_extract(), pass name into _wrap_result()
diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt
@@ -40,6 +40,7 @@ Enhancements
      Timestamp('2014-08-01 16:30') + BusinessHour()
 
 - ``DataFrame.diff`` now takes an ``axis`` parameter that determines the direction of differencing (:issue:`9727`)
+
 - Allow clip, clip_lower, and clip_upper to accept array-like arguments as thresholds (:issue:`6966`). These methods now have an ``axis`` parameter which determines how the Series or DataFrame will be aligned with the threshold(s).
 
 - ``DataFrame.mask()`` and ``Series.mask()`` now support same keywords as ``where`` (:issue:`8801`)
@@ -216,6 +217,8 @@ enhancements are performed to make string operation easier.
      idx.str.startswith('a')
      s[s.index.str.startswith('a')]
 
+- Improved ``extract`` and ``get_dummies`` methods for ``Index.str`` (:issue:`9980`)
+
 .. _whatsnew_0161.api:
 
 API changes
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
@@ -466,6 +466,7 @@ def str_extract(arr, pat, flags=0):
     """
     from pandas.core.series import Series
     from pandas.core.frame import DataFrame
+    from pandas.core.index import Index
 
     regex = re.compile(pat, flags=flags)
     # just to be safe, check this
@@ -481,11 +482,14 @@ def f(x):
             return [np.nan if item is None else item for item in m.groups()]
         else:
             return empty_row
+
     if regex.groups == 1:
-        result = Series([f(val)[0] for val in arr],
-                        name=_get_single_group_name(regex),
-                        index=arr.index, dtype=object)
+        result = np.array([f(val)[0] for val in arr], dtype=object)
+        name = _get_single_group_name(regex)
     else:
+        if isinstance(arr, Index):
+            raise ValueError("only one regex group is supported with Index")
+        name = None
         names = dict(zip(regex.groupindex.values(), regex.groupindex.keys()))
         columns = [names.get(1 + i, i) for i in range(regex.groups)]
         if arr.empty:
@@ -495,7 +499,7 @@ def f(x):
                                columns=columns,
                                index=arr.index,
                                dtype=object)
-    return result
+    return result, name
 
 
 def str_get_dummies(arr, sep='|'):
@@ -531,6 +535,11 @@ def str_get_dummies(arr, sep='|'):
     pandas.get_dummies
     """
     from pandas.core.frame import DataFrame
+    from pandas.core.index import Index
+
+    # GH9980, Index.str does not support get_dummies() as it returns a frame
+    if isinstance(arr, Index):
+        raise TypeError("get_dummies is not supported for string methods on Index")
 
     # TODO remove this hack?
     arr = arr.fillna('')
@@ -991,7 +1000,7 @@ def __iter__(self):
             i += 1
             g = self.get(i)
 
-    def _wrap_result(self, result):
+    def _wrap_result(self, result, **kwargs):
         # leave as it is to keep extract and get_dummies results
         # can be merged to _wrap_result_expand in v0.17
         from pandas.core.series import Series
@@ -1000,16 +1009,16 @@ def _wrap_result(self, result):
 
         if not hasattr(result, 'ndim'):
             return result
-        elif result.ndim == 1:
-            name = getattr(result, 'name', None)
+        name = kwargs.get('name') or getattr(result, 'name', None) or self.series.name
+
+        if result.ndim == 1:
             if isinstance(self.series, Index):
                 # if result is a boolean np.array, return the np.array
                 # instead of wrapping it into a boolean Index (GH 8875)
                 if is_bool_dtype(result):
                     return result
-                return Index(result, name=name or self.series.name)
-            return Series(result, index=self.series.index,
-                          name=name or self.series.name)
+                return Index(result, name=name)
+            return Series(result, index=self.series.index, name=name)
         else:
             assert result.ndim < 3
             return DataFrame(result, index=self.series.index)
@@ -1257,7 +1266,11 @@ def get_dummies(self, sep='|'):
     startswith = _pat_wrapper(str_startswith, na=True)
     endswith = _pat_wrapper(str_endswith, na=True)
     findall = _pat_wrapper(str_findall, flags=True)
-    extract = _pat_wrapper(str_extract, flags=True)
+
+    @copy(str_extract)
+    def extract(self, pat, flags=0):
+        result, name = str_extract(self.series, pat, flags=flags)
+        return self._wrap_result(result, name=name)
 
     _shared_docs['find'] = ("""
     Return %(side)s indexes in each strings in the Series/Index
diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py
@@ -516,7 +516,6 @@ def test_match(self):
 
     def test_extract(self):
         # Contains tests like those in test_match and some others.
-
         values = Series(['fooBAD__barBAD', NA, 'foo'])
         er = [NA, NA]  # empty row
 
@@ -540,15 +539,31 @@ def test_extract(self):
         exp = DataFrame([[u('BAD__'), u('BAD')], er, er])
         tm.assert_frame_equal(result, exp)
 
-        # no groups
-        s = Series(['A1', 'B2', 'C3'])
-        f = lambda: s.str.extract('[ABC][123]')
-        self.assertRaises(ValueError, f)
-
-        # only non-capturing groups
-        f = lambda: s.str.extract('(?:[AB]).*')
-        self.assertRaises(ValueError, f)
+        # GH9980
+        # Index only works with one regex group since
+        # multi-group would expand to a frame
+        idx = Index(['A1', 'A2', 'A3', 'A4', 'B5'])
+        with tm.assertRaisesRegexp(ValueError, "supported"):
+            idx.str.extract('([AB])([123])')
+
+        # these should work for both Series and Index
+        for klass in [Series, Index]:
+            # no groups
+            s_or_idx = klass(['A1', 'B2', 'C3'])
+            f = lambda: s_or_idx.str.extract('[ABC][123]')
+            self.assertRaises(ValueError, f)
+
+            # only non-capturing groups
+            f = lambda: s_or_idx.str.extract('(?:[AB]).*')
+            self.assertRaises(ValueError, f)
+
+            # single group renames series/index properly
+            s_or_idx = klass(['A1', 'A2'])
+            result = s_or_idx.str.extract(r'(?P<uno>A)\d')
+            tm.assert_equal(result.name, 'uno')
+            tm.assert_array_equal(result, klass(['A', 'A']))
 
+        s = Series(['A1', 'B2', 'C3'])
         # one group, no matches
         result = s.str.extract('(_)')
         exp = Series([NA, NA, NA], dtype=object)
@@ -569,14 +584,16 @@ def test_extract(self):
         exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]])
         tm.assert_frame_equal(result, exp)
 
-        # named group/groups
-        result = s.str.extract('(?P<letter>[AB])(?P<number>[123])')
-        exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]], columns=['letter', 'number'])
-        tm.assert_frame_equal(result, exp)
+        # one named group
         result = s.str.extract('(?P<letter>[AB])')
         exp = Series(['A', 'B', NA], name='letter')
         tm.assert_series_equal(result, exp)
 
+        # two named groups
+        result = s.str.extract('(?P<letter>[AB])(?P<number>[123])')
+        exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]], columns=['letter', 'number'])
+        tm.assert_frame_equal(result, exp)
+
         # mix named and unnamed groups
         result = s.str.extract('([AB])(?P<number>[123])')
         exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]], columns=[0, 'number'])
@@ -602,11 +619,6 @@ def test_extract(self):
         exp = DataFrame([['A', '1'], ['B', '2'], ['C', NA]], columns=['letter', 'number'])
         tm.assert_frame_equal(result, exp)
 
-        # single group renames series properly
-        s = Series(['A1', 'A2'])
-        result = s.str.extract(r'(?P<uno>A)\d')
-        tm.assert_equal(result.name, 'uno')
-
         # GH6348
         # not passing index to the extractor
         def check_index(index):
@@ -761,6 +773,12 @@ def test_get_dummies(self):
                              columns=list('7ab'))
         tm.assert_frame_equal(result, expected)
 
+        # GH9980
+        # Index.str does not support get_dummies() as it returns a frame
+        with tm.assertRaisesRegexp(TypeError, "not supported"):
+            idx = Index(['a|b', 'a|c', 'b|c'])
+            idx.str.get_dummies('|')
+
     def test_join(self):
         values = Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h'])
         result = values.str.split('_').str.join('_')