ENH: improve extract and get_dummies methods for Index.str (fix for #9980)

mortada · mortada · commit 11e3b5ab6617 · 2015-04-28T14:02:25.000-07:00
diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt
@@ -24,7 +24,6 @@ Enhancements
 
 - Added ``StringMethods.capitalize()`` and ``swapcase`` which behave as the same as standard ``str`` (:issue:`9766`)
 - Added ``StringMethods`` (.str accessor) to ``Index`` (:issue:`9068`)
-- Allow clip, clip_lower, and clip_upper to accept array-like arguments as thresholds (:issue:`6966`). These methods now have an ``axis`` parameter which determines how the Series or DataFrame will be aligned with the threshold(s).
 
   The ``.str`` accessor is now available for both ``Series`` and ``Index``.
 
@@ -46,6 +45,8 @@ Enhancements
      idx.str.startswith('a')
      s[s.index.str.startswith('a')]
 
+- Improved ``extract`` and ``get_dummies`` methods for ``Index.str`` (:issue:`9980`)
+- Allow clip, clip_lower, and clip_upper to accept array-like arguments as thresholds (:issue:`6966`). These methods now have an ``axis`` parameter which determines how the Series or DataFrame will be aligned with the threshold(s).
 - ``DataFrame.mask()`` and ``Series.mask()`` now support same keywords as ``where`` (:issue:`8801`)
 
 - ``drop`` function can now accept ``errors`` keyword to suppress ``ValueError`` raised when any of label does not exist in the target data. (:issue:`6736`)
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
@@ -435,12 +435,13 @@ def str_extract(arr, pat, flags=0):
     """
     from pandas.core.series import Series
     from pandas.core.frame import DataFrame
+    from pandas.core.index import Index
 
     regex = re.compile(pat, flags=flags)
     # just to be safe, check this
     if regex.groups == 0:
         raise ValueError("This pattern contains no groups to capture.")
-    empty_row = [np.nan]*regex.groups
+    empty_row = [np.nan] * regex.groups
     def f(x):
         if not isinstance(x, compat.string_types):
             return empty_row
@@ -449,11 +450,19 @@ def f(x):
             return [np.nan if item is None else item for item in m.groups()]
         else:
             return empty_row
+
     if regex.groups == 1:
-        result = Series([f(val)[0] for val in arr],
-                        name=_get_single_group_name(regex),
-                        index=arr.index, dtype=object)
+        if isinstance(arr, Index):
+            result = Index([f(val)[0] for val in arr],
+                           name=_get_single_group_name(regex),
+                           dtype=object)
+        else:
+            result = Series([f(val)[0] for val in arr],
+                            name=_get_single_group_name(regex),
+                            index=arr.index, dtype=object)
     else:
+        if isinstance(arr, Index):
+            raise ValueError("only one regex group is supported with Index")
         names = dict(zip(regex.groupindex.values(), regex.groupindex.keys()))
         columns = [names.get(1 + i, i) for i in range(regex.groups)]
         if arr.empty:
@@ -488,6 +497,11 @@ def str_get_dummies(arr, sep='|'):
 
     """
     from pandas.core.frame import DataFrame
+    from pandas.core.index import Index
+
+    # GH9980, Index.str does not support get_dummies() as it returns a frame
+    if isinstance(arr, Index):
+        raise TypeError("get_dummies is not supported for string methods on Index")
 
     # TODO remove this hack?
     arr = arr.fillna('')
diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py
@@ -516,7 +516,6 @@ def test_match(self):
 
     def test_extract(self):
         # Contains tests like those in test_match and some others.
-
         values = Series(['fooBAD__barBAD', NA, 'foo'])
         er = [NA, NA]  # empty row
 
@@ -540,15 +539,31 @@ def test_extract(self):
         exp = DataFrame([[u('BAD__'), u('BAD')], er, er])
         tm.assert_frame_equal(result, exp)
 
-        # no groups
-        s = Series(['A1', 'B2', 'C3'])
-        f = lambda: s.str.extract('[ABC][123]')
-        self.assertRaises(ValueError, f)
-
-        # only non-capturing groups
-        f = lambda: s.str.extract('(?:[AB]).*')
-        self.assertRaises(ValueError, f)
+        # GH9980
+        # Index only works with one regex group since
+        # multi-group would expand to a frame
+        idx = Index(['A1', 'A2', 'A3', 'A4', 'B5'])
+        with tm.assertRaisesRegexp(ValueError, "supported"):
+            idx.str.extract('([AB])([123])')
+
+        # these should work for both Series and Index
+        for klass in [Series, Index]:
+            # no groups
+            s_or_idx = klass(['A1', 'B2', 'C3'])
+            f = lambda: s_or_idx.str.extract('[ABC][123]')
+            self.assertRaises(ValueError, f)
+
+            # only non-capturing groups
+            f = lambda: s_or_idx.str.extract('(?:[AB]).*')
+            self.assertRaises(ValueError, f)
+
+            # single group renames series/index properly
+            s_or_idx = klass(['A1', 'A2'])
+            result = s_or_idx.str.extract(r'(?P<uno>A)\d')
+            tm.assert_equal(result.name, 'uno')
+            tm.assert_array_equal(result, klass(['A', 'A']))
 
+        s = Series(['A1', 'B2', 'C3'])
         # one group, no matches
         result = s.str.extract('(_)')
         exp = Series([NA, NA, NA], dtype=object)
@@ -569,14 +584,16 @@ def test_extract(self):
         exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]])
         tm.assert_frame_equal(result, exp)
 
-        # named group/groups
-        result = s.str.extract('(?P<letter>[AB])(?P<number>[123])')
-        exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]], columns=['letter', 'number'])
-        tm.assert_frame_equal(result, exp)
+        # one named group
         result = s.str.extract('(?P<letter>[AB])')
         exp = Series(['A', 'B', NA], name='letter')
         tm.assert_series_equal(result, exp)
 
+        # two named groups
+        result = s.str.extract('(?P<letter>[AB])(?P<number>[123])')
+        exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]], columns=['letter', 'number'])
+        tm.assert_frame_equal(result, exp)
+
         # mix named and unnamed groups
         result = s.str.extract('([AB])(?P<number>[123])')
         exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]], columns=[0, 'number'])
@@ -602,11 +619,6 @@ def test_extract(self):
         exp = DataFrame([['A', '1'], ['B', '2'], ['C', NA]], columns=['letter', 'number'])
         tm.assert_frame_equal(result, exp)
 
-        # single group renames series properly
-        s = Series(['A1', 'A2'])
-        result = s.str.extract(r'(?P<uno>A)\d')
-        tm.assert_equal(result.name, 'uno')
-
         # GH6348
         # not passing index to the extractor
         def check_index(index):
@@ -752,6 +764,12 @@ def test_get_dummies(self):
                              columns=list('7ab'))
         tm.assert_frame_equal(result, expected)
 
+        # GH9980
+        # Index.str does not support get_dummies() as it returns a frame
+        with tm.assertRaisesRegexp(TypeError, "not supported"):
+            idx = Index(['a|b', 'a|c', 'b|c'])
+            idx.str.get_dummies('|')
+
     def test_join(self):
         values = Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h'])
         result = values.str.split('_').str.join('_')