ENH: add StringMethods (.str accessor) to Index, fixes #9068

mortada · mortada · commit f98bcb8b0ccf · 2015-04-10T00:26:20.000-07:00
diff --git a/doc/source/text.rst b/doc/source/text.rst
@@ -17,10 +17,10 @@ Working with Text Data
 
 .. _text.string_methods:
 
-Series is equipped with a set of string processing methods
+Series and Index are equipped with a set of string processing methods
 that make it easy to operate on each element of the array. Perhaps most
 importantly, these methods exclude missing/NA values automatically. These are
-accessed via the Series's ``str`` attribute and generally have names matching
+accessed via the ``str`` attribute and generally have names matching
 the equivalent (scalar) built-in string methods:
 
 .. ipython:: python
@@ -30,6 +30,13 @@ the equivalent (scalar) built-in string methods:
    s.str.upper()
    s.str.len()
 
+.. ipython:: python
+
+   idx = Index([' jack', 'jill ', ' jesse ', 'frank'])
+   idx.str.strip()
+   idx.str.lstrip()
+   idx.str.rstrip()
+
 Splitting and Replacing Strings
 -------------------------------
 
diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt
@@ -18,11 +18,28 @@ Enhancements
 ~~~~~~~~~~~~
 
 - Added ``StringMethods.capitalize()`` and ``swapcase`` which behave as the same as standard ``str`` (:issue:`9766`)
+- Added ``StringMethods`` (.str accessor) to ``Index`` (:issue:`9068`)
 
-- ``DataFrame.mask()`` and ``Series.mask()`` now support same keywords as ``where`` (:issue:`8801`)
+  The `.str` accessor is now available for both `Series` and `Index`.
+
+  .. ipython:: python
 
+     idx = Index([' jack', 'jill ', ' jesse ', 'frank'])
+     idx.str.strip()
 
+  One special case for the `.str` accessor on `Index` is that if a string method returns `bool`, the `.str` accessor
+  will return a `np.array` instead of a boolean `Index` (:issue:`8875`). This enables the following expression
+  to work naturally:
 
+  .. ipython:: python
+
+     idx = Index(['a1', 'a2', 'b1', 'b2'])
+     s = Series(range(4), index=idx)
+     s
+     idx.str.startswith('a')
+     s[s.index.str.startswith('a')]
+
+- ``DataFrame.mask()`` and ``Series.mask()`` now support same keywords as ``where`` (:issue:`8801`)
 - ``drop`` function can now accept ``errors`` keyword to suppress ValueError raised when any of label does not exist in the target data. (:issue:`6736`)
 
   .. ipython:: python
diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -10,6 +10,7 @@
 import pandas.tslib as tslib
 import pandas.lib as lib
 from pandas.util.decorators import Appender, cache_readonly
+from pandas.core.strings import StringMethods
 
 
 _shared_docs = dict()
@@ -497,6 +498,24 @@ def searchsorted(self, key, side='left'):
         #### needs tests/doc-string
         return self.values.searchsorted(key, side=side)
 
+    # string methods
+    def _make_str_accessor(self):
+        from pandas.core.series import Series
+        from pandas.core.index import Index
+        if isinstance(self, Series) and not com.is_object_dtype(self.dtype):
+            # this really should exclude all series with any non-string values,
+            # but that isn't practical for performance reasons until we have a
+            # str dtype (GH 9343)
+            raise AttributeError("Can only use .str accessor with string "
+                                 "values, which use np.object_ dtype in "
+                                 "pandas")
+        elif isinstance(self, Index) and self.inferred_type != 'string':
+            raise AttributeError("Can only use .str accessor with string "
+                                 "values (i.e. inferred_type is 'string')")
+        return StringMethods(self)
+
+    str = AccessorProperty(StringMethods, _make_str_accessor)
+
     _shared_docs['drop_duplicates'] = (
         """Return %(klass)s with duplicate values removed
 
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -28,7 +28,6 @@
 from pandas.core import generic, base
 from pandas.core.internals import SingleBlockManager
 from pandas.core.categorical import Categorical, CategoricalAccessor
-from pandas.core.strings import StringMethods
 from pandas.tseries.common import (maybe_to_datetimelike,
                                    CombinedDatetimelikeProperties)
 from pandas.tseries.index import DatetimeIndex
@@ -2494,21 +2493,6 @@ def to_period(self, freq=None, copy=True):
         return self._constructor(new_values,
                                  index=new_index).__finalize__(self)
 
-    #------------------------------------------------------------------------------
-    # string methods
-
-    def _make_str_accessor(self):
-        if not com.is_object_dtype(self.dtype):
-            # this really should exclude all series with any non-string values,
-            # but that isn't practical for performance reasons until we have a
-            # str dtype (GH 9343)
-            raise AttributeError("Can only use .str accessor with string "
-                                 "values, which use np.object_ dtype in "
-                                 "pandas")
-        return StringMethods(self)
-
-    str = base.AccessorProperty(StringMethods, _make_str_accessor)
-
     #------------------------------------------------------------------------------
     # Datetimelike delegation methods
 
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
@@ -1,7 +1,7 @@
 import numpy as np
 
 from pandas.compat import zip
-from pandas.core.common import isnull, _values_from_object
+from pandas.core.common import isnull, _values_from_object, is_bool_dtype
 import pandas.compat as compat
 from pandas.util.decorators import Appender
 import re
@@ -632,9 +632,10 @@ def str_split(arr, pat=None, n=None, return_type='series'):
     pat : string, default None
         String or regular expression to split on. If None, splits on whitespace
     n : int, default None (all)
-    return_type : {'series', 'frame'}, default 'series
+    return_type : {'series', 'index', 'frame'}, default 'series'
         If frame, returns a DataFrame (elements are strings)
-        If series, returns an Series (elements are lists of strings).
+        If series or index, returns the same type as the original object
+        (elements are lists of strings).
 
     Notes
     -----
@@ -646,9 +647,13 @@ def str_split(arr, pat=None, n=None, return_type='series'):
     """
     from pandas.core.series import Series
     from pandas.core.frame import DataFrame
+    from pandas.core.index import Index
 
-    if return_type not in ('series', 'frame'):
-        raise ValueError("return_type must be {'series', 'frame'}")
+    if return_type not in ('series', 'index', 'frame'):
+        raise ValueError("return_type must be {'series', 'index', 'frame'}")
+    if return_type == 'frame' and isinstance(arr, Index):
+        raise ValueError("return_type='frame' is not supported for string "
+                         "methods on Index")
     if pat is None:
         if n is None or n == 0:
             n = -1
@@ -928,9 +933,9 @@ def do_copy(target):
 class StringMethods(object):
 
     """
-    Vectorized string functions for Series. NAs stay NA unless handled
-    otherwise by a particular method. Patterned after Python's string methods,
-    with some inspiration from R's stringr package.
+    Vectorized string functions for Series and Index. NAs stay NA unless
+    handled otherwise by a particular method. Patterned after Python's string
+    methods, with some inspiration from R's stringr package.
 
     Examples
     --------
@@ -959,11 +964,18 @@ def __iter__(self):
     def _wrap_result(self, result):
         from pandas.core.series import Series
         from pandas.core.frame import DataFrame
+        from pandas.core.index import Index
 
         if not hasattr(result, 'ndim'):
             return result
         elif result.ndim == 1:
             name = getattr(result, 'name', None)
+            if isinstance(self.series, Index):
+                # if result is a boolean np.array, return the np.array
+                # instead of wrapping it into a boolean Index (GH 8875)
+                if is_bool_dtype(result):
+                    return result
+                return Index(result, name=name or self.series.name)
             return Series(result, index=self.series.index,
                           name=name or self.series.name)
         else:
diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py
@@ -1197,6 +1197,40 @@ def test_join_self(self):
             for kind in kinds:
                 joined = res.join(res, how=kind)
                 self.assertIs(res, joined)
+    def test_str_attribute(self):
+        # GH9068
+        methods = ['strip', 'rstrip', 'lstrip']
+        idx = Index([' jack', 'jill ', ' jesse ', 'frank'])
+        for method in methods:
+            expected = Index([getattr(str, method)(x) for x in idx.values])
+            tm.assert_index_equal(getattr(Index.str, method)(idx.str), expected)
+
+        # create a few instances that are not able to use .str accessor
+        indices = [Index(range(5)),
+                   tm.makeDateIndex(10),
+                   MultiIndex.from_tuples([('foo', '1'), ('bar', '3')]),
+                   PeriodIndex(start='2000', end='2010', freq='A')]
+        for idx in indices:
+            with self.assertRaisesRegexp(AttributeError, 'only use .str accessor'):
+                idx.str.repeat(2)
+
+        idx = Index(['a b c', 'd e', 'f'])
+        expected = Index([['a', 'b', 'c'], ['d', 'e'], ['f']])
+        tm.assert_index_equal(idx.str.split(), expected)
+        tm.assert_index_equal(idx.str.split(return_type='series'), expected)
+        # return_type 'index' is an alias for 'series'
+        tm.assert_index_equal(idx.str.split(return_type='index'), expected)
+        with self.assertRaisesRegexp(ValueError, 'not supported'):
+            idx.str.split(return_type='frame')
+
+        # test boolean case, should return np.array instead of boolean Index
+        idx = Index(['a1', 'a2', 'b1', 'b2'])
+        expected = np.array([True, True, False, False])
+        self.assert_array_equal(idx.str.startswith('a'), expected)
+        self.assertIsInstance(idx.str.startswith('a'), np.ndarray)
+        s = Series(range(4), index=idx)
+        expected = Series(range(2), index=['a1', 'a2'])
+        tm.assert_series_equal(s[s.index.str.startswith('a')], expected)
 
     def test_indexing_doesnt_change_class(self):
         idx = Index([1, 2, 3, 'a', 'b', 'c'])
diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py
@@ -4933,6 +4933,19 @@ def test_to_csv_path_is_none(self):
         csv_str = s.to_csv(path=None)
         self.assertIsInstance(csv_str, str)
 
+    def test_str_attribute(self):
+        # GH9068
+        methods = ['strip', 'rstrip', 'lstrip']
+        s = Series([' jack', 'jill ', ' jesse ', 'frank'])
+        for method in methods:
+            expected = Series([getattr(str, method)(x) for x in s.values])
+            assert_series_equal(getattr(Series.str, method)(s.str), expected)
+
+        # str accessor only valid with string values
+        s = Series(range(5))
+        with self.assertRaisesRegexp(AttributeError, 'only use .str accessor'):
+            s.str.repeat(2)
+
     def test_clip(self):
         val = self.ts.median()