Merge pull request #10045 from mortada/str_index

jreback · jreback · commit 6178eaa9c93b · 2015-05-08T13:46:06.000-04:00
ENH: support StringMethods index and rindex
diff --git a/doc/source/api.rst b/doc/source/api.rst
@@ -536,6 +536,7 @@ strings and apply several methods to it. These can be acccessed like
    Series.str.find
    Series.str.findall
    Series.str.get
+   Series.str.index
    Series.str.join
    Series.str.len
    Series.str.ljust
@@ -548,6 +549,7 @@ strings and apply several methods to it. These can be acccessed like
    Series.str.repeat
    Series.str.replace
    Series.str.rfind
+   Series.str.rindex
    Series.str.rjust
    Series.str.rpartition
    Series.str.rstrip
diff --git a/doc/source/text.rst b/doc/source/text.rst
@@ -268,6 +268,8 @@ Method Summary
     :meth:`~Series.str.upper`,Equivalent to ``str.upper``
     :meth:`~Series.str.find`,Equivalent to ``str.find``
     :meth:`~Series.str.rfind`,Equivalent to ``str.rfind``
+    :meth:`~Series.str.index`,Equivalent to ``str.index``
+    :meth:`~Series.str.rindex`,Equivalent to ``str.rindex``
     :meth:`~Series.str.capitalize`,Equivalent to ``str.capitalize``
     :meth:`~Series.str.swapcase`,Equivalent to ``str.swapcase``
     :meth:`~Series.str.normalize`,Return Unicode normal form. Equivalent to ``unicodedata.normalize``
diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt
@@ -189,12 +189,13 @@ String Methods Enhancements
 :ref:`Continuing from v0.16.0 <whatsnew_0160.enhancements.string>`, following
 enhancements are performed to make string operation easier.
 
-- Following new methods are accesible via ``.str`` accessor to apply the function to each values. This is intended to make it more consistent with standard methods on strings. (:issue:`9766`, :issue:`9773`, :issue:`10031`)
+- Following new methods are accesible via ``.str`` accessor to apply the function to each values. This is intended to make it more consistent with standard methods on strings. (:issue:`9766`, :issue:`9773`, :issue:`10031`, :issue:`10045`)
 
   ================  ===============  ===============  ===============  ================
   ..                ..               Methods          ..               ..
   ================  ===============  ===============  ===============  ================
   ``capitalize()``  ``swapcase()``   ``normalize()``  ``partition()``  ``rpartition()``
+  ``index()``       ``rindex()``
   ================  ===============  ===============  ===============  ================
 
 
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
@@ -638,6 +638,26 @@ def str_find(arr, sub, start=0, end=None, side='left'):
     return _na_map(f, arr, dtype=int)
 
 
+def str_index(arr, sub, start=0, end=None, side='left'):
+    if not isinstance(sub, compat.string_types):
+        msg = 'expected a string object, not {0}'
+        raise TypeError(msg.format(type(sub).__name__))
+
+    if side == 'left':
+        method = 'index'
+    elif side == 'right':
+        method = 'rindex'
+    else:  # pragma: no cover
+        raise ValueError('Invalid side')
+
+    if end is None:
+        f = lambda x: getattr(x, method)(sub, start)
+    else:
+        f = lambda x: getattr(x, method)(sub, start, end)
+
+    return _na_map(f, arr, dtype=int)
+
+
 def str_pad(arr, width, side='left', fillchar=' '):
     """
     Pad strings in the Series/Index with an additional character to
@@ -1327,6 +1347,42 @@ def normalize(self, form):
         result = _na_map(f, self.series)
         return self._wrap_result(result)
 
+    _shared_docs['index'] = ("""
+    Return %(side)s indexes in each strings where the substring is
+    fully contained between [start:end]. This is the same as ``str.%(similar)s``
+    except instead of returning -1, it raises a ValueError when the substring
+    is not found. Equivalent to standard ``str.%(method)s``.
+
+    Parameters
+    ----------
+    sub : str
+        Substring being searched
+    start : int
+        Left edge index
+    end : int
+        Right edge index
+
+    Returns
+    -------
+    found : Series/Index of objects
+
+    See Also
+    --------
+    %(also)s
+    """)
+
+    @Appender(_shared_docs['index'] % dict(side='lowest', similar='find', method='index',
+              also='rindex : Return highest indexes in each strings'))
+    def index(self, sub, start=0, end=None):
+        result = str_index(self.series, sub, start=start, end=end, side='left')
+        return self._wrap_result(result)
+
+    @Appender(_shared_docs['index'] % dict(side='highest', similar='rfind', method='rindex',
+              also='index : Return lowest indexes in each strings'))
+    def rindex(self, sub, start=0, end=None):
+        result = str_index(self.series, sub, start=start, end=end, side='right')
+        return self._wrap_result(result)
+
     _shared_docs['len'] = ("""
     Compute length of each string in the Series/Index.
 
diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py
@@ -908,6 +908,53 @@ def test_find_nan(self):
         result = values.str.rfind('EF', 3, 6)
         tm.assert_series_equal(result, Series([4, np.nan, -1, np.nan, -1]))
 
+    def test_index(self):
+        for klass in [Series, Index]:
+            s = klass(['ABCDEFG', 'BCDEFEF', 'DEFGHIJEF', 'EFGHEF'])
+
+            result = s.str.index('EF')
+            tm.assert_array_equal(result, klass([4, 3, 1, 0]))
+            expected = np.array([v.index('EF') for v in s.values])
+            tm.assert_array_equal(result.values, expected)
+
+            result = s.str.rindex('EF')
+            tm.assert_array_equal(result, klass([4, 5, 7, 4]))
+            expected = np.array([v.rindex('EF') for v in s.values])
+            tm.assert_array_equal(result.values, expected)
+
+            result = s.str.index('EF', 3)
+            tm.assert_array_equal(result, klass([4, 3, 7, 4]))
+            expected = np.array([v.index('EF', 3) for v in s.values])
+            tm.assert_array_equal(result.values, expected)
+
+            result = s.str.rindex('EF', 3)
+            tm.assert_array_equal(result, klass([4, 5, 7, 4]))
+            expected = np.array([v.rindex('EF', 3) for v in s.values])
+            tm.assert_array_equal(result.values, expected)
+
+            result = s.str.index('E', 4, 8)
+            tm.assert_array_equal(result, klass([4, 5, 7, 4]))
+            expected = np.array([v.index('E', 4, 8) for v in s.values])
+            tm.assert_array_equal(result.values, expected)
+
+            result = s.str.rindex('E', 0, 5)
+            tm.assert_array_equal(result, klass([4, 3, 1, 4]))
+            expected = np.array([v.rindex('E', 0, 5) for v in s.values])
+            tm.assert_array_equal(result.values, expected)
+
+            with tm.assertRaisesRegexp(ValueError, "substring not found"):
+                result = s.str.index('DE')
+
+            with tm.assertRaisesRegexp(TypeError, "expected a string object, not int"):
+                result = s.str.index(0)
+
+        # test with nan
+        s = Series(['abcb', 'ab', 'bcbe', np.nan])
+        result = s.str.index('b')
+        tm.assert_array_equal(result, Series([1, 1, 0, np.nan]))
+        result = s.str.rindex('b')
+        tm.assert_array_equal(result, Series([3, 1, 2, np.nan]))
+
     def test_pad(self):
         values = Series(['a', 'b', NA, 'c', NA, 'eeeeee'])