Merge pull request #10303 from mortada/str_rsplit

jreback · jreback · commit b2065484a9e7 · 2015-06-08T20:38:32.000-04:00
ENH: added rsplit to StringMethods
diff --git a/doc/source/api.rst b/doc/source/api.rst
@@ -567,6 +567,7 @@ strings and apply several methods to it. These can be acccessed like
    Series.str.slice
    Series.str.slice_replace
    Series.str.split
+   Series.str.rsplit
    Series.str.startswith
    Series.str.strip
    Series.str.swapcase
diff --git a/doc/source/text.rst b/doc/source/text.rst
@@ -88,6 +88,19 @@ Easy to expand this to return a DataFrame using ``expand``.
 
    s2.str.split('_', expand=True)
 
+It is also possible to limit the number of splits:
+
+.. ipython:: python
+
+   s2.str.split('_', expand=True, n=1)
+
+``rsplit`` is similar to ``split`` except it works in the reverse direction,
+i.e., from the end of the string to the beginning of the string:
+
+.. ipython:: python
+
+   s2.str.rsplit('_', expand=True, n=1)
+
 Methods like ``replace`` and ``findall`` take `regular expressions
 <https://docs.python.org/2/library/re.html>`__, too:
 
@@ -239,6 +252,7 @@ Method Summary
 
     :meth:`~Series.str.cat`,Concatenate strings
     :meth:`~Series.str.split`,Split strings on delimiter
+    :meth:`~Series.str.rsplit`,Split strings on delimiter working from the end of the string
     :meth:`~Series.str.get`,Index into each element (retrieve i-th element)
     :meth:`~Series.str.join`,Join strings in each element of the Series with passed separator
     :meth:`~Series.str.contains`,Return boolean array if each string contains pattern/regex
diff --git a/doc/source/whatsnew/v0.16.2.txt b/doc/source/whatsnew/v0.16.2.txt
@@ -79,6 +79,8 @@ See the :ref:`documentation <basics.pipe>` for more. (:issue:`10129`)
 .. _magrittr: https://github.com/smbache/magrittr
 .. _R: http://www.r-project.org
 
+- Added `rsplit` to Index/Series StringMethods (:issue:`10303`)
+
 .. _whatsnew_0162.enhancements.other:
 
 Other enhancements
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
@@ -734,6 +734,35 @@ def str_split(arr, pat=None, n=None):
     return res
 
 
+def str_rsplit(arr, pat=None, n=None):
+    """
+    Split each string in the Series/Index by the given delimiter
+    string, starting at the end of the string and working to the front.
+    Equivalent to :meth:`str.rsplit`.
+
+    .. versionadded:: 0.16.2
+
+    Parameters
+    ----------
+    pat : string, default None
+        Separator to split on. If None, splits on whitespace
+    n : int, default -1 (all)
+        None, 0 and -1 will be interpreted as return all splits
+    expand : bool, default False
+        * If True, return DataFrame/MultiIndex expanding dimensionality.
+        * If False, return Series/Index.
+
+    Returns
+    -------
+    split : Series/Index or DataFrame/MultiIndex of objects
+    """
+    if n is None or n == 0:
+        n = -1
+    f = lambda x: x.rsplit(pat, n)
+    res = _na_map(f, arr)
+    return res
+
+
 def str_slice(arr, start=None, stop=None, step=None):
     """
     Slice substrings from each element in the Series/Index
@@ -1115,6 +1144,11 @@ def split(self, pat=None, n=-1, expand=False):
         result = str_split(self.series, pat, n=n)
         return self._wrap_result_expand(result, expand=expand)
 
+    @copy(str_rsplit)
+    def rsplit(self, pat=None, n=-1, expand=False):
+        result = str_rsplit(self.series, pat, n=n)
+        return self._wrap_result_expand(result, expand=expand)
+
     _shared_docs['str_partition'] = ("""
     Split the string at the %(side)s occurrence of `sep`, and return 3 elements
     containing the part before the separator, the separator itself,
diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py
@@ -676,6 +676,7 @@ def test_empty_str_methods(self):
         tm.assert_series_equal(empty_str, empty.str.pad(42))
         tm.assert_series_equal(empty_str, empty.str.center(42))
         tm.assert_series_equal(empty_list, empty.str.split('a'))
+        tm.assert_series_equal(empty_list, empty.str.rsplit('a'))
         tm.assert_series_equal(empty_list, empty.str.partition('a', expand=False))
         tm.assert_series_equal(empty_list, empty.str.rpartition('a', expand=False))
         tm.assert_series_equal(empty_str, empty.str.slice(stop=1))
@@ -1212,15 +1213,15 @@ def test_split(self):
         # mixed
         mixed = Series(['a_b_c', NA, 'd_e_f', True, datetime.today(),
                         None, 1, 2.])
-        rs = mixed.str.split('_')
-        xp = Series([['a', 'b', 'c'], NA, ['d', 'e', 'f'], NA, NA,
+        result = mixed.str.split('_')
+        exp = Series([['a', 'b', 'c'], NA, ['d', 'e', 'f'], NA, NA,
                      NA, NA, NA])
-        tm.assert_isinstance(rs, Series)
-        tm.assert_almost_equal(rs, xp)
+        tm.assert_isinstance(result, Series)
+        tm.assert_almost_equal(result, exp)
 
-        rs = mixed.str.split('_', expand=False)
-        tm.assert_isinstance(rs, Series)
-        tm.assert_almost_equal(rs, xp)
+        result = mixed.str.split('_', expand=False)
+        tm.assert_isinstance(result, Series)
+        tm.assert_almost_equal(result, exp)
 
         # unicode
         values = Series([u('a_b_c'), u('c_d_e'), NA, u('f_g_h')])
@@ -1234,12 +1235,75 @@ def test_split(self):
         result = values.str.split('_', expand=False)
         tm.assert_series_equal(result, exp)
 
+        # regex split
+        values = Series([u('a,b_c'), u('c_d,e'), NA, u('f,g,h')])
+        result = values.str.split('[,_]')
+        exp = Series([[u('a'), u('b'), u('c')],
+                      [u('c'), u('d'), u('e')], NA,
+                      [u('f'), u('g'), u('h')]])
+        tm.assert_series_equal(result, exp)
+
+    def test_rsplit(self):
+        values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])
+        result = values.str.rsplit('_')
+        exp = Series([['a', 'b', 'c'], ['c', 'd', 'e'], NA, ['f', 'g', 'h']])
+        tm.assert_series_equal(result, exp)
+
+        # more than one char
+        values = Series(['a__b__c', 'c__d__e', NA, 'f__g__h'])
+        result = values.str.rsplit('__')
+        tm.assert_series_equal(result, exp)
+
+        result = values.str.rsplit('__', expand=False)
+        tm.assert_series_equal(result, exp)
+
+        # mixed
+        mixed = Series(['a_b_c', NA, 'd_e_f', True, datetime.today(),
+                        None, 1, 2.])
+        result = mixed.str.rsplit('_')
+        exp = Series([['a', 'b', 'c'], NA, ['d', 'e', 'f'], NA, NA,
+                     NA, NA, NA])
+        tm.assert_isinstance(result, Series)
+        tm.assert_almost_equal(result, exp)
+
+        result = mixed.str.rsplit('_', expand=False)
+        tm.assert_isinstance(result, Series)
+        tm.assert_almost_equal(result, exp)
+
+        # unicode
+        values = Series([u('a_b_c'), u('c_d_e'), NA, u('f_g_h')])
+        result = values.str.rsplit('_')
+        exp = Series([[u('a'), u('b'), u('c')],
+                      [u('c'), u('d'), u('e')], NA,
+                      [u('f'), u('g'), u('h')]])
+        tm.assert_series_equal(result, exp)
+
+        result = values.str.rsplit('_', expand=False)
+        tm.assert_series_equal(result, exp)
+
+        # regex split is not supported by rsplit
+        values = Series([u('a,b_c'), u('c_d,e'), NA, u('f,g,h')])
+        result = values.str.rsplit('[,_]')
+        exp = Series([[u('a,b_c')],
+                      [u('c_d,e')],
+                      NA,
+                      [u('f,g,h')]])
+        tm.assert_series_equal(result, exp)
+
+        # setting max number of splits, make sure it's from reverse
+        values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])
+        result = values.str.rsplit('_', n=1)
+        exp = Series([['a_b', 'c'], ['c_d', 'e'], NA, ['f_g', 'h']])
+        tm.assert_series_equal(result, exp)
+
     def test_split_noargs(self):
         # #1859
         s = Series(['Wes McKinney', 'Travis  Oliphant'])
-
         result = s.str.split()
-        self.assertEqual(result[1], ['Travis', 'Oliphant'])
+        expected = ['Travis', 'Oliphant']
+        self.assertEqual(result[1], expected)
+        result = s.str.rsplit()
+        self.assertEqual(result[1], expected)
 
     def test_split_maxsplit(self):
         # re.split 0, str.split -1
@@ -1348,6 +1412,55 @@ def test_split_to_multiindex_expand(self):
         with tm.assertRaisesRegexp(ValueError, "expand must be"):
             idx.str.split('_', return_type="some_invalid_type")
 
+    def test_rsplit_to_dataframe_expand(self):
+        s = Series(['nosplit', 'alsonosplit'])
+        result = s.str.rsplit('_', expand=True)
+        exp = DataFrame({0: Series(['nosplit', 'alsonosplit'])})
+        tm.assert_frame_equal(result, exp)
+
+        s = Series(['some_equal_splits', 'with_no_nans'])
+        result = s.str.rsplit('_', expand=True)
+        exp = DataFrame({0: ['some', 'with'], 1: ['equal', 'no'],
+                         2: ['splits', 'nans']})
+        tm.assert_frame_equal(result, exp)
+
+        result = s.str.rsplit('_', expand=True, n=2)
+        exp = DataFrame({0: ['some', 'with'], 1: ['equal', 'no'],
+                         2: ['splits', 'nans']})
+        tm.assert_frame_equal(result, exp)
+
+        result = s.str.rsplit('_', expand=True, n=1)
+        exp = DataFrame({0: ['some_equal', 'with_no'],
+                         1: ['splits', 'nans']})
+        tm.assert_frame_equal(result, exp)
+
+        s = Series(['some_splits', 'with_index'], index=['preserve', 'me'])
+        result = s.str.rsplit('_', expand=True)
+        exp = DataFrame({0: ['some', 'with'], 1: ['splits', 'index']},
+                        index=['preserve', 'me'])
+        tm.assert_frame_equal(result, exp)
+
+    def test_rsplit_to_multiindex_expand(self):
+        idx = Index(['nosplit', 'alsonosplit'])
+        result = idx.str.rsplit('_', expand=True)
+        exp = Index([np.array(['nosplit']), np.array(['alsonosplit'])])
+        tm.assert_index_equal(result, exp)
+        self.assertEqual(result.nlevels, 1)
+
+        idx = Index(['some_equal_splits', 'with_no_nans'])
+        result = idx.str.rsplit('_', expand=True)
+        exp = MultiIndex.from_tuples([('some', 'equal', 'splits'),
+                                      ('with', 'no', 'nans')])
+        tm.assert_index_equal(result, exp)
+        self.assertEqual(result.nlevels, 3)
+
+        idx = Index(['some_equal_splits', 'with_no_nans'])
+        result = idx.str.rsplit('_', expand=True, n=1)
+        exp = MultiIndex.from_tuples([('some_equal', 'splits'),
+                                      ('with_no', 'nans')])
+        tm.assert_index_equal(result, exp)
+        self.assertEqual(result.nlevels, 2)
+
     def test_partition_series(self):
         values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])