From bc66f436c94557c7ecabc8a8679aeed7801782f3 Mon Sep 17 00:00:00 2001 From: Mortada Mehyar Date: Sat, 6 Jun 2015 11:22:48 -0700 Subject: [PATCH] ENH: added rsplit to StringMethods --- doc/source/api.rst | 1 + doc/source/text.rst | 14 ++++ doc/source/whatsnew/v0.16.2.txt | 2 + pandas/core/strings.py | 34 +++++++++ pandas/tests/test_strings.py | 131 +++++++++++++++++++++++++++++--- 5 files changed, 173 insertions(+), 9 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index f5ba03afc9f19..5b6e536af0501 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -563,6 +563,7 @@ strings and apply several methods to it. These can be acccessed like Series.str.slice Series.str.slice_replace Series.str.split + Series.str.rsplit Series.str.startswith Series.str.strip Series.str.swapcase diff --git a/doc/source/text.rst b/doc/source/text.rst index d40445d8490f7..9bbb152f5a69b 100644 --- a/doc/source/text.rst +++ b/doc/source/text.rst @@ -88,6 +88,19 @@ Easy to expand this to return a DataFrame using ``expand``. s2.str.split('_', expand=True) +It is also possible to limit the number of splits: + +.. ipython:: python + + s2.str.split('_', expand=True, n=1) + +``rsplit`` is similar to ``split`` except it works in the reverse direction, +i.e., from the end of the string to the beginning of the string: + +.. ipython:: python + + s2.str.rsplit('_', expand=True, n=1) + Methods like ``replace`` and ``findall`` take `regular expressions `__, too: @@ -239,6 +252,7 @@ Method Summary :meth:`~Series.str.cat`,Concatenate strings :meth:`~Series.str.split`,Split strings on delimiter + :meth:`~Series.str.rsplit`,Split strings on delimiter working from the end of the string :meth:`~Series.str.get`,Index into each element (retrieve i-th element) :meth:`~Series.str.join`,Join strings in each element of the Series with passed separator :meth:`~Series.str.contains`,Return boolean array if each string contains pattern/regex diff --git a/doc/source/whatsnew/v0.16.2.txt b/doc/source/whatsnew/v0.16.2.txt index 9421ab0f841ac..f9214c175ca24 100644 --- a/doc/source/whatsnew/v0.16.2.txt +++ b/doc/source/whatsnew/v0.16.2.txt @@ -79,6 +79,8 @@ See the :ref:`documentation ` for more. (:issue:`10129`) .. _magrittr: https://github.com/smbache/magrittr .. _R: http://www.r-project.org +- Added `rsplit` to Index/Series StringMethods (:issue:`10303`) + .. _whatsnew_0162.enhancements.other: Other enhancements diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 78ae4fba02033..59894d0800895 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -734,6 +734,35 @@ def str_split(arr, pat=None, n=None): return res +def str_rsplit(arr, pat=None, n=None): + """ + Split each string in the Series/Index by the given delimiter + string, starting at the end of the string and working to the front. + Equivalent to :meth:`str.rsplit`. + + .. versionadded:: 0.16.2 + + Parameters + ---------- + pat : string, default None + Separator to split on. If None, splits on whitespace + n : int, default -1 (all) + None, 0 and -1 will be interpreted as return all splits + expand : bool, default False + * If True, return DataFrame/MultiIndex expanding dimensionality. + * If False, return Series/Index. + + Returns + ------- + split : Series/Index or DataFrame/MultiIndex of objects + """ + if n is None or n == 0: + n = -1 + f = lambda x: x.rsplit(pat, n) + res = _na_map(f, arr) + return res + + def str_slice(arr, start=None, stop=None, step=None): """ Slice substrings from each element in the Series/Index @@ -1115,6 +1144,11 @@ def split(self, pat=None, n=-1, expand=False): result = str_split(self.series, pat, n=n) return self._wrap_result_expand(result, expand=expand) + @copy(str_rsplit) + def rsplit(self, pat=None, n=-1, expand=False): + result = str_rsplit(self.series, pat, n=n) + return self._wrap_result_expand(result, expand=expand) + _shared_docs['str_partition'] = (""" Split the string at the %(side)s occurrence of `sep`, and return 3 elements containing the part before the separator, the separator itself, diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index b0d8d89d65cf2..a66410320e816 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -676,6 +676,7 @@ def test_empty_str_methods(self): tm.assert_series_equal(empty_str, empty.str.pad(42)) tm.assert_series_equal(empty_str, empty.str.center(42)) tm.assert_series_equal(empty_list, empty.str.split('a')) + tm.assert_series_equal(empty_list, empty.str.rsplit('a')) tm.assert_series_equal(empty_list, empty.str.partition('a', expand=False)) tm.assert_series_equal(empty_list, empty.str.rpartition('a', expand=False)) tm.assert_series_equal(empty_str, empty.str.slice(stop=1)) @@ -1212,15 +1213,15 @@ def test_split(self): # mixed mixed = Series(['a_b_c', NA, 'd_e_f', True, datetime.today(), None, 1, 2.]) - rs = mixed.str.split('_') - xp = Series([['a', 'b', 'c'], NA, ['d', 'e', 'f'], NA, NA, + result = mixed.str.split('_') + exp = Series([['a', 'b', 'c'], NA, ['d', 'e', 'f'], NA, NA, NA, NA, NA]) - tm.assert_isinstance(rs, Series) - tm.assert_almost_equal(rs, xp) + tm.assert_isinstance(result, Series) + tm.assert_almost_equal(result, exp) - rs = mixed.str.split('_', expand=False) - tm.assert_isinstance(rs, Series) - tm.assert_almost_equal(rs, xp) + result = mixed.str.split('_', expand=False) + tm.assert_isinstance(result, Series) + tm.assert_almost_equal(result, exp) # unicode values = Series([u('a_b_c'), u('c_d_e'), NA, u('f_g_h')]) @@ -1234,12 +1235,75 @@ def test_split(self): result = values.str.split('_', expand=False) tm.assert_series_equal(result, exp) + # regex split + values = Series([u('a,b_c'), u('c_d,e'), NA, u('f,g,h')]) + result = values.str.split('[,_]') + exp = Series([[u('a'), u('b'), u('c')], + [u('c'), u('d'), u('e')], NA, + [u('f'), u('g'), u('h')]]) + tm.assert_series_equal(result, exp) + + def test_rsplit(self): + values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h']) + result = values.str.rsplit('_') + exp = Series([['a', 'b', 'c'], ['c', 'd', 'e'], NA, ['f', 'g', 'h']]) + tm.assert_series_equal(result, exp) + + # more than one char + values = Series(['a__b__c', 'c__d__e', NA, 'f__g__h']) + result = values.str.rsplit('__') + tm.assert_series_equal(result, exp) + + result = values.str.rsplit('__', expand=False) + tm.assert_series_equal(result, exp) + + # mixed + mixed = Series(['a_b_c', NA, 'd_e_f', True, datetime.today(), + None, 1, 2.]) + result = mixed.str.rsplit('_') + exp = Series([['a', 'b', 'c'], NA, ['d', 'e', 'f'], NA, NA, + NA, NA, NA]) + tm.assert_isinstance(result, Series) + tm.assert_almost_equal(result, exp) + + result = mixed.str.rsplit('_', expand=False) + tm.assert_isinstance(result, Series) + tm.assert_almost_equal(result, exp) + + # unicode + values = Series([u('a_b_c'), u('c_d_e'), NA, u('f_g_h')]) + result = values.str.rsplit('_') + exp = Series([[u('a'), u('b'), u('c')], + [u('c'), u('d'), u('e')], NA, + [u('f'), u('g'), u('h')]]) + tm.assert_series_equal(result, exp) + + result = values.str.rsplit('_', expand=False) + tm.assert_series_equal(result, exp) + + # regex split is not supported by rsplit + values = Series([u('a,b_c'), u('c_d,e'), NA, u('f,g,h')]) + result = values.str.rsplit('[,_]') + exp = Series([[u('a,b_c')], + [u('c_d,e')], + NA, + [u('f,g,h')]]) + tm.assert_series_equal(result, exp) + + # setting max number of splits, make sure it's from reverse + values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h']) + result = values.str.rsplit('_', n=1) + exp = Series([['a_b', 'c'], ['c_d', 'e'], NA, ['f_g', 'h']]) + tm.assert_series_equal(result, exp) + def test_split_noargs(self): # #1859 s = Series(['Wes McKinney', 'Travis Oliphant']) - result = s.str.split() - self.assertEqual(result[1], ['Travis', 'Oliphant']) + expected = ['Travis', 'Oliphant'] + self.assertEqual(result[1], expected) + result = s.str.rsplit() + self.assertEqual(result[1], expected) def test_split_maxsplit(self): # re.split 0, str.split -1 @@ -1348,6 +1412,55 @@ def test_split_to_multiindex_expand(self): with tm.assertRaisesRegexp(ValueError, "expand must be"): idx.str.split('_', return_type="some_invalid_type") + def test_rsplit_to_dataframe_expand(self): + s = Series(['nosplit', 'alsonosplit']) + result = s.str.rsplit('_', expand=True) + exp = DataFrame({0: Series(['nosplit', 'alsonosplit'])}) + tm.assert_frame_equal(result, exp) + + s = Series(['some_equal_splits', 'with_no_nans']) + result = s.str.rsplit('_', expand=True) + exp = DataFrame({0: ['some', 'with'], 1: ['equal', 'no'], + 2: ['splits', 'nans']}) + tm.assert_frame_equal(result, exp) + + result = s.str.rsplit('_', expand=True, n=2) + exp = DataFrame({0: ['some', 'with'], 1: ['equal', 'no'], + 2: ['splits', 'nans']}) + tm.assert_frame_equal(result, exp) + + result = s.str.rsplit('_', expand=True, n=1) + exp = DataFrame({0: ['some_equal', 'with_no'], + 1: ['splits', 'nans']}) + tm.assert_frame_equal(result, exp) + + s = Series(['some_splits', 'with_index'], index=['preserve', 'me']) + result = s.str.rsplit('_', expand=True) + exp = DataFrame({0: ['some', 'with'], 1: ['splits', 'index']}, + index=['preserve', 'me']) + tm.assert_frame_equal(result, exp) + + def test_rsplit_to_multiindex_expand(self): + idx = Index(['nosplit', 'alsonosplit']) + result = idx.str.rsplit('_', expand=True) + exp = Index([np.array(['nosplit']), np.array(['alsonosplit'])]) + tm.assert_index_equal(result, exp) + self.assertEqual(result.nlevels, 1) + + idx = Index(['some_equal_splits', 'with_no_nans']) + result = idx.str.rsplit('_', expand=True) + exp = MultiIndex.from_tuples([('some', 'equal', 'splits'), + ('with', 'no', 'nans')]) + tm.assert_index_equal(result, exp) + self.assertEqual(result.nlevels, 3) + + idx = Index(['some_equal_splits', 'with_no_nans']) + result = idx.str.rsplit('_', expand=True, n=1) + exp = MultiIndex.from_tuples([('some_equal', 'splits'), + ('with_no', 'nans')]) + tm.assert_index_equal(result, exp) + self.assertEqual(result.nlevels, 2) + def test_partition_series(self): values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])