Skip to content

Commit b206548

Browse files
committed
Merge pull request #10303 from mortada/str_rsplit
ENH: added rsplit to StringMethods
2 parents b0757dc + bc66f43 commit b206548

File tree

5 files changed

+173
-9
lines changed

5 files changed

+173
-9
lines changed

doc/source/api.rst

+1
Original file line numberDiff line numberDiff line change
@@ -567,6 +567,7 @@ strings and apply several methods to it. These can be acccessed like
567567
Series.str.slice
568568
Series.str.slice_replace
569569
Series.str.split
570+
Series.str.rsplit
570571
Series.str.startswith
571572
Series.str.strip
572573
Series.str.swapcase

doc/source/text.rst

+14
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,19 @@ Easy to expand this to return a DataFrame using ``expand``.
8888
8989
s2.str.split('_', expand=True)
9090
91+
It is also possible to limit the number of splits:
92+
93+
.. ipython:: python
94+
95+
s2.str.split('_', expand=True, n=1)
96+
97+
``rsplit`` is similar to ``split`` except it works in the reverse direction,
98+
i.e., from the end of the string to the beginning of the string:
99+
100+
.. ipython:: python
101+
102+
s2.str.rsplit('_', expand=True, n=1)
103+
91104
Methods like ``replace`` and ``findall`` take `regular expressions
92105
<https://docs.python.org/2/library/re.html>`__, too:
93106

@@ -239,6 +252,7 @@ Method Summary
239252

240253
:meth:`~Series.str.cat`,Concatenate strings
241254
:meth:`~Series.str.split`,Split strings on delimiter
255+
:meth:`~Series.str.rsplit`,Split strings on delimiter working from the end of the string
242256
:meth:`~Series.str.get`,Index into each element (retrieve i-th element)
243257
:meth:`~Series.str.join`,Join strings in each element of the Series with passed separator
244258
:meth:`~Series.str.contains`,Return boolean array if each string contains pattern/regex

doc/source/whatsnew/v0.16.2.txt

+2
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,8 @@ See the :ref:`documentation <basics.pipe>` for more. (:issue:`10129`)
7979
.. _magrittr: https://github.com/smbache/magrittr
8080
.. _R: http://www.r-project.org
8181

82+
- Added `rsplit` to Index/Series StringMethods (:issue:`10303`)
83+
8284
.. _whatsnew_0162.enhancements.other:
8385

8486
Other enhancements

pandas/core/strings.py

+34
Original file line numberDiff line numberDiff line change
@@ -734,6 +734,35 @@ def str_split(arr, pat=None, n=None):
734734
return res
735735

736736

737+
def str_rsplit(arr, pat=None, n=None):
738+
"""
739+
Split each string in the Series/Index by the given delimiter
740+
string, starting at the end of the string and working to the front.
741+
Equivalent to :meth:`str.rsplit`.
742+
743+
.. versionadded:: 0.16.2
744+
745+
Parameters
746+
----------
747+
pat : string, default None
748+
Separator to split on. If None, splits on whitespace
749+
n : int, default -1 (all)
750+
None, 0 and -1 will be interpreted as return all splits
751+
expand : bool, default False
752+
* If True, return DataFrame/MultiIndex expanding dimensionality.
753+
* If False, return Series/Index.
754+
755+
Returns
756+
-------
757+
split : Series/Index or DataFrame/MultiIndex of objects
758+
"""
759+
if n is None or n == 0:
760+
n = -1
761+
f = lambda x: x.rsplit(pat, n)
762+
res = _na_map(f, arr)
763+
return res
764+
765+
737766
def str_slice(arr, start=None, stop=None, step=None):
738767
"""
739768
Slice substrings from each element in the Series/Index
@@ -1115,6 +1144,11 @@ def split(self, pat=None, n=-1, expand=False):
11151144
result = str_split(self.series, pat, n=n)
11161145
return self._wrap_result_expand(result, expand=expand)
11171146

1147+
@copy(str_rsplit)
1148+
def rsplit(self, pat=None, n=-1, expand=False):
1149+
result = str_rsplit(self.series, pat, n=n)
1150+
return self._wrap_result_expand(result, expand=expand)
1151+
11181152
_shared_docs['str_partition'] = ("""
11191153
Split the string at the %(side)s occurrence of `sep`, and return 3 elements
11201154
containing the part before the separator, the separator itself,

pandas/tests/test_strings.py

+122-9
Original file line numberDiff line numberDiff line change
@@ -676,6 +676,7 @@ def test_empty_str_methods(self):
676676
tm.assert_series_equal(empty_str, empty.str.pad(42))
677677
tm.assert_series_equal(empty_str, empty.str.center(42))
678678
tm.assert_series_equal(empty_list, empty.str.split('a'))
679+
tm.assert_series_equal(empty_list, empty.str.rsplit('a'))
679680
tm.assert_series_equal(empty_list, empty.str.partition('a', expand=False))
680681
tm.assert_series_equal(empty_list, empty.str.rpartition('a', expand=False))
681682
tm.assert_series_equal(empty_str, empty.str.slice(stop=1))
@@ -1212,15 +1213,15 @@ def test_split(self):
12121213
# mixed
12131214
mixed = Series(['a_b_c', NA, 'd_e_f', True, datetime.today(),
12141215
None, 1, 2.])
1215-
rs = mixed.str.split('_')
1216-
xp = Series([['a', 'b', 'c'], NA, ['d', 'e', 'f'], NA, NA,
1216+
result = mixed.str.split('_')
1217+
exp = Series([['a', 'b', 'c'], NA, ['d', 'e', 'f'], NA, NA,
12171218
NA, NA, NA])
1218-
tm.assert_isinstance(rs, Series)
1219-
tm.assert_almost_equal(rs, xp)
1219+
tm.assert_isinstance(result, Series)
1220+
tm.assert_almost_equal(result, exp)
12201221

1221-
rs = mixed.str.split('_', expand=False)
1222-
tm.assert_isinstance(rs, Series)
1223-
tm.assert_almost_equal(rs, xp)
1222+
result = mixed.str.split('_', expand=False)
1223+
tm.assert_isinstance(result, Series)
1224+
tm.assert_almost_equal(result, exp)
12241225

12251226
# unicode
12261227
values = Series([u('a_b_c'), u('c_d_e'), NA, u('f_g_h')])
@@ -1234,12 +1235,75 @@ def test_split(self):
12341235
result = values.str.split('_', expand=False)
12351236
tm.assert_series_equal(result, exp)
12361237

1238+
# regex split
1239+
values = Series([u('a,b_c'), u('c_d,e'), NA, u('f,g,h')])
1240+
result = values.str.split('[,_]')
1241+
exp = Series([[u('a'), u('b'), u('c')],
1242+
[u('c'), u('d'), u('e')], NA,
1243+
[u('f'), u('g'), u('h')]])
1244+
tm.assert_series_equal(result, exp)
1245+
1246+
def test_rsplit(self):
1247+
values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])
1248+
result = values.str.rsplit('_')
1249+
exp = Series([['a', 'b', 'c'], ['c', 'd', 'e'], NA, ['f', 'g', 'h']])
1250+
tm.assert_series_equal(result, exp)
1251+
1252+
# more than one char
1253+
values = Series(['a__b__c', 'c__d__e', NA, 'f__g__h'])
1254+
result = values.str.rsplit('__')
1255+
tm.assert_series_equal(result, exp)
1256+
1257+
result = values.str.rsplit('__', expand=False)
1258+
tm.assert_series_equal(result, exp)
1259+
1260+
# mixed
1261+
mixed = Series(['a_b_c', NA, 'd_e_f', True, datetime.today(),
1262+
None, 1, 2.])
1263+
result = mixed.str.rsplit('_')
1264+
exp = Series([['a', 'b', 'c'], NA, ['d', 'e', 'f'], NA, NA,
1265+
NA, NA, NA])
1266+
tm.assert_isinstance(result, Series)
1267+
tm.assert_almost_equal(result, exp)
1268+
1269+
result = mixed.str.rsplit('_', expand=False)
1270+
tm.assert_isinstance(result, Series)
1271+
tm.assert_almost_equal(result, exp)
1272+
1273+
# unicode
1274+
values = Series([u('a_b_c'), u('c_d_e'), NA, u('f_g_h')])
1275+
result = values.str.rsplit('_')
1276+
exp = Series([[u('a'), u('b'), u('c')],
1277+
[u('c'), u('d'), u('e')], NA,
1278+
[u('f'), u('g'), u('h')]])
1279+
tm.assert_series_equal(result, exp)
1280+
1281+
result = values.str.rsplit('_', expand=False)
1282+
tm.assert_series_equal(result, exp)
1283+
1284+
# regex split is not supported by rsplit
1285+
values = Series([u('a,b_c'), u('c_d,e'), NA, u('f,g,h')])
1286+
result = values.str.rsplit('[,_]')
1287+
exp = Series([[u('a,b_c')],
1288+
[u('c_d,e')],
1289+
NA,
1290+
[u('f,g,h')]])
1291+
tm.assert_series_equal(result, exp)
1292+
1293+
# setting max number of splits, make sure it's from reverse
1294+
values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])
1295+
result = values.str.rsplit('_', n=1)
1296+
exp = Series([['a_b', 'c'], ['c_d', 'e'], NA, ['f_g', 'h']])
1297+
tm.assert_series_equal(result, exp)
1298+
12371299
def test_split_noargs(self):
12381300
# #1859
12391301
s = Series(['Wes McKinney', 'Travis Oliphant'])
1240-
12411302
result = s.str.split()
1242-
self.assertEqual(result[1], ['Travis', 'Oliphant'])
1303+
expected = ['Travis', 'Oliphant']
1304+
self.assertEqual(result[1], expected)
1305+
result = s.str.rsplit()
1306+
self.assertEqual(result[1], expected)
12431307

12441308
def test_split_maxsplit(self):
12451309
# re.split 0, str.split -1
@@ -1348,6 +1412,55 @@ def test_split_to_multiindex_expand(self):
13481412
with tm.assertRaisesRegexp(ValueError, "expand must be"):
13491413
idx.str.split('_', return_type="some_invalid_type")
13501414

1415+
def test_rsplit_to_dataframe_expand(self):
1416+
s = Series(['nosplit', 'alsonosplit'])
1417+
result = s.str.rsplit('_', expand=True)
1418+
exp = DataFrame({0: Series(['nosplit', 'alsonosplit'])})
1419+
tm.assert_frame_equal(result, exp)
1420+
1421+
s = Series(['some_equal_splits', 'with_no_nans'])
1422+
result = s.str.rsplit('_', expand=True)
1423+
exp = DataFrame({0: ['some', 'with'], 1: ['equal', 'no'],
1424+
2: ['splits', 'nans']})
1425+
tm.assert_frame_equal(result, exp)
1426+
1427+
result = s.str.rsplit('_', expand=True, n=2)
1428+
exp = DataFrame({0: ['some', 'with'], 1: ['equal', 'no'],
1429+
2: ['splits', 'nans']})
1430+
tm.assert_frame_equal(result, exp)
1431+
1432+
result = s.str.rsplit('_', expand=True, n=1)
1433+
exp = DataFrame({0: ['some_equal', 'with_no'],
1434+
1: ['splits', 'nans']})
1435+
tm.assert_frame_equal(result, exp)
1436+
1437+
s = Series(['some_splits', 'with_index'], index=['preserve', 'me'])
1438+
result = s.str.rsplit('_', expand=True)
1439+
exp = DataFrame({0: ['some', 'with'], 1: ['splits', 'index']},
1440+
index=['preserve', 'me'])
1441+
tm.assert_frame_equal(result, exp)
1442+
1443+
def test_rsplit_to_multiindex_expand(self):
1444+
idx = Index(['nosplit', 'alsonosplit'])
1445+
result = idx.str.rsplit('_', expand=True)
1446+
exp = Index([np.array(['nosplit']), np.array(['alsonosplit'])])
1447+
tm.assert_index_equal(result, exp)
1448+
self.assertEqual(result.nlevels, 1)
1449+
1450+
idx = Index(['some_equal_splits', 'with_no_nans'])
1451+
result = idx.str.rsplit('_', expand=True)
1452+
exp = MultiIndex.from_tuples([('some', 'equal', 'splits'),
1453+
('with', 'no', 'nans')])
1454+
tm.assert_index_equal(result, exp)
1455+
self.assertEqual(result.nlevels, 3)
1456+
1457+
idx = Index(['some_equal_splits', 'with_no_nans'])
1458+
result = idx.str.rsplit('_', expand=True, n=1)
1459+
exp = MultiIndex.from_tuples([('some_equal', 'splits'),
1460+
('with_no', 'nans')])
1461+
tm.assert_index_equal(result, exp)
1462+
self.assertEqual(result.nlevels, 2)
1463+
13511464
def test_partition_series(self):
13521465
values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])
13531466

0 commit comments

Comments
 (0)