Skip to content

Commit 1b4a0f0

Browse files
committed
ENH: Add StringMethods.partition and rpartition
1 parent 0222024 commit 1b4a0f0

File tree

5 files changed

+177
-11
lines changed

5 files changed

+177
-11
lines changed

doc/source/api.rst

+2
Original file line numberDiff line numberDiff line change
@@ -540,10 +540,12 @@ strings and apply several methods to it. These can be acccessed like
540540
Series.str.lstrip
541541
Series.str.match
542542
Series.str.pad
543+
Series.str.partition
543544
Series.str.repeat
544545
Series.str.replace
545546
Series.str.rfind
546547
Series.str.rjust
548+
Series.str.rpartition
547549
Series.str.rstrip
548550
Series.str.slice
549551
Series.str.slice_replace

doc/source/text.rst

+2
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,8 @@ Method Summary
229229
:meth:`~Series.str.strip`,Equivalent to ``str.strip``
230230
:meth:`~Series.str.rstrip`,Equivalent to ``str.rstrip``
231231
:meth:`~Series.str.lstrip`,Equivalent to ``str.lstrip``
232+
:meth:`~Series.str.partition`,Equivalent to ``str.partition``
233+
:meth:`~Series.str.rpartition`,Equivalent to ``str.rpartition``
232234
:meth:`~Series.str.lower`,Equivalent to ``str.lower``
233235
:meth:`~Series.str.upper`,Equivalent to ``str.upper``
234236
:meth:`~Series.str.find`,Equivalent to ``str.find``

doc/source/whatsnew/v0.16.1.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ Enhancements
2121

2222

2323

24-
24+
- Added ``StringMethods.partition()`` and ``rpartition()`` which behave as the same as standard ``str`` (:issue:`9773`)
2525

2626

2727
.. _whatsnew_0161.api:

pandas/core/strings.py

+72-10
Original file line numberDiff line numberDiff line change
@@ -622,6 +622,19 @@ def str_pad(arr, width, side='left', fillchar=' '):
622622
return _na_map(f, arr)
623623

624624

625+
def _return_type_wrapper(f, arr, return_type):
626+
if return_type not in ('series', 'frame'):
627+
raise ValueError("return_type must be {'series', 'frame'}")
628+
629+
if return_type == 'frame':
630+
from pandas.core.frame import DataFrame
631+
from pandas.core.series import Series
632+
return DataFrame((Series(x) for x in _na_map(f, arr)), index=arr.index)
633+
634+
else:
635+
return _na_map(f, arr)
636+
637+
625638
def str_split(arr, pat=None, n=None, return_type='series'):
626639
"""
627640
Split each string (a la re.split) in array by given pattern, propagating NA
@@ -644,11 +657,6 @@ def str_split(arr, pat=None, n=None, return_type='series'):
644657
-------
645658
split : array
646659
"""
647-
from pandas.core.series import Series
648-
from pandas.core.frame import DataFrame
649-
650-
if return_type not in ('series', 'frame'):
651-
raise ValueError("return_type must be {'series', 'frame'}")
652660
if pat is None:
653661
if n is None or n == 0:
654662
n = -1
@@ -663,11 +671,7 @@ def str_split(arr, pat=None, n=None, return_type='series'):
663671
n = 0
664672
regex = re.compile(pat)
665673
f = lambda x: regex.split(x, maxsplit=n)
666-
if return_type == 'frame':
667-
res = DataFrame((Series(x) for x in _na_map(f, arr)), index=arr.index)
668-
else:
669-
res = _na_map(f, arr)
670-
return res
674+
return _return_type_wrapper(f, arr, return_type)
671675

672676

673677
def str_slice(arr, start=None, stop=None, step=None):
@@ -978,6 +982,64 @@ def split(self, pat=None, n=-1, return_type='series'):
978982
result = str_split(self.series, pat, n=n, return_type=return_type)
979983
return self._wrap_result(result)
980984

985+
_shared_docs['str_partition'] = ("""
986+
Split the string at the %(side)s occurrence of sep, and return a 3-tuple containing the part
987+
before the separator, the separator itself, and the part after the separator.
988+
If the separator is not found, return %(return)s.
989+
990+
Parameters
991+
----------
992+
pat : string, default whitespace
993+
String to split on.
994+
return_type : {'series', 'frame'}, default 'frame'
995+
If frame, returns a DataFrame (elements are strings)
996+
If series, returns an Series (elements are lists of strings).
997+
998+
Returns
999+
-------
1000+
split : array
1001+
1002+
See Also
1003+
--------
1004+
%(also)s
1005+
1006+
Examples
1007+
--------
1008+
1009+
>>> s = Series(['A_B_C', 'D_E_F', 'X'])
1010+
0 A_B_C
1011+
1 D_E_F
1012+
2 X
1013+
dtype: object
1014+
1015+
>>> s.str.partition('_')
1016+
0 1 2
1017+
0 A _ B_C
1018+
1 D _ E_F
1019+
2 X
1020+
1021+
>>> s.str.rpartition('_')
1022+
0 1 2
1023+
0 A_B _ C
1024+
1 D_E _ F
1025+
2 X
1026+
""")
1027+
@Appender(_shared_docs['str_partition'] % {'side': 'first',
1028+
'return': 'a 3-tuple containing the string itself, followed by two empty strings',
1029+
'also': 'rpartition : Split the string at the last occurrence of sep'})
1030+
def partition(self, pat=' ', return_type='frame'):
1031+
f = lambda x: x.partition(pat)
1032+
result = _return_type_wrapper(f, self.series, return_type)
1033+
return self._wrap_result(result)
1034+
1035+
@Appender(_shared_docs['str_partition'] % {'side': 'last',
1036+
'return': 'a 3-tuple containing two empty strings, followed by the string itself',
1037+
'also': 'partition : Split the string at the first occurrence of sep'})
1038+
def rpartition(self, pat=' ', return_type='frame'):
1039+
f = lambda x: x.rpartition(pat)
1040+
result = _return_type_wrapper(f, self.series, return_type)
1041+
return self._wrap_result(result)
1042+
9811043
@copy(str_get)
9821044
def get(self, i):
9831045
result = str_get(self.series, i)

pandas/tests/test_strings.py

+100
Original file line numberDiff line numberDiff line change
@@ -664,6 +664,8 @@ def test_empty_str_methods(self):
664664
tm.assert_series_equal(empty_str, empty.str.pad(42))
665665
tm.assert_series_equal(empty_str, empty.str.center(42))
666666
tm.assert_series_equal(empty_list, empty.str.split('a'))
667+
tm.assert_series_equal(empty_list, empty.str.partition('a', return_type='series'))
668+
tm.assert_series_equal(empty_list, empty.str.rpartition('a', return_type='series'))
667669
tm.assert_series_equal(empty_str, empty.str.slice(stop=1))
668670
tm.assert_series_equal(empty_str, empty.str.slice(step=1))
669671
tm.assert_series_equal(empty_str, empty.str.strip())
@@ -686,6 +688,13 @@ def test_empty_str_methods(self):
686688
tm.assert_series_equal(empty_str, empty.str.capitalize())
687689
tm.assert_series_equal(empty_str, empty.str.swapcase())
688690

691+
def test_empty_str_methods_to_frame(self):
692+
empty_str = empty = Series(dtype=str)
693+
empty_df = DataFrame([])
694+
695+
tm.assert_frame_equal(empty_df, empty.str.partition('a'))
696+
tm.assert_frame_equal(empty_df, empty.str.rpartition('a'))
697+
689698
def test_ismethods(self):
690699
values = ['A', 'b', 'Xy', '4', '3A', '', 'TT', '55', '-', ' ']
691700
str_s = Series(values)
@@ -1174,6 +1183,97 @@ def test_split_to_dataframe(self):
11741183
with tm.assertRaisesRegexp(ValueError, "return_type must be"):
11751184
s.str.split('_', return_type="some_invalid_type")
11761185

1186+
def test_partition_series(self):
1187+
values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])
1188+
1189+
result = values.str.partition('_', return_type='series')
1190+
exp = Series([['a', '_', 'b_c'], ['c', '_', 'd_e'], NA, ['f', '_', 'g_h']])
1191+
tm.assert_series_equal(result, exp)
1192+
1193+
result = values.str.rpartition('_', return_type='series')
1194+
exp = Series([['a_b', '_', 'c'], ['c_d', '_', 'e'], NA, ['f_g', '_', 'h']])
1195+
tm.assert_series_equal(result, exp)
1196+
1197+
# more than one char
1198+
values = Series(['a__b__c', 'c__d__e', NA, 'f__g__h'])
1199+
result = values.str.partition('__', return_type='series')
1200+
exp = Series([['a', '__', 'b__c'], ['c', '__', 'd__e'], NA, ['f', '__', 'g__h']])
1201+
tm.assert_series_equal(result, exp)
1202+
1203+
result = values.str.rpartition('__', return_type='series')
1204+
exp = Series([['a__b', '__', 'c'], ['c__d', '__', 'e'], NA, ['f__g', '__', 'h']])
1205+
tm.assert_series_equal(result, exp)
1206+
1207+
# None
1208+
values = Series(['a b c', 'c d e', NA, 'f g h'])
1209+
result = values.str.partition(return_type='series')
1210+
exp = Series([['a', ' ', 'b c'], ['c', ' ', 'd e'], NA, ['f', ' ', 'g h']])
1211+
tm.assert_series_equal(result, exp)
1212+
1213+
result = values.str.rpartition(return_type='series')
1214+
exp = Series([['a b', ' ', 'c'], ['c d', ' ', 'e'], NA, ['f g', ' ', 'h']])
1215+
tm.assert_series_equal(result, exp)
1216+
1217+
# Not splited
1218+
values = Series(['abc', 'cde', NA, 'fgh'])
1219+
result = values.str.partition('_', return_type='series')
1220+
exp = Series([['abc', '', ''], ['cde', '', ''], NA, ['fgh', '', '']])
1221+
tm.assert_series_equal(result, exp)
1222+
1223+
result = values.str.rpartition('_', return_type='series')
1224+
exp = Series([['', '', 'abc'], ['', '', 'cde'], NA, ['', '', 'fgh']])
1225+
tm.assert_series_equal(result, exp)
1226+
1227+
# unicode
1228+
values = Series([u('a_b_c'), u('c_d_e'), NA, u('f_g_h')])
1229+
1230+
result = values.str.partition('_', return_type='series')
1231+
exp = Series([[u('a'), u('_'), u('b_c')], [u('c'), u('_'), u('d_e')],
1232+
NA, [u('f'), u('_'), u('g_h')]])
1233+
tm.assert_series_equal(result, exp)
1234+
1235+
result = values.str.rpartition('_', return_type='series')
1236+
exp = Series([[u('a_b'), u('_'), u('c')], [u('c_d'), u('_'), u('e')],
1237+
NA, [u('f_g'), u('_'), u('h')]])
1238+
tm.assert_series_equal(result, exp)
1239+
1240+
# compare to standard lib
1241+
values = Series(['A_B_C', 'B_C_D', 'E_F_G', 'EFGHEF'])
1242+
result = values.str.partition('_', return_type='series').tolist()
1243+
self.assertEqual(result, [v.partition('_') for v in values])
1244+
result = values.str.rpartition('_', return_type='series').tolist()
1245+
self.assertEqual(result, [v.rpartition('_') for v in values])
1246+
1247+
def test_partition_to_dataframe(self):
1248+
values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])
1249+
result = values.str.partition('_')
1250+
exp = DataFrame({0: ['a', 'c', np.nan, 'f'],
1251+
1: ['_', '_', np.nan, '_'],
1252+
2: ['b_c', 'd_e', np.nan, 'g_h']})
1253+
tm.assert_frame_equal(result, exp)
1254+
1255+
result = values.str.rpartition('_')
1256+
exp = DataFrame({0: ['a_b', 'c_d', np.nan, 'f_g'],
1257+
1: ['_', '_', np.nan, '_'],
1258+
2: ['c', 'e', np.nan, 'h']})
1259+
tm.assert_frame_equal(result, exp)
1260+
1261+
values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])
1262+
result = values.str.partition('_', return_type='frame')
1263+
exp = DataFrame({0: ['a', 'c', np.nan, 'f'],
1264+
1: ['_', '_', np.nan, '_'],
1265+
2: ['b_c', 'd_e', np.nan, 'g_h']})
1266+
tm.assert_frame_equal(result, exp)
1267+
1268+
result = values.str.rpartition('_', return_type='frame')
1269+
exp = DataFrame({0: ['a_b', 'c_d', np.nan, 'f_g'],
1270+
1: ['_', '_', np.nan, '_'],
1271+
2: ['c', 'e', np.nan, 'h']})
1272+
tm.assert_frame_equal(result, exp)
1273+
1274+
with tm.assertRaisesRegexp(ValueError, "return_type must be"):
1275+
values.str.partition('_', return_type="some_invalid_type")
1276+
11771277
def test_pipe_failures(self):
11781278
# #2119
11791279
s = Series(['A|B|C'])

0 commit comments

Comments
 (0)