Skip to content

Commit b6b05a9

Browse files
committed
ENH: Add StringMethods.partition and rpartition
1 parent 8f0f417 commit b6b05a9

File tree

5 files changed

+216
-0
lines changed

5 files changed

+216
-0
lines changed

doc/source/api.rst

+2
Original file line numberDiff line numberDiff line change
@@ -541,10 +541,12 @@ strings and apply several methods to it. These can be acccessed like
541541
Series.str.lstrip
542542
Series.str.match
543543
Series.str.pad
544+
Series.str.partition
544545
Series.str.repeat
545546
Series.str.replace
546547
Series.str.rfind
547548
Series.str.rjust
549+
Series.str.rpartition
548550
Series.str.rstrip
549551
Series.str.slice
550552
Series.str.slice_replace

doc/source/text.rst

+2
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,8 @@ Method Summary
262262
:meth:`~Series.str.strip`,Equivalent to ``str.strip``
263263
:meth:`~Series.str.rstrip`,Equivalent to ``str.rstrip``
264264
:meth:`~Series.str.lstrip`,Equivalent to ``str.lstrip``
265+
:meth:`~Series.str.partition`,Equivalent to ``str.partition``
266+
:meth:`~Series.str.rpartition`,Equivalent to ``str.rpartition``
265267
:meth:`~Series.str.lower`,Equivalent to ``str.lower``
266268
:meth:`~Series.str.upper`,Equivalent to ``str.upper``
267269
:meth:`~Series.str.find`,Equivalent to ``str.find``

doc/source/whatsnew/v0.16.1.txt

+1
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ Enhancements
2626
- Added ``StringMethods.capitalize()`` and ``swapcase`` which behave as the same as standard ``str`` (:issue:`9766`)
2727
- ``DataFrame.diff`` now takes an ``axis`` parameter that determines the direction of differencing (:issue:`9727`)
2828
- Added ``StringMethods`` (.str accessor) to ``Index`` (:issue:`9068`)
29+
- Added ``StringMethods.partition()`` and ``rpartition()`` which behave as the same as standard ``str`` (:issue:`9773`)
2930
- Allow clip, clip_lower, and clip_upper to accept array-like arguments as thresholds (:issue:`6966`). These methods now have an ``axis`` parameter which determines how the Series or DataFrame will be aligned with the threshold(s).
3031

3132
The ``.str`` accessor is now available for both ``Series`` and ``Index``.

pandas/core/strings.py

+90
Original file line numberDiff line numberDiff line change
@@ -962,6 +962,8 @@ def __iter__(self):
962962
g = self.get(i)
963963

964964
def _wrap_result(self, result):
965+
# leave as it is to keep extract and get_dummies results
966+
# can be merged to _wrap_result_expand in v0.17
965967
from pandas.core.series import Series
966968
from pandas.core.frame import DataFrame
967969
from pandas.core.index import Index
@@ -982,6 +984,34 @@ def _wrap_result(self, result):
982984
assert result.ndim < 3
983985
return DataFrame(result, index=self.series.index)
984986

987+
def _wrap_result_expand(self, result, expand=False):
988+
from pandas.core.index import Index
989+
if not hasattr(result, 'ndim'):
990+
return result
991+
992+
if isinstance(self.series, Index):
993+
name = getattr(result, 'name', None)
994+
# if result is a boolean np.array, return the np.array
995+
# instead of wrapping it into a boolean Index (GH 8875)
996+
if hasattr(result, 'dtype') and is_bool_dtype(result):
997+
return result
998+
999+
if expand:
1000+
return Index(list(result), name=name)
1001+
else:
1002+
return Index(result, name=name)
1003+
else:
1004+
index = self.series.index
1005+
if expand:
1006+
cons_row = self.series._constructor
1007+
cons = self.series._constructor_expanddim
1008+
data = [cons_row(x) for x in result]
1009+
return cons(data, index=index)
1010+
else:
1011+
name = getattr(result, 'name', None)
1012+
cons = self.series._constructor
1013+
return cons(result, name=name, index=index)
1014+
9851015
@copy(str_cat)
9861016
def cat(self, others=None, sep=None, na_rep=None):
9871017
result = str_cat(self.series, others=others, sep=sep, na_rep=na_rep)
@@ -992,6 +1022,65 @@ def split(self, pat=None, n=-1, return_type='series'):
9921022
result = str_split(self.series, pat, n=n, return_type=return_type)
9931023
return self._wrap_result(result)
9941024

1025+
_shared_docs['str_partition'] = ("""
1026+
Split the string at the %(side)s occurrence of sep, and return a 3-tuple containing the part
1027+
before the separator, the separator itself, and the part after the separator.
1028+
If the separator is not found, return %(return)s.
1029+
1030+
Parameters
1031+
----------
1032+
pat : string, default whitespace
1033+
String to split on.
1034+
expand : bool, default True
1035+
If True, return DataFrame/MultiIndex expanding dimensionality
1036+
If False, return Series/Index
1037+
1038+
Returns
1039+
-------
1040+
split : DataFrame or Series
1041+
1042+
See Also
1043+
--------
1044+
%(also)s
1045+
1046+
Examples
1047+
--------
1048+
1049+
>>> s = Series(['A_B_C', 'D_E_F', 'X'])
1050+
0 A_B_C
1051+
1 D_E_F
1052+
2 X
1053+
dtype: object
1054+
1055+
>>> s.str.partition('_')
1056+
0 1 2
1057+
0 A _ B_C
1058+
1 D _ E_F
1059+
2 X
1060+
1061+
>>> s.str.rpartition('_')
1062+
0 1 2
1063+
0 A_B _ C
1064+
1 D_E _ F
1065+
2 X
1066+
""")
1067+
@Appender(_shared_docs['str_partition'] % {'side': 'first',
1068+
'return': 'a 3-tuple containing the string itself, followed by two empty strings',
1069+
'also': 'rpartition : Split the string at the last occurrence of sep'})
1070+
def partition(self, pat=' ', expand=True):
1071+
f = lambda x: x.partition(pat)
1072+
result = _na_map(f, self.series)
1073+
print('x', result)
1074+
return self._wrap_result_expand(result, expand=expand)
1075+
1076+
@Appender(_shared_docs['str_partition'] % {'side': 'last',
1077+
'return': 'a 3-tuple containing two empty strings, followed by the string itself',
1078+
'also': 'partition : Split the string at the first occurrence of sep'})
1079+
def rpartition(self, pat=' ', expand=True):
1080+
f = lambda x: x.rpartition(pat)
1081+
result = _na_map(f, self.series)
1082+
return self._wrap_result_expand(result, expand=expand)
1083+
9951084
@copy(str_get)
9961085
def get(self, i):
9971086
result = str_get(self.series, i)
@@ -1124,6 +1213,7 @@ def get_dummies(self, sep='|'):
11241213
startswith = _pat_wrapper(str_startswith, na=True)
11251214
endswith = _pat_wrapper(str_endswith, na=True)
11261215
findall = _pat_wrapper(str_findall, flags=True)
1216+
findall = _pat_wrapper(str_findall, flags=True)
11271217
extract = _pat_wrapper(str_extract, flags=True)
11281218

11291219
_shared_docs['find'] = ("""

pandas/tests/test_strings.py

+121
Original file line numberDiff line numberDiff line change
@@ -664,6 +664,8 @@ def test_empty_str_methods(self):
664664
tm.assert_series_equal(empty_str, empty.str.pad(42))
665665
tm.assert_series_equal(empty_str, empty.str.center(42))
666666
tm.assert_series_equal(empty_list, empty.str.split('a'))
667+
tm.assert_series_equal(empty_list, empty.str.partition('a', expand=False))
668+
tm.assert_series_equal(empty_list, empty.str.rpartition('a', expand=False))
667669
tm.assert_series_equal(empty_str, empty.str.slice(stop=1))
668670
tm.assert_series_equal(empty_str, empty.str.slice(step=1))
669671
tm.assert_series_equal(empty_str, empty.str.strip())
@@ -686,6 +688,12 @@ def test_empty_str_methods(self):
686688
tm.assert_series_equal(empty_str, empty.str.capitalize())
687689
tm.assert_series_equal(empty_str, empty.str.swapcase())
688690

691+
def test_empty_str_methods_to_frame(self):
692+
empty_str = empty = Series(dtype=str)
693+
empty_df = DataFrame([])
694+
tm.assert_frame_equal(empty_df, empty.str.partition('a'))
695+
tm.assert_frame_equal(empty_df, empty.str.rpartition('a'))
696+
689697
def test_ismethods(self):
690698
values = ['A', 'b', 'Xy', '4', '3A', '', 'TT', '55', '-', ' ']
691699
str_s = Series(values)
@@ -1174,6 +1182,119 @@ def test_split_to_dataframe(self):
11741182
with tm.assertRaisesRegexp(ValueError, "return_type must be"):
11751183
s.str.split('_', return_type="some_invalid_type")
11761184

1185+
def test_partition_series(self):
1186+
values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])
1187+
1188+
result = values.str.partition('_', expand=False)
1189+
exp = Series([['a', '_', 'b_c'], ['c', '_', 'd_e'], NA, ['f', '_', 'g_h']])
1190+
tm.assert_series_equal(result, exp)
1191+
1192+
result = values.str.rpartition('_', expand=False)
1193+
exp = Series([['a_b', '_', 'c'], ['c_d', '_', 'e'], NA, ['f_g', '_', 'h']])
1194+
tm.assert_series_equal(result, exp)
1195+
1196+
# more than one char
1197+
values = Series(['a__b__c', 'c__d__e', NA, 'f__g__h'])
1198+
result = values.str.partition('__', expand=False)
1199+
exp = Series([['a', '__', 'b__c'], ['c', '__', 'd__e'], NA, ['f', '__', 'g__h']])
1200+
tm.assert_series_equal(result, exp)
1201+
1202+
result = values.str.rpartition('__', expand=False)
1203+
exp = Series([['a__b', '__', 'c'], ['c__d', '__', 'e'], NA, ['f__g', '__', 'h']])
1204+
tm.assert_series_equal(result, exp)
1205+
1206+
# None
1207+
values = Series(['a b c', 'c d e', NA, 'f g h'])
1208+
result = values.str.partition(expand=False)
1209+
exp = Series([['a', ' ', 'b c'], ['c', ' ', 'd e'], NA, ['f', ' ', 'g h']])
1210+
tm.assert_series_equal(result, exp)
1211+
1212+
result = values.str.rpartition(expand=False)
1213+
exp = Series([['a b', ' ', 'c'], ['c d', ' ', 'e'], NA, ['f g', ' ', 'h']])
1214+
tm.assert_series_equal(result, exp)
1215+
1216+
# Not splited
1217+
values = Series(['abc', 'cde', NA, 'fgh'])
1218+
result = values.str.partition('_', expand=False)
1219+
exp = Series([['abc', '', ''], ['cde', '', ''], NA, ['fgh', '', '']])
1220+
tm.assert_series_equal(result, exp)
1221+
1222+
result = values.str.rpartition('_', expand=False)
1223+
exp = Series([['', '', 'abc'], ['', '', 'cde'], NA, ['', '', 'fgh']])
1224+
tm.assert_series_equal(result, exp)
1225+
1226+
# unicode
1227+
values = Series([u('a_b_c'), u('c_d_e'), NA, u('f_g_h')])
1228+
1229+
result = values.str.partition('_', expand=False)
1230+
exp = Series([[u('a'), u('_'), u('b_c')], [u('c'), u('_'), u('d_e')],
1231+
NA, [u('f'), u('_'), u('g_h')]])
1232+
tm.assert_series_equal(result, exp)
1233+
1234+
result = values.str.rpartition('_', expand=False)
1235+
exp = Series([[u('a_b'), u('_'), u('c')], [u('c_d'), u('_'), u('e')],
1236+
NA, [u('f_g'), u('_'), u('h')]])
1237+
tm.assert_series_equal(result, exp)
1238+
1239+
# compare to standard lib
1240+
values = Series(['A_B_C', 'B_C_D', 'E_F_G', 'EFGHEF'])
1241+
result = values.str.partition('_', expand=False).tolist()
1242+
self.assertEqual(result, [v.partition('_') for v in values])
1243+
result = values.str.rpartition('_', expand=False).tolist()
1244+
self.assertEqual(result, [v.rpartition('_') for v in values])
1245+
1246+
def test_partition_index(self):
1247+
values = Index(['a_b_c', 'c_d_e', 'f_g_h'])
1248+
1249+
result = values.str.partition('_', expand=False)
1250+
exp = Index(np.array([('a', '_', 'b_c'), ('c', '_', 'd_e'), ('f', '_', 'g_h')]))
1251+
tm.assert_index_equal(result, exp)
1252+
self.assertEqual(result.nlevels, 1)
1253+
1254+
result = values.str.rpartition('_', expand=False)
1255+
exp = Index(np.array([('a_b', '_', 'c'), ('c_d', '_', 'e'), ('f_g', '_', 'h')]))
1256+
tm.assert_index_equal(result, exp)
1257+
self.assertEqual(result.nlevels, 1)
1258+
1259+
result = values.str.partition('_')
1260+
exp = Index([('a', '_', 'b_c'), ('c', '_', 'd_e'), ('f', '_', 'g_h')])
1261+
tm.assert_index_equal(result, exp)
1262+
self.assertTrue(isinstance(result, MultiIndex))
1263+
self.assertEqual(result.nlevels, 3)
1264+
1265+
result = values.str.rpartition('_')
1266+
exp = Index([('a_b', '_', 'c'), ('c_d', '_', 'e'), ('f_g', '_', 'h')])
1267+
tm.assert_index_equal(result, exp)
1268+
self.assertTrue(isinstance(result, MultiIndex))
1269+
self.assertEqual(result.nlevels, 3)
1270+
1271+
def test_partition_to_dataframe(self):
1272+
values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])
1273+
result = values.str.partition('_')
1274+
exp = DataFrame({0: ['a', 'c', np.nan, 'f'],
1275+
1: ['_', '_', np.nan, '_'],
1276+
2: ['b_c', 'd_e', np.nan, 'g_h']})
1277+
tm.assert_frame_equal(result, exp)
1278+
1279+
result = values.str.rpartition('_')
1280+
exp = DataFrame({0: ['a_b', 'c_d', np.nan, 'f_g'],
1281+
1: ['_', '_', np.nan, '_'],
1282+
2: ['c', 'e', np.nan, 'h']})
1283+
tm.assert_frame_equal(result, exp)
1284+
1285+
values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])
1286+
result = values.str.partition('_', expand=True)
1287+
exp = DataFrame({0: ['a', 'c', np.nan, 'f'],
1288+
1: ['_', '_', np.nan, '_'],
1289+
2: ['b_c', 'd_e', np.nan, 'g_h']})
1290+
tm.assert_frame_equal(result, exp)
1291+
1292+
result = values.str.rpartition('_', expand=True)
1293+
exp = DataFrame({0: ['a_b', 'c_d', np.nan, 'f_g'],
1294+
1: ['_', '_', np.nan, '_'],
1295+
2: ['c', 'e', np.nan, 'h']})
1296+
tm.assert_frame_equal(result, exp)
1297+
11771298
def test_pipe_failures(self):
11781299
# #2119
11791300
s = Series(['A|B|C'])

0 commit comments

Comments
 (0)