Skip to content

Commit 4a2ac33

Browse files
committed
PERF: increase performance of string split when expand=True
1 parent 1e2d44e commit 4a2ac33

File tree

1 file changed

+124
-32
lines changed

1 file changed

+124
-32
lines changed

pandas/core/strings.py

+124-32
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from pandas.compat import zip
44
from pandas.core.common import isnull, _values_from_object, is_bool_dtype
55
import pandas.compat as compat
6-
from pandas.util.decorators import Appender
6+
from pandas.util.decorators import Appender, deprecate_kwarg
77
import re
88
import pandas.lib as lib
99
import warnings
@@ -638,6 +638,26 @@ def str_find(arr, sub, start=0, end=None, side='left'):
638638
return _na_map(f, arr, dtype=int)
639639

640640

641+
def str_index(arr, sub, start=0, end=None, side='left'):
642+
if not isinstance(sub, compat.string_types):
643+
msg = 'expected a string object, not {0}'
644+
raise TypeError(msg.format(type(sub).__name__))
645+
646+
if side == 'left':
647+
method = 'index'
648+
elif side == 'right':
649+
method = 'rindex'
650+
else: # pragma: no cover
651+
raise ValueError('Invalid side')
652+
653+
if end is None:
654+
f = lambda x: getattr(x, method)(sub, start)
655+
else:
656+
f = lambda x: getattr(x, method)(sub, start, end)
657+
658+
return _na_map(f, arr, dtype=int)
659+
660+
641661
def str_pad(arr, width, side='left', fillchar=' '):
642662
"""
643663
Pad strings in the Series/Index with an additional character to
@@ -676,7 +696,7 @@ def str_pad(arr, width, side='left', fillchar=' '):
676696
return _na_map(f, arr)
677697

678698

679-
def str_split(arr, pat=None, n=None, return_type='series'):
699+
def str_split(arr, pat=None, n=None):
680700
"""
681701
Split each string (a la re.split) in the Series/Index by given
682702
pattern, propagating NA values. Equivalent to :meth:`str.split`.
@@ -685,29 +705,17 @@ def str_split(arr, pat=None, n=None, return_type='series'):
685705
----------
686706
pat : string, default None
687707
String or regular expression to split on. If None, splits on whitespace
688-
n : int, default None (all)
689-
return_type : {'series', 'index', 'frame'}, default 'series'
690-
If frame, returns a DataFrame (elements are strings)
691-
If series or index, returns the same type as the original object
692-
(elements are lists of strings).
693-
694-
Notes
695-
-----
696-
Both 0 and -1 will be interpreted as return all splits
708+
n : int, default -1 (all)
709+
None, 0 and -1 will be interpreted as return all splits
710+
expand : bool, default False
711+
* If True, return DataFrame/MultiIndex expanding dimensionality.
712+
* If False, return Series/Index.
713+
return_type : deprecated, use `expand`
697714
698715
Returns
699716
-------
700-
split : Series/Index of objects or DataFrame
717+
split : Series/Index or DataFrame/MultiIndex of objects
701718
"""
702-
from pandas.core.series import Series
703-
from pandas.core.frame import DataFrame
704-
from pandas.core.index import Index
705-
706-
if return_type not in ('series', 'index', 'frame'):
707-
raise ValueError("return_type must be {'series', 'index', 'frame'}")
708-
if return_type == 'frame' and isinstance(arr, Index):
709-
raise ValueError("return_type='frame' is not supported for string "
710-
"methods on Index")
711719
if pat is None:
712720
if n is None or n == 0:
713721
n = -1
@@ -722,10 +730,7 @@ def str_split(arr, pat=None, n=None, return_type='series'):
722730
n = 0
723731
regex = re.compile(pat)
724732
f = lambda x: regex.split(x, maxsplit=n)
725-
if return_type == 'frame':
726-
res = DataFrame((x for x in _na_map(f, arr)), index=arr.index)
727-
else:
728-
res = _na_map(f, arr)
733+
res = _na_map(f, arr)
729734
return res
730735

731736

@@ -808,7 +813,7 @@ def str_strip(arr, to_strip=None, side='both'):
808813

809814

810815
def str_wrap(arr, width, **kwargs):
811-
"""
816+
r"""
812817
Wrap long strings in the Series/Index to be formatted in
813818
paragraphs with length less than a given width.
814819
@@ -870,6 +875,44 @@ def str_wrap(arr, width, **kwargs):
870875
return _na_map(lambda s: '\n'.join(tw.wrap(s)), arr)
871876

872877

878+
def str_translate(arr, table, deletechars=None):
879+
"""
880+
Map all characters in the string through the given mapping table.
881+
Equivalent to standard :meth:`str.translate`. Note that the optional
882+
argument deletechars is only valid if you are using python 2. For python 3,
883+
character deletion should be specified via the table argument.
884+
885+
Parameters
886+
----------
887+
table : dict (python 3), str or None (python 2)
888+
In python 3, table is a mapping of Unicode ordinals to Unicode ordinals,
889+
strings, or None. Unmapped characters are left untouched. Characters
890+
mapped to None are deleted. :meth:`str.maketrans` is a helper function
891+
for making translation tables.
892+
In python 2, table is either a string of length 256 or None. If the
893+
table argument is None, no translation is applied and the operation
894+
simply removes the characters in deletechars. :func:`string.maketrans`
895+
is a helper function for making translation tables.
896+
deletechars : str, optional (python 2)
897+
A string of characters to delete. This argument is only valid
898+
in python 2.
899+
900+
Returns
901+
-------
902+
translated : Series/Index of objects
903+
"""
904+
if deletechars is None:
905+
f = lambda x: x.translate(table)
906+
else:
907+
from pandas import compat
908+
if compat.PY3:
909+
raise ValueError("deletechars is not a valid argument for "
910+
"str.translate in python 3. You should simply "
911+
"specify character deletions in the table argument")
912+
f = lambda x: x.translate(table, deletechars)
913+
return _na_map(f, arr)
914+
915+
873916
def str_get(arr, i):
874917
"""
875918
Extract element from lists, tuples, or strings in each element in the
@@ -1001,6 +1044,7 @@ def __iter__(self):
10011044
g = self.get(i)
10021045

10031046
def _wrap_result(self, result, **kwargs):
1047+
10041048
# leave as it is to keep extract and get_dummies results
10051049
# can be merged to _wrap_result_expand in v0.17
10061050
from pandas.core.series import Series
@@ -1024,7 +1068,10 @@ def _wrap_result(self, result, **kwargs):
10241068
return DataFrame(result, index=self.series.index)
10251069

10261070
def _wrap_result_expand(self, result, expand=False):
1027-
from pandas.core.index import Index
1071+
if not isinstance(expand, bool):
1072+
raise ValueError("expand must be True or False")
1073+
1074+
from pandas.core.index import Index, MultiIndex
10281075
if not hasattr(result, 'ndim'):
10291076
return result
10301077

@@ -1037,7 +1084,9 @@ def _wrap_result_expand(self, result, expand=False):
10371084

10381085
if expand:
10391086
result = list(result)
1040-
return Index(result, name=name)
1087+
return MultiIndex.from_tuples(result, names=name)
1088+
else:
1089+
return Index(result, name=name)
10411090
else:
10421091
index = self.series.index
10431092
if expand:
@@ -1054,10 +1103,12 @@ def cat(self, others=None, sep=None, na_rep=None):
10541103
result = str_cat(self.series, others=others, sep=sep, na_rep=na_rep)
10551104
return self._wrap_result(result)
10561105

1106+
@deprecate_kwarg('return_type', 'expand',
1107+
mapping={'series': False, 'frame': True})
10571108
@copy(str_split)
1058-
def split(self, pat=None, n=-1, return_type='series'):
1059-
result = str_split(self.series, pat, n=n, return_type=return_type)
1060-
return self._wrap_result(result)
1109+
def split(self, pat=None, n=-1, expand=False):
1110+
result = str_split(self.series, pat, n=n)
1111+
return self._wrap_result_expand(result, expand=expand)
10611112

10621113
_shared_docs['str_partition'] = ("""
10631114
Split the string at the %(side)s occurrence of `sep`, and return 3 elements
@@ -1071,7 +1122,7 @@ def split(self, pat=None, n=-1, return_type='series'):
10711122
String to split on.
10721123
expand : bool, default True
10731124
* If True, return DataFrame/MultiIndex expanding dimensionality.
1074-
* If False, return Series/Index
1125+
* If False, return Series/Index.
10751126
10761127
Returns
10771128
-------
@@ -1261,6 +1312,11 @@ def get_dummies(self, sep='|'):
12611312
result = str_get_dummies(self.series, sep)
12621313
return self._wrap_result(result)
12631314

1315+
@copy(str_translate)
1316+
def translate(self, table, deletechars=None):
1317+
result = str_translate(self.series, table, deletechars)
1318+
return self._wrap_result(result)
1319+
12641320
count = _pat_wrapper(str_count, flags=True)
12651321
startswith = _pat_wrapper(str_startswith, na=True)
12661322
endswith = _pat_wrapper(str_endswith, na=True)
@@ -1325,6 +1381,42 @@ def normalize(self, form):
13251381
result = _na_map(f, self.series)
13261382
return self._wrap_result(result)
13271383

1384+
_shared_docs['index'] = ("""
1385+
Return %(side)s indexes in each strings where the substring is
1386+
fully contained between [start:end]. This is the same as ``str.%(similar)s``
1387+
except instead of returning -1, it raises a ValueError when the substring
1388+
is not found. Equivalent to standard ``str.%(method)s``.
1389+
1390+
Parameters
1391+
----------
1392+
sub : str
1393+
Substring being searched
1394+
start : int
1395+
Left edge index
1396+
end : int
1397+
Right edge index
1398+
1399+
Returns
1400+
-------
1401+
found : Series/Index of objects
1402+
1403+
See Also
1404+
--------
1405+
%(also)s
1406+
""")
1407+
1408+
@Appender(_shared_docs['index'] % dict(side='lowest', similar='find', method='index',
1409+
also='rindex : Return highest indexes in each strings'))
1410+
def index(self, sub, start=0, end=None):
1411+
result = str_index(self.series, sub, start=start, end=end, side='left')
1412+
return self._wrap_result(result)
1413+
1414+
@Appender(_shared_docs['index'] % dict(side='highest', similar='rfind', method='rindex',
1415+
also='index : Return lowest indexes in each strings'))
1416+
def rindex(self, sub, start=0, end=None):
1417+
result = str_index(self.series, sub, start=start, end=end, side='right')
1418+
return self._wrap_result(result)
1419+
13281420
_shared_docs['len'] = ("""
13291421
Compute length of each string in the Series/Index.
13301422

0 commit comments

Comments
 (0)