Skip to content

Commit 1e2d44e

Browse files
committed
# This is a combination of 2 commits.
# The first commit's message is: PERF: increase performance of str_split when returning a frame # The 2nd commit message will be skipped: # PERF: new str.split performance improvement, handles NaN
1 parent 9b04bd0 commit 1e2d44e

File tree

1 file changed

+34
-127
lines changed

1 file changed

+34
-127
lines changed

pandas/core/strings.py

+34-127
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from pandas.compat import zip
44
from pandas.core.common import isnull, _values_from_object, is_bool_dtype
55
import pandas.compat as compat
6-
from pandas.util.decorators import Appender, deprecate_kwarg
6+
from pandas.util.decorators import Appender
77
import re
88
import pandas.lib as lib
99
import warnings
@@ -638,26 +638,6 @@ def str_find(arr, sub, start=0, end=None, side='left'):
638638
return _na_map(f, arr, dtype=int)
639639

640640

641-
def str_index(arr, sub, start=0, end=None, side='left'):
642-
if not isinstance(sub, compat.string_types):
643-
msg = 'expected a string object, not {0}'
644-
raise TypeError(msg.format(type(sub).__name__))
645-
646-
if side == 'left':
647-
method = 'index'
648-
elif side == 'right':
649-
method = 'rindex'
650-
else: # pragma: no cover
651-
raise ValueError('Invalid side')
652-
653-
if end is None:
654-
f = lambda x: getattr(x, method)(sub, start)
655-
else:
656-
f = lambda x: getattr(x, method)(sub, start, end)
657-
658-
return _na_map(f, arr, dtype=int)
659-
660-
661641
def str_pad(arr, width, side='left', fillchar=' '):
662642
"""
663643
Pad strings in the Series/Index with an additional character to
@@ -696,7 +676,7 @@ def str_pad(arr, width, side='left', fillchar=' '):
696676
return _na_map(f, arr)
697677

698678

699-
def str_split(arr, pat=None, n=None):
679+
def str_split(arr, pat=None, n=None, return_type='series'):
700680
"""
701681
Split each string (a la re.split) in the Series/Index by given
702682
pattern, propagating NA values. Equivalent to :meth:`str.split`.
@@ -705,17 +685,29 @@ def str_split(arr, pat=None, n=None):
705685
----------
706686
pat : string, default None
707687
String or regular expression to split on. If None, splits on whitespace
708-
n : int, default -1 (all)
709-
None, 0 and -1 will be interpreted as return all splits
710-
expand : bool, default False
711-
* If True, return DataFrame/MultiIndex expanding dimensionality.
712-
* If False, return Series/Index.
713-
return_type : deprecated, use `expand`
688+
n : int, default None (all)
689+
return_type : {'series', 'index', 'frame'}, default 'series'
690+
If frame, returns a DataFrame (elements are strings)
691+
If series or index, returns the same type as the original object
692+
(elements are lists of strings).
693+
694+
Notes
695+
-----
696+
Both 0 and -1 will be interpreted as return all splits
714697
715698
Returns
716699
-------
717-
split : Series/Index or DataFrame/MultiIndex of objects
700+
split : Series/Index of objects or DataFrame
718701
"""
702+
from pandas.core.series import Series
703+
from pandas.core.frame import DataFrame
704+
from pandas.core.index import Index
705+
706+
if return_type not in ('series', 'index', 'frame'):
707+
raise ValueError("return_type must be {'series', 'index', 'frame'}")
708+
if return_type == 'frame' and isinstance(arr, Index):
709+
raise ValueError("return_type='frame' is not supported for string "
710+
"methods on Index")
719711
if pat is None:
720712
if n is None or n == 0:
721713
n = -1
@@ -730,7 +722,10 @@ def str_split(arr, pat=None, n=None):
730722
n = 0
731723
regex = re.compile(pat)
732724
f = lambda x: regex.split(x, maxsplit=n)
733-
res = _na_map(f, arr)
725+
if return_type == 'frame':
726+
res = DataFrame((x for x in _na_map(f, arr)), index=arr.index)
727+
else:
728+
res = _na_map(f, arr)
734729
return res
735730

736731

@@ -813,7 +808,7 @@ def str_strip(arr, to_strip=None, side='both'):
813808

814809

815810
def str_wrap(arr, width, **kwargs):
816-
r"""
811+
"""
817812
Wrap long strings in the Series/Index to be formatted in
818813
paragraphs with length less than a given width.
819814
@@ -875,44 +870,6 @@ def str_wrap(arr, width, **kwargs):
875870
return _na_map(lambda s: '\n'.join(tw.wrap(s)), arr)
876871

877872

878-
def str_translate(arr, table, deletechars=None):
879-
"""
880-
Map all characters in the string through the given mapping table.
881-
Equivalent to standard :meth:`str.translate`. Note that the optional
882-
argument deletechars is only valid if you are using python 2. For python 3,
883-
character deletion should be specified via the table argument.
884-
885-
Parameters
886-
----------
887-
table : dict (python 3), str or None (python 2)
888-
In python 3, table is a mapping of Unicode ordinals to Unicode ordinals,
889-
strings, or None. Unmapped characters are left untouched. Characters
890-
mapped to None are deleted. :meth:`str.maketrans` is a helper function
891-
for making translation tables.
892-
In python 2, table is either a string of length 256 or None. If the
893-
table argument is None, no translation is applied and the operation
894-
simply removes the characters in deletechars. :func:`string.maketrans`
895-
is a helper function for making translation tables.
896-
deletechars : str, optional (python 2)
897-
A string of characters to delete. This argument is only valid
898-
in python 2.
899-
900-
Returns
901-
-------
902-
translated : Series/Index of objects
903-
"""
904-
if deletechars is None:
905-
f = lambda x: x.translate(table)
906-
else:
907-
from pandas import compat
908-
if compat.PY3:
909-
raise ValueError("deletechars is not a valid argument for "
910-
"str.translate in python 3. You should simply "
911-
"specify character deletions in the table argument")
912-
f = lambda x: x.translate(table, deletechars)
913-
return _na_map(f, arr)
914-
915-
916873
def str_get(arr, i):
917874
"""
918875
Extract element from lists, tuples, or strings in each element in the
@@ -1044,7 +1001,6 @@ def __iter__(self):
10441001
g = self.get(i)
10451002

10461003
def _wrap_result(self, result, **kwargs):
1047-
10481004
# leave as it is to keep extract and get_dummies results
10491005
# can be merged to _wrap_result_expand in v0.17
10501006
from pandas.core.series import Series
@@ -1068,10 +1024,7 @@ def _wrap_result(self, result, **kwargs):
10681024
return DataFrame(result, index=self.series.index)
10691025

10701026
def _wrap_result_expand(self, result, expand=False):
1071-
if not isinstance(expand, bool):
1072-
raise ValueError("expand must be True or False")
1073-
1074-
from pandas.core.index import Index, MultiIndex
1027+
from pandas.core.index import Index
10751028
if not hasattr(result, 'ndim'):
10761029
return result
10771030

@@ -1084,16 +1037,13 @@ def _wrap_result_expand(self, result, expand=False):
10841037

10851038
if expand:
10861039
result = list(result)
1087-
return MultiIndex.from_tuples(result, names=name)
1088-
else:
1089-
return Index(result, name=name)
1040+
return Index(result, name=name)
10901041
else:
10911042
index = self.series.index
10921043
if expand:
1093-
cons_row = self.series._constructor
10941044
cons = self.series._constructor_expanddim
1095-
data = [cons_row(x) for x in result]
1096-
return cons(data, index=index)
1045+
data = [x if (x is not np.nan) else [None] for x in result]
1046+
return cons(data, index=index).fillna(np.nan)
10971047
else:
10981048
name = getattr(result, 'name', None)
10991049
cons = self.series._constructor
@@ -1104,12 +1054,10 @@ def cat(self, others=None, sep=None, na_rep=None):
11041054
result = str_cat(self.series, others=others, sep=sep, na_rep=na_rep)
11051055
return self._wrap_result(result)
11061056

1107-
@deprecate_kwarg('return_type', 'expand',
1108-
mapping={'series': False, 'frame': True})
11091057
@copy(str_split)
1110-
def split(self, pat=None, n=-1, expand=False):
1111-
result = str_split(self.series, pat, n=n)
1112-
return self._wrap_result_expand(result, expand=expand)
1058+
def split(self, pat=None, n=-1, return_type='series'):
1059+
result = str_split(self.series, pat, n=n, return_type=return_type)
1060+
return self._wrap_result(result)
11131061

11141062
_shared_docs['str_partition'] = ("""
11151063
Split the string at the %(side)s occurrence of `sep`, and return 3 elements
@@ -1123,7 +1071,7 @@ def split(self, pat=None, n=-1, expand=False):
11231071
String to split on.
11241072
expand : bool, default True
11251073
* If True, return DataFrame/MultiIndex expanding dimensionality.
1126-
* If False, return Series/Index.
1074+
* If False, return Series/Index
11271075
11281076
Returns
11291077
-------
@@ -1313,11 +1261,6 @@ def get_dummies(self, sep='|'):
13131261
result = str_get_dummies(self.series, sep)
13141262
return self._wrap_result(result)
13151263

1316-
@copy(str_translate)
1317-
def translate(self, table, deletechars=None):
1318-
result = str_translate(self.series, table, deletechars)
1319-
return self._wrap_result(result)
1320-
13211264
count = _pat_wrapper(str_count, flags=True)
13221265
startswith = _pat_wrapper(str_startswith, na=True)
13231266
endswith = _pat_wrapper(str_endswith, na=True)
@@ -1382,42 +1325,6 @@ def normalize(self, form):
13821325
result = _na_map(f, self.series)
13831326
return self._wrap_result(result)
13841327

1385-
_shared_docs['index'] = ("""
1386-
Return %(side)s indexes in each strings where the substring is
1387-
fully contained between [start:end]. This is the same as ``str.%(similar)s``
1388-
except instead of returning -1, it raises a ValueError when the substring
1389-
is not found. Equivalent to standard ``str.%(method)s``.
1390-
1391-
Parameters
1392-
----------
1393-
sub : str
1394-
Substring being searched
1395-
start : int
1396-
Left edge index
1397-
end : int
1398-
Right edge index
1399-
1400-
Returns
1401-
-------
1402-
found : Series/Index of objects
1403-
1404-
See Also
1405-
--------
1406-
%(also)s
1407-
""")
1408-
1409-
@Appender(_shared_docs['index'] % dict(side='lowest', similar='find', method='index',
1410-
also='rindex : Return highest indexes in each strings'))
1411-
def index(self, sub, start=0, end=None):
1412-
result = str_index(self.series, sub, start=start, end=end, side='left')
1413-
return self._wrap_result(result)
1414-
1415-
@Appender(_shared_docs['index'] % dict(side='highest', similar='rfind', method='rindex',
1416-
also='index : Return lowest indexes in each strings'))
1417-
def rindex(self, sub, start=0, end=None):
1418-
result = str_index(self.series, sub, start=start, end=end, side='right')
1419-
return self._wrap_result(result)
1420-
14211328
_shared_docs['len'] = ("""
14221329
Compute length of each string in the Series/Index.
14231330

0 commit comments

Comments
 (0)