Skip to content

Commit d688a65

Browse files
committed
PERF: improve .str perf for all-string values
1 parent b32f218 commit d688a65

File tree

1 file changed

+30
-11
lines changed

1 file changed

+30
-11
lines changed

pandas/core/strings.py

+30-11
Original file line numberDiff line numberDiff line change
@@ -116,12 +116,12 @@ def _length_check(others):
116116
return n
117117

118118

119-
def _na_map(f, arr, na_result=np.nan, dtype=object):
119+
def _na_map(f, arr, na_result=np.nan, dtype=object, np_f=None):
120120
# should really _check_ for NA
121-
return _map(f, arr, na_mask=True, na_value=na_result, dtype=dtype)
121+
return _map(f, arr, na_mask=True, na_value=na_result, dtype=dtype, np_f=np_f)
122122

123123

124-
def _map(f, arr, na_mask=False, na_value=np.nan, dtype=object):
124+
def _map(f, arr, na_mask=False, na_value=np.nan, dtype=object, np_f=None):
125125
from pandas.core.series import Series
126126

127127
if not len(arr):
@@ -131,6 +131,14 @@ def _map(f, arr, na_mask=False, na_value=np.nan, dtype=object):
131131
arr = arr.values
132132
if not isinstance(arr, np.ndarray):
133133
arr = np.asarray(arr, dtype=object)
134+
135+
# short path for all-string array
136+
if np_f is not None and lib.is_string_array(arr):
137+
try:
138+
return np_f(arr.astype(unicode))
139+
except Exception:
140+
pass
141+
134142
if na_mask:
135143
mask = isnull(arr)
136144
try:
@@ -686,14 +694,17 @@ def str_pad(arr, width, side='left', fillchar=' '):
686694

687695
if side == 'left':
688696
f = lambda x: x.rjust(width, fillchar)
697+
np_f = lambda x: np.core.defchararray.ljust(x, width, fillchar)
689698
elif side == 'right':
690699
f = lambda x: x.ljust(width, fillchar)
700+
np_f = lambda x: np.core.defchararray.rjust(x, width, fillchar)
691701
elif side == 'both':
692702
f = lambda x: x.center(width, fillchar)
703+
np_f = lambda x: np.core.defchararray.lower(x, width, fillchar)
693704
else: # pragma: no cover
694705
raise ValueError('Invalid side')
695706

696-
return _na_map(f, arr)
707+
return _na_map(f, arr, np_f=np_f)
697708

698709

699710
def str_split(arr, pat=None, n=None):
@@ -720,17 +731,21 @@ def str_split(arr, pat=None, n=None):
720731
if n is None or n == 0:
721732
n = -1
722733
f = lambda x: x.split(pat, n)
734+
np_f = lambda x: np.core.defchararray.split(x, pat, n)
723735
else:
724736
if len(pat) == 1:
725737
if n is None or n == 0:
726738
n = -1
727739
f = lambda x: x.split(pat, n)
740+
np_f = lambda x: np.core.defchararray.split(x, pat, n)
728741
else:
729742
if n is None or n == -1:
730743
n = 0
731744
regex = re.compile(pat)
732745
f = lambda x: regex.split(x, maxsplit=n)
733-
res = _na_map(f, arr)
746+
# numpy doesn't support regex
747+
np_f = None
748+
res = _na_map(f, arr, np_f=np_f)
734749
return res
735750

736751

@@ -946,7 +961,8 @@ def str_decode(arr, encoding, errors="strict"):
946961
decoded : Series/Index of objects
947962
"""
948963
f = lambda x: x.decode(encoding, errors)
949-
return _na_map(f, arr)
964+
np_f = lambda x: np.core.defchararray.decode(x, errors)
965+
return _na_map(f, arr, np_f=np_f)
950966

951967

952968
def str_encode(arr, encoding, errors="strict"):
@@ -964,12 +980,13 @@ def str_encode(arr, encoding, errors="strict"):
964980
encoded : Series/Index of objects
965981
"""
966982
f = lambda x: x.encode(encoding, errors)
967-
return _na_map(f, arr)
983+
np_f = lambda x: np.core.defchararray.encode(x, errors)
984+
return _na_map(f, arr, np_f=np_f)
968985

969986

970-
def _noarg_wrapper(f, docstring=None, **kargs):
987+
def _noarg_wrapper(f, docstring=None, np_f=None, **kargs):
971988
def wrapper(self):
972-
result = _na_map(f, self.series, **kargs)
989+
result = _na_map(f, self.series, np_f=np_f, **kargs)
973990
return self._wrap_result(result)
974991

975992
wrapper.__name__ = f.__name__
@@ -1443,7 +1460,8 @@ def rindex(self, sub, start=0, end=None):
14431460
_shared_docs['swapcase'] = dict(type='be swapcased', method='swapcase')
14441461
lower = _noarg_wrapper(lambda x: x.lower(),
14451462
docstring=_shared_docs['casemethods'] %
1446-
_shared_docs['lower'])
1463+
_shared_docs['lower'],
1464+
np_f=np.core.defchararray.lower)
14471465
upper = _noarg_wrapper(lambda x: x.upper(),
14481466
docstring=_shared_docs['casemethods'] %
14491467
_shared_docs['upper'])
@@ -1452,7 +1470,8 @@ def rindex(self, sub, start=0, end=None):
14521470
_shared_docs['title'])
14531471
capitalize = _noarg_wrapper(lambda x: x.capitalize(),
14541472
docstring=_shared_docs['casemethods'] %
1455-
_shared_docs['capitalize'])
1473+
_shared_docs['capitalize'],
1474+
np_f=np.core.defchararray.capitalize)
14561475
swapcase = _noarg_wrapper(lambda x: x.swapcase(),
14571476
docstring=_shared_docs['casemethods'] %
14581477
_shared_docs['swapcase'])

0 commit comments

Comments
 (0)