Skip to content

Commit 99910a3

Browse files
committed
Merge pull request #5944 from unutbu/str-extract
ENH: Improve perf of str_extract
2 parents 2081fcc + 6d9d67c commit 99910a3

File tree

2 files changed

+15
-27
lines changed

2 files changed

+15
-27
lines changed

doc/source/release.rst

+1
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ Improvements to existing features
7979
allow multiple axes to be used to operate on slabs of a ``Panel``
8080
- The ``ArrayFormatter``s for ``datetime`` and ``timedelta64`` now intelligently
8181
limit precision based on the values in the array (:issue:`3401`)
82+
- perf improvements to Series.str.extract (:issue:`5944`)
8283

8384
.. _release.bug_fixes-0.13.1:
8485

pandas/core/strings.py

+14-27
Original file line numberDiff line numberDiff line change
@@ -439,41 +439,28 @@ def str_extract(arr, pat, flags=0):
439439
440440
"""
441441
regex = re.compile(pat, flags=flags)
442-
443442
# just to be safe, check this
444443
if regex.groups == 0:
445444
raise ValueError("This pattern contains no groups to capture.")
446-
elif regex.groups == 1:
447-
def f(x):
448-
if not isinstance(x, compat.string_types):
449-
return None
450-
m = regex.search(x)
451-
if m:
452-
return m.groups()[0] # may be None
453-
else:
454-
return None
445+
empty_row = [np.nan]*regex.groups
446+
def f(x):
447+
if not isinstance(x, compat.string_types):
448+
return empty_row
449+
m = regex.search(x)
450+
if m:
451+
return [np.nan if item is None else item for item in m.groups()]
452+
else:
453+
return empty_row
454+
if regex.groups == 1:
455+
result = Series([f(val)[0] for val in arr], name=regex.groupindex.get(1))
455456
else:
456-
empty_row = Series(regex.groups * [None])
457-
458-
def f(x):
459-
if not isinstance(x, compat.string_types):
460-
return empty_row
461-
m = regex.search(x)
462-
if m:
463-
return Series(list(m.groups())) # may contain None
464-
else:
465-
return empty_row
466-
result = arr.apply(f)
467-
result.replace({None: np.nan}, inplace=True)
468-
if regex.groups > 1:
469-
result = DataFrame(result) # Don't rely on the wrapper; name columns.
470457
names = dict(zip(regex.groupindex.values(), regex.groupindex.keys()))
471-
result.columns = [names.get(1 + i, i) for i in range(regex.groups)]
472-
else:
473-
result.name = regex.groupindex.get(0)
458+
columns = [names.get(1 + i, i) for i in range(regex.groups)]
459+
result = DataFrame([f(val) for val in arr], columns=columns)
474460
return result
475461

476462

463+
477464
def str_join(arr, sep):
478465
"""
479466
Join lists contained as elements in array, a la str.join

0 commit comments

Comments
 (0)