Merge pull request #5944 from unutbu/str-extract

jreback · jreback · commit 99910a39fba7 · 2014-01-15T14:17:33.000-08:00
ENH: Improve perf of str_extract
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -79,6 +79,7 @@ Improvements to existing features
     allow multiple axes to be used to operate on slabs of a ``Panel``
   - The ``ArrayFormatter``s for ``datetime`` and ``timedelta64`` now intelligently
     limit precision based on the values in the array (:issue:`3401`)
+  - perf improvements to Series.str.extract (:issue:`5944`)
 
 .. _release.bug_fixes-0.13.1:
 
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
@@ -439,41 +439,28 @@ def str_extract(arr, pat, flags=0):
 
     """
     regex = re.compile(pat, flags=flags)
-
     # just to be safe, check this
     if regex.groups == 0:
         raise ValueError("This pattern contains no groups to capture.")
-    elif regex.groups == 1:
-        def f(x):
-            if not isinstance(x, compat.string_types):
-                return None
-            m = regex.search(x)
-            if m:
-                return m.groups()[0]  # may be None
-            else:
-                return None
+    empty_row = [np.nan]*regex.groups
+    def f(x):
+        if not isinstance(x, compat.string_types):
+            return empty_row
+        m = regex.search(x)
+        if m:
+            return [np.nan if item is None else item for item in m.groups()]
+        else:
+            return empty_row
+    if regex.groups == 1:
+        result = Series([f(val)[0] for val in arr], name=regex.groupindex.get(1))
     else:
-        empty_row = Series(regex.groups * [None])
-
-        def f(x):
-            if not isinstance(x, compat.string_types):
-                return empty_row
-            m = regex.search(x)
-            if m:
-                return Series(list(m.groups()))  # may contain None
-            else:
-                return empty_row
-    result = arr.apply(f)
-    result.replace({None: np.nan}, inplace=True)
-    if regex.groups > 1:
-        result = DataFrame(result)  # Don't rely on the wrapper; name columns.
         names = dict(zip(regex.groupindex.values(), regex.groupindex.keys()))
-        result.columns = [names.get(1 + i, i) for i in range(regex.groups)]
-    else:
-        result.name = regex.groupindex.get(0)
+        columns = [names.get(1 + i, i) for i in range(regex.groups)]
+        result = DataFrame([f(val) for val in arr], columns=columns)
     return result
 
 
+
 def str_join(arr, sep):
     """
     Join lists contained as elements in array, a la str.join