From 86cccb038359ff06c4f5d9cf401e2093736a6df2 Mon Sep 17 00:00:00 2001 From: Constantine Glen Evans Date: Fri, 8 May 2015 22:11:58 -0700 Subject: [PATCH] PERF: increase performance of string split when expand=True --- doc/source/whatsnew/v0.16.2.txt | 1 + pandas/core/strings.py | 8 ++++++-- vb_suite/strings.py | 1 + 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.16.2.txt b/doc/source/whatsnew/v0.16.2.txt index 86332a26fd14c..49f143e158abf 100644 --- a/doc/source/whatsnew/v0.16.2.txt +++ b/doc/source/whatsnew/v0.16.2.txt @@ -47,6 +47,7 @@ Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Improved ``Series.resample`` performance with dtype=datetime64[ns] (:issue:`7754`) +- Increase performance of ``str.split`` when ``expand=True`` (:issue:`10081`) .. _whatsnew_0162.bug_fixes: diff --git a/pandas/core/strings.py b/pandas/core/strings.py index f4ac0166cf44b..78ae4fba02033 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1,7 +1,7 @@ import numpy as np from pandas.compat import zip -from pandas.core.common import isnull, _values_from_object, is_bool_dtype +from pandas.core.common import isnull, _values_from_object, is_bool_dtype, is_list_like import pandas.compat as compat from pandas.util.decorators import Appender, deprecate_kwarg import re @@ -1090,7 +1090,11 @@ def _wrap_result_expand(self, result, expand=False): else: index = self.series.index if expand: - cons_row = self.series._constructor + def cons_row(x): + if is_list_like(x): + return x + else: + return [ x ] cons = self.series._constructor_expanddim data = [cons_row(x) for x in result] return cons(data, index=index) diff --git a/vb_suite/strings.py b/vb_suite/strings.py index 96791cd52f1cf..f229e0ddedbae 100644 --- a/vb_suite/strings.py +++ b/vb_suite/strings.py @@ -35,6 +35,7 @@ def make_series(letters, strlen, size): strings_match = Benchmark("many.str.match(r'mat..this')", setup) strings_extract = Benchmark("many.str.extract(r'(\w*)matchthis(\w*)')", setup) strings_join_split = Benchmark("many.str.join(r'--').str.split('--')", setup) +strings_join_split_expand = Benchmark("many.str.join(r'--').str.split('--',expand=True)", setup) strings_len = Benchmark("many.str.len()", setup) strings_findall = Benchmark("many.str.findall(r'[A-Z]+')", setup) strings_pad = Benchmark("many.str.pad(100, side='both')", setup)