From 1788ffe696ce3b68a3f1c9438b7996eae71259ff Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 15 May 2021 18:06:24 +0100 Subject: [PATCH 1/3] [ArrowStringArray] PERF: single regex group with expand=False --- asv_bench/benchmarks/strings.py | 18 +++++++++++ pandas/core/strings/accessor.py | 48 ++++++++++++++++++++-------- pandas/tests/strings/test_extract.py | 24 +++++++------- 3 files changed, 63 insertions(+), 27 deletions(-) diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index 0f68d1043b49d..c7c52dc4de17f 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -248,6 +248,24 @@ def time_rsplit(self, dtype, expand): self.s.str.rsplit("--", expand=expand) +class Extract: + + params = (["str", "string", "arrow_string"], [True, False]) + param_names = ["dtype", "expand"] + + def setup(self, dtype, expand): + from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 + + try: + self.s = Series(tm.makeStringIndex(10 ** 5), dtype=dtype) + except ImportError: + raise NotImplementedError + + def time_extract_single_group(self, dtype, expand): + with warnings.catch_warnings(record=True): + self.s.str.extract("(\\w*)A", expand=expand) + + class Dummies: params = ["str", "string", "arrow_string"] param_names = ["dtype"] diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 15fc2d9e6d3c5..75a80496a2e2d 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -3033,16 +3033,26 @@ def cat_core(list_of_columns: List, sep: str): def _groups_or_na_fun(regex): """Used in both extract_noexpand and extract_frame""" - empty_row = [np.nan] * regex.groups - - def f(x): - if not isinstance(x, str): - return empty_row - m = regex.search(x) - if m: - return [np.nan if item is None else item for item in m.groups()] - else: - return empty_row + + if regex.groups == 1: + + def f(x): + if not isinstance(x, str): + return np.nan + m = regex.search(x) + return m.groups()[0] if m else np.nan + + else: + empty_row = [np.nan] * regex.groups + + def f(x): + if not isinstance(x, str): + return empty_row + m = regex.search(x) + if m: + return [np.nan if item is None else item for item in m.groups()] + else: + return empty_row return f @@ -3099,14 +3109,23 @@ def _str_extract_noexpand(arr, pat, flags=0): regex = re.compile(pat, flags=flags) groups_or_na = _groups_or_na_fun(regex) - result_dtype = _result_dtype(arr) + result_dtype = _result_dtype(arr._orig) if regex.groups == 1: - result = np.array([groups_or_na(val)[0] for val in arr], dtype=object) + arr = arr._data.array + mask = isna(arr) + result = lib.map_infer_mask( + np.asarray(arr), groups_or_na, mask.view(np.uint8), convert=False + ) + # special case since extract on an object array converts None to np.nan + # will be handled by _str_map array method + if is_object_dtype(result_dtype): + result[mask] = np.nan name = _get_single_group_name(regex) # not dispatching, so we have to reconstruct here. result = pd_array(result, dtype=result_dtype) else: + arr = arr._orig name = None columns = _get_group_names(regex) if arr.size == 0: @@ -3136,6 +3155,7 @@ def _str_extract_frame(arr, pat, flags=0): """ from pandas import DataFrame + arr = arr._orig regex = re.compile(pat, flags=flags) groups_or_na = _groups_or_na_fun(regex) columns = _get_group_names(regex) @@ -3157,10 +3177,10 @@ def _str_extract_frame(arr, pat, flags=0): def str_extract(arr, pat, flags=0, expand=True): if expand: - result = _str_extract_frame(arr._orig, pat, flags=flags) + result = _str_extract_frame(arr, pat, flags=flags) return result.__finalize__(arr._orig, method="str_extract") else: - result, name = _str_extract_noexpand(arr._orig, pat, flags=flags) + result, name = _str_extract_noexpand(arr, pat, flags=flags) return arr._wrap_result(result, name=name, expand=expand) diff --git a/pandas/tests/strings/test_extract.py b/pandas/tests/strings/test_extract.py index 2be00a689a206..16ec4a8c6831c 100644 --- a/pandas/tests/strings/test_extract.py +++ b/pandas/tests/strings/test_extract.py @@ -38,25 +38,23 @@ def test_extract_expand_kwarg(any_string_dtype): def test_extract_expand_False_mixed_object(): - mixed = Series( - [ - "aBAD_BAD", - np.nan, - "BAD_b_BAD", - True, - datetime.today(), - "foo", - None, - 1, - 2.0, - ] + ser = Series( + ["aBAD_BAD", np.nan, "BAD_b_BAD", True, datetime.today(), "foo", None, 1, 2.0] ) - result = Series(mixed).str.extract(".*(BAD[_]+).*(BAD)", expand=False) + # two groups + result = ser.str.extract(".*(BAD[_]+).*(BAD)", expand=False) er = [np.nan, np.nan] # empty row expected = DataFrame([["BAD_", "BAD"], er, ["BAD_", "BAD"], er, er, er, er, er, er]) tm.assert_frame_equal(result, expected) + # single group + result = ser.str.extract(".*(BAD[_]+).*BAD", expand=False) + expected = Series( + ["BAD_", np.nan, "BAD_", np.nan, np.nan, np.nan, np.nan, np.nan, np.nan] + ) + tm.assert_series_equal(result, expected) + def test_extract_expand_index_raises(): # GH9980 From 79ff0d3e17f9fd7ac14a5c5628de5c59d736d010 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 16 May 2021 09:05:50 +0100 Subject: [PATCH 2/3] only the iter changes --- asv_bench/benchmarks/strings.py | 17 +++++++++++ pandas/core/strings/accessor.py | 52 +++++++++++---------------------- 2 files changed, 34 insertions(+), 35 deletions(-) diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index c7c52dc4de17f..969b7d08735a6 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -297,3 +297,20 @@ def setup(self): def time_vector_slice(self): # GH 2602 self.s.str[:5] + + +class Iter: + params = ["str", "string", "arrow_string"] + param_names = ["dtype"] + + def setup(self, dtype): + from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 + + try: + self.s = Series(["abcdefg", np.nan] * 500000, dtype=dtype) + except ImportError: + raise NotImplementedError + + def time_iter(self, dtype): + for i in self.s: + pass diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 75a80496a2e2d..e058d1172aa8c 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -3033,26 +3033,16 @@ def cat_core(list_of_columns: List, sep: str): def _groups_or_na_fun(regex): """Used in both extract_noexpand and extract_frame""" - - if regex.groups == 1: - - def f(x): - if not isinstance(x, str): - return np.nan - m = regex.search(x) - return m.groups()[0] if m else np.nan - - else: - empty_row = [np.nan] * regex.groups - - def f(x): - if not isinstance(x, str): - return empty_row - m = regex.search(x) - if m: - return [np.nan if item is None else item for item in m.groups()] - else: - return empty_row + empty_row = [np.nan] * regex.groups + + def f(x): + if not isinstance(x, str): + return empty_row + m = regex.search(x) + if m: + return [np.nan if item is None else item for item in m.groups()] + else: + return empty_row return f @@ -3109,23 +3099,16 @@ def _str_extract_noexpand(arr, pat, flags=0): regex = re.compile(pat, flags=flags) groups_or_na = _groups_or_na_fun(regex) - result_dtype = _result_dtype(arr._orig) + result_dtype = _result_dtype(arr) if regex.groups == 1: - arr = arr._data.array - mask = isna(arr) - result = lib.map_infer_mask( - np.asarray(arr), groups_or_na, mask.view(np.uint8), convert=False + result = np.array( + [groups_or_na(val)[0] for val in np.asarray(arr)], dtype=object ) - # special case since extract on an object array converts None to np.nan - # will be handled by _str_map array method - if is_object_dtype(result_dtype): - result[mask] = np.nan name = _get_single_group_name(regex) # not dispatching, so we have to reconstruct here. result = pd_array(result, dtype=result_dtype) else: - arr = arr._orig name = None columns = _get_group_names(regex) if arr.size == 0: @@ -3138,7 +3121,7 @@ def _str_extract_noexpand(arr, pat, flags=0): # error: Incompatible types in assignment (expression has type # "DataFrame", variable has type "ndarray") result = DataFrame( # type:ignore[assignment] - [groups_or_na(val) for val in arr], + [groups_or_na(val) for val in np.asarray(arr)], columns=columns, index=arr.index, dtype=result_dtype, @@ -3155,7 +3138,6 @@ def _str_extract_frame(arr, pat, flags=0): """ from pandas import DataFrame - arr = arr._orig regex = re.compile(pat, flags=flags) groups_or_na = _groups_or_na_fun(regex) columns = _get_group_names(regex) @@ -3168,7 +3150,7 @@ def _str_extract_frame(arr, pat, flags=0): except AttributeError: result_index = None return DataFrame( - [groups_or_na(val) for val in arr], + [groups_or_na(val) for val in np.asarray(arr)], columns=columns, index=result_index, dtype=result_dtype, @@ -3177,10 +3159,10 @@ def _str_extract_frame(arr, pat, flags=0): def str_extract(arr, pat, flags=0, expand=True): if expand: - result = _str_extract_frame(arr, pat, flags=flags) + result = _str_extract_frame(arr._orig, pat, flags=flags) return result.__finalize__(arr._orig, method="str_extract") else: - result, name = _str_extract_noexpand(arr, pat, flags=flags) + result, name = _str_extract_noexpand(arr._orig, pat, flags=flags) return arr._wrap_result(result, name=name, expand=expand) From a4e66a186306542ad28526bf5415f63749319dd7 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 17 May 2021 18:55:05 +0100 Subject: [PATCH 3/3] add base class to benchmarks for setup --- asv_bench/benchmarks/strings.py | 86 +++++++++++---------------------- 1 file changed, 28 insertions(+), 58 deletions(-) diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index 969b7d08735a6..d7fb36b062388 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -11,6 +11,19 @@ from .pandas_vb_common import tm +class Dtypes: + params = ["str", "string", "arrow_string"] + param_names = ["dtype"] + + def setup(self, dtype): + from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 + + try: + self.s = Series(tm.makeStringIndex(10 ** 5), dtype=dtype) + except ImportError: + raise NotImplementedError + + class Construction: params = ["str", "string"] @@ -49,18 +62,7 @@ def peakmem_cat_frame_construction(self, dtype): DataFrame(self.frame_cat_arr, dtype=dtype) -class Methods: - params = ["str", "string", "arrow_string"] - param_names = ["dtype"] - - def setup(self, dtype): - from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 - - try: - self.s = Series(tm.makeStringIndex(10 ** 5), dtype=dtype) - except ImportError: - raise NotImplementedError - +class Methods(Dtypes): def time_center(self, dtype): self.s.str.center(100) @@ -211,35 +213,26 @@ def time_cat(self, other_cols, sep, na_rep, na_frac): self.s.str.cat(others=self.others, sep=sep, na_rep=na_rep) -class Contains: +class Contains(Dtypes): - params = (["str", "string", "arrow_string"], [True, False]) + params = (Dtypes.params, [True, False]) param_names = ["dtype", "regex"] def setup(self, dtype, regex): - from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 - - try: - self.s = Series(tm.makeStringIndex(10 ** 5), dtype=dtype) - except ImportError: - raise NotImplementedError + super().setup(dtype) def time_contains(self, dtype, regex): self.s.str.contains("A", regex=regex) -class Split: +class Split(Dtypes): - params = (["str", "string", "arrow_string"], [True, False]) + params = (Dtypes.params, [True, False]) param_names = ["dtype", "expand"] def setup(self, dtype, expand): - from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 - - try: - self.s = Series(tm.makeStringIndex(10 ** 5), dtype=dtype).str.join("--") - except ImportError: - raise NotImplementedError + super().setup(dtype) + self.s = self.s.str.join("--") def time_split(self, dtype, expand): self.s.str.split("--", expand=expand) @@ -248,35 +241,23 @@ def time_rsplit(self, dtype, expand): self.s.str.rsplit("--", expand=expand) -class Extract: +class Extract(Dtypes): - params = (["str", "string", "arrow_string"], [True, False]) + params = (Dtypes.params, [True, False]) param_names = ["dtype", "expand"] def setup(self, dtype, expand): - from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 - - try: - self.s = Series(tm.makeStringIndex(10 ** 5), dtype=dtype) - except ImportError: - raise NotImplementedError + super().setup(dtype) def time_extract_single_group(self, dtype, expand): with warnings.catch_warnings(record=True): self.s.str.extract("(\\w*)A", expand=expand) -class Dummies: - params = ["str", "string", "arrow_string"] - param_names = ["dtype"] - +class Dummies(Dtypes): def setup(self, dtype): - from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 - - try: - self.s = Series(tm.makeStringIndex(10 ** 5), dtype=dtype).str.join("|") - except ImportError: - raise NotImplementedError + super().setup(dtype) + self.s = self.s.str.join("|") def time_get_dummies(self, dtype): self.s.str.get_dummies("|") @@ -299,18 +280,7 @@ def time_vector_slice(self): self.s.str[:5] -class Iter: - params = ["str", "string", "arrow_string"] - param_names = ["dtype"] - - def setup(self, dtype): - from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 - - try: - self.s = Series(["abcdefg", np.nan] * 500000, dtype=dtype) - except ImportError: - raise NotImplementedError - +class Iter(Dtypes): def time_iter(self, dtype): for i in self.s: pass