diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index 0f68d1043b49d..d7fb36b062388 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -11,6 +11,19 @@ from .pandas_vb_common import tm +class Dtypes: + params = ["str", "string", "arrow_string"] + param_names = ["dtype"] + + def setup(self, dtype): + from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 + + try: + self.s = Series(tm.makeStringIndex(10 ** 5), dtype=dtype) + except ImportError: + raise NotImplementedError + + class Construction: params = ["str", "string"] @@ -49,18 +62,7 @@ def peakmem_cat_frame_construction(self, dtype): DataFrame(self.frame_cat_arr, dtype=dtype) -class Methods: - params = ["str", "string", "arrow_string"] - param_names = ["dtype"] - - def setup(self, dtype): - from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 - - try: - self.s = Series(tm.makeStringIndex(10 ** 5), dtype=dtype) - except ImportError: - raise NotImplementedError - +class Methods(Dtypes): def time_center(self, dtype): self.s.str.center(100) @@ -211,35 +213,26 @@ def time_cat(self, other_cols, sep, na_rep, na_frac): self.s.str.cat(others=self.others, sep=sep, na_rep=na_rep) -class Contains: +class Contains(Dtypes): - params = (["str", "string", "arrow_string"], [True, False]) + params = (Dtypes.params, [True, False]) param_names = ["dtype", "regex"] def setup(self, dtype, regex): - from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 - - try: - self.s = Series(tm.makeStringIndex(10 ** 5), dtype=dtype) - except ImportError: - raise NotImplementedError + super().setup(dtype) def time_contains(self, dtype, regex): self.s.str.contains("A", regex=regex) -class Split: +class Split(Dtypes): - params = (["str", "string", "arrow_string"], [True, False]) + params = (Dtypes.params, [True, False]) param_names = ["dtype", "expand"] def setup(self, dtype, expand): - from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 - - try: - self.s = Series(tm.makeStringIndex(10 ** 5), dtype=dtype).str.join("--") - except ImportError: - raise NotImplementedError + super().setup(dtype) + self.s = self.s.str.join("--") def time_split(self, dtype, expand): self.s.str.split("--", expand=expand) @@ -248,17 +241,23 @@ def time_rsplit(self, dtype, expand): self.s.str.rsplit("--", expand=expand) -class Dummies: - params = ["str", "string", "arrow_string"] - param_names = ["dtype"] +class Extract(Dtypes): - def setup(self, dtype): - from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 + params = (Dtypes.params, [True, False]) + param_names = ["dtype", "expand"] - try: - self.s = Series(tm.makeStringIndex(10 ** 5), dtype=dtype).str.join("|") - except ImportError: - raise NotImplementedError + def setup(self, dtype, expand): + super().setup(dtype) + + def time_extract_single_group(self, dtype, expand): + with warnings.catch_warnings(record=True): + self.s.str.extract("(\\w*)A", expand=expand) + + +class Dummies(Dtypes): + def setup(self, dtype): + super().setup(dtype) + self.s = self.s.str.join("|") def time_get_dummies(self, dtype): self.s.str.get_dummies("|") @@ -279,3 +278,9 @@ def setup(self): def time_vector_slice(self): # GH 2602 self.s.str[:5] + + +class Iter(Dtypes): + def time_iter(self, dtype): + for i in self.s: + pass diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 5606380908f38..25d36c617a894 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -3101,7 +3101,7 @@ def _str_extract_noexpand(arr, pat, flags=0): groups_or_na = _groups_or_na_fun(regex) result_dtype = _result_dtype(arr) - result = np.array([groups_or_na(val)[0] for val in arr], dtype=object) + result = np.array([groups_or_na(val)[0] for val in np.asarray(arr)], dtype=object) # not dispatching, so we have to reconstruct here. result = pd_array(result, dtype=result_dtype) return result @@ -3136,7 +3136,7 @@ def _str_extract_frame(arr, pat, flags=0): else: result_index = None return DataFrame( - [groups_or_na(val) for val in arr], + [groups_or_na(val) for val in np.asarray(arr)], columns=columns, index=result_index, dtype=result_dtype, diff --git a/pandas/tests/strings/test_extract.py b/pandas/tests/strings/test_extract.py index 2be00a689a206..16ec4a8c6831c 100644 --- a/pandas/tests/strings/test_extract.py +++ b/pandas/tests/strings/test_extract.py @@ -38,25 +38,23 @@ def test_extract_expand_kwarg(any_string_dtype): def test_extract_expand_False_mixed_object(): - mixed = Series( - [ - "aBAD_BAD", - np.nan, - "BAD_b_BAD", - True, - datetime.today(), - "foo", - None, - 1, - 2.0, - ] + ser = Series( + ["aBAD_BAD", np.nan, "BAD_b_BAD", True, datetime.today(), "foo", None, 1, 2.0] ) - result = Series(mixed).str.extract(".*(BAD[_]+).*(BAD)", expand=False) + # two groups + result = ser.str.extract(".*(BAD[_]+).*(BAD)", expand=False) er = [np.nan, np.nan] # empty row expected = DataFrame([["BAD_", "BAD"], er, ["BAD_", "BAD"], er, er, er, er, er, er]) tm.assert_frame_equal(result, expected) + # single group + result = ser.str.extract(".*(BAD[_]+).*BAD", expand=False) + expected = Series( + ["BAD_", np.nan, "BAD_", np.nan, np.nan, np.nan, np.nan, np.nan, np.nan] + ) + tm.assert_series_equal(result, expected) + def test_extract_expand_index_raises(): # GH9980