From 2e2650332c6215a93abc9fdb37e2394c697e7d53 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 19 Apr 2021 14:21:38 +0100 Subject: [PATCH 1/4] [ArrowStringArray] Use `utf8_is_*` functions from Apache Arrow if available --- pandas/core/arrays/string_arrow.py | 64 ++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 52bdcd03d3b49..864dae999d54d 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -39,6 +39,7 @@ from pandas.core import missing from pandas.core.arraylike import OpsMixin from pandas.core.arrays.base import ExtensionArray +from pandas.core.arrays.boolean import BooleanDtype from pandas.core.indexers import ( check_array_indexer, validate_indices, @@ -752,3 +753,66 @@ def _str_map(self, f, na_value=None, dtype: Dtype | None = None): # or .findall returns a list). # -> We don't know the result type. E.g. `.get` can return anything. return lib.map_infer_mask(arr, f, mask.view("uint8")) + + def _str_isalnum(self): + if hasattr(pc, "utf8_is_alnum"): + result = pc.utf8_is_alnum(self._data) + return BooleanDtype().__from_arrow__(result) + else: + return super()._str_isalnum() + + def _str_isalpha(self): + if hasattr(pc, "utf8_is_alpha"): + result = pc.utf8_is_alpha(self._data) + return BooleanDtype().__from_arrow__(result) + else: + return super()._str_isalpha() + + def _str_isdecimal(self): + if hasattr(pc, "utf8_is_decimal"): + result = pc.utf8_is_decimal(self._data) + return BooleanDtype().__from_arrow__(result) + else: + return super()._str_isdecimal() + + def _str_isdigit(self): + if hasattr(pc, "utf8_is_digit"): + result = pc.utf8_is_digit(self._data) + return BooleanDtype().__from_arrow__(result) + else: + return super()._str_isdigit() + + def _str_islower(self): + if hasattr(pc, "utf8_is_lower"): + result = pc.utf8_is_lower(self._data) + return BooleanDtype().__from_arrow__(result) + else: + return super()._str_islower() + + def _str_isnumeric(self): + if hasattr(pc, "utf8_is_numeric"): + result = pc.utf8_is_numeric(self._data) + return BooleanDtype().__from_arrow__(result) + else: + return super()._str_isnumeric() + + def _str_isspace(self): + if hasattr(pc, "utf8_is_space"): + result = pc.utf8_is_space(self._data) + return BooleanDtype().__from_arrow__(result) + else: + return super()._str_isspace() + + def _str_istitle(self): + if hasattr(pc, "utf8_is_title"): + result = pc.utf8_is_title(self._data) + return BooleanDtype().__from_arrow__(result) + else: + return super()._str_istitle() + + def _str_isupper(self): + if hasattr(pc, "utf8_is_upper"): + result = pc.utf8_is_upper(self._data) + return BooleanDtype().__from_arrow__(result) + else: + return super()._str_isupper() From 0987b0e73b69a21bf80a774b9e455693ceedad45 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 20 Apr 2021 09:11:00 +0200 Subject: [PATCH 2/4] PERF: optimize conversion from boolean Arrow array to masked BooleanArray --- pandas/core/arrays/boolean.py | 15 +++++++++++++-- pandas/tests/arrays/masked/test_arrow_compat.py | 11 +++++++++-- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 0a0bfccc0ea15..d59f74ffd256f 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -122,8 +122,19 @@ def __from_arrow__( results = [] for arr in chunks: - # TODO should optimize this without going through object array - bool_arr = BooleanArray._from_sequence(np.array(arr)) + buflist = arr.buffers() + data = pyarrow.BooleanArray.from_buffers( + arr.type, len(arr), [None, buflist[1]], offset=arr.offset + ).to_numpy(zero_copy_only=False) + if arr.null_count != 0: + mask = pyarrow.BooleanArray.from_buffers( + arr.type, len(arr), [None, buflist[0]], offset=arr.offset + ).to_numpy(zero_copy_only=False) + mask = ~mask + else: + mask = np.zeros(len(arr), dtype=bool) + + bool_arr = BooleanArray(data, mask) results.append(bool_arr) return BooleanArray._concat_same_type(results) diff --git a/pandas/tests/arrays/masked/test_arrow_compat.py b/pandas/tests/arrays/masked/test_arrow_compat.py index 8bb32dec2cc0e..ae5bec078734c 100644 --- a/pandas/tests/arrays/masked/test_arrow_compat.py +++ b/pandas/tests/arrays/masked/test_arrow_compat.py @@ -55,12 +55,19 @@ def test_arrow_from_arrow_uint(): @td.skip_if_no("pyarrow", min_version="0.16.0") -def test_arrow_sliced(): +def test_arrow_sliced(data): # https://github.com/pandas-dev/pandas/issues/38525 import pyarrow as pa - df = pd.DataFrame({"a": pd.array([0, None, 2, 3, None], dtype="Int64")}) + df = pd.DataFrame({"a": data}) table = pa.table(df) result = table.slice(2, None).to_pandas() expected = df.iloc[2:].reset_index(drop=True) tm.assert_frame_equal(result, expected) + + # no missing values + df2 = df.fillna(data[0]) + table = pa.table(df2) + result = table.slice(2, None).to_pandas() + expected = df2.iloc[2:].reset_index(drop=True) + tm.assert_frame_equal(result, expected) From 18428269616c521772b49c2378fc4627de40a9a4 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 22 Apr 2021 14:56:11 +0100 Subject: [PATCH 3/4] more testing --- pandas/core/strings/accessor.py | 3 +- pandas/tests/strings/test_string_array.py | 17 +---- pandas/tests/strings/test_strings.py | 83 ++++++++++++++++------- 3 files changed, 61 insertions(+), 42 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 0b5613e302175..85a58d3d99795 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -3002,8 +3002,9 @@ def _result_dtype(arr): # ideally we just pass `dtype=arr.dtype` unconditionally, but this fails # when the list of values is empty. from pandas.core.arrays.string_ import StringDtype + from pandas.core.arrays.string_arrow import ArrowStringDtype - if isinstance(arr.dtype, StringDtype): + if isinstance(arr.dtype, (StringDtype, ArrowStringDtype)): return arr.dtype.name else: return object diff --git a/pandas/tests/strings/test_string_array.py b/pandas/tests/strings/test_string_array.py index 02ccb3a930557..f90d219159c7e 100644 --- a/pandas/tests/strings/test_string_array.py +++ b/pandas/tests/strings/test_string_array.py @@ -13,19 +13,11 @@ ) -def test_string_array(nullable_string_dtype, any_string_method, request): +def test_string_array(nullable_string_dtype, any_string_method): method_name, args, kwargs = any_string_method if method_name == "decode": pytest.skip("decode requires bytes.") - if nullable_string_dtype == "arrow_string" and method_name in { - "extract", - "extractall", - }: - reason = "extract/extractall does not yet dispatch to array" - mark = pytest.mark.xfail(reason=reason) - request.node.add_marker(mark) - data = ["a", "bb", np.nan, "ccc"] a = Series(data, dtype=object) b = Series(data, dtype=nullable_string_dtype) @@ -93,15 +85,10 @@ def test_string_array_boolean_array(nullable_string_dtype, method, expected): tm.assert_series_equal(result, expected) -def test_string_array_extract(nullable_string_dtype, request): +def test_string_array_extract(nullable_string_dtype): # https://github.com/pandas-dev/pandas/issues/30969 # Only expand=False & multiple groups was failing - if nullable_string_dtype == "arrow_string": - reason = "extract does not yet dispatch to array" - mark = pytest.mark.xfail(reason=reason) - request.node.add_marker(mark) - a = Series(["a1", "b2", "cc"], dtype=nullable_string_dtype) b = Series(["a1", "b2", "cc"], dtype="object") pat = r"(\w)(\d)" diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 06b22f00a38cf..2a52b3ba3f9e1 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -6,6 +6,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import ( DataFrame, Index, @@ -17,6 +19,27 @@ import pandas._testing as tm +@pytest.fixture( + params=[ + "object", + "string", + pytest.param( + "arrow_string", marks=td.skip_if_no("pyarrow", min_version="1.0.0") + ), + ] +) +def any_string_dtype(request): + """ + Parametrized fixture for string dtypes. + * 'object' + * 'string' + * 'arrow_string' + """ + from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 + + return request.param + + def assert_series_or_index_equal(left, right): if isinstance(left, Series): tm.assert_series_equal(left, right) @@ -149,10 +172,15 @@ def test_repeat_with_null(nullable_string_dtype): tm.assert_series_equal(result, expected) -def test_empty_str_methods(): - empty_str = empty = Series(dtype=object) - empty_int = Series(dtype="int64") - empty_bool = Series(dtype=bool) +def test_empty_str_methods(any_string_dtype): + empty_str = empty = Series(dtype=any_string_dtype) + if any_string_dtype == "object": + empty_int = Series(dtype="int64") + empty_bool = Series(dtype=bool) + else: + empty_int = Series(dtype="Int64") + empty_bool = Series(dtype="boolean") + empty_object = Series(dtype=object) empty_bytes = Series(dtype=object) # GH7241 @@ -184,15 +212,15 @@ def test_empty_str_methods(): tm.assert_frame_equal(DataFrame(dtype=str), empty.str.get_dummies()) tm.assert_series_equal(empty_str, empty_str.str.join("")) tm.assert_series_equal(empty_int, empty.str.len()) - tm.assert_series_equal(empty_str, empty_str.str.findall("a")) + tm.assert_series_equal(empty_object, empty_str.str.findall("a")) tm.assert_series_equal(empty_int, empty.str.find("a")) tm.assert_series_equal(empty_int, empty.str.rfind("a")) tm.assert_series_equal(empty_str, empty.str.pad(42)) tm.assert_series_equal(empty_str, empty.str.center(42)) - tm.assert_series_equal(empty_str, empty.str.split("a")) - tm.assert_series_equal(empty_str, empty.str.rsplit("a")) - tm.assert_series_equal(empty_str, empty.str.partition("a", expand=False)) - tm.assert_series_equal(empty_str, empty.str.rpartition("a", expand=False)) + tm.assert_series_equal(empty_object, empty.str.split("a")) + tm.assert_series_equal(empty_object, empty.str.rsplit("a")) + tm.assert_series_equal(empty_object, empty.str.partition("a", expand=False)) + tm.assert_series_equal(empty_object, empty.str.rpartition("a", expand=False)) tm.assert_series_equal(empty_str, empty.str.slice(stop=1)) tm.assert_series_equal(empty_str, empty.str.slice(step=1)) tm.assert_series_equal(empty_str, empty.str.strip()) @@ -200,7 +228,7 @@ def test_empty_str_methods(): tm.assert_series_equal(empty_str, empty.str.rstrip()) tm.assert_series_equal(empty_str, empty.str.wrap(42)) tm.assert_series_equal(empty_str, empty.str.get(0)) - tm.assert_series_equal(empty_str, empty_bytes.str.decode("ascii")) + tm.assert_series_equal(empty_object, empty_bytes.str.decode("ascii")) tm.assert_series_equal(empty_bytes, empty.str.encode("ascii")) # ismethods should always return boolean (GH 29624) tm.assert_series_equal(empty_bool, empty.str.isalnum()) @@ -227,9 +255,9 @@ def test_empty_str_methods_to_frame(): tm.assert_frame_equal(empty_df, empty.str.rpartition("a")) -def test_ismethods(): +def test_ismethods(any_string_dtype): values = ["A", "b", "Xy", "4", "3A", "", "TT", "55", "-", " "] - str_s = Series(values) + str_s = Series(values, dtype=any_string_dtype) alnum_e = [True, True, True, True, True, False, True, True, False, False] alpha_e = [True, True, True, False, False, False, True, False, False, False] digit_e = [False, False, False, True, False, False, False, True, False, False] @@ -253,13 +281,14 @@ def test_ismethods(): upper_e = [True, False, False, False, True, False, True, False, False, False] title_e = [True, False, True, False, True, False, False, False, False, False] - tm.assert_series_equal(str_s.str.isalnum(), Series(alnum_e)) - tm.assert_series_equal(str_s.str.isalpha(), Series(alpha_e)) - tm.assert_series_equal(str_s.str.isdigit(), Series(digit_e)) - tm.assert_series_equal(str_s.str.isspace(), Series(space_e)) - tm.assert_series_equal(str_s.str.islower(), Series(lower_e)) - tm.assert_series_equal(str_s.str.isupper(), Series(upper_e)) - tm.assert_series_equal(str_s.str.istitle(), Series(title_e)) + dtype = "bool" if any_string_dtype == "object" else "boolean" + tm.assert_series_equal(str_s.str.isalnum(), Series(alnum_e, dtype=dtype)) + tm.assert_series_equal(str_s.str.isalpha(), Series(alpha_e, dtype=dtype)) + tm.assert_series_equal(str_s.str.isdigit(), Series(digit_e, dtype=dtype)) + tm.assert_series_equal(str_s.str.isspace(), Series(space_e, dtype=dtype)) + tm.assert_series_equal(str_s.str.islower(), Series(lower_e, dtype=dtype)) + tm.assert_series_equal(str_s.str.isupper(), Series(upper_e, dtype=dtype)) + tm.assert_series_equal(str_s.str.istitle(), Series(title_e, dtype=dtype)) assert str_s.str.isalnum().tolist() == [v.isalnum() for v in values] assert str_s.str.isalpha().tolist() == [v.isalpha() for v in values] @@ -270,28 +299,30 @@ def test_ismethods(): assert str_s.str.istitle().tolist() == [v.istitle() for v in values] -def test_isnumeric(): +def test_isnumeric(any_string_dtype): # 0x00bc: ¼ VULGAR FRACTION ONE QUARTER # 0x2605: ★ not number # 0x1378: ፸ ETHIOPIC NUMBER SEVENTY # 0xFF13: 3 Em 3 values = ["A", "3", "¼", "★", "፸", "3", "four"] - s = Series(values) + s = Series(values, dtype=any_string_dtype) numeric_e = [False, True, True, False, True, True, False] decimal_e = [False, True, False, False, False, True, False] - tm.assert_series_equal(s.str.isnumeric(), Series(numeric_e)) - tm.assert_series_equal(s.str.isdecimal(), Series(decimal_e)) + dtype = "bool" if any_string_dtype == "object" else "boolean" + tm.assert_series_equal(s.str.isnumeric(), Series(numeric_e, dtype=dtype)) + tm.assert_series_equal(s.str.isdecimal(), Series(decimal_e, dtype=dtype)) unicodes = ["A", "3", "¼", "★", "፸", "3", "four"] assert s.str.isnumeric().tolist() == [v.isnumeric() for v in unicodes] assert s.str.isdecimal().tolist() == [v.isdecimal() for v in unicodes] values = ["A", np.nan, "¼", "★", np.nan, "3", "four"] - s = Series(values) + s = Series(values, dtype=any_string_dtype) numeric_e = [False, np.nan, True, False, np.nan, True, False] decimal_e = [False, np.nan, False, False, np.nan, True, False] - tm.assert_series_equal(s.str.isnumeric(), Series(numeric_e)) - tm.assert_series_equal(s.str.isdecimal(), Series(decimal_e)) + dtype = "object" if any_string_dtype == "object" else "boolean" + tm.assert_series_equal(s.str.isnumeric(), Series(numeric_e, dtype=dtype)) + tm.assert_series_equal(s.str.isdecimal(), Series(decimal_e, dtype=dtype)) def test_get_dummies(): From 9e2c11bea3ec05604012f1b22668b6782437a747 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 22 Apr 2021 22:08:01 +0100 Subject: [PATCH 4/4] add benchmarks --- asv_bench/benchmarks/strings.py | 93 +++++++++++++++++++++++---------- 1 file changed, 64 insertions(+), 29 deletions(-) diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index 76257e1b40f1a..5d9b1c135d7ae 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -50,91 +50,126 @@ def peakmem_cat_frame_construction(self, dtype): class Methods: - def setup(self): - self.s = Series(tm.makeStringIndex(10 ** 5)) + params = ["str", "string", "arrow_string"] + param_names = ["dtype"] + + def setup(self, dtype): + from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401 + + try: + self.s = Series(tm.makeStringIndex(10 ** 5), dtype=dtype) + except ImportError: + raise NotImplementedError - def time_center(self): + def time_center(self, dtype): self.s.str.center(100) - def time_count(self): + def time_count(self, dtype): self.s.str.count("A") - def time_endswith(self): + def time_endswith(self, dtype): self.s.str.endswith("A") - def time_extract(self): + def time_extract(self, dtype): with warnings.catch_warnings(record=True): self.s.str.extract("(\\w*)A(\\w*)") - def time_findall(self): + def time_findall(self, dtype): self.s.str.findall("[A-Z]+") - def time_find(self): + def time_find(self, dtype): self.s.str.find("[A-Z]+") - def time_rfind(self): + def time_rfind(self, dtype): self.s.str.rfind("[A-Z]+") - def time_get(self): + def time_get(self, dtype): self.s.str.get(0) - def time_len(self): + def time_len(self, dtype): self.s.str.len() - def time_join(self): + def time_join(self, dtype): self.s.str.join(" ") - def time_match(self): + def time_match(self, dtype): self.s.str.match("A") - def time_normalize(self): + def time_normalize(self, dtype): self.s.str.normalize("NFC") - def time_pad(self): + def time_pad(self, dtype): self.s.str.pad(100, side="both") - def time_partition(self): + def time_partition(self, dtype): self.s.str.partition("A") - def time_rpartition(self): + def time_rpartition(self, dtype): self.s.str.rpartition("A") - def time_replace(self): + def time_replace(self, dtype): self.s.str.replace("A", "\x01\x01") - def time_translate(self): + def time_translate(self, dtype): self.s.str.translate({"A": "\x01\x01"}) - def time_slice(self): + def time_slice(self, dtype): self.s.str.slice(5, 15, 2) - def time_startswith(self): + def time_startswith(self, dtype): self.s.str.startswith("A") - def time_strip(self): + def time_strip(self, dtype): self.s.str.strip("A") - def time_rstrip(self): + def time_rstrip(self, dtype): self.s.str.rstrip("A") - def time_lstrip(self): + def time_lstrip(self, dtype): self.s.str.lstrip("A") - def time_title(self): + def time_title(self, dtype): self.s.str.title() - def time_upper(self): + def time_upper(self, dtype): self.s.str.upper() - def time_lower(self): + def time_lower(self, dtype): self.s.str.lower() - def time_wrap(self): + def time_wrap(self, dtype): self.s.str.wrap(10) - def time_zfill(self): + def time_zfill(self, dtype): self.s.str.zfill(10) + def time_isalnum(self, dtype): + self.s.str.isalnum() + + def time_isalpha(self, dtype): + self.s.str.isalpha() + + def time_isdecimal(self, dtype): + self.s.str.isdecimal() + + def time_isdigit(self, dtype): + self.s.str.isdigit() + + def time_islower(self, dtype): + self.s.str.islower() + + def time_isnumeric(self, dtype): + self.s.str.isnumeric() + + def time_isspace(self, dtype): + self.s.str.isspace() + + def time_istitle(self, dtype): + self.s.str.istitle() + + def time_isupper(self, dtype): + self.s.str.isupper() + class Repeat: