From b24afc9ba03939403e48f9d97e0b6d2a04662799 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 13 Aug 2023 14:25:27 +0200 Subject: [PATCH 01/20] Start new string array --- pandas/core/arrays/string_arrow.py | 95 ++++++++++++++++++++++++++---- 1 file changed, 83 insertions(+), 12 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 4a70fcf6b5a93..474351fc6b1d7 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -113,6 +113,7 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr # error: Incompatible types in assignment (expression has type "StringDtype", # base class "ArrowExtensionArray" defined the type as "ArrowDtype") _dtype: StringDtype # type: ignore[assignment] + _result_converter = lambda result: BooleanDtype().__from_arrow__(result) def __init__(self, values) -> None: super().__init__(values) @@ -313,7 +314,7 @@ def _str_contains( result = pc.match_substring_regex(self._pa_array, pat, ignore_case=not case) else: result = pc.match_substring(self._pa_array, pat, ignore_case=not case) - result = BooleanDtype().__from_arrow__(result) + result = self._result_converter(result) if not isna(na): result[isna(result)] = bool(na) return result @@ -322,7 +323,7 @@ def _str_startswith(self, pat: str, na=None): result = pc.starts_with(self._pa_array, pattern=pat) if not isna(na): result = result.fill_null(na) - result = BooleanDtype().__from_arrow__(result) + result = self._result_converter_(result) if not isna(na): result[isna(result)] = bool(na) return result @@ -331,7 +332,7 @@ def _str_endswith(self, pat: str, na=None): result = pc.ends_with(self._pa_array, pattern=pat) if not isna(na): result = result.fill_null(na) - result = BooleanDtype().__from_arrow__(result) + result = self._result_converter(result) if not isna(na): result[isna(result)] = bool(na) return result @@ -369,39 +370,39 @@ def _str_fullmatch( def _str_isalnum(self): result = pc.utf8_is_alnum(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_isalpha(self): result = pc.utf8_is_alpha(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_isdecimal(self): result = pc.utf8_is_decimal(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_isdigit(self): result = pc.utf8_is_digit(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_islower(self): result = pc.utf8_is_lower(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_isnumeric(self): result = pc.utf8_is_numeric(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_isspace(self): result = pc.utf8_is_space(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_istitle(self): result = pc.utf8_is_title(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_isupper(self): result = pc.utf8_is_upper(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_len(self): result = pc.utf8_length(self._pa_array) @@ -433,3 +434,73 @@ def _str_rstrip(self, to_strip=None): else: result = pc.utf8_rtrim(self._pa_array, characters=to_strip) return type(self)(result) + + +class ArrowStringArrayNumpySemantics(ArrowStringArray): + _result_converter = lambda result: result.to_numpy() + + def _str_len(self): + result = pc.utf8_length(self._pa_array) + return result.to_numpy() + + def _str_map( + self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True + ): + """ + Map a callable over valid elements of the array. + + Parameters + ---------- + f : Callable + A function to call on each non-NA element. + na_value : Scalar, optional + The value to set for NA values. Might also be used for the + fill value if the callable `f` raises an exception. + This defaults to ``self._str_na_value`` which is ``np.nan`` + for object-dtype and Categorical and ``pd.NA`` for StringArray. + dtype : Dtype, optional + The dtype of the result array. + convert : bool, default True + Whether to call `maybe_convert_objects` on the resulting ndarray + """ + if dtype is None: + dtype = np.dtype("object") + if na_value is None: + na_value = self._str_na_value + + if not len(self): + return np.array([], dtype=dtype) + + arr = np.asarray(self, dtype=object) + mask = isna(arr) + map_convert = convert and not np.all(mask) + try: + result = lib.map_infer_mask(arr, f, mask.view(np.uint8), map_convert) + except (TypeError, AttributeError) as err: + # Reraise the exception if callable `f` got wrong number of args. + # The user may want to be warned by this, instead of getting NaN + p_err = ( + r"((takes)|(missing)) (?(2)from \d+ to )?\d+ " + r"(?(3)required )positional arguments?" + ) + + if len(err.args) >= 1 and re.search(p_err, err.args[0]): + # FIXME: this should be totally avoidable + raise err + + def g(x): + # This type of fallback behavior can be removed once + # we remove object-dtype .str accessor. + try: + return f(x) + except (TypeError, AttributeError): + return na_value + + return self._str_map(g, na_value=na_value, dtype=dtype) + if not isinstance(result, np.ndarray): + return result + if na_value is not np.nan: + np.putmask(result, mask, na_value) + if convert and result.dtype == object: + result = lib.maybe_convert_objects(result) + return result From b306c6f1550279687a9d1aaa2364e6546e5d905b Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 13 Aug 2023 18:30:11 +0200 Subject: [PATCH 02/20] Add missing methods --- pandas/core/arrays/_arrow_string_mixins.py | 94 +++++++++++++++++++++ pandas/core/arrays/arrow/array.py | 95 +++------------------- pandas/core/arrays/string_.py | 21 +++-- pandas/core/arrays/string_arrow.py | 57 ++++++++++++- 4 files changed, 176 insertions(+), 91 deletions(-) create mode 100644 pandas/core/arrays/_arrow_string_mixins.py diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py new file mode 100644 index 0000000000000..62857285cadef --- /dev/null +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -0,0 +1,94 @@ +from __future__ import annotations + +from typing import Literal + +from pandas.compat import pa_version_under7p0 + +if not pa_version_under7p0: + import pyarrow as pa + import pyarrow.compute as pc + + +class ArrowStringArrayMixin: + def _str_pad( + self, + width: int, + side: Literal["left", "right", "both"] = "left", + fillchar: str = " ", + ): + if side == "left": + pa_pad = pc.utf8_lpad + elif side == "right": + pa_pad = pc.utf8_rpad + elif side == "both": + pa_pad = pc.utf8_center + else: + raise ValueError( + f"Invalid side: {side}. Side must be one of 'left', 'right', 'both'" + ) + return type(self)(pa_pad(self._pa_array, width=width, padding=fillchar)) + + def _str_get(self, i: int): + lengths = pc.utf8_length(self._pa_array) + if i >= 0: + out_of_bounds = pc.greater_equal(i, lengths) + start = i + stop = i + 1 + step = 1 + else: + out_of_bounds = pc.greater(-i, lengths) + start = i + stop = i - 1 + step = -1 + not_out_of_bounds = pc.invert(out_of_bounds.fill_null(True)) + selected = pc.utf8_slice_codeunits( + self._pa_array, start=start, stop=stop, step=step + ) + null_value = pa.scalar(None, type=self._pa_array.type) + result = pc.if_else(not_out_of_bounds, selected, null_value) + return type(self)(result) + + def _str_partition(self, sep: str, expand: bool): + predicate = lambda val: val.partition(sep) + result = self._apply_elementwise(predicate) + return type(self)(pa.chunked_array(result)) + + def _str_rpartition(self, sep: str, expand: bool): + predicate = lambda val: val.rpartition(sep) + result = self._apply_elementwise(predicate) + return type(self)(pa.chunked_array(result)) + + def _str_slice( + self, start: int | None = None, stop: int | None = None, step: int | None = None + ): + if start is None: + start = 0 + if step is None: + step = 1 + return type(self)( + pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) + ) + + def _str_slice_replace( + self, start: int | None = None, stop: int | None = None, repl: str | None = None + ): + if repl is None: + repl = "" + if start is None: + start = 0 + return type(self)(pc.utf8_replace_slice(self._pa_array, start, stop, repl)) + + def _str_capitalize(self): + return type(self)(pc.utf8_capitalize(self._pa_array)) + + def _str_title(self): + return type(self)(pc.utf8_title(self._pa_array)) + + def _str_swapcase(self): + return type(self)(pc.utf8_swapcase(self._pa_array)) + + def _str_removesuffix(self, suffix: str): + ends_with = pc.ends_with(self._pa_array, pattern=suffix) + removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix)) + result = pc.if_else(ends_with, removed, self._pa_array) + return type(self)(result) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 88695f11fba59..e83ce450218a5 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -42,6 +42,7 @@ from pandas.core import roperator from pandas.core.arraylike import OpsMixin +from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin from pandas.core.arrays.base import ( ExtensionArray, ExtensionArraySupportsAnyAll, @@ -184,7 +185,10 @@ def to_pyarrow_type( class ArrowExtensionArray( - OpsMixin, ExtensionArraySupportsAnyAll, BaseStringArrayMethods + OpsMixin, + ExtensionArraySupportsAnyAll, + ArrowStringArrayMixin, + BaseStringArrayMethods, ): """ Pandas ExtensionArray backed by a PyArrow ChunkedArray. @@ -246,6 +250,12 @@ def __init__(self, values: pa.Array | pa.ChunkedArray) -> None: ) self._dtype = ArrowDtype(self._pa_array.type) + def __dir__(self): + o = set(dir(type(self))) + o.update(self.__dict__) + o.update(set(dir(ArrowStringArrayMixin))) + return list(o) + @classmethod def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False): """ @@ -1987,24 +1997,6 @@ def _str_count(self, pat: str, flags: int = 0): raise NotImplementedError(f"count not implemented with {flags=}") return type(self)(pc.count_substring_regex(self._pa_array, pat)) - def _str_pad( - self, - width: int, - side: Literal["left", "right", "both"] = "left", - fillchar: str = " ", - ): - if side == "left": - pa_pad = pc.utf8_lpad - elif side == "right": - pa_pad = pc.utf8_rpad - elif side == "both": - pa_pad = pc.utf8_center - else: - raise ValueError( - f"Invalid side: {side}. Side must be one of 'left', 'right', 'both'" - ) - return type(self)(pa_pad(self._pa_array, width=width, padding=fillchar)) - def _str_contains( self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True ): @@ -2089,26 +2081,6 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None): ) return type(self)(result) - def _str_get(self, i: int): - lengths = pc.utf8_length(self._pa_array) - if i >= 0: - out_of_bounds = pc.greater_equal(i, lengths) - start = i - stop = i + 1 - step = 1 - else: - out_of_bounds = pc.greater(-i, lengths) - start = i - stop = i - 1 - step = -1 - not_out_of_bounds = pc.invert(out_of_bounds.fill_null(True)) - selected = pc.utf8_slice_codeunits( - self._pa_array, start=start, stop=stop, step=step - ) - null_value = pa.scalar(None, type=self._pa_array.type) - result = pc.if_else(not_out_of_bounds, selected, null_value) - return type(self)(result) - def _str_join(self, sep: str): if pa.types.is_string(self._pa_array.type): result = self._apply_elementwise(list) @@ -2117,36 +2089,6 @@ def _str_join(self, sep: str): result = self._pa_array return type(self)(pc.binary_join(result, sep)) - def _str_partition(self, sep: str, expand: bool): - predicate = lambda val: val.partition(sep) - result = self._apply_elementwise(predicate) - return type(self)(pa.chunked_array(result)) - - def _str_rpartition(self, sep: str, expand: bool): - predicate = lambda val: val.rpartition(sep) - result = self._apply_elementwise(predicate) - return type(self)(pa.chunked_array(result)) - - def _str_slice( - self, start: int | None = None, stop: int | None = None, step: int | None = None - ): - if start is None: - start = 0 - if step is None: - step = 1 - return type(self)( - pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) - ) - - def _str_slice_replace( - self, start: int | None = None, stop: int | None = None, repl: str | None = None - ): - if repl is None: - repl = "" - if start is None: - start = 0 - return type(self)(pc.utf8_replace_slice(self._pa_array, start, stop, repl)) - def _str_isalnum(self): return type(self)(pc.utf8_is_alnum(self._pa_array)) @@ -2171,18 +2113,9 @@ def _str_isspace(self): def _str_istitle(self): return type(self)(pc.utf8_is_title(self._pa_array)) - def _str_capitalize(self): - return type(self)(pc.utf8_capitalize(self._pa_array)) - - def _str_title(self): - return type(self)(pc.utf8_title(self._pa_array)) - def _str_isupper(self): return type(self)(pc.utf8_is_upper(self._pa_array)) - def _str_swapcase(self): - return type(self)(pc.utf8_swapcase(self._pa_array)) - def _str_len(self): return type(self)(pc.utf8_length(self._pa_array)) @@ -2223,12 +2156,6 @@ def _str_removeprefix(self, prefix: str): result = self._apply_elementwise(predicate) return type(self)(pa.chunked_array(result)) - def _str_removesuffix(self, suffix: str): - ends_with = pc.ends_with(self._pa_array, pattern=suffix) - removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix)) - result = pc.if_else(ends_with, removed, self._pa_array) - return type(self)(result) - def _str_casefold(self): predicate = lambda val: val.casefold() result = self._apply_elementwise(predicate) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 25f1c2ec6ce4f..1e285f90e9fea 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -76,7 +76,7 @@ class StringDtype(StorageExtensionDtype): Parameters ---------- - storage : {"python", "pyarrow"}, optional + storage : {"python", "pyarrow", "pyarrow_numpy"}, optional If not given, the value of ``pd.options.mode.string_storage``. Attributes @@ -108,11 +108,11 @@ def na_value(self) -> libmissing.NAType: def __init__(self, storage=None) -> None: if storage is None: storage = get_option("mode.string_storage") - if storage not in {"python", "pyarrow"}: + if storage not in {"python", "pyarrow", "pyarrow_numpy"}: raise ValueError( f"Storage must be 'python' or 'pyarrow'. Got {storage} instead." ) - if storage == "pyarrow" and pa_version_under7p0: + if storage in ("pyarrow", "pyarrow_numpy") and pa_version_under7p0: raise ImportError( "pyarrow>=7.0.0 is required for PyArrow backed StringArray." ) @@ -160,6 +160,8 @@ def construct_from_string(cls, string): return cls(storage="python") elif string == "string[pyarrow]": return cls(storage="pyarrow") + elif string == "string[pyarrow_numpy]": + return cls(storage="pyarrow_numpy") else: raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") @@ -176,12 +178,17 @@ def construct_array_type( # type: ignore[override] ------- type """ - from pandas.core.arrays.string_arrow import ArrowStringArray + from pandas.core.arrays.string_arrow import ( + ArrowStringArray, + ArrowStringArrayNumpySemantics, + ) if self.storage == "python": return StringArray - else: + elif self.storage == "pyarrow": return ArrowStringArray + else: + return ArrowStringArrayNumpySemantics def __from_arrow__( self, array: pyarrow.Array | pyarrow.ChunkedArray @@ -193,6 +200,10 @@ def __from_arrow__( from pandas.core.arrays.string_arrow import ArrowStringArray return ArrowStringArray(array) + elif self.storage == "pyarrow_numpy": + from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics + + return ArrowStringArrayNumpySemantics(array) else: import pyarrow diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 474351fc6b1d7..20dc8978b63df 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -1,5 +1,6 @@ from __future__ import annotations +from functools import partial import re from typing import ( TYPE_CHECKING, @@ -27,6 +28,7 @@ ) from pandas.core.dtypes.missing import isna +from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin from pandas.core.arrays.arrow import ArrowExtensionArray from pandas.core.arrays.boolean import BooleanDtype from pandas.core.arrays.integer import Int64Dtype @@ -114,10 +116,11 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr # base class "ArrowExtensionArray" defined the type as "ArrowDtype") _dtype: StringDtype # type: ignore[assignment] _result_converter = lambda result: BooleanDtype().__from_arrow__(result) + _storage = "pyarrow" def __init__(self, values) -> None: super().__init__(values) - self._dtype = StringDtype(storage="pyarrow") + self._dtype = StringDtype(storage=self._storage) if not pa.types.is_string(self._pa_array.type) and not ( pa.types.is_dictionary(self._pa_array.type) @@ -145,7 +148,10 @@ def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False) if dtype and not (isinstance(dtype, str) and dtype == "string"): dtype = pandas_dtype(dtype) - assert isinstance(dtype, StringDtype) and dtype.storage == "pyarrow" + assert isinstance(dtype, StringDtype) and dtype.storage in ( + "pyarrow", + "pyarrow_numpy", + ) if isinstance(scalars, BaseMaskedArray): # avoid costly conversion to object dtype in ensure_string_array and @@ -438,6 +444,12 @@ def _str_rstrip(self, to_strip=None): class ArrowStringArrayNumpySemantics(ArrowStringArray): _result_converter = lambda result: result.to_numpy() + _storage = "pyarrow_numpy" + + def __getattribute__(self, item): + if item in ArrowStringArrayMixin.__dict__: + return partial(getattr(ArrowStringArrayMixin, item), self) + return super().__getattribute__(item) def _str_len(self): result = pc.utf8_length(self._pa_array) @@ -504,3 +516,44 @@ def g(x): if convert and result.dtype == object: result = lib.maybe_convert_objects(result) return result + + def _str_count(self, pat: str, flags: int = 0): + if flags: + return super()._str_count(pat, flags) + return pc.count_substring_regex(self._pa_array, pat).to_numpy() + + def _str_find(self, sub: str, start: int = 0, end: int | None = None): + if start != 0 and end is not None: + slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end) + result = pc.find_substring(slices, sub) + not_found = pc.equal(result, -1) + offset_result = pc.add(result, end - start) + result = pc.if_else(not_found, result, offset_result) + elif start == 0 and end is None: + slices = self._pa_array + result = pc.find_substring(slices, sub) + else: + return super()._str_find(sub, start, end) + return type(self)(result) + + def _str_split( + self, + pat: str | None = None, + n: int | None = -1, + expand: bool = False, + regex: bool | None = None, + ): + if n in {-1, 0}: + n = None + if regex: + split_func = pc.split_pattern_regex + else: + split_func = pc.split_pattern + return split_func(self._pa_array, pat, max_splits=n).to_numpy() + + def _str_rsplit(self, pat: str | None = None, n: int | None = -1): + if n in {-1, 0}: + n = None + return pc.split_pattern( + self._pa_array, pat, max_splits=n, reverse=True + ).to_numpy() From 2dbcfb0b91ab953ea933afecf9d2003d86873b31 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 13 Aug 2023 21:57:53 +0200 Subject: [PATCH 03/20] Implement Arrow String Array that is compatible with NumPy semantics --- pandas/conftest.py | 2 + pandas/core/arrays/_arrow_string_mixins.py | 25 +--- pandas/core/arrays/arrow/array.py | 26 +++- pandas/core/arrays/string_arrow.py | 163 ++++++++++----------- pandas/core/config_init.py | 2 +- pandas/core/strings/accessor.py | 4 +- pandas/tests/arrays/string_/test_string.py | 100 +++++++++---- pandas/tests/arrays/test_datetimelike.py | 7 +- pandas/tests/extension/base/methods.py | 3 + pandas/tests/extension/test_string.py | 8 +- pandas/tests/io/conftest.py | 16 ++ pandas/tests/strings/__init__.py | 1 + pandas/tests/strings/test_case_justify.py | 2 + pandas/tests/strings/test_find_replace.py | 63 ++++---- pandas/tests/strings/test_strings.py | 18 ++- 15 files changed, 261 insertions(+), 179 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index f756da82157b8..e0aae234144f5 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1321,6 +1321,7 @@ def nullable_string_dtype(request): params=[ "python", pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")), + pytest.param("pyarrow_numpy", marks=td.skip_if_no("pyarrow")), ] ) def string_storage(request): @@ -1380,6 +1381,7 @@ def object_dtype(request): "object", "string[python]", pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), + pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), ] ) def any_string_dtype(request): diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 62857285cadef..29ea4c6dc221d 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -2,6 +2,8 @@ from typing import Literal +import numpy as np + from pandas.compat import pa_version_under7p0 if not pa_version_under7p0: @@ -48,27 +50,6 @@ def _str_get(self, i: int): result = pc.if_else(not_out_of_bounds, selected, null_value) return type(self)(result) - def _str_partition(self, sep: str, expand: bool): - predicate = lambda val: val.partition(sep) - result = self._apply_elementwise(predicate) - return type(self)(pa.chunked_array(result)) - - def _str_rpartition(self, sep: str, expand: bool): - predicate = lambda val: val.rpartition(sep) - result = self._apply_elementwise(predicate) - return type(self)(pa.chunked_array(result)) - - def _str_slice( - self, start: int | None = None, stop: int | None = None, step: int | None = None - ): - if start is None: - start = 0 - if step is None: - step = 1 - return type(self)( - pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) - ) - def _str_slice_replace( self, start: int | None = None, stop: int | None = None, repl: str | None = None ): @@ -76,6 +57,8 @@ def _str_slice_replace( repl = "" if start is None: start = 0 + if stop is None: + stop = np.iinfo(np.int64).max return type(self)(pc.utf8_replace_slice(self._pa_array, start, stop, repl)) def _str_capitalize(self): diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index e83ce450218a5..d4c68c6870681 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -512,7 +512,10 @@ def __getitem__(self, item: PositionalIndexer): if isinstance(item, np.ndarray): if not len(item): # Removable once we migrate StringDtype[pyarrow] to ArrowDtype[string] - if self._dtype.name == "string" and self._dtype.storage == "pyarrow": + if self._dtype.name == "string" and self._dtype.storage in ( + "pyarrow", + "pyarrow_numpy", + ): pa_dtype = pa.string() else: pa_dtype = self._dtype.pyarrow_dtype @@ -1997,6 +2000,27 @@ def _str_count(self, pat: str, flags: int = 0): raise NotImplementedError(f"count not implemented with {flags=}") return type(self)(pc.count_substring_regex(self._pa_array, pat)) + def _str_slice( + self, start: int | None = None, stop: int | None = None, step: int | None = None + ): + if start is None: + start = 0 + if step is None: + step = 1 + return type(self)( + pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) + ) + + def _str_partition(self, sep: str, expand: bool): + predicate = lambda val: val.partition(sep) + result = self._apply_elementwise(predicate) + return type(self)(pa.chunked_array(result)) + + def _str_rpartition(self, sep: str, expand: bool): + predicate = lambda val: val.rpartition(sep) + result = self._apply_elementwise(predicate) + return type(self)(pa.chunked_array(result)) + def _str_contains( self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True ): diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 20dc8978b63df..099e49980d27e 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -115,7 +115,9 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr # error: Incompatible types in assignment (expression has type "StringDtype", # base class "ArrowExtensionArray" defined the type as "ArrowDtype") _dtype: StringDtype # type: ignore[assignment] - _result_converter = lambda result: BooleanDtype().__from_arrow__(result) + _result_converter = lambda _, result, **kwargs: BooleanDtype().__from_arrow__( + result + ) _storage = "pyarrow" def __init__(self, values) -> None: @@ -320,7 +322,7 @@ def _str_contains( result = pc.match_substring_regex(self._pa_array, pat, ignore_case=not case) else: result = pc.match_substring(self._pa_array, pat, ignore_case=not case) - result = self._result_converter(result) + result = self._result_converter(result, na=na) if not isna(na): result[isna(result)] = bool(na) return result @@ -329,7 +331,7 @@ def _str_startswith(self, pat: str, na=None): result = pc.starts_with(self._pa_array, pattern=pat) if not isna(na): result = result.fill_null(na) - result = self._result_converter_(result) + result = self._result_converter(result) if not isna(na): result[isna(result)] = bool(na) return result @@ -443,84 +445,87 @@ def _str_rstrip(self, to_strip=None): class ArrowStringArrayNumpySemantics(ArrowStringArray): - _result_converter = lambda result: result.to_numpy() + # _result_converter = lambda _, result: result.to_numpy(na_value=np.nan) _storage = "pyarrow_numpy" + @staticmethod + def _result_converter(values, na=None): + if not isna(na): + values = values.fill_null(bool(na)) + return ArrowExtensionArray(values).to_numpy(na_value=np.nan) + def __getattribute__(self, item): if item in ArrowStringArrayMixin.__dict__: return partial(getattr(ArrowStringArrayMixin, item), self) return super().__getattribute__(item) - def _str_len(self): - result = pc.utf8_length(self._pa_array) - return result.to_numpy() - def _str_map( self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True ): - """ - Map a callable over valid elements of the array. - - Parameters - ---------- - f : Callable - A function to call on each non-NA element. - na_value : Scalar, optional - The value to set for NA values. Might also be used for the - fill value if the callable `f` raises an exception. - This defaults to ``self._str_na_value`` which is ``np.nan`` - for object-dtype and Categorical and ``pd.NA`` for StringArray. - dtype : Dtype, optional - The dtype of the result array. - convert : bool, default True - Whether to call `maybe_convert_objects` on the resulting ndarray - """ if dtype is None: - dtype = np.dtype("object") + dtype = self.dtype if na_value is None: - na_value = self._str_na_value - - if not len(self): - return np.array([], dtype=dtype) - - arr = np.asarray(self, dtype=object) - mask = isna(arr) - map_convert = convert and not np.all(mask) - try: - result = lib.map_infer_mask(arr, f, mask.view(np.uint8), map_convert) - except (TypeError, AttributeError) as err: - # Reraise the exception if callable `f` got wrong number of args. - # The user may want to be warned by this, instead of getting NaN - p_err = ( - r"((takes)|(missing)) (?(2)from \d+ to )?\d+ " - r"(?(3)required )positional arguments?" + na_value = self.dtype.na_value + + mask = isna(self) + arr = np.asarray(self) + + if is_integer_dtype(dtype) or is_bool_dtype(dtype): + if is_integer_dtype(dtype): + na_value = np.nan + else: + na_value = False + try: + result = lib.map_infer_mask( + arr, + f, + mask.view("uint8"), + convert=False, + na_value=na_value, + dtype=np.dtype(dtype), + ) + return result + + except ValueError: + result = lib.map_infer_mask( + arr, + f, + mask.view("uint8"), + convert=False, + na_value=na_value, + ) + if convert and result.dtype == object: + result = lib.maybe_convert_objects(result) + return result + + elif is_string_dtype(dtype) and not is_object_dtype(dtype): + # i.e. StringDtype + result = lib.map_infer_mask( + arr, f, mask.view("uint8"), convert=False, na_value=na_value ) + result = pa.array(result, mask=mask, type=pa.string(), from_pandas=True) + return type(self)(result) + else: + # This is when the result type is object. We reach this when + # -> We know the result type is truly object (e.g. .encode returns bytes + # or .findall returns a list). + # -> We don't know the result type. E.g. `.get` can return anything. + return lib.map_infer_mask(arr, f, mask.view("uint8")) - if len(err.args) >= 1 and re.search(p_err, err.args[0]): - # FIXME: this should be totally avoidable - raise err - - def g(x): - # This type of fallback behavior can be removed once - # we remove object-dtype .str accessor. - try: - return f(x) - except (TypeError, AttributeError): - return na_value - - return self._str_map(g, na_value=na_value, dtype=dtype) - if not isinstance(result, np.ndarray): - return result - if na_value is not np.nan: - np.putmask(result, mask, na_value) - if convert and result.dtype == object: - result = lib.maybe_convert_objects(result) + def _convert_int_dtype(self, result): + if result.dtype == np.int32: + result = result.astype(np.int64) return result def _str_count(self, pat: str, flags: int = 0): if flags: return super()._str_count(pat, flags) - return pc.count_substring_regex(self._pa_array, pat).to_numpy() + result = pc.count_substring_regex(self._pa_array, pat).to_numpy() + return self._convert_int_dtype(result) + + def _str_len(self): + result = pc.utf8_length(self._pa_array).to_numpy() + return self._convert_int_dtype(result) def _str_find(self, sub: str, start: int = 0, end: int | None = None): if start != 0 and end is not None: @@ -534,26 +539,16 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None): result = pc.find_substring(slices, sub) else: return super()._str_find(sub, start, end) - return type(self)(result) + return self._convert_int_dtype(result.to_numpy()) - def _str_split( - self, - pat: str | None = None, - n: int | None = -1, - expand: bool = False, - regex: bool | None = None, - ): - if n in {-1, 0}: - n = None - if regex: - split_func = pc.split_pattern_regex - else: - split_func = pc.split_pattern - return split_func(self._pa_array, pat, max_splits=n).to_numpy() - - def _str_rsplit(self, pat: str | None = None, n: int | None = -1): - if n in {-1, 0}: - n = None - return pc.split_pattern( - self._pa_array, pat, max_splits=n, reverse=True - ).to_numpy() + def _cmp_method(self, other, op): + result = super()._cmp_method(other, op) + return result.to_numpy(na_value=False) + + def value_counts(self, dropna: bool = True): + from pandas import Series + + result = super().value_counts(dropna) + return Series( + result._values.to_numpy(), index=result.index, name=result.name, copy=False + ) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 27e9bf8958ab0..745689ab1fcc8 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -500,7 +500,7 @@ def use_inf_as_na_cb(key) -> None: "string_storage", "python", string_storage_doc, - validator=is_one_of_factory(["python", "pyarrow"]), + validator=is_one_of_factory(["python", "pyarrow", "pyarrow_numpy"]), ) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index e59369db776da..6d9470eb730c5 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -145,7 +145,9 @@ def _map_and_wrap(name: str | None, docstring: str | None): @forbid_nonstring_types(["bytes"], name=name) def wrapper(self): result = getattr(self._data.array, f"_str_{name}")() - return self._wrap_result(result) + return self._wrap_result( + result, returns_string=name not in ("isnumeric", "isdecimal") + ) wrapper.__doc__ = docstring return wrapper diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index cfd3314eb5944..b65fc52053414 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -9,7 +9,10 @@ import pandas as pd import pandas._testing as tm -from pandas.core.arrays.string_arrow import ArrowStringArray +from pandas.core.arrays.string_arrow import ( + ArrowStringArray, + ArrowStringArrayNumpySemantics, +) from pandas.util.version import Version @@ -33,7 +36,12 @@ def test_repr(dtype): expected = "0 a\n1 \n2 b\nName: A, dtype: string" assert repr(df.A) == expected - arr_name = "ArrowStringArray" if dtype.storage == "pyarrow" else "StringArray" + if dtype.storage == "pyarrow": + arr_name = "ArrowStringArray" + elif dtype.storage == "pyarrow_numpy": + arr_name = "ArrowStringArrayNumpySemantics" + else: + arr_name = "StringArray" expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: string" assert repr(df.A.array) == expected @@ -116,7 +124,7 @@ def test_add(dtype): def test_add_2d(dtype, request): - if dtype.storage == "pyarrow": + if dtype.storage in ("pyarrow", "pyarrow_numpy"): reason = "Failed: DID NOT RAISE " mark = pytest.mark.xfail(raises=None, reason=reason) request.node.add_marker(mark) @@ -145,7 +153,7 @@ def test_add_sequence(dtype): def test_mul(dtype, request): - if dtype.storage == "pyarrow": + if dtype.storage in ("pyarrow", "pyarrow_numpy"): reason = "unsupported operand type(s) for *: 'ArrowStringArray' and 'int'" mark = pytest.mark.xfail(raises=NotImplementedError, reason=reason) request.node.add_marker(mark) @@ -195,19 +203,30 @@ def test_comparison_methods_scalar(comparison_op, dtype): a = pd.array(["a", None, "c"], dtype=dtype) other = "a" result = getattr(a, op_name)(other) - expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" - expected = np.array([getattr(item, op_name)(other) for item in a], dtype=object) - expected = pd.array(expected, dtype=expected_dtype) - tm.assert_extension_array_equal(result, expected) + if dtype.storage == "pyarrow_numpy": + expected = np.array([getattr(item, op_name)(other) for item in a], dtype=object) + expected = pd.array(expected, dtype="boolean").to_numpy(na_value=False) + tm.assert_numpy_array_equal(result, expected) + else: + expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" + expected = np.array([getattr(item, op_name)(other) for item in a], dtype=object) + expected = pd.array(expected, dtype=expected_dtype) + tm.assert_extension_array_equal(result, expected) def test_comparison_methods_scalar_pd_na(comparison_op, dtype): op_name = f"__{comparison_op.__name__}__" a = pd.array(["a", None, "c"], dtype=dtype) result = getattr(a, op_name)(pd.NA) - expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" - expected = pd.array([None, None, None], dtype=expected_dtype) - tm.assert_extension_array_equal(result, expected) + + if dtype.storage == "pyarrow_numpy": + expected = np.array([False, False, False], dtype=object) + tm.assert_numpy_array_equal(result, expected) + else: + expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" + expected = pd.array([None, None, None], dtype=expected_dtype) + tm.assert_extension_array_equal(result, expected) + tm.assert_extension_array_equal(result, expected) def test_comparison_methods_scalar_not_string(comparison_op, dtype): @@ -223,12 +242,21 @@ def test_comparison_methods_scalar_not_string(comparison_op, dtype): return result = getattr(a, op_name)(other) - expected_data = {"__eq__": [False, None, False], "__ne__": [True, None, True]}[ - op_name - ] - expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" - expected = pd.array(expected_data, dtype=expected_dtype) - tm.assert_extension_array_equal(result, expected) + + if dtype.storage == "pyarrow_numpy": + expected_data = { + "__eq__": [False, False, False], + "__ne__": [True, False, True], + }[op_name] + expected = np.array(expected_data, dtype=object) + tm.assert_numpy_array_equal(result, expected) + else: + expected_data = {"__eq__": [False, None, False], "__ne__": [True, None, True]}[ + op_name + ] + expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" + expected = pd.array(expected_data, dtype=expected_dtype) + tm.assert_extension_array_equal(result, expected) def test_comparison_methods_array(comparison_op, dtype): @@ -237,15 +265,25 @@ def test_comparison_methods_array(comparison_op, dtype): a = pd.array(["a", None, "c"], dtype=dtype) other = [None, None, "c"] result = getattr(a, op_name)(other) - expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" - expected = np.full(len(a), fill_value=None, dtype="object") - expected[-1] = getattr(other[-1], op_name)(a[-1]) - expected = pd.array(expected, dtype=expected_dtype) - tm.assert_extension_array_equal(result, expected) + if dtype.storage == "pyarrow_numpy": + expected = np.array([False, False, False], dtype=object) + expected[-1] = getattr(other[-1], op_name)(a[-1]) + tm.assert_numpy_array_equal(result, expected) - result = getattr(a, op_name)(pd.NA) - expected = pd.array([None, None, None], dtype=expected_dtype) - tm.assert_extension_array_equal(result, expected) + result = getattr(a, op_name)(pd.NA) + expected = np.array([False, False, False], dtype=object) + tm.assert_numpy_array_equal(result, expected) + + else: + expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" + expected = np.full(len(a), fill_value=None, dtype="object") + expected[-1] = getattr(other[-1], op_name)(a[-1]) + expected = pd.array(expected, dtype=expected_dtype) + tm.assert_extension_array_equal(result, expected) + + result = getattr(a, op_name)(pd.NA) + expected = pd.array([None, None, None], dtype=expected_dtype) + tm.assert_extension_array_equal(result, expected) def test_constructor_raises(cls): @@ -297,7 +335,7 @@ def test_from_sequence_no_mutate(copy, cls, request): result = cls._from_sequence(nan_arr, copy=copy) - if cls is ArrowStringArray: + if cls in (ArrowStringArray, ArrowStringArrayNumpySemantics): import pyarrow as pa expected = cls(pa.array(na_arr, type=pa.string(), from_pandas=True)) @@ -370,7 +408,7 @@ def test_min_max(method, skipna, dtype, request): @pytest.mark.parametrize("method", ["min", "max"]) @pytest.mark.parametrize("box", [pd.Series, pd.array]) def test_min_max_numpy(method, box, dtype, request): - if dtype.storage == "pyarrow" and box is pd.array: + if dtype.storage in ("pyarrow", "pyarrow_numpy") and box is pd.array: if box is pd.array: reason = "'<=' not supported between instances of 'str' and 'NoneType'" else: @@ -397,7 +435,7 @@ def test_fillna_args(dtype, request): expected = pd.array(["a", "b"], dtype=dtype) tm.assert_extension_array_equal(res, expected) - if dtype.storage == "pyarrow": + if dtype.storage in ("pyarrow", "pyarrow_numpy"): msg = "Invalid value '1' for dtype string" else: msg = "Cannot set non-string value '1' into a StringArray." @@ -455,6 +493,8 @@ def test_arrow_load_from_zero_chunks(dtype, string_storage2): def test_value_counts_na(dtype): if getattr(dtype, "storage", "") == "pyarrow": exp_dtype = "int64[pyarrow]" + elif getattr(dtype, "storage", "") == "pyarrow_numpy": + exp_dtype = "int64" else: exp_dtype = "Int64" arr = pd.array(["a", "b", "a", pd.NA], dtype=dtype) @@ -470,6 +510,8 @@ def test_value_counts_na(dtype): def test_value_counts_with_normalize(dtype): if getattr(dtype, "storage", "") == "pyarrow": exp_dtype = "double[pyarrow]" + elif getattr(dtype, "storage", "") == "pyarrow_numpy": + exp_dtype = np.float64 else: exp_dtype = "Float64" ser = pd.Series(["a", "b", "a", pd.NA], dtype=dtype) @@ -506,7 +548,7 @@ def test_use_inf_as_na(values, expected, dtype): def test_memory_usage(dtype): # GH 33963 - if dtype.storage == "pyarrow": + if dtype.storage in ("pyarrow", "pyarrow_numpy"): pytest.skip(f"not applicable for {dtype.storage}") series = pd.Series(["a", "b", "c"], dtype=dtype) diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 9eee2e0bea687..20530e37116f2 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -324,7 +324,12 @@ def test_searchsorted_castable_strings(self, arr1d, box, string_storage): ): arr.searchsorted("foo") - arr_type = "StringArray" if string_storage == "python" else "ArrowStringArray" + if string_storage == "python": + arr_type = "StringArray" + elif string_storage == "pyarrow": + arr_type = "ArrowStringArray" + else: + arr_type = "ArrowStringArrayNumpySemantics" with pd.option_context("string_storage", string_storage): with pytest.raises( diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 2dd62a4ca7538..16059155a7a8f 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -70,6 +70,9 @@ def test_value_counts_with_normalize(self, data): ): # TODO: avoid special-casing expected = expected.astype("double[pyarrow]") + elif getattr(data.dtype, "storage", "") == "pyarrow_numpy": + # TODO: avoid special-casing + expected = expected.astype("float64") elif na_value_for_dtype(data.dtype) is pd.NA: # TODO(GH#44692): avoid special-casing expected = expected.astype("Float64") diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 6597ff84e3ca4..6088cd211d829 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -104,7 +104,7 @@ def test_is_not_string_type(self, dtype): class TestInterface(base.BaseInterfaceTests): def test_view(self, data, request): - if data.dtype.storage == "pyarrow": + if data.dtype.storage in ("pyarrow", "pyarrow_numpy"): pytest.skip(reason="2D support not implemented for ArrowStringArray") super().test_view(data) @@ -117,7 +117,7 @@ def test_from_dtype(self, data): class TestReshaping(base.BaseReshapingTests): def test_transpose(self, data, request): - if data.dtype.storage == "pyarrow": + if data.dtype.storage in ("pyarrow", "pyarrow_numpy"): pytest.skip(reason="2D support not implemented for ArrowStringArray") super().test_transpose(data) @@ -128,7 +128,7 @@ class TestGetitem(base.BaseGetitemTests): class TestSetitem(base.BaseSetitemTests): def test_setitem_preserves_views(self, data, request): - if data.dtype.storage == "pyarrow": + if data.dtype.storage in ("pyarrow", "pyarrow_numpy"): pytest.skip(reason="2D support not implemented for ArrowStringArray") super().test_setitem_preserves_views(data) @@ -184,6 +184,8 @@ def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): # attribute "storage" if dtype.storage == "pyarrow": # type: ignore[union-attr] cast_to = "boolean[pyarrow]" + elif dtype.storage == "pyarrow_numpy": + cast_to = np.bool_ else: cast_to = "boolean" return pointwise_result.astype(cast_to) diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index 170e2f61e7d4a..701bfe3767db4 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -234,3 +234,19 @@ def compression_format(request): @pytest.fixture(params=_compression_formats_params) def compression_ext(request): return request.param[0] + + +@pytest.fixture( + params=[ + "python", + pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")), + ] +) +def string_storage(request): + """ + Parametrized fixture for pd.options.mode.string_storage. + + * 'python' + * 'pyarrow' + """ + return request.param diff --git a/pandas/tests/strings/__init__.py b/pandas/tests/strings/__init__.py index e69de29bb2d1d..326ae24410502 100644 --- a/pandas/tests/strings/__init__.py +++ b/pandas/tests/strings/__init__.py @@ -0,0 +1 @@ +object_pyarrow_numpy = ("object", "string[pyarrow_numpy]") diff --git a/pandas/tests/strings/test_case_justify.py b/pandas/tests/strings/test_case_justify.py index ced941187f548..3de97cccd1d72 100644 --- a/pandas/tests/strings/test_case_justify.py +++ b/pandas/tests/strings/test_case_justify.py @@ -278,6 +278,8 @@ def test_center_ljust_rjust_mixed_object(): def test_center_ljust_rjust_fillchar(any_string_dtype): + if any_string_dtype == "string[pyarrow_numpy]": + pytest.skip("Arrow logic is different") s = Series(["a", "bb", "cccc", "ddddd", "eeeeee"], dtype=any_string_dtype) result = s.str.center(5, fillchar="X") diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index c3cc8b3643ed2..f62299f53aebe 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -11,6 +11,7 @@ Series, _testing as tm, ) +from pandas.tests.strings import object_pyarrow_numpy # -------------------------------------------------------------------------------------- # str.contains @@ -25,7 +26,7 @@ def test_contains(any_string_dtype): pat = "mmm[_]+" result = values.str.contains(pat) - expected_dtype = "object" if any_string_dtype == "object" else "boolean" + expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series( np.array([False, np.nan, True, True, False], dtype=np.object_), dtype=expected_dtype, @@ -44,7 +45,7 @@ def test_contains(any_string_dtype): dtype=any_string_dtype, ) result = values.str.contains(pat) - expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" + expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series(np.array([False, False, True, True]), dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -71,14 +72,14 @@ def test_contains(any_string_dtype): pat = "mmm[_]+" result = values.str.contains(pat) - expected_dtype = "object" if any_string_dtype == "object" else "boolean" + expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series( np.array([False, np.nan, True, True], dtype=np.object_), dtype=expected_dtype ) tm.assert_series_equal(result, expected) result = values.str.contains(pat, na=False) - expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" + expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series(np.array([False, False, True, True]), dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -163,7 +164,7 @@ def test_contains_moar(any_string_dtype): ) result = s.str.contains("a") - expected_dtype = "object" if any_string_dtype == "object" else "boolean" + expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series( [False, False, False, True, True, False, np.nan, False, False, True], dtype=expected_dtype, @@ -204,7 +205,7 @@ def test_contains_nan(any_string_dtype): s = Series([np.nan, np.nan, np.nan], dtype=any_string_dtype) result = s.str.contains("foo", na=False) - expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" + expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series([False, False, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -215,12 +216,14 @@ def test_contains_nan(any_string_dtype): result = s.str.contains("foo", na="foo") if any_string_dtype == "object": expected = Series(["foo", "foo", "foo"], dtype=np.object_) + elif any_string_dtype == "string[pyarrow_numpy]": + expected = Series([True, True, True], dtype=np.bool_) else: expected = Series([True, True, True], dtype="boolean") tm.assert_series_equal(result, expected) result = s.str.contains("foo") - expected_dtype = "object" if any_string_dtype == "object" else "boolean" + expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series([np.nan, np.nan, np.nan], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -379,7 +382,7 @@ def test_replace_unicode(any_string_dtype): ser = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype) expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype) with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" + PerformanceWarning, "string[pyarrow" in any_string_dtype ): result = ser.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE, regex=True) tm.assert_series_equal(result, expected) @@ -402,7 +405,7 @@ def test_replace_callable(any_string_dtype): # test with callable repl = lambda m: m.group(0).swapcase() with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" + PerformanceWarning, "string[pyarrow" in any_string_dtype ): result = ser.str.replace("[a-z][A-Z]{2}", repl, n=2, regex=True) expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype) @@ -423,7 +426,7 @@ def test_replace_callable_raises(any_string_dtype, repl): ) with pytest.raises(TypeError, match=msg): with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" + PerformanceWarning, "string[pyarrow" in any_string_dtype ): values.str.replace("a", repl, regex=True) @@ -434,7 +437,7 @@ def test_replace_callable_named_groups(any_string_dtype): pat = r"(?P\w+) (?P\w+) (?P\w+)" repl = lambda m: m.group("middle").swapcase() with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" + PerformanceWarning, "string[pyarrow" in any_string_dtype ): result = ser.str.replace(pat, repl, regex=True) expected = Series(["bAR", np.nan], dtype=any_string_dtype) @@ -448,14 +451,14 @@ def test_replace_compiled_regex(any_string_dtype): # test with compiled regex pat = re.compile(r"BAD_*") with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" + PerformanceWarning, "string[pyarrow" in any_string_dtype ): result = ser.str.replace(pat, "", regex=True) expected = Series(["foobar", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" + PerformanceWarning, "string[pyarrow" in any_string_dtype ): result = ser.str.replace(pat, "", n=1, regex=True) expected = Series(["foobarBAD", np.nan], dtype=any_string_dtype) @@ -477,7 +480,7 @@ def test_replace_compiled_regex_unicode(any_string_dtype): expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype) pat = re.compile(r"(?<=\w),(?=\w)", flags=re.UNICODE) with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" + PerformanceWarning, "string[pyarrow" in any_string_dtype ): result = ser.str.replace(pat, ", ", regex=True) tm.assert_series_equal(result, expected) @@ -507,7 +510,7 @@ def test_replace_compiled_regex_callable(any_string_dtype): repl = lambda m: m.group(0).swapcase() pat = re.compile("[a-z][A-Z]{2}") with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" + PerformanceWarning, "string[pyarrow" in any_string_dtype ): result = ser.str.replace(pat, repl, n=2, regex=True) expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype) @@ -558,7 +561,7 @@ def test_replace_moar(any_string_dtype): tm.assert_series_equal(result, expected) with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" + PerformanceWarning, "string[pyarrow" in any_string_dtype ): result = ser.str.replace("A", "YYY", case=False) expected = Series( @@ -579,7 +582,7 @@ def test_replace_moar(any_string_dtype): tm.assert_series_equal(result, expected) with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" + PerformanceWarning, "string[pyarrow" in any_string_dtype ): result = ser.str.replace("^.a|dog", "XX-XX ", case=False, regex=True) expected = Series( @@ -605,14 +608,14 @@ def test_replace_not_case_sensitive_not_regex(any_string_dtype): ser = Series(["A.", "a.", "Ab", "ab", np.nan], dtype=any_string_dtype) with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" + PerformanceWarning, "string[pyarrow" in any_string_dtype ): result = ser.str.replace("a", "c", case=False, regex=False) expected = Series(["c.", "c.", "cb", "cb", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" + PerformanceWarning, "string[pyarrow" in any_string_dtype ): result = ser.str.replace("a.", "c.", case=False, regex=False) expected = Series(["c.", "c.", "Ab", "ab", np.nan], dtype=any_string_dtype) @@ -648,7 +651,7 @@ def test_replace_regex_single_character(regex, any_string_dtype): def test_match(any_string_dtype): # New match behavior introduced in 0.13 - expected_dtype = "object" if any_string_dtype == "object" else "boolean" + expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" values = Series(["fooBAD__barBAD", np.nan, "foo"], dtype=any_string_dtype) result = values.str.match(".*(BAD[_]+).*(BAD)") @@ -703,12 +706,12 @@ def test_match_na_kwarg(any_string_dtype): s = Series(["a", "b", np.nan], dtype=any_string_dtype) result = s.str.match("a", na=False) - expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" + expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series([True, False, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) result = s.str.match("a") - expected_dtype = "object" if any_string_dtype == "object" else "boolean" + expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series([True, False, np.nan], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -716,7 +719,7 @@ def test_match_na_kwarg(any_string_dtype): def test_match_case_kwarg(any_string_dtype): values = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype) result = values.str.match("ab", case=False) - expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" + expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series([True, True, True, True], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -732,7 +735,7 @@ def test_fullmatch(any_string_dtype): ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype ) result = ser.str.fullmatch(".*BAD[_]+.*BAD") - expected_dtype = "object" if any_string_dtype == "object" else "boolean" + expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series([True, False, np.nan, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -742,14 +745,14 @@ def test_fullmatch_na_kwarg(any_string_dtype): ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype ) result = ser.str.fullmatch(".*BAD[_]+.*BAD", na=False) - expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" + expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series([True, False, False, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) def test_fullmatch_case_kwarg(any_string_dtype): ser = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype) - expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" + expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series([True, False, False, False], dtype=expected_dtype) @@ -762,7 +765,7 @@ def test_fullmatch_case_kwarg(any_string_dtype): tm.assert_series_equal(result, expected) with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" + PerformanceWarning, "string[pyarrow" in any_string_dtype ): result = ser.str.fullmatch("ab", flags=re.IGNORECASE) tm.assert_series_equal(result, expected) @@ -825,7 +828,7 @@ def test_find(any_string_dtype): ser = Series( ["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF", "XXXX"], dtype=any_string_dtype ) - expected_dtype = np.int64 if any_string_dtype == "object" else "Int64" + expected_dtype = np.int64 if any_string_dtype in object_pyarrow_numpy else "Int64" result = ser.str.find("EF") expected = Series([4, 3, 1, 0, -1], dtype=expected_dtype) @@ -877,7 +880,7 @@ def test_find_nan(any_string_dtype): ser = Series( ["ABCDEFG", np.nan, "DEFGHIJEF", np.nan, "XXXX"], dtype=any_string_dtype ) - expected_dtype = np.float64 if any_string_dtype == "object" else "Int64" + expected_dtype = np.float64 if any_string_dtype in object_pyarrow_numpy else "Int64" result = ser.str.find("EF") expected = Series([4, np.nan, 1, np.nan, -1], dtype=expected_dtype) @@ -944,7 +947,7 @@ def test_flags_kwarg(any_string_dtype): pat = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})" - using_pyarrow = any_string_dtype == "string[pyarrow]" + using_pyarrow = "string[pyarrow" in any_string_dtype result = data.str.extract(pat, flags=re.IGNORECASE, expand=True) assert result.iloc[0].tolist() == ["dave", "google", "com"] diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 1e573bdfe8fb5..bdae496cdc121 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -14,6 +14,7 @@ ) import pandas._testing as tm from pandas.core.strings.accessor import StringMethods +from pandas.tests.strings import object_pyarrow_numpy @pytest.mark.parametrize("pattern", [0, True, Series(["foo", "bar"])]) @@ -40,7 +41,7 @@ def test_iter_raises(): def test_count(any_string_dtype): ser = Series(["foo", "foofoo", np.nan, "foooofooofommmfoo"], dtype=any_string_dtype) result = ser.str.count("f[o]+") - expected_dtype = np.float64 if any_string_dtype == "object" else "Int64" + expected_dtype = np.float64 if any_string_dtype in object_pyarrow_numpy else "Int64" expected = Series([1, 2, np.nan, 4], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -91,7 +92,7 @@ def test_repeat_with_null(any_string_dtype, arg, repeat): def test_empty_str_methods(any_string_dtype): empty_str = empty = Series(dtype=any_string_dtype) - if any_string_dtype == "object": + if any_string_dtype in object_pyarrow_numpy: empty_int = Series(dtype="int64") empty_bool = Series(dtype=bool) else: @@ -205,7 +206,7 @@ def test_ismethods(method, expected, any_string_dtype): ser = Series( ["A", "b", "Xy", "4", "3A", "", "TT", "55", "-", " "], dtype=any_string_dtype ) - expected_dtype = "bool" if any_string_dtype == "object" else "boolean" + expected_dtype = "bool" if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series(expected, dtype=expected_dtype) result = getattr(ser.str, method)() tm.assert_series_equal(result, expected) @@ -230,7 +231,7 @@ def test_isnumeric_unicode(method, expected, any_string_dtype): ser = Series( ["A", "3", "¼", "★", "፸", "3", "four"], dtype=any_string_dtype # noqa: RUF001 ) - expected_dtype = "bool" if any_string_dtype == "object" else "boolean" + expected_dtype = "bool" if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series(expected, dtype=expected_dtype) result = getattr(ser.str, method)() tm.assert_series_equal(result, expected) @@ -250,7 +251,7 @@ def test_isnumeric_unicode(method, expected, any_string_dtype): def test_isnumeric_unicode_missing(method, expected, any_string_dtype): values = ["A", np.nan, "¼", "★", np.nan, "3", "four"] # noqa: RUF001 ser = Series(values, dtype=any_string_dtype) - expected_dtype = "object" if any_string_dtype == "object" else "boolean" + expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series(expected, dtype=expected_dtype) result = getattr(ser.str, method)() tm.assert_series_equal(result, expected) @@ -280,7 +281,7 @@ def test_len(any_string_dtype): dtype=any_string_dtype, ) result = ser.str.len() - expected_dtype = "float64" if any_string_dtype == "object" else "Int64" + expected_dtype = "float64" if any_string_dtype in object_pyarrow_numpy else "Int64" expected = Series([3, 4, 6, np.nan, 8, 4, 1], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -309,7 +310,8 @@ def test_index(method, sub, start, end, index_or_series, any_string_dtype, expec obj = index_or_series( ["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF"], dtype=any_string_dtype ) - expected_dtype = np.int64 if any_string_dtype == "object" else "Int64" + + expected_dtype = np.int64 if any_string_dtype in object_pyarrow_numpy else "Int64" expected = index_or_series(expected, dtype=expected_dtype) result = getattr(obj.str, method)(sub, start, end) @@ -350,7 +352,7 @@ def test_index_wrong_type_raises(index_or_series, any_string_dtype, method): ) def test_index_missing(any_string_dtype, method, exp): ser = Series(["abcb", "ab", "bcbe", np.nan], dtype=any_string_dtype) - expected_dtype = np.float64 if any_string_dtype == "object" else "Int64" + expected_dtype = np.float64 if any_string_dtype in object_pyarrow_numpy else "Int64" result = getattr(ser.str, method)("b") expected = Series(exp + [np.nan], dtype=expected_dtype) From d9e61e542d9252928d8119f65894381f749de0c6 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 13 Aug 2023 22:00:05 +0200 Subject: [PATCH 04/20] Move methods --- pandas/core/arrays/arrow/array.py | 42 +++++++++++++++---------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index d4c68c6870681..b2b1b92e569e3 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2000,27 +2000,6 @@ def _str_count(self, pat: str, flags: int = 0): raise NotImplementedError(f"count not implemented with {flags=}") return type(self)(pc.count_substring_regex(self._pa_array, pat)) - def _str_slice( - self, start: int | None = None, stop: int | None = None, step: int | None = None - ): - if start is None: - start = 0 - if step is None: - step = 1 - return type(self)( - pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) - ) - - def _str_partition(self, sep: str, expand: bool): - predicate = lambda val: val.partition(sep) - result = self._apply_elementwise(predicate) - return type(self)(pa.chunked_array(result)) - - def _str_rpartition(self, sep: str, expand: bool): - predicate = lambda val: val.rpartition(sep) - result = self._apply_elementwise(predicate) - return type(self)(pa.chunked_array(result)) - def _str_contains( self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True ): @@ -2113,6 +2092,27 @@ def _str_join(self, sep: str): result = self._pa_array return type(self)(pc.binary_join(result, sep)) + def _str_partition(self, sep: str, expand: bool): + predicate = lambda val: val.partition(sep) + result = self._apply_elementwise(predicate) + return type(self)(pa.chunked_array(result)) + + def _str_rpartition(self, sep: str, expand: bool): + predicate = lambda val: val.rpartition(sep) + result = self._apply_elementwise(predicate) + return type(self)(pa.chunked_array(result)) + + def _str_slice( + self, start: int | None = None, stop: int | None = None, step: int | None = None + ): + if start is None: + start = 0 + if step is None: + step = 1 + return type(self)( + pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) + ) + def _str_isalnum(self): return type(self)(pc.utf8_is_alnum(self._pa_array)) From 3188c25cc74859694e7991e54f9a4f1333e2d71d Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 13 Aug 2023 22:58:17 +0200 Subject: [PATCH 05/20] Refactor --- pandas/core/arrays/string_arrow.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 099e49980d27e..aa9deae0b1964 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -115,9 +115,6 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr # error: Incompatible types in assignment (expression has type "StringDtype", # base class "ArrowExtensionArray" defined the type as "ArrowDtype") _dtype: StringDtype # type: ignore[assignment] - _result_converter = lambda _, result, **kwargs: BooleanDtype().__from_arrow__( - result - ) _storage = "pyarrow" def __init__(self, values) -> None: @@ -187,6 +184,10 @@ def insert(self, loc: int, item) -> ArrowStringArray: raise TypeError("Scalar must be NA or str") return super().insert(loc, item) + @staticmethod + def _result_converter(values, **kwargs): + return BooleanDtype().__from_arrow__(values) + def _maybe_convert_setitem_value(self, value): """Maybe convert value to be pyarrow compatible.""" if is_scalar(value): From cd19bfb43d077e379b12b80d0c30e51724b6f5ec Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 14 Aug 2023 12:16:31 +0200 Subject: [PATCH 06/20] Refactor --- pandas/tests/strings/test_strings.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index bdae496cdc121..4315835b70a40 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -310,7 +310,6 @@ def test_index(method, sub, start, end, index_or_series, any_string_dtype, expec obj = index_or_series( ["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF"], dtype=any_string_dtype ) - expected_dtype = np.int64 if any_string_dtype in object_pyarrow_numpy else "Int64" expected = index_or_series(expected, dtype=expected_dtype) From c73c6b0e964cf1694feefe2a2c390f7e3dd5f0b8 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 14 Aug 2023 12:19:19 +0200 Subject: [PATCH 07/20] Remove --- pandas/core/arrays/arrow/array.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 1ff3b31389bb2..dea502c6dc090 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -250,12 +250,6 @@ def __init__(self, values: pa.Array | pa.ChunkedArray) -> None: ) self._dtype = ArrowDtype(self._pa_array.type) - def __dir__(self): - o = set(dir(type(self))) - o.update(self.__dict__) - o.update(set(dir(ArrowStringArrayMixin))) - return list(o) - @classmethod def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False): """ From 6b2630902c93a0ea49a883fc3bbd76d5360db364 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 14 Aug 2023 13:46:47 +0200 Subject: [PATCH 08/20] Fix --- pandas/core/arrays/string_arrow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index aa9deae0b1964..bc9add038301d 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -456,7 +456,7 @@ def _result_converter(values, na=None): return ArrowExtensionArray(values).to_numpy(na_value=np.nan) def __getattribute__(self, item): - if item in ArrowStringArrayMixin.__dict__: + if item in ArrowStringArrayMixin.__dict__ and item != "_pa_array": return partial(getattr(ArrowStringArrayMixin, item), self) return super().__getattribute__(item) From da6d67c7447a6c51262c6c4a556684ca41fc08c8 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 14 Aug 2023 17:21:06 +0200 Subject: [PATCH 09/20] Update --- pandas/tests/arrays/string_/test_string.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index b65fc52053414..3d66d45151e05 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas.compat.pyarrow import pa_version_under12p0 + from pandas.core.dtypes.common import is_dtype_equal import pandas as pd @@ -13,7 +15,6 @@ ArrowStringArray, ArrowStringArrayNumpySemantics, ) -from pandas.util.version import Version @pytest.fixture @@ -450,7 +451,7 @@ def test_arrow_array(dtype): data = pd.array(["a", "b", "c"], dtype=dtype) arr = pa.array(data) expected = pa.array(list(data), type=pa.string(), from_pandas=True) - if dtype.storage == "pyarrow" and Version(pa.__version__) <= Version("11.0.0"): + if dtype.storage in ("pyarrow", "pyarrow_numpy") and pa_version_under12p0: expected = pa.chunked_array(expected) assert arr.equals(expected) From 6cf263912ff682ac688a945fc27db454a6851f5a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 16 Aug 2023 22:40:39 +0200 Subject: [PATCH 10/20] Fix --- pandas/tests/strings/test_find_replace.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index e2d4619290a93..4cbfe97ad2ab4 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -19,7 +19,7 @@ def using_pyarrow(dtype): - return dtype in ("string[pyarrow]",) + return dtype in ("string[pyarrow]", "string[pyarrow_numpy]") def test_contains(any_string_dtype): From 0c260fb903b62c6049c87d7bd90705da3c2c514d Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 21 Aug 2023 10:51:31 +0200 Subject: [PATCH 11/20] Update pandas/core/arrays/string_arrow.py Co-authored-by: Joris Van den Bossche --- pandas/core/arrays/string_arrow.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 2f9b2651103d9..d0baae368fc1c 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -446,7 +446,6 @@ def _str_rstrip(self, to_strip=None): class ArrowStringArrayNumpySemantics(ArrowStringArray): - # _result_converter = lambda _, result: result.to_numpy(na_value=np.nan) _storage = "pyarrow_numpy" @staticmethod From 8df070ad7b938b35dee0c7e73c29107b0c018b42 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 21 Aug 2023 10:53:57 +0200 Subject: [PATCH 12/20] Add comment --- pandas/core/arrays/string_arrow.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index d0baae368fc1c..b571c06d5b278 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -455,6 +455,8 @@ def _result_converter(values, na=None): return ArrowExtensionArray(values).to_numpy(na_value=np.nan) def __getattribute__(self, item): + # ArrowStringArray and we both inherit from ArrowExtensionArray, which + # creates inheritance problems (Diamnond inheritance) if item in ArrowStringArrayMixin.__dict__ and item != "_pa_array": return partial(getattr(ArrowStringArrayMixin, item), self) return super().__getattribute__(item) From 1e732c9760f1f62b1edf22ecf6aa31493e6a48b2 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 21 Aug 2023 11:01:52 +0200 Subject: [PATCH 13/20] Use bool instead of object --- pandas/core/arrays/string_arrow.py | 2 +- pandas/tests/arrays/string_/test_string.py | 14 +++++++++----- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index b571c06d5b278..0bce3833908e9 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -545,7 +545,7 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None): def _cmp_method(self, other, op): result = super()._cmp_method(other, op) - return result.to_numpy(na_value=False) + return result.to_numpy(na_value=False).astype(np.bool_) def value_counts(self, dropna: bool = True): from pandas import Series diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 3d66d45151e05..f53f210f20172 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -206,7 +206,11 @@ def test_comparison_methods_scalar(comparison_op, dtype): result = getattr(a, op_name)(other) if dtype.storage == "pyarrow_numpy": expected = np.array([getattr(item, op_name)(other) for item in a], dtype=object) - expected = pd.array(expected, dtype="boolean").to_numpy(na_value=False) + expected = ( + pd.array(expected, dtype="boolean") + .to_numpy(na_value=False) + .astype(np.bool_) + ) tm.assert_numpy_array_equal(result, expected) else: expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" @@ -221,7 +225,7 @@ def test_comparison_methods_scalar_pd_na(comparison_op, dtype): result = getattr(a, op_name)(pd.NA) if dtype.storage == "pyarrow_numpy": - expected = np.array([False, False, False], dtype=object) + expected = np.array([False, False, False]) tm.assert_numpy_array_equal(result, expected) else: expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" @@ -249,7 +253,7 @@ def test_comparison_methods_scalar_not_string(comparison_op, dtype): "__eq__": [False, False, False], "__ne__": [True, False, True], }[op_name] - expected = np.array(expected_data, dtype=object) + expected = np.array(expected_data) tm.assert_numpy_array_equal(result, expected) else: expected_data = {"__eq__": [False, None, False], "__ne__": [True, None, True]}[ @@ -267,12 +271,12 @@ def test_comparison_methods_array(comparison_op, dtype): other = [None, None, "c"] result = getattr(a, op_name)(other) if dtype.storage == "pyarrow_numpy": - expected = np.array([False, False, False], dtype=object) + expected = np.array([False, False, False]) expected[-1] = getattr(other[-1], op_name)(a[-1]) tm.assert_numpy_array_equal(result, expected) result = getattr(a, op_name)(pd.NA) - expected = np.array([False, False, False], dtype=object) + expected = np.array([False, False, False]) tm.assert_numpy_array_equal(result, expected) else: From 3a913e1ecb17799e9bc8f49860ada607722db2b2 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 21 Aug 2023 11:05:53 +0200 Subject: [PATCH 14/20] Update docstring --- pandas/conftest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/conftest.py b/pandas/conftest.py index d0e0fca40de57..99b7bcf715f7e 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1330,6 +1330,7 @@ def string_storage(request): * 'python' * 'pyarrow' + * 'pyarrow_numpy' """ return request.param From ec56cefad6cf5d51a63271a0bec1f4d67c278b86 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 21 Aug 2023 12:08:28 +0200 Subject: [PATCH 15/20] Fix --- pandas/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 99b7bcf715f7e..10826f50d1fe1 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -2003,4 +2003,4 @@ def warsaw(request) -> str: @pytest.fixture() def arrow_string_storage(): - return ("pyarrow",) + return ("pyarrow", "pyarrow_numpy") From 31a59c4c95bd9b5e3a91be7879c683769e7779d0 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 21 Aug 2023 13:20:21 +0200 Subject: [PATCH 16/20] Fix typing --- pandas/core/arrays/string_arrow.py | 2 +- pandas/tests/extension/test_string.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 0bce3833908e9..03a3911cc6214 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -484,7 +484,7 @@ def _str_map( mask.view("uint8"), convert=False, na_value=na_value, - dtype=np.dtype(dtype), + dtype=np.dtype(dtype), # type: ignore[arg-type] ) return result diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 4809046f7214c..fd8e06ce148ed 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -176,8 +176,8 @@ def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): # attribute "storage" if dtype.storage == "pyarrow": # type: ignore[union-attr] cast_to = "boolean[pyarrow]" - elif dtype.storage == "pyarrow_numpy": - cast_to = np.bool_ + elif dtype.storage == "pyarrow_numpy": # type: ignore[union-attr] + cast_to = np.bool_ # type: ignore[assignment] else: cast_to = "boolean" return pointwise_result.astype(cast_to) From ed779679566f89598e570fa02d3adcb55ecebbd5 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 21 Aug 2023 16:04:11 +0200 Subject: [PATCH 17/20] Update pandas/core/arrays/string_arrow.py Co-authored-by: Joris Van den Bossche --- pandas/core/arrays/string_arrow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 03a3911cc6214..6259c4630ab19 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -545,7 +545,7 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None): def _cmp_method(self, other, op): result = super()._cmp_method(other, op) - return result.to_numpy(na_value=False).astype(np.bool_) + return result.to_numpy(np.bool_, na_value=False) def value_counts(self, dropna: bool = True): from pandas import Series From d05c51d1ac4c5648ca3faaba644bc89375f6c12d Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 22 Aug 2023 17:37:56 +0200 Subject: [PATCH 18/20] Update test_case_justify.py --- pandas/tests/strings/test_case_justify.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/tests/strings/test_case_justify.py b/pandas/tests/strings/test_case_justify.py index 3de97cccd1d72..d3cf479d9241c 100644 --- a/pandas/tests/strings/test_case_justify.py +++ b/pandas/tests/strings/test_case_justify.py @@ -279,7 +279,10 @@ def test_center_ljust_rjust_mixed_object(): def test_center_ljust_rjust_fillchar(any_string_dtype): if any_string_dtype == "string[pyarrow_numpy]": - pytest.skip("Arrow logic is different") + pytest.skip( + "Arrow logic is different, " + "see https://github.com/pandas-dev/pandas/pull/54533/files#r1299808126", + ) s = Series(["a", "bb", "cccc", "ddddd", "eeeeee"], dtype=any_string_dtype) result = s.str.center(5, fillchar="X") From aad3d2e6535843a41645edbd52c7d0f2503b63bb Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 22 Aug 2023 17:41:31 +0200 Subject: [PATCH 19/20] Update --- pandas/core/arrays/string_arrow.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 6259c4630ab19..1df0be12a8127 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -184,8 +184,8 @@ def insert(self, loc: int, item) -> ArrowStringArray: raise TypeError("Scalar must be NA or str") return super().insert(loc, item) - @staticmethod - def _result_converter(values, **kwargs): + @classmethod + def _result_converter(cls, values, na=None): return BooleanDtype().__from_arrow__(values) def _maybe_convert_setitem_value(self, value): @@ -448,15 +448,15 @@ def _str_rstrip(self, to_strip=None): class ArrowStringArrayNumpySemantics(ArrowStringArray): _storage = "pyarrow_numpy" - @staticmethod - def _result_converter(values, na=None): + @classmethod + def _result_converter(cls, values, na=None): if not isna(na): values = values.fill_null(bool(na)) return ArrowExtensionArray(values).to_numpy(na_value=np.nan) def __getattribute__(self, item): # ArrowStringArray and we both inherit from ArrowExtensionArray, which - # creates inheritance problems (Diamnond inheritance) + # creates inheritance problems (Diamond inheritance) if item in ArrowStringArrayMixin.__dict__ and item != "_pa_array": return partial(getattr(ArrowStringArrayMixin, item), self) return super().__getattribute__(item) From 5383eebdbf3fd1940459172839a66402a9ae9cba Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 22 Aug 2023 20:59:02 +0200 Subject: [PATCH 20/20] Update pandas/tests/strings/test_case_justify.py Co-authored-by: Joris Van den Bossche --- pandas/tests/strings/test_case_justify.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/strings/test_case_justify.py b/pandas/tests/strings/test_case_justify.py index d3cf479d9241c..1dee25e631648 100644 --- a/pandas/tests/strings/test_case_justify.py +++ b/pandas/tests/strings/test_case_justify.py @@ -282,7 +282,7 @@ def test_center_ljust_rjust_fillchar(any_string_dtype): pytest.skip( "Arrow logic is different, " "see https://github.com/pandas-dev/pandas/pull/54533/files#r1299808126", - ) + ) s = Series(["a", "bb", "cccc", "ddddd", "eeeeee"], dtype=any_string_dtype) result = s.str.center(5, fillchar="X")