From b24afc9ba03939403e48f9d97e0b6d2a04662799 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 13 Aug 2023 14:25:27 +0200 Subject: [PATCH 01/23] Start new string array --- pandas/core/arrays/string_arrow.py | 95 ++++++++++++++++++++++++++---- 1 file changed, 83 insertions(+), 12 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 4a70fcf6b5a93..474351fc6b1d7 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -113,6 +113,7 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr # error: Incompatible types in assignment (expression has type "StringDtype", # base class "ArrowExtensionArray" defined the type as "ArrowDtype") _dtype: StringDtype # type: ignore[assignment] + _result_converter = lambda result: BooleanDtype().__from_arrow__(result) def __init__(self, values) -> None: super().__init__(values) @@ -313,7 +314,7 @@ def _str_contains( result = pc.match_substring_regex(self._pa_array, pat, ignore_case=not case) else: result = pc.match_substring(self._pa_array, pat, ignore_case=not case) - result = BooleanDtype().__from_arrow__(result) + result = self._result_converter(result) if not isna(na): result[isna(result)] = bool(na) return result @@ -322,7 +323,7 @@ def _str_startswith(self, pat: str, na=None): result = pc.starts_with(self._pa_array, pattern=pat) if not isna(na): result = result.fill_null(na) - result = BooleanDtype().__from_arrow__(result) + result = self._result_converter_(result) if not isna(na): result[isna(result)] = bool(na) return result @@ -331,7 +332,7 @@ def _str_endswith(self, pat: str, na=None): result = pc.ends_with(self._pa_array, pattern=pat) if not isna(na): result = result.fill_null(na) - result = BooleanDtype().__from_arrow__(result) + result = self._result_converter(result) if not isna(na): result[isna(result)] = bool(na) return result @@ -369,39 +370,39 @@ def _str_fullmatch( def _str_isalnum(self): result = pc.utf8_is_alnum(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_isalpha(self): result = pc.utf8_is_alpha(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_isdecimal(self): result = pc.utf8_is_decimal(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_isdigit(self): result = pc.utf8_is_digit(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_islower(self): result = pc.utf8_is_lower(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_isnumeric(self): result = pc.utf8_is_numeric(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_isspace(self): result = pc.utf8_is_space(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_istitle(self): result = pc.utf8_is_title(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_isupper(self): result = pc.utf8_is_upper(self._pa_array) - return BooleanDtype().__from_arrow__(result) + return self._result_converter(result) def _str_len(self): result = pc.utf8_length(self._pa_array) @@ -433,3 +434,73 @@ def _str_rstrip(self, to_strip=None): else: result = pc.utf8_rtrim(self._pa_array, characters=to_strip) return type(self)(result) + + +class ArrowStringArrayNumpySemantics(ArrowStringArray): + _result_converter = lambda result: result.to_numpy() + + def _str_len(self): + result = pc.utf8_length(self._pa_array) + return result.to_numpy() + + def _str_map( + self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True + ): + """ + Map a callable over valid elements of the array. + + Parameters + ---------- + f : Callable + A function to call on each non-NA element. + na_value : Scalar, optional + The value to set for NA values. Might also be used for the + fill value if the callable `f` raises an exception. + This defaults to ``self._str_na_value`` which is ``np.nan`` + for object-dtype and Categorical and ``pd.NA`` for StringArray. + dtype : Dtype, optional + The dtype of the result array. + convert : bool, default True + Whether to call `maybe_convert_objects` on the resulting ndarray + """ + if dtype is None: + dtype = np.dtype("object") + if na_value is None: + na_value = self._str_na_value + + if not len(self): + return np.array([], dtype=dtype) + + arr = np.asarray(self, dtype=object) + mask = isna(arr) + map_convert = convert and not np.all(mask) + try: + result = lib.map_infer_mask(arr, f, mask.view(np.uint8), map_convert) + except (TypeError, AttributeError) as err: + # Reraise the exception if callable `f` got wrong number of args. + # The user may want to be warned by this, instead of getting NaN + p_err = ( + r"((takes)|(missing)) (?(2)from \d+ to )?\d+ " + r"(?(3)required )positional arguments?" + ) + + if len(err.args) >= 1 and re.search(p_err, err.args[0]): + # FIXME: this should be totally avoidable + raise err + + def g(x): + # This type of fallback behavior can be removed once + # we remove object-dtype .str accessor. + try: + return f(x) + except (TypeError, AttributeError): + return na_value + + return self._str_map(g, na_value=na_value, dtype=dtype) + if not isinstance(result, np.ndarray): + return result + if na_value is not np.nan: + np.putmask(result, mask, na_value) + if convert and result.dtype == object: + result = lib.maybe_convert_objects(result) + return result From b306c6f1550279687a9d1aaa2364e6546e5d905b Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 13 Aug 2023 18:30:11 +0200 Subject: [PATCH 02/23] Add missing methods --- pandas/core/arrays/_arrow_string_mixins.py | 94 +++++++++++++++++++++ pandas/core/arrays/arrow/array.py | 95 +++------------------- pandas/core/arrays/string_.py | 21 +++-- pandas/core/arrays/string_arrow.py | 57 ++++++++++++- 4 files changed, 176 insertions(+), 91 deletions(-) create mode 100644 pandas/core/arrays/_arrow_string_mixins.py diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py new file mode 100644 index 0000000000000..62857285cadef --- /dev/null +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -0,0 +1,94 @@ +from __future__ import annotations + +from typing import Literal + +from pandas.compat import pa_version_under7p0 + +if not pa_version_under7p0: + import pyarrow as pa + import pyarrow.compute as pc + + +class ArrowStringArrayMixin: + def _str_pad( + self, + width: int, + side: Literal["left", "right", "both"] = "left", + fillchar: str = " ", + ): + if side == "left": + pa_pad = pc.utf8_lpad + elif side == "right": + pa_pad = pc.utf8_rpad + elif side == "both": + pa_pad = pc.utf8_center + else: + raise ValueError( + f"Invalid side: {side}. Side must be one of 'left', 'right', 'both'" + ) + return type(self)(pa_pad(self._pa_array, width=width, padding=fillchar)) + + def _str_get(self, i: int): + lengths = pc.utf8_length(self._pa_array) + if i >= 0: + out_of_bounds = pc.greater_equal(i, lengths) + start = i + stop = i + 1 + step = 1 + else: + out_of_bounds = pc.greater(-i, lengths) + start = i + stop = i - 1 + step = -1 + not_out_of_bounds = pc.invert(out_of_bounds.fill_null(True)) + selected = pc.utf8_slice_codeunits( + self._pa_array, start=start, stop=stop, step=step + ) + null_value = pa.scalar(None, type=self._pa_array.type) + result = pc.if_else(not_out_of_bounds, selected, null_value) + return type(self)(result) + + def _str_partition(self, sep: str, expand: bool): + predicate = lambda val: val.partition(sep) + result = self._apply_elementwise(predicate) + return type(self)(pa.chunked_array(result)) + + def _str_rpartition(self, sep: str, expand: bool): + predicate = lambda val: val.rpartition(sep) + result = self._apply_elementwise(predicate) + return type(self)(pa.chunked_array(result)) + + def _str_slice( + self, start: int | None = None, stop: int | None = None, step: int | None = None + ): + if start is None: + start = 0 + if step is None: + step = 1 + return type(self)( + pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) + ) + + def _str_slice_replace( + self, start: int | None = None, stop: int | None = None, repl: str | None = None + ): + if repl is None: + repl = "" + if start is None: + start = 0 + return type(self)(pc.utf8_replace_slice(self._pa_array, start, stop, repl)) + + def _str_capitalize(self): + return type(self)(pc.utf8_capitalize(self._pa_array)) + + def _str_title(self): + return type(self)(pc.utf8_title(self._pa_array)) + + def _str_swapcase(self): + return type(self)(pc.utf8_swapcase(self._pa_array)) + + def _str_removesuffix(self, suffix: str): + ends_with = pc.ends_with(self._pa_array, pattern=suffix) + removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix)) + result = pc.if_else(ends_with, removed, self._pa_array) + return type(self)(result) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 88695f11fba59..e83ce450218a5 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -42,6 +42,7 @@ from pandas.core import roperator from pandas.core.arraylike import OpsMixin +from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin from pandas.core.arrays.base import ( ExtensionArray, ExtensionArraySupportsAnyAll, @@ -184,7 +185,10 @@ def to_pyarrow_type( class ArrowExtensionArray( - OpsMixin, ExtensionArraySupportsAnyAll, BaseStringArrayMethods + OpsMixin, + ExtensionArraySupportsAnyAll, + ArrowStringArrayMixin, + BaseStringArrayMethods, ): """ Pandas ExtensionArray backed by a PyArrow ChunkedArray. @@ -246,6 +250,12 @@ def __init__(self, values: pa.Array | pa.ChunkedArray) -> None: ) self._dtype = ArrowDtype(self._pa_array.type) + def __dir__(self): + o = set(dir(type(self))) + o.update(self.__dict__) + o.update(set(dir(ArrowStringArrayMixin))) + return list(o) + @classmethod def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False): """ @@ -1987,24 +1997,6 @@ def _str_count(self, pat: str, flags: int = 0): raise NotImplementedError(f"count not implemented with {flags=}") return type(self)(pc.count_substring_regex(self._pa_array, pat)) - def _str_pad( - self, - width: int, - side: Literal["left", "right", "both"] = "left", - fillchar: str = " ", - ): - if side == "left": - pa_pad = pc.utf8_lpad - elif side == "right": - pa_pad = pc.utf8_rpad - elif side == "both": - pa_pad = pc.utf8_center - else: - raise ValueError( - f"Invalid side: {side}. Side must be one of 'left', 'right', 'both'" - ) - return type(self)(pa_pad(self._pa_array, width=width, padding=fillchar)) - def _str_contains( self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True ): @@ -2089,26 +2081,6 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None): ) return type(self)(result) - def _str_get(self, i: int): - lengths = pc.utf8_length(self._pa_array) - if i >= 0: - out_of_bounds = pc.greater_equal(i, lengths) - start = i - stop = i + 1 - step = 1 - else: - out_of_bounds = pc.greater(-i, lengths) - start = i - stop = i - 1 - step = -1 - not_out_of_bounds = pc.invert(out_of_bounds.fill_null(True)) - selected = pc.utf8_slice_codeunits( - self._pa_array, start=start, stop=stop, step=step - ) - null_value = pa.scalar(None, type=self._pa_array.type) - result = pc.if_else(not_out_of_bounds, selected, null_value) - return type(self)(result) - def _str_join(self, sep: str): if pa.types.is_string(self._pa_array.type): result = self._apply_elementwise(list) @@ -2117,36 +2089,6 @@ def _str_join(self, sep: str): result = self._pa_array return type(self)(pc.binary_join(result, sep)) - def _str_partition(self, sep: str, expand: bool): - predicate = lambda val: val.partition(sep) - result = self._apply_elementwise(predicate) - return type(self)(pa.chunked_array(result)) - - def _str_rpartition(self, sep: str, expand: bool): - predicate = lambda val: val.rpartition(sep) - result = self._apply_elementwise(predicate) - return type(self)(pa.chunked_array(result)) - - def _str_slice( - self, start: int | None = None, stop: int | None = None, step: int | None = None - ): - if start is None: - start = 0 - if step is None: - step = 1 - return type(self)( - pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) - ) - - def _str_slice_replace( - self, start: int | None = None, stop: int | None = None, repl: str | None = None - ): - if repl is None: - repl = "" - if start is None: - start = 0 - return type(self)(pc.utf8_replace_slice(self._pa_array, start, stop, repl)) - def _str_isalnum(self): return type(self)(pc.utf8_is_alnum(self._pa_array)) @@ -2171,18 +2113,9 @@ def _str_isspace(self): def _str_istitle(self): return type(self)(pc.utf8_is_title(self._pa_array)) - def _str_capitalize(self): - return type(self)(pc.utf8_capitalize(self._pa_array)) - - def _str_title(self): - return type(self)(pc.utf8_title(self._pa_array)) - def _str_isupper(self): return type(self)(pc.utf8_is_upper(self._pa_array)) - def _str_swapcase(self): - return type(self)(pc.utf8_swapcase(self._pa_array)) - def _str_len(self): return type(self)(pc.utf8_length(self._pa_array)) @@ -2223,12 +2156,6 @@ def _str_removeprefix(self, prefix: str): result = self._apply_elementwise(predicate) return type(self)(pa.chunked_array(result)) - def _str_removesuffix(self, suffix: str): - ends_with = pc.ends_with(self._pa_array, pattern=suffix) - removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix)) - result = pc.if_else(ends_with, removed, self._pa_array) - return type(self)(result) - def _str_casefold(self): predicate = lambda val: val.casefold() result = self._apply_elementwise(predicate) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 25f1c2ec6ce4f..1e285f90e9fea 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -76,7 +76,7 @@ class StringDtype(StorageExtensionDtype): Parameters ---------- - storage : {"python", "pyarrow"}, optional + storage : {"python", "pyarrow", "pyarrow_numpy"}, optional If not given, the value of ``pd.options.mode.string_storage``. Attributes @@ -108,11 +108,11 @@ def na_value(self) -> libmissing.NAType: def __init__(self, storage=None) -> None: if storage is None: storage = get_option("mode.string_storage") - if storage not in {"python", "pyarrow"}: + if storage not in {"python", "pyarrow", "pyarrow_numpy"}: raise ValueError( f"Storage must be 'python' or 'pyarrow'. Got {storage} instead." ) - if storage == "pyarrow" and pa_version_under7p0: + if storage in ("pyarrow", "pyarrow_numpy") and pa_version_under7p0: raise ImportError( "pyarrow>=7.0.0 is required for PyArrow backed StringArray." ) @@ -160,6 +160,8 @@ def construct_from_string(cls, string): return cls(storage="python") elif string == "string[pyarrow]": return cls(storage="pyarrow") + elif string == "string[pyarrow_numpy]": + return cls(storage="pyarrow_numpy") else: raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") @@ -176,12 +178,17 @@ def construct_array_type( # type: ignore[override] ------- type """ - from pandas.core.arrays.string_arrow import ArrowStringArray + from pandas.core.arrays.string_arrow import ( + ArrowStringArray, + ArrowStringArrayNumpySemantics, + ) if self.storage == "python": return StringArray - else: + elif self.storage == "pyarrow": return ArrowStringArray + else: + return ArrowStringArrayNumpySemantics def __from_arrow__( self, array: pyarrow.Array | pyarrow.ChunkedArray @@ -193,6 +200,10 @@ def __from_arrow__( from pandas.core.arrays.string_arrow import ArrowStringArray return ArrowStringArray(array) + elif self.storage == "pyarrow_numpy": + from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics + + return ArrowStringArrayNumpySemantics(array) else: import pyarrow diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 474351fc6b1d7..20dc8978b63df 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -1,5 +1,6 @@ from __future__ import annotations +from functools import partial import re from typing import ( TYPE_CHECKING, @@ -27,6 +28,7 @@ ) from pandas.core.dtypes.missing import isna +from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin from pandas.core.arrays.arrow import ArrowExtensionArray from pandas.core.arrays.boolean import BooleanDtype from pandas.core.arrays.integer import Int64Dtype @@ -114,10 +116,11 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr # base class "ArrowExtensionArray" defined the type as "ArrowDtype") _dtype: StringDtype # type: ignore[assignment] _result_converter = lambda result: BooleanDtype().__from_arrow__(result) + _storage = "pyarrow" def __init__(self, values) -> None: super().__init__(values) - self._dtype = StringDtype(storage="pyarrow") + self._dtype = StringDtype(storage=self._storage) if not pa.types.is_string(self._pa_array.type) and not ( pa.types.is_dictionary(self._pa_array.type) @@ -145,7 +148,10 @@ def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False) if dtype and not (isinstance(dtype, str) and dtype == "string"): dtype = pandas_dtype(dtype) - assert isinstance(dtype, StringDtype) and dtype.storage == "pyarrow" + assert isinstance(dtype, StringDtype) and dtype.storage in ( + "pyarrow", + "pyarrow_numpy", + ) if isinstance(scalars, BaseMaskedArray): # avoid costly conversion to object dtype in ensure_string_array and @@ -438,6 +444,12 @@ def _str_rstrip(self, to_strip=None): class ArrowStringArrayNumpySemantics(ArrowStringArray): _result_converter = lambda result: result.to_numpy() + _storage = "pyarrow_numpy" + + def __getattribute__(self, item): + if item in ArrowStringArrayMixin.__dict__: + return partial(getattr(ArrowStringArrayMixin, item), self) + return super().__getattribute__(item) def _str_len(self): result = pc.utf8_length(self._pa_array) @@ -504,3 +516,44 @@ def g(x): if convert and result.dtype == object: result = lib.maybe_convert_objects(result) return result + + def _str_count(self, pat: str, flags: int = 0): + if flags: + return super()._str_count(pat, flags) + return pc.count_substring_regex(self._pa_array, pat).to_numpy() + + def _str_find(self, sub: str, start: int = 0, end: int | None = None): + if start != 0 and end is not None: + slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end) + result = pc.find_substring(slices, sub) + not_found = pc.equal(result, -1) + offset_result = pc.add(result, end - start) + result = pc.if_else(not_found, result, offset_result) + elif start == 0 and end is None: + slices = self._pa_array + result = pc.find_substring(slices, sub) + else: + return super()._str_find(sub, start, end) + return type(self)(result) + + def _str_split( + self, + pat: str | None = None, + n: int | None = -1, + expand: bool = False, + regex: bool | None = None, + ): + if n in {-1, 0}: + n = None + if regex: + split_func = pc.split_pattern_regex + else: + split_func = pc.split_pattern + return split_func(self._pa_array, pat, max_splits=n).to_numpy() + + def _str_rsplit(self, pat: str | None = None, n: int | None = -1): + if n in {-1, 0}: + n = None + return pc.split_pattern( + self._pa_array, pat, max_splits=n, reverse=True + ).to_numpy() From 2dbcfb0b91ab953ea933afecf9d2003d86873b31 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 13 Aug 2023 21:57:53 +0200 Subject: [PATCH 03/23] Implement Arrow String Array that is compatible with NumPy semantics --- pandas/conftest.py | 2 + pandas/core/arrays/_arrow_string_mixins.py | 25 +--- pandas/core/arrays/arrow/array.py | 26 +++- pandas/core/arrays/string_arrow.py | 163 ++++++++++----------- pandas/core/config_init.py | 2 +- pandas/core/strings/accessor.py | 4 +- pandas/tests/arrays/string_/test_string.py | 100 +++++++++---- pandas/tests/arrays/test_datetimelike.py | 7 +- pandas/tests/extension/base/methods.py | 3 + pandas/tests/extension/test_string.py | 8 +- pandas/tests/io/conftest.py | 16 ++ pandas/tests/strings/__init__.py | 1 + pandas/tests/strings/test_case_justify.py | 2 + pandas/tests/strings/test_find_replace.py | 63 ++++---- pandas/tests/strings/test_strings.py | 18 ++- 15 files changed, 261 insertions(+), 179 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index f756da82157b8..e0aae234144f5 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1321,6 +1321,7 @@ def nullable_string_dtype(request): params=[ "python", pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")), + pytest.param("pyarrow_numpy", marks=td.skip_if_no("pyarrow")), ] ) def string_storage(request): @@ -1380,6 +1381,7 @@ def object_dtype(request): "object", "string[python]", pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), + pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), ] ) def any_string_dtype(request): diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 62857285cadef..29ea4c6dc221d 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -2,6 +2,8 @@ from typing import Literal +import numpy as np + from pandas.compat import pa_version_under7p0 if not pa_version_under7p0: @@ -48,27 +50,6 @@ def _str_get(self, i: int): result = pc.if_else(not_out_of_bounds, selected, null_value) return type(self)(result) - def _str_partition(self, sep: str, expand: bool): - predicate = lambda val: val.partition(sep) - result = self._apply_elementwise(predicate) - return type(self)(pa.chunked_array(result)) - - def _str_rpartition(self, sep: str, expand: bool): - predicate = lambda val: val.rpartition(sep) - result = self._apply_elementwise(predicate) - return type(self)(pa.chunked_array(result)) - - def _str_slice( - self, start: int | None = None, stop: int | None = None, step: int | None = None - ): - if start is None: - start = 0 - if step is None: - step = 1 - return type(self)( - pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) - ) - def _str_slice_replace( self, start: int | None = None, stop: int | None = None, repl: str | None = None ): @@ -76,6 +57,8 @@ def _str_slice_replace( repl = "" if start is None: start = 0 + if stop is None: + stop = np.iinfo(np.int64).max return type(self)(pc.utf8_replace_slice(self._pa_array, start, stop, repl)) def _str_capitalize(self): diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index e83ce450218a5..d4c68c6870681 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -512,7 +512,10 @@ def __getitem__(self, item: PositionalIndexer): if isinstance(item, np.ndarray): if not len(item): # Removable once we migrate StringDtype[pyarrow] to ArrowDtype[string] - if self._dtype.name == "string" and self._dtype.storage == "pyarrow": + if self._dtype.name == "string" and self._dtype.storage in ( + "pyarrow", + "pyarrow_numpy", + ): pa_dtype = pa.string() else: pa_dtype = self._dtype.pyarrow_dtype @@ -1997,6 +2000,27 @@ def _str_count(self, pat: str, flags: int = 0): raise NotImplementedError(f"count not implemented with {flags=}") return type(self)(pc.count_substring_regex(self._pa_array, pat)) + def _str_slice( + self, start: int | None = None, stop: int | None = None, step: int | None = None + ): + if start is None: + start = 0 + if step is None: + step = 1 + return type(self)( + pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) + ) + + def _str_partition(self, sep: str, expand: bool): + predicate = lambda val: val.partition(sep) + result = self._apply_elementwise(predicate) + return type(self)(pa.chunked_array(result)) + + def _str_rpartition(self, sep: str, expand: bool): + predicate = lambda val: val.rpartition(sep) + result = self._apply_elementwise(predicate) + return type(self)(pa.chunked_array(result)) + def _str_contains( self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True ): diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 20dc8978b63df..099e49980d27e 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -115,7 +115,9 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr # error: Incompatible types in assignment (expression has type "StringDtype", # base class "ArrowExtensionArray" defined the type as "ArrowDtype") _dtype: StringDtype # type: ignore[assignment] - _result_converter = lambda result: BooleanDtype().__from_arrow__(result) + _result_converter = lambda _, result, **kwargs: BooleanDtype().__from_arrow__( + result + ) _storage = "pyarrow" def __init__(self, values) -> None: @@ -320,7 +322,7 @@ def _str_contains( result = pc.match_substring_regex(self._pa_array, pat, ignore_case=not case) else: result = pc.match_substring(self._pa_array, pat, ignore_case=not case) - result = self._result_converter(result) + result = self._result_converter(result, na=na) if not isna(na): result[isna(result)] = bool(na) return result @@ -329,7 +331,7 @@ def _str_startswith(self, pat: str, na=None): result = pc.starts_with(self._pa_array, pattern=pat) if not isna(na): result = result.fill_null(na) - result = self._result_converter_(result) + result = self._result_converter(result) if not isna(na): result[isna(result)] = bool(na) return result @@ -443,84 +445,87 @@ def _str_rstrip(self, to_strip=None): class ArrowStringArrayNumpySemantics(ArrowStringArray): - _result_converter = lambda result: result.to_numpy() + # _result_converter = lambda _, result: result.to_numpy(na_value=np.nan) _storage = "pyarrow_numpy" + @staticmethod + def _result_converter(values, na=None): + if not isna(na): + values = values.fill_null(bool(na)) + return ArrowExtensionArray(values).to_numpy(na_value=np.nan) + def __getattribute__(self, item): if item in ArrowStringArrayMixin.__dict__: return partial(getattr(ArrowStringArrayMixin, item), self) return super().__getattribute__(item) - def _str_len(self): - result = pc.utf8_length(self._pa_array) - return result.to_numpy() - def _str_map( self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True ): - """ - Map a callable over valid elements of the array. - - Parameters - ---------- - f : Callable - A function to call on each non-NA element. - na_value : Scalar, optional - The value to set for NA values. Might also be used for the - fill value if the callable `f` raises an exception. - This defaults to ``self._str_na_value`` which is ``np.nan`` - for object-dtype and Categorical and ``pd.NA`` for StringArray. - dtype : Dtype, optional - The dtype of the result array. - convert : bool, default True - Whether to call `maybe_convert_objects` on the resulting ndarray - """ if dtype is None: - dtype = np.dtype("object") + dtype = self.dtype if na_value is None: - na_value = self._str_na_value - - if not len(self): - return np.array([], dtype=dtype) - - arr = np.asarray(self, dtype=object) - mask = isna(arr) - map_convert = convert and not np.all(mask) - try: - result = lib.map_infer_mask(arr, f, mask.view(np.uint8), map_convert) - except (TypeError, AttributeError) as err: - # Reraise the exception if callable `f` got wrong number of args. - # The user may want to be warned by this, instead of getting NaN - p_err = ( - r"((takes)|(missing)) (?(2)from \d+ to )?\d+ " - r"(?(3)required )positional arguments?" + na_value = self.dtype.na_value + + mask = isna(self) + arr = np.asarray(self) + + if is_integer_dtype(dtype) or is_bool_dtype(dtype): + if is_integer_dtype(dtype): + na_value = np.nan + else: + na_value = False + try: + result = lib.map_infer_mask( + arr, + f, + mask.view("uint8"), + convert=False, + na_value=na_value, + dtype=np.dtype(dtype), + ) + return result + + except ValueError: + result = lib.map_infer_mask( + arr, + f, + mask.view("uint8"), + convert=False, + na_value=na_value, + ) + if convert and result.dtype == object: + result = lib.maybe_convert_objects(result) + return result + + elif is_string_dtype(dtype) and not is_object_dtype(dtype): + # i.e. StringDtype + result = lib.map_infer_mask( + arr, f, mask.view("uint8"), convert=False, na_value=na_value ) + result = pa.array(result, mask=mask, type=pa.string(), from_pandas=True) + return type(self)(result) + else: + # This is when the result type is object. We reach this when + # -> We know the result type is truly object (e.g. .encode returns bytes + # or .findall returns a list). + # -> We don't know the result type. E.g. `.get` can return anything. + return lib.map_infer_mask(arr, f, mask.view("uint8")) - if len(err.args) >= 1 and re.search(p_err, err.args[0]): - # FIXME: this should be totally avoidable - raise err - - def g(x): - # This type of fallback behavior can be removed once - # we remove object-dtype .str accessor. - try: - return f(x) - except (TypeError, AttributeError): - return na_value - - return self._str_map(g, na_value=na_value, dtype=dtype) - if not isinstance(result, np.ndarray): - return result - if na_value is not np.nan: - np.putmask(result, mask, na_value) - if convert and result.dtype == object: - result = lib.maybe_convert_objects(result) + def _convert_int_dtype(self, result): + if result.dtype == np.int32: + result = result.astype(np.int64) return result def _str_count(self, pat: str, flags: int = 0): if flags: return super()._str_count(pat, flags) - return pc.count_substring_regex(self._pa_array, pat).to_numpy() + result = pc.count_substring_regex(self._pa_array, pat).to_numpy() + return self._convert_int_dtype(result) + + def _str_len(self): + result = pc.utf8_length(self._pa_array).to_numpy() + return self._convert_int_dtype(result) def _str_find(self, sub: str, start: int = 0, end: int | None = None): if start != 0 and end is not None: @@ -534,26 +539,16 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None): result = pc.find_substring(slices, sub) else: return super()._str_find(sub, start, end) - return type(self)(result) + return self._convert_int_dtype(result.to_numpy()) - def _str_split( - self, - pat: str | None = None, - n: int | None = -1, - expand: bool = False, - regex: bool | None = None, - ): - if n in {-1, 0}: - n = None - if regex: - split_func = pc.split_pattern_regex - else: - split_func = pc.split_pattern - return split_func(self._pa_array, pat, max_splits=n).to_numpy() - - def _str_rsplit(self, pat: str | None = None, n: int | None = -1): - if n in {-1, 0}: - n = None - return pc.split_pattern( - self._pa_array, pat, max_splits=n, reverse=True - ).to_numpy() + def _cmp_method(self, other, op): + result = super()._cmp_method(other, op) + return result.to_numpy(na_value=False) + + def value_counts(self, dropna: bool = True): + from pandas import Series + + result = super().value_counts(dropna) + return Series( + result._values.to_numpy(), index=result.index, name=result.name, copy=False + ) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 27e9bf8958ab0..745689ab1fcc8 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -500,7 +500,7 @@ def use_inf_as_na_cb(key) -> None: "string_storage", "python", string_storage_doc, - validator=is_one_of_factory(["python", "pyarrow"]), + validator=is_one_of_factory(["python", "pyarrow", "pyarrow_numpy"]), ) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index e59369db776da..6d9470eb730c5 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -145,7 +145,9 @@ def _map_and_wrap(name: str | None, docstring: str | None): @forbid_nonstring_types(["bytes"], name=name) def wrapper(self): result = getattr(self._data.array, f"_str_{name}")() - return self._wrap_result(result) + return self._wrap_result( + result, returns_string=name not in ("isnumeric", "isdecimal") + ) wrapper.__doc__ = docstring return wrapper diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index cfd3314eb5944..b65fc52053414 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -9,7 +9,10 @@ import pandas as pd import pandas._testing as tm -from pandas.core.arrays.string_arrow import ArrowStringArray +from pandas.core.arrays.string_arrow import ( + ArrowStringArray, + ArrowStringArrayNumpySemantics, +) from pandas.util.version import Version @@ -33,7 +36,12 @@ def test_repr(dtype): expected = "0 a\n1 \n2 b\nName: A, dtype: string" assert repr(df.A) == expected - arr_name = "ArrowStringArray" if dtype.storage == "pyarrow" else "StringArray" + if dtype.storage == "pyarrow": + arr_name = "ArrowStringArray" + elif dtype.storage == "pyarrow_numpy": + arr_name = "ArrowStringArrayNumpySemantics" + else: + arr_name = "StringArray" expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: string" assert repr(df.A.array) == expected @@ -116,7 +124,7 @@ def test_add(dtype): def test_add_2d(dtype, request): - if dtype.storage == "pyarrow": + if dtype.storage in ("pyarrow", "pyarrow_numpy"): reason = "Failed: DID NOT RAISE " mark = pytest.mark.xfail(raises=None, reason=reason) request.node.add_marker(mark) @@ -145,7 +153,7 @@ def test_add_sequence(dtype): def test_mul(dtype, request): - if dtype.storage == "pyarrow": + if dtype.storage in ("pyarrow", "pyarrow_numpy"): reason = "unsupported operand type(s) for *: 'ArrowStringArray' and 'int'" mark = pytest.mark.xfail(raises=NotImplementedError, reason=reason) request.node.add_marker(mark) @@ -195,19 +203,30 @@ def test_comparison_methods_scalar(comparison_op, dtype): a = pd.array(["a", None, "c"], dtype=dtype) other = "a" result = getattr(a, op_name)(other) - expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" - expected = np.array([getattr(item, op_name)(other) for item in a], dtype=object) - expected = pd.array(expected, dtype=expected_dtype) - tm.assert_extension_array_equal(result, expected) + if dtype.storage == "pyarrow_numpy": + expected = np.array([getattr(item, op_name)(other) for item in a], dtype=object) + expected = pd.array(expected, dtype="boolean").to_numpy(na_value=False) + tm.assert_numpy_array_equal(result, expected) + else: + expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" + expected = np.array([getattr(item, op_name)(other) for item in a], dtype=object) + expected = pd.array(expected, dtype=expected_dtype) + tm.assert_extension_array_equal(result, expected) def test_comparison_methods_scalar_pd_na(comparison_op, dtype): op_name = f"__{comparison_op.__name__}__" a = pd.array(["a", None, "c"], dtype=dtype) result = getattr(a, op_name)(pd.NA) - expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" - expected = pd.array([None, None, None], dtype=expected_dtype) - tm.assert_extension_array_equal(result, expected) + + if dtype.storage == "pyarrow_numpy": + expected = np.array([False, False, False], dtype=object) + tm.assert_numpy_array_equal(result, expected) + else: + expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" + expected = pd.array([None, None, None], dtype=expected_dtype) + tm.assert_extension_array_equal(result, expected) + tm.assert_extension_array_equal(result, expected) def test_comparison_methods_scalar_not_string(comparison_op, dtype): @@ -223,12 +242,21 @@ def test_comparison_methods_scalar_not_string(comparison_op, dtype): return result = getattr(a, op_name)(other) - expected_data = {"__eq__": [False, None, False], "__ne__": [True, None, True]}[ - op_name - ] - expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" - expected = pd.array(expected_data, dtype=expected_dtype) - tm.assert_extension_array_equal(result, expected) + + if dtype.storage == "pyarrow_numpy": + expected_data = { + "__eq__": [False, False, False], + "__ne__": [True, False, True], + }[op_name] + expected = np.array(expected_data, dtype=object) + tm.assert_numpy_array_equal(result, expected) + else: + expected_data = {"__eq__": [False, None, False], "__ne__": [True, None, True]}[ + op_name + ] + expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" + expected = pd.array(expected_data, dtype=expected_dtype) + tm.assert_extension_array_equal(result, expected) def test_comparison_methods_array(comparison_op, dtype): @@ -237,15 +265,25 @@ def test_comparison_methods_array(comparison_op, dtype): a = pd.array(["a", None, "c"], dtype=dtype) other = [None, None, "c"] result = getattr(a, op_name)(other) - expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" - expected = np.full(len(a), fill_value=None, dtype="object") - expected[-1] = getattr(other[-1], op_name)(a[-1]) - expected = pd.array(expected, dtype=expected_dtype) - tm.assert_extension_array_equal(result, expected) + if dtype.storage == "pyarrow_numpy": + expected = np.array([False, False, False], dtype=object) + expected[-1] = getattr(other[-1], op_name)(a[-1]) + tm.assert_numpy_array_equal(result, expected) - result = getattr(a, op_name)(pd.NA) - expected = pd.array([None, None, None], dtype=expected_dtype) - tm.assert_extension_array_equal(result, expected) + result = getattr(a, op_name)(pd.NA) + expected = np.array([False, False, False], dtype=object) + tm.assert_numpy_array_equal(result, expected) + + else: + expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" + expected = np.full(len(a), fill_value=None, dtype="object") + expected[-1] = getattr(other[-1], op_name)(a[-1]) + expected = pd.array(expected, dtype=expected_dtype) + tm.assert_extension_array_equal(result, expected) + + result = getattr(a, op_name)(pd.NA) + expected = pd.array([None, None, None], dtype=expected_dtype) + tm.assert_extension_array_equal(result, expected) def test_constructor_raises(cls): @@ -297,7 +335,7 @@ def test_from_sequence_no_mutate(copy, cls, request): result = cls._from_sequence(nan_arr, copy=copy) - if cls is ArrowStringArray: + if cls in (ArrowStringArray, ArrowStringArrayNumpySemantics): import pyarrow as pa expected = cls(pa.array(na_arr, type=pa.string(), from_pandas=True)) @@ -370,7 +408,7 @@ def test_min_max(method, skipna, dtype, request): @pytest.mark.parametrize("method", ["min", "max"]) @pytest.mark.parametrize("box", [pd.Series, pd.array]) def test_min_max_numpy(method, box, dtype, request): - if dtype.storage == "pyarrow" and box is pd.array: + if dtype.storage in ("pyarrow", "pyarrow_numpy") and box is pd.array: if box is pd.array: reason = "'<=' not supported between instances of 'str' and 'NoneType'" else: @@ -397,7 +435,7 @@ def test_fillna_args(dtype, request): expected = pd.array(["a", "b"], dtype=dtype) tm.assert_extension_array_equal(res, expected) - if dtype.storage == "pyarrow": + if dtype.storage in ("pyarrow", "pyarrow_numpy"): msg = "Invalid value '1' for dtype string" else: msg = "Cannot set non-string value '1' into a StringArray." @@ -455,6 +493,8 @@ def test_arrow_load_from_zero_chunks(dtype, string_storage2): def test_value_counts_na(dtype): if getattr(dtype, "storage", "") == "pyarrow": exp_dtype = "int64[pyarrow]" + elif getattr(dtype, "storage", "") == "pyarrow_numpy": + exp_dtype = "int64" else: exp_dtype = "Int64" arr = pd.array(["a", "b", "a", pd.NA], dtype=dtype) @@ -470,6 +510,8 @@ def test_value_counts_na(dtype): def test_value_counts_with_normalize(dtype): if getattr(dtype, "storage", "") == "pyarrow": exp_dtype = "double[pyarrow]" + elif getattr(dtype, "storage", "") == "pyarrow_numpy": + exp_dtype = np.float64 else: exp_dtype = "Float64" ser = pd.Series(["a", "b", "a", pd.NA], dtype=dtype) @@ -506,7 +548,7 @@ def test_use_inf_as_na(values, expected, dtype): def test_memory_usage(dtype): # GH 33963 - if dtype.storage == "pyarrow": + if dtype.storage in ("pyarrow", "pyarrow_numpy"): pytest.skip(f"not applicable for {dtype.storage}") series = pd.Series(["a", "b", "c"], dtype=dtype) diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 9eee2e0bea687..20530e37116f2 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -324,7 +324,12 @@ def test_searchsorted_castable_strings(self, arr1d, box, string_storage): ): arr.searchsorted("foo") - arr_type = "StringArray" if string_storage == "python" else "ArrowStringArray" + if string_storage == "python": + arr_type = "StringArray" + elif string_storage == "pyarrow": + arr_type = "ArrowStringArray" + else: + arr_type = "ArrowStringArrayNumpySemantics" with pd.option_context("string_storage", string_storage): with pytest.raises( diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 2dd62a4ca7538..16059155a7a8f 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -70,6 +70,9 @@ def test_value_counts_with_normalize(self, data): ): # TODO: avoid special-casing expected = expected.astype("double[pyarrow]") + elif getattr(data.dtype, "storage", "") == "pyarrow_numpy": + # TODO: avoid special-casing + expected = expected.astype("float64") elif na_value_for_dtype(data.dtype) is pd.NA: # TODO(GH#44692): avoid special-casing expected = expected.astype("Float64") diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 6597ff84e3ca4..6088cd211d829 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -104,7 +104,7 @@ def test_is_not_string_type(self, dtype): class TestInterface(base.BaseInterfaceTests): def test_view(self, data, request): - if data.dtype.storage == "pyarrow": + if data.dtype.storage in ("pyarrow", "pyarrow_numpy"): pytest.skip(reason="2D support not implemented for ArrowStringArray") super().test_view(data) @@ -117,7 +117,7 @@ def test_from_dtype(self, data): class TestReshaping(base.BaseReshapingTests): def test_transpose(self, data, request): - if data.dtype.storage == "pyarrow": + if data.dtype.storage in ("pyarrow", "pyarrow_numpy"): pytest.skip(reason="2D support not implemented for ArrowStringArray") super().test_transpose(data) @@ -128,7 +128,7 @@ class TestGetitem(base.BaseGetitemTests): class TestSetitem(base.BaseSetitemTests): def test_setitem_preserves_views(self, data, request): - if data.dtype.storage == "pyarrow": + if data.dtype.storage in ("pyarrow", "pyarrow_numpy"): pytest.skip(reason="2D support not implemented for ArrowStringArray") super().test_setitem_preserves_views(data) @@ -184,6 +184,8 @@ def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): # attribute "storage" if dtype.storage == "pyarrow": # type: ignore[union-attr] cast_to = "boolean[pyarrow]" + elif dtype.storage == "pyarrow_numpy": + cast_to = np.bool_ else: cast_to = "boolean" return pointwise_result.astype(cast_to) diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index 170e2f61e7d4a..701bfe3767db4 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -234,3 +234,19 @@ def compression_format(request): @pytest.fixture(params=_compression_formats_params) def compression_ext(request): return request.param[0] + + +@pytest.fixture( + params=[ + "python", + pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")), + ] +) +def string_storage(request): + """ + Parametrized fixture for pd.options.mode.string_storage. + + * 'python' + * 'pyarrow' + """ + return request.param diff --git a/pandas/tests/strings/__init__.py b/pandas/tests/strings/__init__.py index e69de29bb2d1d..326ae24410502 100644 --- a/pandas/tests/strings/__init__.py +++ b/pandas/tests/strings/__init__.py @@ -0,0 +1 @@ +object_pyarrow_numpy = ("object", "string[pyarrow_numpy]") diff --git a/pandas/tests/strings/test_case_justify.py b/pandas/tests/strings/test_case_justify.py index ced941187f548..3de97cccd1d72 100644 --- a/pandas/tests/strings/test_case_justify.py +++ b/pandas/tests/strings/test_case_justify.py @@ -278,6 +278,8 @@ def test_center_ljust_rjust_mixed_object(): def test_center_ljust_rjust_fillchar(any_string_dtype): + if any_string_dtype == "string[pyarrow_numpy]": + pytest.skip("Arrow logic is different") s = Series(["a", "bb", "cccc", "ddddd", "eeeeee"], dtype=any_string_dtype) result = s.str.center(5, fillchar="X") diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index c3cc8b3643ed2..f62299f53aebe 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -11,6 +11,7 @@ Series, _testing as tm, ) +from pandas.tests.strings import object_pyarrow_numpy # -------------------------------------------------------------------------------------- # str.contains @@ -25,7 +26,7 @@ def test_contains(any_string_dtype): pat = "mmm[_]+" result = values.str.contains(pat) - expected_dtype = "object" if any_string_dtype == "object" else "boolean" + expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series( np.array([False, np.nan, True, True, False], dtype=np.object_), dtype=expected_dtype, @@ -44,7 +45,7 @@ def test_contains(any_string_dtype): dtype=any_string_dtype, ) result = values.str.contains(pat) - expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" + expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series(np.array([False, False, True, True]), dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -71,14 +72,14 @@ def test_contains(any_string_dtype): pat = "mmm[_]+" result = values.str.contains(pat) - expected_dtype = "object" if any_string_dtype == "object" else "boolean" + expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series( np.array([False, np.nan, True, True], dtype=np.object_), dtype=expected_dtype ) tm.assert_series_equal(result, expected) result = values.str.contains(pat, na=False) - expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" + expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series(np.array([False, False, True, True]), dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -163,7 +164,7 @@ def test_contains_moar(any_string_dtype): ) result = s.str.contains("a") - expected_dtype = "object" if any_string_dtype == "object" else "boolean" + expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series( [False, False, False, True, True, False, np.nan, False, False, True], dtype=expected_dtype, @@ -204,7 +205,7 @@ def test_contains_nan(any_string_dtype): s = Series([np.nan, np.nan, np.nan], dtype=any_string_dtype) result = s.str.contains("foo", na=False) - expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" + expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series([False, False, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -215,12 +216,14 @@ def test_contains_nan(any_string_dtype): result = s.str.contains("foo", na="foo") if any_string_dtype == "object": expected = Series(["foo", "foo", "foo"], dtype=np.object_) + elif any_string_dtype == "string[pyarrow_numpy]": + expected = Series([True, True, True], dtype=np.bool_) else: expected = Series([True, True, True], dtype="boolean") tm.assert_series_equal(result, expected) result = s.str.contains("foo") - expected_dtype = "object" if any_string_dtype == "object" else "boolean" + expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series([np.nan, np.nan, np.nan], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -379,7 +382,7 @@ def test_replace_unicode(any_string_dtype): ser = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype) expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype) with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" + PerformanceWarning, "string[pyarrow" in any_string_dtype ): result = ser.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE, regex=True) tm.assert_series_equal(result, expected) @@ -402,7 +405,7 @@ def test_replace_callable(any_string_dtype): # test with callable repl = lambda m: m.group(0).swapcase() with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" + PerformanceWarning, "string[pyarrow" in any_string_dtype ): result = ser.str.replace("[a-z][A-Z]{2}", repl, n=2, regex=True) expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype) @@ -423,7 +426,7 @@ def test_replace_callable_raises(any_string_dtype, repl): ) with pytest.raises(TypeError, match=msg): with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" + PerformanceWarning, "string[pyarrow" in any_string_dtype ): values.str.replace("a", repl, regex=True) @@ -434,7 +437,7 @@ def test_replace_callable_named_groups(any_string_dtype): pat = r"(?P\w+) (?P\w+) (?P\w+)" repl = lambda m: m.group("middle").swapcase() with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" + PerformanceWarning, "string[pyarrow" in any_string_dtype ): result = ser.str.replace(pat, repl, regex=True) expected = Series(["bAR", np.nan], dtype=any_string_dtype) @@ -448,14 +451,14 @@ def test_replace_compiled_regex(any_string_dtype): # test with compiled regex pat = re.compile(r"BAD_*") with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" + PerformanceWarning, "string[pyarrow" in any_string_dtype ): result = ser.str.replace(pat, "", regex=True) expected = Series(["foobar", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" + PerformanceWarning, "string[pyarrow" in any_string_dtype ): result = ser.str.replace(pat, "", n=1, regex=True) expected = Series(["foobarBAD", np.nan], dtype=any_string_dtype) @@ -477,7 +480,7 @@ def test_replace_compiled_regex_unicode(any_string_dtype): expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype) pat = re.compile(r"(?<=\w),(?=\w)", flags=re.UNICODE) with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" + PerformanceWarning, "string[pyarrow" in any_string_dtype ): result = ser.str.replace(pat, ", ", regex=True) tm.assert_series_equal(result, expected) @@ -507,7 +510,7 @@ def test_replace_compiled_regex_callable(any_string_dtype): repl = lambda m: m.group(0).swapcase() pat = re.compile("[a-z][A-Z]{2}") with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" + PerformanceWarning, "string[pyarrow" in any_string_dtype ): result = ser.str.replace(pat, repl, n=2, regex=True) expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype) @@ -558,7 +561,7 @@ def test_replace_moar(any_string_dtype): tm.assert_series_equal(result, expected) with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" + PerformanceWarning, "string[pyarrow" in any_string_dtype ): result = ser.str.replace("A", "YYY", case=False) expected = Series( @@ -579,7 +582,7 @@ def test_replace_moar(any_string_dtype): tm.assert_series_equal(result, expected) with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" + PerformanceWarning, "string[pyarrow" in any_string_dtype ): result = ser.str.replace("^.a|dog", "XX-XX ", case=False, regex=True) expected = Series( @@ -605,14 +608,14 @@ def test_replace_not_case_sensitive_not_regex(any_string_dtype): ser = Series(["A.", "a.", "Ab", "ab", np.nan], dtype=any_string_dtype) with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" + PerformanceWarning, "string[pyarrow" in any_string_dtype ): result = ser.str.replace("a", "c", case=False, regex=False) expected = Series(["c.", "c.", "cb", "cb", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" + PerformanceWarning, "string[pyarrow" in any_string_dtype ): result = ser.str.replace("a.", "c.", case=False, regex=False) expected = Series(["c.", "c.", "Ab", "ab", np.nan], dtype=any_string_dtype) @@ -648,7 +651,7 @@ def test_replace_regex_single_character(regex, any_string_dtype): def test_match(any_string_dtype): # New match behavior introduced in 0.13 - expected_dtype = "object" if any_string_dtype == "object" else "boolean" + expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" values = Series(["fooBAD__barBAD", np.nan, "foo"], dtype=any_string_dtype) result = values.str.match(".*(BAD[_]+).*(BAD)") @@ -703,12 +706,12 @@ def test_match_na_kwarg(any_string_dtype): s = Series(["a", "b", np.nan], dtype=any_string_dtype) result = s.str.match("a", na=False) - expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" + expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series([True, False, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) result = s.str.match("a") - expected_dtype = "object" if any_string_dtype == "object" else "boolean" + expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series([True, False, np.nan], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -716,7 +719,7 @@ def test_match_na_kwarg(any_string_dtype): def test_match_case_kwarg(any_string_dtype): values = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype) result = values.str.match("ab", case=False) - expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" + expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series([True, True, True, True], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -732,7 +735,7 @@ def test_fullmatch(any_string_dtype): ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype ) result = ser.str.fullmatch(".*BAD[_]+.*BAD") - expected_dtype = "object" if any_string_dtype == "object" else "boolean" + expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series([True, False, np.nan, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -742,14 +745,14 @@ def test_fullmatch_na_kwarg(any_string_dtype): ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype ) result = ser.str.fullmatch(".*BAD[_]+.*BAD", na=False) - expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" + expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series([True, False, False, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) def test_fullmatch_case_kwarg(any_string_dtype): ser = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype) - expected_dtype = np.bool_ if any_string_dtype == "object" else "boolean" + expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series([True, False, False, False], dtype=expected_dtype) @@ -762,7 +765,7 @@ def test_fullmatch_case_kwarg(any_string_dtype): tm.assert_series_equal(result, expected) with tm.maybe_produces_warning( - PerformanceWarning, any_string_dtype == "string[pyarrow]" + PerformanceWarning, "string[pyarrow" in any_string_dtype ): result = ser.str.fullmatch("ab", flags=re.IGNORECASE) tm.assert_series_equal(result, expected) @@ -825,7 +828,7 @@ def test_find(any_string_dtype): ser = Series( ["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF", "XXXX"], dtype=any_string_dtype ) - expected_dtype = np.int64 if any_string_dtype == "object" else "Int64" + expected_dtype = np.int64 if any_string_dtype in object_pyarrow_numpy else "Int64" result = ser.str.find("EF") expected = Series([4, 3, 1, 0, -1], dtype=expected_dtype) @@ -877,7 +880,7 @@ def test_find_nan(any_string_dtype): ser = Series( ["ABCDEFG", np.nan, "DEFGHIJEF", np.nan, "XXXX"], dtype=any_string_dtype ) - expected_dtype = np.float64 if any_string_dtype == "object" else "Int64" + expected_dtype = np.float64 if any_string_dtype in object_pyarrow_numpy else "Int64" result = ser.str.find("EF") expected = Series([4, np.nan, 1, np.nan, -1], dtype=expected_dtype) @@ -944,7 +947,7 @@ def test_flags_kwarg(any_string_dtype): pat = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})" - using_pyarrow = any_string_dtype == "string[pyarrow]" + using_pyarrow = "string[pyarrow" in any_string_dtype result = data.str.extract(pat, flags=re.IGNORECASE, expand=True) assert result.iloc[0].tolist() == ["dave", "google", "com"] diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 1e573bdfe8fb5..bdae496cdc121 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -14,6 +14,7 @@ ) import pandas._testing as tm from pandas.core.strings.accessor import StringMethods +from pandas.tests.strings import object_pyarrow_numpy @pytest.mark.parametrize("pattern", [0, True, Series(["foo", "bar"])]) @@ -40,7 +41,7 @@ def test_iter_raises(): def test_count(any_string_dtype): ser = Series(["foo", "foofoo", np.nan, "foooofooofommmfoo"], dtype=any_string_dtype) result = ser.str.count("f[o]+") - expected_dtype = np.float64 if any_string_dtype == "object" else "Int64" + expected_dtype = np.float64 if any_string_dtype in object_pyarrow_numpy else "Int64" expected = Series([1, 2, np.nan, 4], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -91,7 +92,7 @@ def test_repeat_with_null(any_string_dtype, arg, repeat): def test_empty_str_methods(any_string_dtype): empty_str = empty = Series(dtype=any_string_dtype) - if any_string_dtype == "object": + if any_string_dtype in object_pyarrow_numpy: empty_int = Series(dtype="int64") empty_bool = Series(dtype=bool) else: @@ -205,7 +206,7 @@ def test_ismethods(method, expected, any_string_dtype): ser = Series( ["A", "b", "Xy", "4", "3A", "", "TT", "55", "-", " "], dtype=any_string_dtype ) - expected_dtype = "bool" if any_string_dtype == "object" else "boolean" + expected_dtype = "bool" if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series(expected, dtype=expected_dtype) result = getattr(ser.str, method)() tm.assert_series_equal(result, expected) @@ -230,7 +231,7 @@ def test_isnumeric_unicode(method, expected, any_string_dtype): ser = Series( ["A", "3", "¼", "★", "፸", "3", "four"], dtype=any_string_dtype # noqa: RUF001 ) - expected_dtype = "bool" if any_string_dtype == "object" else "boolean" + expected_dtype = "bool" if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series(expected, dtype=expected_dtype) result = getattr(ser.str, method)() tm.assert_series_equal(result, expected) @@ -250,7 +251,7 @@ def test_isnumeric_unicode(method, expected, any_string_dtype): def test_isnumeric_unicode_missing(method, expected, any_string_dtype): values = ["A", np.nan, "¼", "★", np.nan, "3", "four"] # noqa: RUF001 ser = Series(values, dtype=any_string_dtype) - expected_dtype = "object" if any_string_dtype == "object" else "boolean" + expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean" expected = Series(expected, dtype=expected_dtype) result = getattr(ser.str, method)() tm.assert_series_equal(result, expected) @@ -280,7 +281,7 @@ def test_len(any_string_dtype): dtype=any_string_dtype, ) result = ser.str.len() - expected_dtype = "float64" if any_string_dtype == "object" else "Int64" + expected_dtype = "float64" if any_string_dtype in object_pyarrow_numpy else "Int64" expected = Series([3, 4, 6, np.nan, 8, 4, 1], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -309,7 +310,8 @@ def test_index(method, sub, start, end, index_or_series, any_string_dtype, expec obj = index_or_series( ["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF"], dtype=any_string_dtype ) - expected_dtype = np.int64 if any_string_dtype == "object" else "Int64" + + expected_dtype = np.int64 if any_string_dtype in object_pyarrow_numpy else "Int64" expected = index_or_series(expected, dtype=expected_dtype) result = getattr(obj.str, method)(sub, start, end) @@ -350,7 +352,7 @@ def test_index_wrong_type_raises(index_or_series, any_string_dtype, method): ) def test_index_missing(any_string_dtype, method, exp): ser = Series(["abcb", "ab", "bcbe", np.nan], dtype=any_string_dtype) - expected_dtype = np.float64 if any_string_dtype == "object" else "Int64" + expected_dtype = np.float64 if any_string_dtype in object_pyarrow_numpy else "Int64" result = getattr(ser.str, method)("b") expected = Series(exp + [np.nan], dtype=expected_dtype) From d9e61e542d9252928d8119f65894381f749de0c6 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 13 Aug 2023 22:00:05 +0200 Subject: [PATCH 04/23] Move methods --- pandas/core/arrays/arrow/array.py | 42 +++++++++++++++---------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index d4c68c6870681..b2b1b92e569e3 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2000,27 +2000,6 @@ def _str_count(self, pat: str, flags: int = 0): raise NotImplementedError(f"count not implemented with {flags=}") return type(self)(pc.count_substring_regex(self._pa_array, pat)) - def _str_slice( - self, start: int | None = None, stop: int | None = None, step: int | None = None - ): - if start is None: - start = 0 - if step is None: - step = 1 - return type(self)( - pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) - ) - - def _str_partition(self, sep: str, expand: bool): - predicate = lambda val: val.partition(sep) - result = self._apply_elementwise(predicate) - return type(self)(pa.chunked_array(result)) - - def _str_rpartition(self, sep: str, expand: bool): - predicate = lambda val: val.rpartition(sep) - result = self._apply_elementwise(predicate) - return type(self)(pa.chunked_array(result)) - def _str_contains( self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True ): @@ -2113,6 +2092,27 @@ def _str_join(self, sep: str): result = self._pa_array return type(self)(pc.binary_join(result, sep)) + def _str_partition(self, sep: str, expand: bool): + predicate = lambda val: val.partition(sep) + result = self._apply_elementwise(predicate) + return type(self)(pa.chunked_array(result)) + + def _str_rpartition(self, sep: str, expand: bool): + predicate = lambda val: val.rpartition(sep) + result = self._apply_elementwise(predicate) + return type(self)(pa.chunked_array(result)) + + def _str_slice( + self, start: int | None = None, stop: int | None = None, step: int | None = None + ): + if start is None: + start = 0 + if step is None: + step = 1 + return type(self)( + pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) + ) + def _str_isalnum(self): return type(self)(pc.utf8_is_alnum(self._pa_array)) From 3188c25cc74859694e7991e54f9a4f1333e2d71d Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 13 Aug 2023 22:58:17 +0200 Subject: [PATCH 05/23] Refactor --- pandas/core/arrays/string_arrow.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 099e49980d27e..aa9deae0b1964 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -115,9 +115,6 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr # error: Incompatible types in assignment (expression has type "StringDtype", # base class "ArrowExtensionArray" defined the type as "ArrowDtype") _dtype: StringDtype # type: ignore[assignment] - _result_converter = lambda _, result, **kwargs: BooleanDtype().__from_arrow__( - result - ) _storage = "pyarrow" def __init__(self, values) -> None: @@ -187,6 +184,10 @@ def insert(self, loc: int, item) -> ArrowStringArray: raise TypeError("Scalar must be NA or str") return super().insert(loc, item) + @staticmethod + def _result_converter(values, **kwargs): + return BooleanDtype().__from_arrow__(values) + def _maybe_convert_setitem_value(self, value): """Maybe convert value to be pyarrow compatible.""" if is_scalar(value): From cd19bfb43d077e379b12b80d0c30e51724b6f5ec Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 14 Aug 2023 12:16:31 +0200 Subject: [PATCH 06/23] Refactor --- pandas/tests/strings/test_strings.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index bdae496cdc121..4315835b70a40 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -310,7 +310,6 @@ def test_index(method, sub, start, end, index_or_series, any_string_dtype, expec obj = index_or_series( ["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF"], dtype=any_string_dtype ) - expected_dtype = np.int64 if any_string_dtype in object_pyarrow_numpy else "Int64" expected = index_or_series(expected, dtype=expected_dtype) From c73c6b0e964cf1694feefe2a2c390f7e3dd5f0b8 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 14 Aug 2023 12:19:19 +0200 Subject: [PATCH 07/23] Remove --- pandas/core/arrays/arrow/array.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 1ff3b31389bb2..dea502c6dc090 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -250,12 +250,6 @@ def __init__(self, values: pa.Array | pa.ChunkedArray) -> None: ) self._dtype = ArrowDtype(self._pa_array.type) - def __dir__(self): - o = set(dir(type(self))) - o.update(self.__dict__) - o.update(set(dir(ArrowStringArrayMixin))) - return list(o) - @classmethod def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False): """ From 6b2630902c93a0ea49a883fc3bbd76d5360db364 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 14 Aug 2023 13:46:47 +0200 Subject: [PATCH 08/23] Fix --- pandas/core/arrays/string_arrow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index aa9deae0b1964..bc9add038301d 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -456,7 +456,7 @@ def _result_converter(values, na=None): return ArrowExtensionArray(values).to_numpy(na_value=np.nan) def __getattribute__(self, item): - if item in ArrowStringArrayMixin.__dict__: + if item in ArrowStringArrayMixin.__dict__ and item != "_pa_array": return partial(getattr(ArrowStringArrayMixin, item), self) return super().__getattribute__(item) From da6d67c7447a6c51262c6c4a556684ca41fc08c8 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 14 Aug 2023 17:21:06 +0200 Subject: [PATCH 09/23] Update --- pandas/tests/arrays/string_/test_string.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index b65fc52053414..3d66d45151e05 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas.compat.pyarrow import pa_version_under12p0 + from pandas.core.dtypes.common import is_dtype_equal import pandas as pd @@ -13,7 +15,6 @@ ArrowStringArray, ArrowStringArrayNumpySemantics, ) -from pandas.util.version import Version @pytest.fixture @@ -450,7 +451,7 @@ def test_arrow_array(dtype): data = pd.array(["a", "b", "c"], dtype=dtype) arr = pa.array(data) expected = pa.array(list(data), type=pa.string(), from_pandas=True) - if dtype.storage == "pyarrow" and Version(pa.__version__) <= Version("11.0.0"): + if dtype.storage in ("pyarrow", "pyarrow_numpy") and pa_version_under12p0: expected = pa.chunked_array(expected) assert arr.equals(expected) From d862ecaf3aa05082d36512f994d99106cc7c10b1 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 16 Aug 2023 18:53:57 +0200 Subject: [PATCH 10/23] Na return value --- pandas/core/arrays/string_.py | 5 ++- pandas/tests/strings/__init__.py | 14 +++++++ pandas/tests/strings/test_find_replace.py | 9 ++-- pandas/tests/strings/test_split_partition.py | 44 +++++++------------- 4 files changed, 38 insertions(+), 34 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 1e285f90e9fea..4c28360c732a3 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -101,7 +101,10 @@ class StringDtype(StorageExtensionDtype): #: StringDtype().na_value uses pandas.NA @property def na_value(self) -> libmissing.NAType: - return libmissing.NA + if self.storage == "pyarrow_numpy": + return np.nan + else: + return libmissing.NA _metadata = ("storage",) diff --git a/pandas/tests/strings/__init__.py b/pandas/tests/strings/__init__.py index 326ae24410502..01b49b5e5b633 100644 --- a/pandas/tests/strings/__init__.py +++ b/pandas/tests/strings/__init__.py @@ -1 +1,15 @@ +import numpy as np + +import pandas as pd + object_pyarrow_numpy = ("object", "string[pyarrow_numpy]") + + +def _convert_na_value(ser, expected): + if ser.dtype != object: + if ser.dtype.storage == "pyarrow_numpy": + expected = expected.fillna(np.nan) + else: + # GH#18463 + expected = expected.fillna(pd.NA) + return expected diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index f62299f53aebe..2320ab4ed8b02 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -11,7 +11,10 @@ Series, _testing as tm, ) -from pandas.tests.strings import object_pyarrow_numpy +from pandas.tests.strings import ( + _convert_na_value, + object_pyarrow_numpy, +) # -------------------------------------------------------------------------------------- # str.contains @@ -780,9 +783,7 @@ def test_findall(any_string_dtype): ser = Series(["fooBAD__barBAD", np.nan, "foo", "BAD"], dtype=any_string_dtype) result = ser.str.findall("BAD[_]*") expected = Series([["BAD__", "BAD"], np.nan, [], ["BAD"]]) - if ser.dtype != object: - # GH#18463 - expected = expected.fillna(pd.NA) + expected = _convert_na_value(ser, expected) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py index 0298694ccaf71..0a7d409773dd6 100644 --- a/pandas/tests/strings/test_split_partition.py +++ b/pandas/tests/strings/test_split_partition.py @@ -12,6 +12,10 @@ Series, _testing as tm, ) +from pandas.tests.strings import ( + _convert_na_value, + object_pyarrow_numpy, +) @pytest.mark.parametrize("method", ["split", "rsplit"]) @@ -20,9 +24,7 @@ def test_split(any_string_dtype, method): result = getattr(values.str, method)("_") exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]]) - if values.dtype != object: - # GH#18463 - exp = exp.fillna(pd.NA) + exp = _convert_na_value(values, exp) tm.assert_series_equal(result, exp) @@ -32,9 +34,7 @@ def test_split_more_than_one_char(any_string_dtype, method): values = Series(["a__b__c", "c__d__e", np.nan, "f__g__h"], dtype=any_string_dtype) result = getattr(values.str, method)("__") exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]]) - if values.dtype != object: - # GH#18463 - exp = exp.fillna(pd.NA) + exp = _convert_na_value(values, exp) tm.assert_series_equal(result, exp) result = getattr(values.str, method)("__", expand=False) @@ -46,9 +46,7 @@ def test_split_more_regex_split(any_string_dtype): values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"], dtype=any_string_dtype) result = values.str.split("[,_]") exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]]) - if values.dtype != object: - # GH#18463 - exp = exp.fillna(pd.NA) + exp = _convert_na_value(values, exp) tm.assert_series_equal(result, exp) @@ -118,8 +116,8 @@ def test_split_object_mixed(expand, method): def test_split_n(any_string_dtype, method, n): s = Series(["a b", pd.NA, "b c"], dtype=any_string_dtype) expected = Series([["a", "b"], pd.NA, ["b", "c"]]) - result = getattr(s.str, method)(" ", n=n) + expected = _convert_na_value(s, expected) tm.assert_series_equal(result, expected) @@ -128,9 +126,7 @@ def test_rsplit(any_string_dtype): values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"], dtype=any_string_dtype) result = values.str.rsplit("[,_]") exp = Series([["a,b_c"], ["c_d,e"], np.nan, ["f,g,h"]]) - if values.dtype != object: - # GH#18463 - exp = exp.fillna(pd.NA) + exp = _convert_na_value(values, exp) tm.assert_series_equal(result, exp) @@ -139,9 +135,7 @@ def test_rsplit_max_number(any_string_dtype): values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype) result = values.str.rsplit("_", n=1) exp = Series([["a_b", "c"], ["c_d", "e"], np.nan, ["f_g", "h"]]) - if values.dtype != object: - # GH#18463 - exp = exp.fillna(pd.NA) + exp = _convert_na_value(values, exp) tm.assert_series_equal(result, exp) @@ -390,7 +384,7 @@ def test_split_nan_expand(any_string_dtype): # check that these are actually np.nan/pd.NA and not None # TODO see GH 18463 # tm.assert_frame_equal does not differentiate - if any_string_dtype == "object": + if any_string_dtype in object_pyarrow_numpy: assert all(np.isnan(x) for x in result.iloc[1]) else: assert all(x is pd.NA for x in result.iloc[1]) @@ -455,9 +449,7 @@ def test_partition_series_more_than_one_char(method, exp, any_string_dtype): s = Series(["a__b__c", "c__d__e", np.nan, "f__g__h", None], dtype=any_string_dtype) result = getattr(s.str, method)("__", expand=False) expected = Series(exp) - if s.dtype != object: - # GH#18463 - expected = expected.fillna(pd.NA) + expected = _convert_na_value(s, expected) tm.assert_series_equal(result, expected) @@ -480,9 +472,7 @@ def test_partition_series_none(any_string_dtype, method, exp): s = Series(["a b c", "c d e", np.nan, "f g h", None], dtype=any_string_dtype) result = getattr(s.str, method)(expand=False) expected = Series(exp) - if s.dtype != object: - # GH#18463 - expected = expected.fillna(pd.NA) + expected = _convert_na_value(s, expected) tm.assert_series_equal(result, expected) @@ -505,9 +495,7 @@ def test_partition_series_not_split(any_string_dtype, method, exp): s = Series(["abc", "cde", np.nan, "fgh", None], dtype=any_string_dtype) result = getattr(s.str, method)("_", expand=False) expected = Series(exp) - if s.dtype != object: - # GH#18463 - expected = expected.fillna(pd.NA) + expected = _convert_na_value(s, expected) tm.assert_series_equal(result, expected) @@ -531,9 +519,7 @@ def test_partition_series_unicode(any_string_dtype, method, exp): result = getattr(s.str, method)("_", expand=False) expected = Series(exp) - if s.dtype != object: - # GH#18463 - expected = expected.fillna(pd.NA) + expected = _convert_na_value(s, expected) tm.assert_series_equal(result, expected) From 6cf263912ff682ac688a945fc27db454a6851f5a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 16 Aug 2023 22:40:39 +0200 Subject: [PATCH 11/23] Fix --- pandas/tests/strings/test_find_replace.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index e2d4619290a93..4cbfe97ad2ab4 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -19,7 +19,7 @@ def using_pyarrow(dtype): - return dtype in ("string[pyarrow]",) + return dtype in ("string[pyarrow]", "string[pyarrow_numpy]") def test_contains(any_string_dtype): From 48bd626c8bcc9df0c13dca08861c0b21be0bfcca Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 16 Aug 2023 23:10:27 +0200 Subject: [PATCH 12/23] Update --- pandas/tests/arrays/string_/test_string.py | 33 ++++++++++++++++------ 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 3d66d45151e05..d92f1c048a369 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -17,6 +17,13 @@ ) +def na_val(dtype): + if dtype.storage == "pyarrow_numpy": + return np.nan + else: + return pd.NA + + @pytest.fixture def dtype(string_storage): """Fixture giving StringDtype from parametrized 'string_storage'""" @@ -31,26 +38,34 @@ def cls(dtype): def test_repr(dtype): df = pd.DataFrame({"A": pd.array(["a", pd.NA, "b"], dtype=dtype)}) - expected = " A\n0 a\n1 \n2 b" + if dtype.storage == "pyarrow_numpy": + expected = " A\n0 a\n1 NaN\n2 b" + else: + expected = " A\n0 a\n1 \n2 b" assert repr(df) == expected - expected = "0 a\n1 \n2 b\nName: A, dtype: string" + if dtype.storage == "pyarrow_numpy": + expected = "0 a\n1 NaN\n2 b\nName: A, dtype: string" + else: + expected = "0 a\n1 \n2 b\nName: A, dtype: string" assert repr(df.A) == expected if dtype.storage == "pyarrow": arr_name = "ArrowStringArray" + expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: string" elif dtype.storage == "pyarrow_numpy": arr_name = "ArrowStringArrayNumpySemantics" + expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: string" else: arr_name = "StringArray" - expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: string" + expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: string" assert repr(df.A.array) == expected def test_none_to_nan(cls): a = cls._from_sequence(["a", None, "b"]) assert a[1] is not None - assert a[1] is pd.NA + assert a[1] is na_val(a.dtype) def test_setitem_validates(cls): @@ -206,7 +221,7 @@ def test_comparison_methods_scalar(comparison_op, dtype): result = getattr(a, op_name)(other) if dtype.storage == "pyarrow_numpy": expected = np.array([getattr(item, op_name)(other) for item in a], dtype=object) - expected = pd.array(expected, dtype="boolean").to_numpy(na_value=False) + expected[1] = False tm.assert_numpy_array_equal(result, expected) else: expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" @@ -403,7 +418,7 @@ def test_min_max(method, skipna, dtype, request): expected = "a" if method == "min" else "c" assert result == expected else: - assert result is pd.NA + assert result is na_val(arr.dtype) @pytest.mark.parametrize("method", ["min", "max"]) @@ -471,7 +486,7 @@ def test_arrow_roundtrip(dtype, string_storage2): expected = df.astype(f"string[{string_storage2}]") tm.assert_frame_equal(result, expected) # ensure the missing value is represented by NA and not np.nan or None - assert result.loc[2, "a"] is pd.NA + assert result.loc[2, "a"] is na_val(result["a"].dtype) def test_arrow_load_from_zero_chunks(dtype, string_storage2): @@ -569,7 +584,7 @@ def test_astype_from_float_dtype(float_dtype, dtype): def test_to_numpy_returns_pdna_default(dtype): arr = pd.array(["a", pd.NA, "b"], dtype=dtype) result = np.array(arr) - expected = np.array(["a", pd.NA, "b"], dtype=object) + expected = np.array(["a", na_val(dtype), "b"], dtype=object) tm.assert_numpy_array_equal(result, expected) @@ -609,7 +624,7 @@ def test_setitem_scalar_with_mask_validation(dtype): mask = np.array([False, True, False]) ser[mask] = None - assert ser.array[1] is pd.NA + assert ser.array[1] is na_val(ser.dtype) # for other non-string we should also raise an error ser = pd.Series(["a", "b", "c"], dtype=dtype) From 4f9387a369fc3e894b96c11fdb2d3d487ec0d6ae Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 17 Aug 2023 00:32:43 +0200 Subject: [PATCH 13/23] Implement any and all for pyarrow numpy strings --- pandas/core/arrays/string_arrow.py | 13 +++++++++++++ pandas/tests/series/test_logical_ops.py | 18 ++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 2f9b2651103d9..b75b94008fe4f 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -553,3 +553,16 @@ def value_counts(self, dropna: bool = True): return Series( result._values.to_numpy(), index=result.index, name=result.name, copy=False ) + + def _reduce( + self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs + ): + if name in ["any", "all"]: + arr = pc.and_kleene( + pc.invert(pc.is_null(self._pa_array)), pc.not_equal(self._pa_array, "") + ) + return ArrowExtensionArray(arr)._reduce( + name, skipna=skipna, keepdims=keepdims, **kwargs + ) + else: + return super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs) diff --git a/pandas/tests/series/test_logical_ops.py b/pandas/tests/series/test_logical_ops.py index 4dab3e8f62598..fa2e37ccc0cac 100644 --- a/pandas/tests/series/test_logical_ops.py +++ b/pandas/tests/series/test_logical_ops.py @@ -513,3 +513,21 @@ def test_int_dtype_different_index_not_bool(self): result = ser1 ^ ser2 tm.assert_series_equal(result, expected) + + def test_any_all(self): + # GH#54591 + ser = Series(["", "a"], dtype="string[pyarrow_numpy]") + assert ser.any() + assert not ser.all() + + ser = Series([None, "a"], dtype="string[pyarrow_numpy]") + assert ser.any() + assert not ser.all() + + ser = Series([None, ""], dtype="string[pyarrow_numpy]") + assert not ser.any() + assert not ser.all() + + ser = Series(["a", "b"], dtype="string[pyarrow_numpy]") + assert ser.any() + assert ser.all() From 606cd71c4adbef4ca03dbb86ca884775d595f0c5 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 21 Aug 2023 11:15:25 +0200 Subject: [PATCH 14/23] Fix typing --- pandas/core/arrays/string_.py | 2 +- pandas/core/arrays/string_arrow.py | 2 +- pandas/tests/extension/test_string.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 16cc18320fb65..79e5beb3175dc 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -100,7 +100,7 @@ class StringDtype(StorageExtensionDtype): #: StringDtype().na_value uses pandas.NA @property - def na_value(self) -> libmissing.NAType: + def na_value(self) -> libmissing.NAType | float: if self.storage == "pyarrow_numpy": return np.nan else: diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 2f9b2651103d9..30d8d22ef7f5a 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -483,7 +483,7 @@ def _str_map( mask.view("uint8"), convert=False, na_value=na_value, - dtype=np.dtype(dtype), + dtype=np.dtype(dtype), # type: ignore[arg-type] ) return result diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 6088cd211d829..35071412a6e40 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -184,8 +184,8 @@ def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): # attribute "storage" if dtype.storage == "pyarrow": # type: ignore[union-attr] cast_to = "boolean[pyarrow]" - elif dtype.storage == "pyarrow_numpy": - cast_to = np.bool_ + elif dtype.storage == "pyarrow_numpy": # type: ignore[union-attr] + cast_to = np.bool_ # type: ignore[assignment] else: cast_to = "boolean" return pointwise_result.astype(cast_to) From 64145012462c59a805602601f082445a1734088a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 21 Aug 2023 11:25:05 +0200 Subject: [PATCH 15/23] Update --- pandas/tests/extension/test_string.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 6088cd211d829..cde08c5372b91 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -168,6 +168,17 @@ def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): with pytest.raises(TypeError): getattr(ser, op_name)(skipna=skipna) + def check_reduce(self, s, op_name, skipna): + res_op = getattr(s, op_name) + alt = s.astype("object") + exp_op = getattr(alt, op_name) + result = res_op(skipna=skipna) + expected = exp_op(skipna=skipna) + tm.assert_almost_equal(result, expected) + + def _supports_reduction(self, obj, op_name: str) -> bool: + return obj.dtype.storage == "pyarrow_numpy" + class TestMethods(base.BaseMethodsTests): pass From 68acc329a9dd8662655ca86cca66e83adcb7006b Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 21 Aug 2023 12:08:46 +0200 Subject: [PATCH 16/23] Fix --- pandas/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index d0e0fca40de57..767855f13e16d 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -2002,4 +2002,4 @@ def warsaw(request) -> str: @pytest.fixture() def arrow_string_storage(): - return ("pyarrow",) + return ("pyarrow", "pyarrow_numpy") From fbab6fbfa866046d4a4d98b4ff93bdd7aaa7fab5 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 21 Aug 2023 12:08:59 +0200 Subject: [PATCH 17/23] Fix --- pandas/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index d0e0fca40de57..767855f13e16d 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -2002,4 +2002,4 @@ def warsaw(request) -> str: @pytest.fixture() def arrow_string_storage(): - return ("pyarrow",) + return ("pyarrow", "pyarrow_numpy") From 68e5f8f2fb310c549f2ab62a5342b25669d369bd Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 21 Aug 2023 13:24:15 +0200 Subject: [PATCH 18/23] Fix --- pandas/tests/extension/test_string.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index fd8e06ce148ed..68433cf610f92 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -158,7 +158,11 @@ def test_fillna_no_op_returns_copy(self, data): class TestReduce(base.BaseReduceTests): def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: - return op_name in ["min", "max"] + return ( + op_name in ["min", "max"] + or ser.dtype.storage == "pyarrow_numpy" + and op_name in ("any", "all") + ) class TestMethods(base.BaseMethodsTests): From 032200612f139eadc34cd0c3068c5fd25c8912f0 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 21 Aug 2023 13:25:27 +0200 Subject: [PATCH 19/23] Move test --- pandas/tests/reductions/test_reductions.py | 18 ++++++++++++++++++ pandas/tests/series/test_logical_ops.py | 18 ------------------ 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 87892a81cef3d..8fb3338229f27 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -1639,3 +1639,21 @@ def test_multimode_complex(self, array, expected, dtype): # Complex numbers are sorted by their magnitude result = Series(array, dtype=dtype).mode() tm.assert_series_equal(result, expected) + + def test_any_all(self): + # GH#54591 + ser = Series(["", "a"], dtype="string[pyarrow_numpy]") + assert ser.any() + assert not ser.all() + + ser = Series([None, "a"], dtype="string[pyarrow_numpy]") + assert ser.any() + assert not ser.all() + + ser = Series([None, ""], dtype="string[pyarrow_numpy]") + assert not ser.any() + assert not ser.all() + + ser = Series(["a", "b"], dtype="string[pyarrow_numpy]") + assert ser.any() + assert ser.all() diff --git a/pandas/tests/series/test_logical_ops.py b/pandas/tests/series/test_logical_ops.py index dd2b90db69039..26046ef9ba295 100644 --- a/pandas/tests/series/test_logical_ops.py +++ b/pandas/tests/series/test_logical_ops.py @@ -513,21 +513,3 @@ def test_int_dtype_different_index_not_bool(self): result = ser1 ^ ser2 tm.assert_series_equal(result, expected) - - def test_any_all(self): - # GH#54591 - ser = Series(["", "a"], dtype="string[pyarrow_numpy]") - assert ser.any() - assert not ser.all() - - ser = Series([None, "a"], dtype="string[pyarrow_numpy]") - assert ser.any() - assert not ser.all() - - ser = Series([None, ""], dtype="string[pyarrow_numpy]") - assert not ser.any() - assert not ser.all() - - ser = Series(["a", "b"], dtype="string[pyarrow_numpy]") - assert ser.any() - assert ser.all() From 8bb52f4ef37a36a87f136aa78e706b90b23a098f Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 21 Aug 2023 22:57:43 +0200 Subject: [PATCH 20/23] Skip test when no pa --- pandas/tests/reductions/test_reductions.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 8fb3338229f27..439f6681d7958 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -1642,6 +1642,7 @@ def test_multimode_complex(self, array, expected, dtype): def test_any_all(self): # GH#54591 + pytest.importorskip("pyarrow") ser = Series(["", "a"], dtype="string[pyarrow_numpy]") assert ser.any() assert not ser.all() From cc8e6f7929da99e20fe424aec7dd2bb2efb6c20b Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 22 Aug 2023 17:23:57 +0200 Subject: [PATCH 21/23] Fix typing --- pandas/core/arrays/string_.py | 2 +- pandas/tests/extension/test_string.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 79e5beb3175dc..7c3930c694e2f 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -100,7 +100,7 @@ class StringDtype(StorageExtensionDtype): #: StringDtype().na_value uses pandas.NA @property - def na_value(self) -> libmissing.NAType | float: + def na_value(self) -> libmissing.NAType | float: # type: ignore[override] if self.storage == "pyarrow_numpy": return np.nan else: diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 68433cf610f92..775be6be78636 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -160,7 +160,7 @@ class TestReduce(base.BaseReduceTests): def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: return ( op_name in ["min", "max"] - or ser.dtype.storage == "pyarrow_numpy" + or ser.dtype.storage == "pyarrow_numpy" # type: ignore[union-attr] and op_name in ("any", "all") ) From 33355a719eb51f68c99cb26d072ddd96a74065bc Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 23 Aug 2023 23:12:11 +0200 Subject: [PATCH 22/23] Fix tests --- pandas/tests/arrays/string_/test_string.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 17730b5ff6647..24d8e43708b91 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -228,9 +228,9 @@ def test_comparison_methods_scalar(comparison_op, dtype): other = "a" result = getattr(a, op_name)(other) if dtype.storage == "pyarrow_numpy": - expected = np.array([getattr(item, op_name)(other) for item in a], dtype=object) + expected = np.array([getattr(item, op_name)(other) for item in a]) expected[1] = False - tm.assert_numpy_array_equal(result, expected) + tm.assert_numpy_array_equal(result, expected.astype(np.bool_)) else: expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" expected = np.array([getattr(item, op_name)(other) for item in a], dtype=object) @@ -244,7 +244,7 @@ def test_comparison_methods_scalar_pd_na(comparison_op, dtype): result = getattr(a, op_name)(pd.NA) if dtype.storage == "pyarrow_numpy": - expected = np.array([False, False, False], dtype=object) + expected = np.array([False, False, False]) tm.assert_numpy_array_equal(result, expected) else: expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" @@ -272,7 +272,7 @@ def test_comparison_methods_scalar_not_string(comparison_op, dtype): "__eq__": [False, False, False], "__ne__": [True, False, True], }[op_name] - expected = np.array(expected_data, dtype=object) + expected = np.array(expected_data) tm.assert_numpy_array_equal(result, expected) else: expected_data = {"__eq__": [False, None, False], "__ne__": [True, None, True]}[ @@ -290,12 +290,12 @@ def test_comparison_methods_array(comparison_op, dtype): other = [None, None, "c"] result = getattr(a, op_name)(other) if dtype.storage == "pyarrow_numpy": - expected = np.array([False, False, False], dtype=object) + expected = np.array([False, False, False]) expected[-1] = getattr(other[-1], op_name)(a[-1]) tm.assert_numpy_array_equal(result, expected) result = getattr(a, op_name)(pd.NA) - expected = np.array([False, False, False], dtype=object) + expected = np.array([False, False, False]) tm.assert_numpy_array_equal(result, expected) else: From 1facb7921332640c1a421c691610bc2744410ff0 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 28 Aug 2023 11:13:32 +0200 Subject: [PATCH 23/23] move + rename test --- pandas/tests/reductions/test_reductions.py | 38 +++++++++++----------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 439f6681d7958..021252500e814 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -1078,6 +1078,25 @@ def test_any_all_datetimelike(self): assert df.any().all() assert not df.all().any() + def test_any_all_pyarrow_string(self): + # GH#54591 + pytest.importorskip("pyarrow") + ser = Series(["", "a"], dtype="string[pyarrow_numpy]") + assert ser.any() + assert not ser.all() + + ser = Series([None, "a"], dtype="string[pyarrow_numpy]") + assert ser.any() + assert not ser.all() + + ser = Series([None, ""], dtype="string[pyarrow_numpy]") + assert not ser.any() + assert not ser.all() + + ser = Series(["a", "b"], dtype="string[pyarrow_numpy]") + assert ser.any() + assert ser.all() + def test_timedelta64_analytics(self): # index min/max dti = date_range("2012-1-1", periods=3, freq="D") @@ -1639,22 +1658,3 @@ def test_multimode_complex(self, array, expected, dtype): # Complex numbers are sorted by their magnitude result = Series(array, dtype=dtype).mode() tm.assert_series_equal(result, expected) - - def test_any_all(self): - # GH#54591 - pytest.importorskip("pyarrow") - ser = Series(["", "a"], dtype="string[pyarrow_numpy]") - assert ser.any() - assert not ser.all() - - ser = Series([None, "a"], dtype="string[pyarrow_numpy]") - assert ser.any() - assert not ser.all() - - ser = Series([None, ""], dtype="string[pyarrow_numpy]") - assert not ser.any() - assert not ser.all() - - ser = Series(["a", "b"], dtype="string[pyarrow_numpy]") - assert ser.any() - assert ser.all()