diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 2ab710a5762d3..5db859897b663 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -27,6 +27,7 @@ pa_version_under5p0, pa_version_under6p0, pa_version_under7p0, + pa_version_under8p0, ) if TYPE_CHECKING: @@ -158,4 +159,5 @@ def get_lzma_file() -> type[lzma.LZMAFile]: "pa_version_under5p0", "pa_version_under6p0", "pa_version_under7p0", + "pa_version_under8p0", ] diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 92aedbb836b38..07b09d78016fd 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -57,6 +57,76 @@ "ge": pc.greater_equal, } + ARROW_LOGICAL_FUNCS = { + "and": NotImplemented if pa_version_under2p0 else pc.and_kleene, + "rand": NotImplemented + if pa_version_under2p0 + else lambda x, y: pc.and_kleene(y, x), + "or": NotImplemented if pa_version_under2p0 else pc.or_kleene, + "ror": NotImplemented + if pa_version_under2p0 + else lambda x, y: pc.or_kleene(y, x), + "xor": NotImplemented if pa_version_under2p0 else pc.xor, + "rxor": NotImplemented if pa_version_under2p0 else lambda x, y: pc.xor(y, x), + } + + def cast_for_truediv( + arrow_array: pa.ChunkedArray, pa_object: pa.Array | pa.Scalar + ) -> pa.ChunkedArray: + # Ensure int / int -> float mirroring Python/Numpy behavior + # as pc.divide_checked(int, int) -> int + if pa.types.is_integer(arrow_array.type) and pa.types.is_integer( + pa_object.type + ): + return arrow_array.cast(pa.float64()) + return arrow_array + + def floordiv_compat( + left: pa.ChunkedArray | pa.Array | pa.Scalar, + right: pa.ChunkedArray | pa.Array | pa.Scalar, + ) -> pa.ChunkedArray: + # Ensure int // int -> int mirroring Python/Numpy behavior + # as pc.floor(pc.divide_checked(int, int)) -> float + result = pc.floor(pc.divide_checked(left, right)) + if pa.types.is_integer(left.type) and pa.types.is_integer(right.type): + result = result.cast(left.type) + return result + + ARROW_ARITHMETIC_FUNCS = { + "add": NotImplemented if pa_version_under2p0 else pc.add_checked, + "radd": NotImplemented + if pa_version_under2p0 + else lambda x, y: pc.add_checked(y, x), + "sub": NotImplemented if pa_version_under2p0 else pc.subtract_checked, + "rsub": NotImplemented + if pa_version_under2p0 + else lambda x, y: pc.subtract_checked(y, x), + "mul": NotImplemented if pa_version_under2p0 else pc.multiply_checked, + "rmul": NotImplemented + if pa_version_under2p0 + else lambda x, y: pc.multiply_checked(y, x), + "truediv": NotImplemented + if pa_version_under2p0 + else lambda x, y: pc.divide_checked(cast_for_truediv(x, y), y), + "rtruediv": NotImplemented + if pa_version_under2p0 + else lambda x, y: pc.divide_checked(y, cast_for_truediv(x, y)), + "floordiv": NotImplemented + if pa_version_under2p0 + else lambda x, y: floordiv_compat(x, y), + "rfloordiv": NotImplemented + if pa_version_under2p0 + else lambda x, y: floordiv_compat(y, x), + "mod": NotImplemented, + "rmod": NotImplemented, + "divmod": NotImplemented, + "rdivmod": NotImplemented, + "pow": NotImplemented if pa_version_under2p0 else pc.power_checked, + "rpow": NotImplemented + if pa_version_under2p0 + else lambda x, y: pc.power_checked(y, x), + } + if TYPE_CHECKING: from pandas import Series @@ -74,6 +144,7 @@ def to_pyarrow_type( elif isinstance(dtype, pa.DataType): pa_dtype = dtype elif dtype: + # Accepts python types too pa_dtype = pa.from_numpy_dtype(dtype) else: pa_dtype = None @@ -263,6 +334,28 @@ def _cmp_method(self, other, op): result = result.to_numpy() return BooleanArray._from_sequence(result) + def _evaluate_op_method(self, other, op, arrow_funcs): + pc_func = arrow_funcs[op.__name__] + if pc_func is NotImplemented: + raise NotImplementedError(f"{op.__name__} not implemented.") + if isinstance(other, ArrowExtensionArray): + result = pc_func(self._data, other._data) + elif isinstance(other, (np.ndarray, list)): + result = pc_func(self._data, pa.array(other, from_pandas=True)) + elif is_scalar(other): + result = pc_func(self._data, pa.scalar(other)) + else: + raise NotImplementedError( + f"{op.__name__} not implemented for {type(other)}" + ) + return type(self)(result) + + def _logical_method(self, other, op): + return self._evaluate_op_method(other, op, ARROW_LOGICAL_FUNCS) + + def _arith_method(self, other, op): + return self._evaluate_op_method(other, op, ARROW_ARITHMETIC_FUNCS) + def equals(self, other) -> bool: if not isinstance(other, ArrowExtensionArray): return False diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 7421645baa463..f884264e9ab75 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -360,7 +360,7 @@ def _str_get_dummies(self, sep="|"): arr = Series(self).fillna("") try: arr = sep + arr + sep - except TypeError: + except (TypeError, NotImplementedError): arr = sep + arr.astype(str) + sep tags: set[str] = set() diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index b563f84207b22..a5eb6189db6f1 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -101,7 +101,7 @@ def test_add(dtype, request): "unsupported operand type(s) for +: 'ArrowStringArray' and " "'ArrowStringArray'" ) - mark = pytest.mark.xfail(raises=TypeError, reason=reason) + mark = pytest.mark.xfail(raises=NotImplementedError, reason=reason) request.node.add_marker(mark) a = pd.Series(["a", "b", "c", None, None], dtype=dtype) @@ -142,7 +142,7 @@ def test_add_2d(dtype, request): def test_add_sequence(dtype, request): if dtype.storage == "pyarrow": reason = "unsupported operand type(s) for +: 'ArrowStringArray' and 'list'" - mark = pytest.mark.xfail(raises=TypeError, reason=reason) + mark = pytest.mark.xfail(raises=NotImplementedError, reason=reason) request.node.add_marker(mark) a = pd.array(["a", "b", None, None], dtype=dtype) @@ -160,7 +160,7 @@ def test_add_sequence(dtype, request): def test_mul(dtype, request): if dtype.storage == "pyarrow": reason = "unsupported operand type(s) for *: 'ArrowStringArray' and 'int'" - mark = pytest.mark.xfail(raises=TypeError, reason=reason) + mark = pytest.mark.xfail(raises=NotImplementedError, reason=reason) request.node.add_marker(mark) a = pd.array(["a", "b", None], dtype=dtype) diff --git a/pandas/tests/extension/base/ops.py b/pandas/tests/extension/base/ops.py index a1d232b737da7..569782e55fd72 100644 --- a/pandas/tests/extension/base/ops.py +++ b/pandas/tests/extension/base/ops.py @@ -67,10 +67,10 @@ class BaseArithmeticOpsTests(BaseOpsUtil): * divmod_exc = TypeError """ - series_scalar_exc: type[TypeError] | None = TypeError - frame_scalar_exc: type[TypeError] | None = TypeError - series_array_exc: type[TypeError] | None = TypeError - divmod_exc: type[TypeError] | None = TypeError + series_scalar_exc: type[Exception] | None = TypeError + frame_scalar_exc: type[Exception] | None = TypeError + series_array_exc: type[Exception] | None = TypeError + divmod_exc: type[Exception] | None = TypeError def test_arith_series_with_scalar(self, data, all_arithmetic_operators): # series & scalar diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 7e0792a6010a7..ef576692c83b6 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -24,6 +24,7 @@ from pandas.compat import ( pa_version_under2p0, pa_version_under3p0, + pa_version_under8p0, ) import pandas as pd @@ -179,6 +180,16 @@ def data_missing_for_sorting(data_for_grouping): ) +@pytest.fixture +def data_for_twos(data): + """Length-100 array in which all the elements are two.""" + pa_dtype = data.dtype.pyarrow_dtype + if pa.types.is_integer(pa_dtype) or pa.types.is_floating(pa_dtype): + return pd.array([2] * 100, dtype=data.dtype) + # tests will be xfailed where 2 is not a valid scalar for pa_dtype + return data + + @pytest.fixture def na_value(): """The scalar missing value for this type. Default 'None'""" @@ -1211,6 +1222,20 @@ def test_EA_types(self, engine, data, request): class TestBaseMethods(base.BaseMethodsTests): + @pytest.mark.parametrize("periods", [1, -2]) + def test_diff(self, data, periods, request): + pa_dtype = data.dtype.pyarrow_dtype + if pa.types.is_unsigned_integer(pa_dtype) and periods == 1: + request.node.add_marker( + pytest.mark.xfail( + raises=pa.ArrowInvalid, + reason=( + f"diff with {pa_dtype} and periods={periods} will overflow" + ), + ) + ) + super().test_diff(data, periods) + @pytest.mark.parametrize("dropna", [True, False]) def test_value_counts(self, all_data, dropna, request): pa_dtype = all_data.dtype.pyarrow_dtype @@ -1491,6 +1516,325 @@ def test_where_series(self, data, na_value, as_frame, request, using_array_manag super().test_where_series(data, na_value, as_frame) +class TestBaseArithmeticOps(base.BaseArithmeticOpsTests): + + divmod_exc = NotImplementedError + + def _patch_combine(self, obj, other, op): + # BaseOpsUtil._combine can upcast expected dtype + # (because it generates expected on python scalars) + # while ArrowExtensionArray maintains original type + expected = base.BaseArithmeticOpsTests._combine(self, obj, other, op) + was_frame = False + if isinstance(expected, pd.DataFrame): + was_frame = True + expected_data = expected.iloc[:, 0] + original_dtype = obj.iloc[:, 0].dtype + else: + expected_data = expected + original_dtype = obj.dtype + pa_array = pa.array(expected_data._values).cast(original_dtype.pyarrow_dtype) + pd_array = type(expected_data._values)(pa_array) + if was_frame: + expected = pd.DataFrame( + pd_array, index=expected.index, columns=expected.columns + ) + else: + expected = pd.Series(pd_array) + return expected + + def test_arith_series_with_scalar( + self, data, all_arithmetic_operators, request, monkeypatch + ): + pa_dtype = data.dtype.pyarrow_dtype + + arrow_temporal_supported = not pa_version_under8p0 and ( + all_arithmetic_operators in ("__add__", "__radd__") + and pa.types.is_duration(pa_dtype) + or all_arithmetic_operators in ("__sub__", "__rsub__") + and pa.types.is_temporal(pa_dtype) + ) + if ( + all_arithmetic_operators + in { + "__mod__", + "__rmod__", + } + or pa_version_under2p0 + ): + self.series_scalar_exc = NotImplementedError + elif arrow_temporal_supported: + self.series_scalar_exc = None + elif not ( + pa.types.is_floating(pa_dtype) + or pa.types.is_integer(pa_dtype) + or arrow_temporal_supported + ): + self.series_scalar_exc = pa.ArrowNotImplementedError + else: + self.series_scalar_exc = None + if ( + all_arithmetic_operators == "__rpow__" + and (pa.types.is_floating(pa_dtype) or pa.types.is_integer(pa_dtype)) + and not pa_version_under2p0 + ): + request.node.add_marker( + pytest.mark.xfail( + reason=( + f"GH 29997: 1**pandas.NA == 1 while 1**pyarrow.NA == NULL " + f"for {pa_dtype}" + ) + ) + ) + elif arrow_temporal_supported: + request.node.add_marker( + pytest.mark.xfail( + raises=TypeError, + reason=( + f"{all_arithmetic_operators} not supported between" + f"pd.NA and {pa_dtype} Python scalar" + ), + ) + ) + elif ( + all_arithmetic_operators in {"__rtruediv__", "__rfloordiv__"} + and (pa.types.is_floating(pa_dtype) or pa.types.is_integer(pa_dtype)) + and not pa_version_under2p0 + ): + request.node.add_marker( + pytest.mark.xfail( + raises=pa.ArrowInvalid, + reason="divide by 0", + ) + ) + if all_arithmetic_operators == "__floordiv__" and pa.types.is_integer(pa_dtype): + # BaseOpsUtil._combine always returns int64, while ArrowExtensionArray does + # not upcast + monkeypatch.setattr(TestBaseArithmeticOps, "_combine", self._patch_combine) + super().test_arith_series_with_scalar(data, all_arithmetic_operators) + + def test_arith_frame_with_scalar( + self, data, all_arithmetic_operators, request, monkeypatch + ): + pa_dtype = data.dtype.pyarrow_dtype + + arrow_temporal_supported = not pa_version_under8p0 and ( + all_arithmetic_operators in ("__add__", "__radd__") + and pa.types.is_duration(pa_dtype) + or all_arithmetic_operators in ("__sub__", "__rsub__") + and pa.types.is_temporal(pa_dtype) + ) + if ( + all_arithmetic_operators + in { + "__mod__", + "__rmod__", + } + or pa_version_under2p0 + ): + self.frame_scalar_exc = NotImplementedError + elif arrow_temporal_supported: + self.frame_scalar_exc = None + elif not (pa.types.is_floating(pa_dtype) or pa.types.is_integer(pa_dtype)): + self.frame_scalar_exc = pa.ArrowNotImplementedError + else: + self.frame_scalar_exc = None + if ( + all_arithmetic_operators == "__rpow__" + and (pa.types.is_floating(pa_dtype) or pa.types.is_integer(pa_dtype)) + and not pa_version_under2p0 + ): + request.node.add_marker( + pytest.mark.xfail( + reason=( + f"GH 29997: 1**pandas.NA == 1 while 1**pyarrow.NA == NULL " + f"for {pa_dtype}" + ) + ) + ) + elif arrow_temporal_supported: + request.node.add_marker( + pytest.mark.xfail( + raises=TypeError, + reason=( + f"{all_arithmetic_operators} not supported between" + f"pd.NA and {pa_dtype} Python scalar" + ), + ) + ) + elif ( + all_arithmetic_operators in {"__rtruediv__", "__rfloordiv__"} + and (pa.types.is_floating(pa_dtype) or pa.types.is_integer(pa_dtype)) + and not pa_version_under2p0 + ): + request.node.add_marker( + pytest.mark.xfail( + raises=pa.ArrowInvalid, + reason="divide by 0", + ) + ) + if all_arithmetic_operators == "__floordiv__" and pa.types.is_integer(pa_dtype): + # BaseOpsUtil._combine always returns int64, while ArrowExtensionArray does + # not upcast + monkeypatch.setattr(TestBaseArithmeticOps, "_combine", self._patch_combine) + super().test_arith_frame_with_scalar(data, all_arithmetic_operators) + + def test_arith_series_with_array( + self, data, all_arithmetic_operators, request, monkeypatch + ): + pa_dtype = data.dtype.pyarrow_dtype + + arrow_temporal_supported = not pa_version_under8p0 and ( + all_arithmetic_operators in ("__add__", "__radd__") + and pa.types.is_duration(pa_dtype) + or all_arithmetic_operators in ("__sub__", "__rsub__") + and pa.types.is_temporal(pa_dtype) + ) + if ( + all_arithmetic_operators + in { + "__mod__", + "__rmod__", + } + or pa_version_under2p0 + ): + self.series_array_exc = NotImplementedError + elif arrow_temporal_supported: + self.series_array_exc = None + elif not (pa.types.is_floating(pa_dtype) or pa.types.is_integer(pa_dtype)): + self.series_array_exc = pa.ArrowNotImplementedError + else: + self.series_array_exc = None + if ( + all_arithmetic_operators == "__rpow__" + and (pa.types.is_floating(pa_dtype) or pa.types.is_integer(pa_dtype)) + and not pa_version_under2p0 + ): + request.node.add_marker( + pytest.mark.xfail( + reason=( + f"GH 29997: 1**pandas.NA == 1 while 1**pyarrow.NA == NULL " + f"for {pa_dtype}" + ) + ) + ) + elif ( + all_arithmetic_operators + in ( + "__sub__", + "__rsub__", + ) + and pa.types.is_unsigned_integer(pa_dtype) + and not pa_version_under2p0 + ): + request.node.add_marker( + pytest.mark.xfail( + raises=pa.ArrowInvalid, + reason=( + f"Implemented pyarrow.compute.subtract_checked " + f"which raises on overflow for {pa_dtype}" + ), + ) + ) + elif arrow_temporal_supported: + request.node.add_marker( + pytest.mark.xfail( + raises=TypeError, + reason=( + f"{all_arithmetic_operators} not supported between" + f"pd.NA and {pa_dtype} Python scalar" + ), + ) + ) + elif ( + all_arithmetic_operators in {"__rtruediv__", "__rfloordiv__"} + and (pa.types.is_floating(pa_dtype) or pa.types.is_integer(pa_dtype)) + and not pa_version_under2p0 + ): + request.node.add_marker( + pytest.mark.xfail( + raises=pa.ArrowInvalid, + reason="divide by 0", + ) + ) + op_name = all_arithmetic_operators + ser = pd.Series(data) + # pd.Series([ser.iloc[0]] * len(ser)) may not return ArrowExtensionArray + # since ser.iloc[0] is a python scalar + other = pd.Series(pd.array([ser.iloc[0]] * len(ser), dtype=data.dtype)) + if pa.types.is_floating(pa_dtype) or ( + pa.types.is_integer(pa_dtype) and all_arithmetic_operators != "__truediv__" + ): + monkeypatch.setattr(TestBaseArithmeticOps, "_combine", self._patch_combine) + self.check_opname(ser, op_name, other, exc=self.series_array_exc) + + def test_add_series_with_extension_array(self, data, request): + pa_dtype = data.dtype.pyarrow_dtype + if ( + not ( + pa.types.is_integer(pa_dtype) + or pa.types.is_floating(pa_dtype) + or (not pa_version_under8p0 and pa.types.is_duration(pa_dtype)) + ) + or pa_version_under2p0 + ): + request.node.add_marker( + pytest.mark.xfail( + raises=NotImplementedError, + reason=f"add_checked not implemented for {pa_dtype}", + ) + ) + super().test_add_series_with_extension_array(data) + + +class TestBaseComparisonOps(base.BaseComparisonOpsTests): + def assert_series_equal(self, left, right, *args, **kwargs): + # Series.combine for "expected" retains bool[pyarrow] dtype + # While "result" return "boolean" dtype + right = pd.Series(right._values.to_numpy(), dtype="boolean") + super().assert_series_equal(left, right, *args, **kwargs) + + def test_compare_array(self, data, comparison_op, na_value, request): + pa_dtype = data.dtype.pyarrow_dtype + ser = pd.Series(data) + # pd.Series([ser.iloc[0]] * len(ser)) may not return ArrowExtensionArray + # since ser.iloc[0] is a python scalar + other = pd.Series(pd.array([ser.iloc[0]] * len(ser), dtype=data.dtype)) + if comparison_op.__name__ in ["eq", "ne"]: + # comparison should match point-wise comparisons + result = comparison_op(ser, other) + # Series.combine does not calculate the NA mask correctly + # when comparing over an array + assert result[8] is na_value + assert result[97] is na_value + expected = ser.combine(other, comparison_op) + expected[8] = na_value + expected[97] = na_value + self.assert_series_equal(result, expected) + + else: + exc = None + try: + result = comparison_op(ser, other) + except Exception as err: + exc = err + + if exc is None: + # Didn't error, then should match point-wise behavior + if pa.types.is_temporal(pa_dtype): + # point-wise comparison with pd.NA raises TypeError + assert result[8] is na_value + assert result[97] is na_value + result = result.drop([8, 97]).reset_index(drop=True) + ser = ser.drop([8, 97]) + other = other.drop([8, 97]) + expected = ser.combine(other, comparison_op) + self.assert_series_equal(result, expected) + else: + with pytest.raises(type(exc)): + ser.combine(other, comparison_op) + + def test_arrowdtype_construct_from_string_type_with_unsupported_parameters(): with pytest.raises(NotImplementedError, match="Passing pyarrow type"): ArrowDtype.construct_from_string("timestamp[s, tz=UTC][pyarrow]") diff --git a/pandas/tests/strings/test_api.py b/pandas/tests/strings/test_api.py index 974ecc152f17b..d76ed65be9e1b 100644 --- a/pandas/tests/strings/test_api.py +++ b/pandas/tests/strings/test_api.py @@ -132,7 +132,7 @@ def test_api_for_categorical(any_string_method, any_string_dtype, request): any_string_dtype == "string" and get_option("string_storage") == "pyarrow" ): # unsupported operand type(s) for +: 'ArrowStringArray' and 'str' - mark = pytest.mark.xfail(raises=TypeError, reason="Not Implemented") + mark = pytest.mark.xfail(raises=NotImplementedError, reason="Not Implemented") request.node.add_marker(mark) s = Series(list("aabb"), dtype=any_string_dtype)