Skip to content

ENH/TST: Add Reduction tests for ArrowExtensionArray #47730

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Jul 22, 2022
63 changes: 63 additions & 0 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -628,6 +628,69 @@ def _concat_same_type(
arr = pa.chunked_array(chunks)
return cls(arr)

def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could type the return

"""
Return a scalar result of performing the reduction operation.

Parameters
----------
name : str
Name of the function, supported values are:
{ any, all, min, max, sum, mean, median, prod,
std, var, sem, kurt, skew }.
skipna : bool, default True
If True, skip NaN values.
**kwargs
Additional keyword arguments passed to the reduction function.
Currently, `ddof` is the only supported kwarg.

Returns
-------
scalar

Raises
------
TypeError : subclass does not define reductions
"""
if name == "sem":

def pyarrow_meth(data, skipna, **kwargs):
numerator = pc.stddev(data, skip_nulls=skipna, **kwargs)
denominator = pc.sqrt_checked(
pc.subtract_checked(
pc.count(self._data, skip_nulls=skipna), kwargs["ddof"]
)
)
return pc.divide_checked(numerator, denominator)

else:
pyarrow_name = {
"median": "approximate_median",
"prod": "product",
"std": "stddev",
"var": "variance",
}.get(name, name)
# error: Incompatible types in assignment
# (expression has type "Optional[Any]", variable has type
# "Callable[[Any, Any, KwArg(Any)], Any]")
pyarrow_meth = getattr(pc, pyarrow_name, None) # type: ignore[assignment]
if pyarrow_meth is None:
# Let ExtensionArray._reduce raise the TypeError
return super()._reduce(name, skipna=skipna, **kwargs)
try:
result = pyarrow_meth(self._data, skip_nulls=skipna, **kwargs)
except (AttributeError, NotImplementedError, TypeError) as err:
msg = (
f"'{type(self).__name__}' with dtype {self.dtype} "
f"does not support reduction '{name}' with pyarrow "
f"version {pa.__version__}. '{name}' may be supported by "
f"upgrading pyarrow."
)
raise TypeError(msg) from err
if pc.is_null(result).as_py():
return self.dtype.na_value
return result.as_py()

def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None:
"""Set one or more values inplace.

Expand Down
9 changes: 6 additions & 3 deletions pandas/tests/arrays/string_/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@
import numpy as np
import pytest

from pandas.compat import pa_version_under2p0
from pandas.compat import (
pa_version_under2p0,
pa_version_under6p0,
)
from pandas.errors import PerformanceWarning
import pandas.util._test_decorators as td

Expand Down Expand Up @@ -375,7 +378,7 @@ def test_reduce_missing(skipna, dtype):
@pytest.mark.parametrize("method", ["min", "max"])
@pytest.mark.parametrize("skipna", [True, False])
def test_min_max(method, skipna, dtype, request):
if dtype.storage == "pyarrow":
if dtype.storage == "pyarrow" and pa_version_under6p0:
reason = "'ArrowStringArray' object has no attribute 'max'"
mark = pytest.mark.xfail(raises=TypeError, reason=reason)
request.node.add_marker(mark)
Expand All @@ -392,7 +395,7 @@ def test_min_max(method, skipna, dtype, request):
@pytest.mark.parametrize("method", ["min", "max"])
@pytest.mark.parametrize("box", [pd.Series, pd.array])
def test_min_max_numpy(method, box, dtype, request):
if dtype.storage == "pyarrow":
if dtype.storage == "pyarrow" and (pa_version_under6p0 or box is pd.array):
if box is pd.array:
reason = "'<=' not supported between instances of 'str' and 'NoneType'"
else:
Expand Down
90 changes: 90 additions & 0 deletions pandas/tests/extension/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from pandas.compat import (
pa_version_under2p0,
pa_version_under3p0,
pa_version_under6p0,
pa_version_under8p0,
)

Expand Down Expand Up @@ -303,6 +304,95 @@ def test_loc_iloc_frame_single_dtype(self, request, using_array_manager, data):
super().test_loc_iloc_frame_single_dtype(data)


class TestBaseNumericReduce(base.BaseNumericReduceTests):
def check_reduce(self, ser, op_name, skipna):
pa_dtype = ser.dtype.pyarrow_dtype
result = getattr(ser, op_name)(skipna=skipna)
if pa.types.is_boolean(pa_dtype):
# Can't convert if ser contains NA
pytest.skip(
"pandas boolean data with NA does not fully support all reductions"
)
elif pa.types.is_integer(pa_dtype) or pa.types.is_floating(pa_dtype):
ser = ser.astype("Float64")
expected = getattr(ser, op_name)(skipna=skipna)
tm.assert_almost_equal(result, expected)

@pytest.mark.parametrize("skipna", [True, False])
def test_reduce_series(self, data, all_numeric_reductions, skipna, request):
pa_dtype = data.dtype.pyarrow_dtype
xfail_mark = pytest.mark.xfail(
raises=TypeError,
reason=(
f"{all_numeric_reductions} is not implemented in "
f"pyarrow={pa.__version__} for {pa_dtype}"
),
)
if all_numeric_reductions in {"skew", "kurt"}:
request.node.add_marker(xfail_mark)
elif (
all_numeric_reductions in {"median", "var", "std", "prod", "max", "min"}
and pa_version_under6p0
):
request.node.add_marker(xfail_mark)
elif all_numeric_reductions in {"sum", "mean"} and pa_version_under2p0:
request.node.add_marker(xfail_mark)
elif (
all_numeric_reductions in {"sum", "mean"}
and skipna is False
and pa_version_under6p0
and (pa.types.is_integer(pa_dtype) or pa.types.is_floating(pa_dtype))
):
request.node.add_marker(
pytest.mark.xfail(
raises=AssertionError,
reason=(
f"{all_numeric_reductions} with skip_nulls={skipna} did not "
f"return NA for {pa_dtype} with pyarrow={pa.__version__}"
),
)
)
elif not (
pa.types.is_integer(pa_dtype)
or pa.types.is_floating(pa_dtype)
or pa.types.is_boolean(pa_dtype)
) and not (
all_numeric_reductions in {"min", "max"}
and (pa.types.is_temporal(pa_dtype) and not pa.types.is_duration(pa_dtype))
):
request.node.add_marker(xfail_mark)
elif pa.types.is_boolean(pa_dtype) and all_numeric_reductions in {
"std",
"var",
"median",
}:
request.node.add_marker(xfail_mark)
super().test_reduce_series(data, all_numeric_reductions, skipna)


class TestBaseBooleanReduce(base.BaseBooleanReduceTests):
@pytest.mark.parametrize("skipna", [True, False])
def test_reduce_series(
self, data, all_boolean_reductions, skipna, na_value, request
):
pa_dtype = data.dtype.pyarrow_dtype
xfail_mark = pytest.mark.xfail(
raises=TypeError,
reason=(
f"{all_boolean_reductions} is not implemented in "
f"pyarrow={pa.__version__} for {pa_dtype}"
),
)
if not pa.types.is_boolean(pa_dtype):
request.node.add_marker(xfail_mark)
elif pa_version_under3p0:
request.node.add_marker(xfail_mark)
op_name = all_boolean_reductions
s = pd.Series(data)
result = getattr(s, op_name)(skipna=skipna)
assert result is (op_name == "any")


class TestBaseGroupby(base.BaseGroupbyTests):
def test_groupby_agg_extension(self, data_for_grouping, request):
tz = getattr(data_for_grouping.dtype.pyarrow_dtype, "tz", None)
Expand Down