Skip to content

BUG/TST: Include sem & count in all_numeric_reductions #49759

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Nov 28, 2022
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -645,7 +645,7 @@ Numeric
^^^^^^^
- Bug in :meth:`DataFrame.add` cannot apply ufunc when inputs contain mixed DataFrame type and Series type (:issue:`39853`)
- Bug in DataFrame reduction methods (e.g. :meth:`DataFrame.sum`) with object dtype, ``axis=1`` and ``numeric_only=False`` would not be coerced to float (:issue:`49551`)
-
- Bug in :meth:`DataFrame.sem` and :meth:`Series.sem` where an erroneous ``TypeError`` would always raise when using data backed by an :class:`ArrowDtype` (:issue:`49759`)

Conversion
^^^^^^^^^^
Expand Down
2 changes: 2 additions & 0 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1034,6 +1034,7 @@ def all_arithmetic_functions(request):


_all_numeric_reductions = [
"count",
"sum",
"max",
"min",
Expand All @@ -1044,6 +1045,7 @@ def all_arithmetic_functions(request):
"median",
"kurt",
"skew",
"sem",
]


Expand Down
10 changes: 3 additions & 7 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -829,13 +829,9 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
"""
if name == "sem":

def pyarrow_meth(data, skipna, **kwargs):
numerator = pc.stddev(data, skip_nulls=skipna, **kwargs)
denominator = pc.sqrt_checked(
pc.subtract_checked(
pc.count(self._data, skip_nulls=skipna), kwargs["ddof"]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is kwargs['ddof'] not relevant here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I suppose this only applies in the stddev call.

When comparing to ser.astype("Float64").sem() the results were off, and removing kwargs["ddof"] aligned this pyarrow sem to nullable dtype sem.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the test changes all look good. this change i dont know enough to form an opinion on. if youre confident its right then LGTM. otherwise lets find an appropriate person to double-check

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think based on the nanops.nansem implementation, I'm fairly sure these align

pandas/pandas/core/nanops.py

Lines 1033 to 1042 in 3fffb6d

nanvar(values, axis=axis, skipna=skipna, ddof=ddof, mask=mask)
mask = _maybe_get_mask(values, skipna, mask)
if not is_float_dtype(values.dtype):
values = values.astype("f8")
count, _ = _get_counts_nanvar(values.shape, mask, axis, ddof, values.dtype)
var = nanvar(values, axis=axis, skipna=skipna, ddof=ddof)
return np.sqrt(var) / np.sqrt(count)

)
)
def pyarrow_meth(data, skip_nulls, **kwargs):
numerator = pc.stddev(data, skip_nulls=skip_nulls, **kwargs)
denominator = pc.sqrt_checked(pc.count(self._data))
return pc.divide_checked(numerator, denominator)

else:
Expand Down
7 changes: 4 additions & 3 deletions pandas/tests/arrays/boolean/test_reduction.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,11 @@ def test_reductions_return_types(dropna, data, all_numeric_reductions):
if dropna:
s = s.dropna()

if op == "sum":
assert isinstance(getattr(s, op)(), np.int_)
elif op == "prod":
if op in ("sum", "prod"):
assert isinstance(getattr(s, op)(), np.int_)
elif op == "count":
# Oddly on the 32 bit build (but not Windows), this is intc (!= intp)
assert isinstance(getattr(s, op)(), np.integer)
elif op in ("min", "max"):
assert isinstance(getattr(s, op)(), np.bool_)
else:
Expand Down
10 changes: 8 additions & 2 deletions pandas/tests/extension/base/reduce.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,14 @@ class BaseReduceTests(BaseExtensionTests):
"""

def check_reduce(self, s, op_name, skipna):
result = getattr(s, op_name)(skipna=skipna)
expected = getattr(s.astype("float64"), op_name)(skipna=skipna)
res_op = getattr(s, op_name)
exp_op = getattr(s.astype("float64"), op_name)
if op_name == "count":
result = res_op()
expected = exp_op()
else:
result = res_op(skipna=skipna)
expected = exp_op(skipna=skipna)
tm.assert_almost_equal(result, expected)


Expand Down
7 changes: 5 additions & 2 deletions pandas/tests/extension/decimal/test_decimal.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,11 +112,14 @@ class TestMissing(base.BaseMissingTests):
class Reduce:
def check_reduce(self, s, op_name, skipna):

if op_name in ["median", "skew", "kurt"]:
if op_name in ["median", "skew", "kurt", "sem"]:
msg = r"decimal does not support the .* operation"
with pytest.raises(NotImplementedError, match=msg):
getattr(s, op_name)(skipna=skipna)

elif op_name == "count":
result = getattr(s, op_name)()
expected = len(s) - s.isna().sum()
tm.assert_almost_equal(result, expected)
else:
result = getattr(s, op_name)(skipna=skipna)
expected = getattr(np.asarray(s), op_name)()
Expand Down
40 changes: 28 additions & 12 deletions pandas/tests/extension/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,15 +346,21 @@ def test_getitem_scalar(self, data):
class TestBaseNumericReduce(base.BaseNumericReduceTests):
def check_reduce(self, ser, op_name, skipna):
pa_dtype = ser.dtype.pyarrow_dtype
result = getattr(ser, op_name)(skipna=skipna)
if op_name == "count":
result = getattr(ser, op_name)()
else:
result = getattr(ser, op_name)(skipna=skipna)
if pa.types.is_boolean(pa_dtype):
# Can't convert if ser contains NA
pytest.skip(
"pandas boolean data with NA does not fully support all reductions"
)
elif pa.types.is_integer(pa_dtype) or pa.types.is_floating(pa_dtype):
ser = ser.astype("Float64")
expected = getattr(ser, op_name)(skipna=skipna)
if op_name == "count":
expected = getattr(ser, op_name)()
else:
expected = getattr(ser, op_name)(skipna=skipna)
tm.assert_almost_equal(result, expected)

@pytest.mark.parametrize("skipna", [True, False])
Expand All @@ -374,6 +380,8 @@ def test_reduce_series(self, data, all_numeric_reductions, skipna, request):
and pa_version_under6p0
):
request.node.add_marker(xfail_mark)
elif all_numeric_reductions == "sem" and pa_version_under8p0:
request.node.add_marker(xfail_mark)
elif (
all_numeric_reductions in {"sum", "mean"}
and skipna is False
Expand All @@ -389,20 +397,28 @@ def test_reduce_series(self, data, all_numeric_reductions, skipna, request):
),
)
)
elif not (
pa.types.is_integer(pa_dtype)
or pa.types.is_floating(pa_dtype)
or pa.types.is_boolean(pa_dtype)
) and not (
all_numeric_reductions in {"min", "max"}
and (
(pa.types.is_temporal(pa_dtype) and not pa.types.is_duration(pa_dtype))
or pa.types.is_string(pa_dtype)
or pa.types.is_binary(pa_dtype)
elif (
not (
pa.types.is_integer(pa_dtype)
or pa.types.is_floating(pa_dtype)
or pa.types.is_boolean(pa_dtype)
)
and not (
all_numeric_reductions in {"min", "max"}
and (
(
pa.types.is_temporal(pa_dtype)
and not pa.types.is_duration(pa_dtype)
)
or pa.types.is_string(pa_dtype)
or pa.types.is_binary(pa_dtype)
)
)
and not all_numeric_reductions == "count"
):
request.node.add_marker(xfail_mark)
elif pa.types.is_boolean(pa_dtype) and all_numeric_reductions in {
"sem",
"std",
"var",
"median",
Expand Down
8 changes: 6 additions & 2 deletions pandas/tests/extension/test_boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -367,8 +367,12 @@ def test_groupby_sum_mincount(self, data_for_grouping, min_count):

class TestNumericReduce(base.BaseNumericReduceTests):
def check_reduce(self, s, op_name, skipna):
result = getattr(s, op_name)(skipna=skipna)
expected = getattr(s.astype("float64"), op_name)(skipna=skipna)
if op_name == "count":
result = getattr(s, op_name)()
expected = getattr(s.astype("float64"), op_name)()
else:
result = getattr(s, op_name)(skipna=skipna)
expected = getattr(s.astype("float64"), op_name)(skipna=skipna)
# override parent function to cast to bool for min/max
if np.isnan(expected):
expected = pd.NA
Expand Down
9 changes: 6 additions & 3 deletions pandas/tests/extension/test_floating.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,13 +188,16 @@ class TestNumericReduce(base.BaseNumericReduceTests):
def check_reduce(self, s, op_name, skipna):
# overwrite to ensure pd.NA is tested instead of np.nan
# https://github.com/pandas-dev/pandas/issues/30958
result = getattr(s, op_name)(skipna=skipna)
if not skipna and s.isna().any():
expected = pd.NA
if op_name == "count":
result = getattr(s, op_name)()
expected = getattr(s.dropna().astype(s.dtype.numpy_dtype), op_name)()
else:
result = getattr(s, op_name)(skipna=skipna)
expected = getattr(s.dropna().astype(s.dtype.numpy_dtype), op_name)(
skipna=skipna
)
if not skipna and s.isna().any():
expected = pd.NA
tm.assert_almost_equal(result, expected)


Expand Down
9 changes: 6 additions & 3 deletions pandas/tests/extension/test_integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,11 +211,14 @@ class TestNumericReduce(base.BaseNumericReduceTests):
def check_reduce(self, s, op_name, skipna):
# overwrite to ensure pd.NA is tested instead of np.nan
# https://github.com/pandas-dev/pandas/issues/30958
result = getattr(s, op_name)(skipna=skipna)
if not skipna and s.isna().any():
expected = pd.NA
if op_name == "count":
result = getattr(s, op_name)()
expected = getattr(s.dropna().astype("int64"), op_name)()
else:
result = getattr(s, op_name)(skipna=skipna)
expected = getattr(s.dropna().astype("int64"), op_name)(skipna=skipna)
if not skipna and s.isna().any():
expected = pd.NA
tm.assert_almost_equal(result, expected)


Expand Down
22 changes: 0 additions & 22 deletions pandas/tests/frame/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,25 +259,3 @@ def frame_of_index_cols():
}
)
return df


@pytest.fixture(
params=[
"any",
"all",
"count",
"sum",
"prod",
"max",
"min",
"mean",
"median",
"skew",
"kurt",
"sem",
"var",
"std",
]
)
def reduction_functions(request):
return request.param
6 changes: 3 additions & 3 deletions pandas/tests/frame/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1427,16 +1427,16 @@ def test_frame_any_with_timedelta(self):
tm.assert_series_equal(result, expected)

def test_reductions_skipna_none_raises(
self, request, frame_or_series, reduction_functions
self, request, frame_or_series, all_reductions
):
if reduction_functions == "count":
if all_reductions == "count":
request.node.add_marker(
pytest.mark.xfail(reason="Count does not accept skipna")
)
obj = frame_or_series([1, 2, 3])
msg = 'For argument "skipna" expected type bool, received type NoneType.'
with pytest.raises(ValueError, match=msg):
getattr(obj, reduction_functions)(skipna=None)
getattr(obj, all_reductions)(skipna=None)


class TestNuisanceColumns:
Expand Down