From 404a3c70766c64e6fd6d9fc56816208380ea0dbb Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 1 Jun 2021 07:54:43 -0700 Subject: [PATCH] Backport PR #41711: REGR: DataFrame reduction with min_count --- doc/source/whatsnew/v1.2.5.rst | 1 + pandas/core/frame.py | 3 +-- pandas/core/internals/blocks.py | 2 +- pandas/core/nanops.py | 3 +-- pandas/tests/frame/test_reductions.py | 29 ++++++++++++++++++++------- 5 files changed, 26 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v1.2.5.rst b/doc/source/whatsnew/v1.2.5.rst index e936519383520..500030e1304c6 100644 --- a/doc/source/whatsnew/v1.2.5.rst +++ b/doc/source/whatsnew/v1.2.5.rst @@ -15,6 +15,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ - Regression in :func:`concat` between two :class:`DataFrames` where one has an :class:`Index` that is all-None and the other is :class:`DatetimeIndex` incorrectly raising (:issue:`40841`) +- Fixed regression in :meth:`DataFrame.sum` and :meth:`DataFrame.prod` when ``min_count`` and ``numeric_only`` are both given (:issue:`41074`) - Regression in :func:`read_csv` when using ``memory_map=True`` with an non-UTF8 encoding (:issue:`40986`) - Regression in :meth:`DataFrame.replace` and :meth:`Series.replace` when the values to replace is a NumPy float array (:issue:`40371`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4c156d7470364..92892ac0f26e0 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8786,7 +8786,6 @@ def _reduce( **kwds, ): - min_count = kwds.get("min_count", 0) assert filter_type is None or filter_type == "bool", filter_type out_dtype = "bool" if filter_type == "bool" else None @@ -8831,7 +8830,7 @@ def _get_data() -> DataFrame: data = self._get_bool_data() return data - if (numeric_only is not None or axis == 0) and min_count == 0: + if numeric_only is not None or axis == 0: # For numeric_only non-None and axis non-None, we know # which blocks to use and no try/except is needed. # For numeric_only=None only the case with axis==0 and no object diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 523e19f6043da..a38b7a19dc80a 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -391,7 +391,7 @@ def reduce(self, func, ignore_failures: bool = False) -> List["Block"]: return [] raise - if np.ndim(result) == 0: + if self.values.ndim == 1: # TODO(EA2D): special case not needed with 2D EAs res_values = np.array([[result]]) else: diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index edc1b1e96509e..20adcee924a15 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -231,8 +231,7 @@ def _maybe_get_mask( """ if mask is None: if is_bool_dtype(values.dtype) or is_integer_dtype(values.dtype): - # Boolean data cannot contain nulls, so signal via mask being None - return None + return np.broadcast_to(False, values.shape) if skipna or needs_i8_conversion(values.dtype): mask = isna(values) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index cb481613eb97f..b6eccc6999dec 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1,5 +1,6 @@ from datetime import timedelta from decimal import Decimal +import re from dateutil.tz import tzlocal import numpy as np @@ -783,34 +784,35 @@ def test_sum_corner(self): assert len(axis1) == 0 @pytest.mark.parametrize("method, unit", [("sum", 0), ("prod", 1)]) - def test_sum_prod_nanops(self, method, unit): + @pytest.mark.parametrize("numeric_only", [None, True, False]) + def test_sum_prod_nanops(self, method, unit, numeric_only): idx = ["a", "b", "c"] df = DataFrame({"a": [unit, unit], "b": [unit, np.nan], "c": [np.nan, np.nan]}) # The default - result = getattr(df, method) + result = getattr(df, method)(numeric_only=numeric_only) expected = Series([unit, unit, unit], index=idx, dtype="float64") # min_count=1 - result = getattr(df, method)(min_count=1) + result = getattr(df, method)(numeric_only=numeric_only, min_count=1) expected = Series([unit, unit, np.nan], index=idx) tm.assert_series_equal(result, expected) # min_count=0 - result = getattr(df, method)(min_count=0) + result = getattr(df, method)(numeric_only=numeric_only, min_count=0) expected = Series([unit, unit, unit], index=idx, dtype="float64") tm.assert_series_equal(result, expected) - result = getattr(df.iloc[1:], method)(min_count=1) + result = getattr(df.iloc[1:], method)(numeric_only=numeric_only, min_count=1) expected = Series([unit, np.nan, np.nan], index=idx) tm.assert_series_equal(result, expected) # min_count > 1 df = DataFrame({"A": [unit] * 10, "B": [unit] * 5 + [np.nan] * 5}) - result = getattr(df, method)(min_count=5) + result = getattr(df, method)(numeric_only=numeric_only, min_count=5) expected = Series(result, index=["A", "B"]) tm.assert_series_equal(result, expected) - result = getattr(df, method)(min_count=6) + result = getattr(df, method)(numeric_only=numeric_only, min_count=6) expected = Series(result, index=["A", "B"]) tm.assert_series_equal(result, expected) @@ -1491,3 +1493,16 @@ def test_minmax_extensionarray(method, numeric_only): [getattr(int64_info, method)], index=Index(["Int64"], dtype="object") ) tm.assert_series_equal(result, expected) + + +def test_prod_sum_min_count_mixed_object(): + # https://github.com/pandas-dev/pandas/issues/41074 + df = DataFrame([1, "a", True]) + + result = df.prod(axis=0, min_count=1, numeric_only=False) + expected = Series(["a"]) + tm.assert_series_equal(result, expected) + + msg = re.escape("unsupported operand type(s) for +: 'int' and 'str'") + with pytest.raises(TypeError, match=msg): + df.sum(axis=0, min_count=1, numeric_only=False)