diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 54d8ba1edea39..d848413d7193c 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -284,6 +284,63 @@ of columns could result in a larger :class:`Series` result. See (:issue:`37799`) In [6]: df[["B", "C"]].all(bool_only=True) +Other :class:`DataFrame` reductions with ``numeric_only=None`` will also avoid +this pathological behavior (:issue:`37827`): + +.. ipython:: python + + df = pd.DataFrame({"A": [0, 1, 2], "B": ["a", "b", "c"]}, dtype=object) + + +*Previous behavior*: + +.. code-block:: ipython + + In [3]: df.mean() + Out[3]: Series([], dtype: float64) + + In [4]: df[["A"]].mean() + Out[4]: + A 1.0 + dtype: float64 + +*New behavior*: + +.. ipython:: python + + df.mean() + + df[["A"]].mean() + +Moreover, :class:`DataFrame` reductions with ``numeric_only=None`` will now be +consistent with their :class:`Series` counterparts. In particular, for +reductions where the :class:`Series` method raises ``TypeError``, the +:class:`DataFrame` reduction will now consider that column non-numeric +instead of casting to NumPy which may have different semantics (:issue:`36076`, +:issue:`28949`, :issue:`21020`). + +.. ipython:: python + + ser = pd.Series([0, 1], dtype="category", name="A") + df = ser.to_frame() + + +*Previous behavior*: + +.. code-block:: ipython + + In [5]: df.any() + Out[5]: + A True + dtype: bool + +*New behavior*: + +.. ipython:: python + + df.any() + + .. _whatsnew_120.api_breaking.python: Increased minimum version for Python diff --git a/pandas/core/frame.py b/pandas/core/frame.py index bae06339a1e60..4310d23e4d7f3 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8765,7 +8765,7 @@ def _get_data() -> DataFrame: data = self._get_bool_data() return data - if numeric_only is not None: + if numeric_only is not None or axis == 0: # For numeric_only non-None and axis non-None, we know # which blocks to use and no try/except is needed. # For numeric_only=None only the case with axis==0 and no object @@ -8790,36 +8790,14 @@ def _get_data() -> DataFrame: # GH#35865 careful to cast explicitly to object nvs = coerce_to_dtypes(out.values, df.dtypes.iloc[np.sort(indexer)]) out[:] = np.array(nvs, dtype=object) + if axis == 0 and len(self) == 0 and name in ["sum", "prod"]: + # Even if we are object dtype, follow numpy and return + # float64, see test_apply_funcs_over_empty + out = out.astype(np.float64) return out assert numeric_only is None - if not self._is_homogeneous_type or self._mgr.any_extension_types: - # try to avoid self.values call - - if filter_type is None and axis == 0: - # operate column-wise - - # numeric_only must be None here, as other cases caught above - - # this can end up with a non-reduction - # but not always. if the types are mixed - # with datelike then need to make sure a series - - # we only end up here if we have not specified - # numeric_only and yet we have tried a - # column-by-column reduction, where we have mixed type. - # So let's just do what we can - from pandas.core.apply import frame_apply - - opa = frame_apply( - self, func=func, result_type="expand", ignore_failures=True - ) - result = opa.get_result() - if result.ndim == self.ndim: - result = result.iloc[0].rename(None) - return result - data = self values = data.values diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 92f6bb6f1cbdd..967e218078a28 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -464,7 +464,9 @@ def _split(self) -> List["Block"]: new_blocks.append(nb) return new_blocks - def split_and_operate(self, mask, f, inplace: bool) -> List["Block"]: + def split_and_operate( + self, mask, f, inplace: bool, ignore_failures: bool = False + ) -> List["Block"]: """ split the block per-column, and apply the callable f per-column, return a new block for each. Handle @@ -474,7 +476,8 @@ def split_and_operate(self, mask, f, inplace: bool) -> List["Block"]: ---------- mask : 2-d boolean mask f : callable accepting (1d-mask, 1d values, indexer) - inplace : boolean + inplace : bool + ignore_failures : bool, default False Returns ------- @@ -513,8 +516,16 @@ def make_a_block(nv, ref_loc): v = new_values[i] # need a new block - if m.any(): - nv = f(m, v, i) + if m.any() or m.size == 0: + # Apply our function; we may ignore_failures if this is a + # reduction that is dropping nuisance columns GH#37827 + try: + nv = f(m, v, i) + except TypeError: + if ignore_failures: + continue + else: + raise else: nv = v if inplace else v.copy() @@ -2459,7 +2470,9 @@ def mask_func(mask, values, inplace): values = values.reshape(1, -1) return func(values) - return self.split_and_operate(None, mask_func, False) + return self.split_and_operate( + None, mask_func, False, ignore_failures=ignore_failures + ) try: res = func(values) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 95cb770a11408..299f00e818105 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -12,6 +12,7 @@ from pandas import ( Categorical, DataFrame, + Index, MultiIndex, Series, Timestamp, @@ -1083,10 +1084,12 @@ def test_any_all_bool_only(self): pytest.param(np.any, {"A": Series([0, 1], dtype="m8[ns]")}, True), pytest.param(np.all, {"A": Series([1, 2], dtype="m8[ns]")}, True), pytest.param(np.any, {"A": Series([1, 2], dtype="m8[ns]")}, True), - (np.all, {"A": Series([0, 1], dtype="category")}, False), - (np.any, {"A": Series([0, 1], dtype="category")}, True), + # np.all on Categorical raises, so the reduction drops the + # column, so all is being done on an empty Series, so is True + (np.all, {"A": Series([0, 1], dtype="category")}, True), + (np.any, {"A": Series([0, 1], dtype="category")}, False), (np.all, {"A": Series([1, 2], dtype="category")}, True), - (np.any, {"A": Series([1, 2], dtype="category")}, True), + (np.any, {"A": Series([1, 2], dtype="category")}, False), # Mix GH#21484 pytest.param( np.all, @@ -1308,6 +1311,114 @@ def test_frame_any_with_timedelta(self): tm.assert_series_equal(result, expected) +class TestNuisanceColumns: + @pytest.mark.parametrize("method", ["any", "all"]) + def test_any_all_categorical_dtype_nuisance_column(self, method): + # GH#36076 DataFrame should match Series behavior + ser = Series([0, 1], dtype="category", name="A") + df = ser.to_frame() + + # Double-check the Series behavior is to raise + with pytest.raises(TypeError, match="does not implement reduction"): + getattr(ser, method)() + + with pytest.raises(TypeError, match="does not implement reduction"): + getattr(np, method)(ser) + + with pytest.raises(TypeError, match="does not implement reduction"): + getattr(df, method)(bool_only=False) + + # With bool_only=None, operating on this column raises and is ignored, + # so we expect an empty result. + result = getattr(df, method)(bool_only=None) + expected = Series([], index=Index([]), dtype=bool) + tm.assert_series_equal(result, expected) + + result = getattr(np, method)(df, axis=0) + tm.assert_series_equal(result, expected) + + def test_median_categorical_dtype_nuisance_column(self): + # GH#21020 DataFrame.median should match Series.median + df = DataFrame({"A": Categorical([1, 2, 2, 2, 3])}) + ser = df["A"] + + # Double-check the Series behavior is to raise + with pytest.raises(TypeError, match="does not implement reduction"): + ser.median() + + with pytest.raises(TypeError, match="does not implement reduction"): + df.median(numeric_only=False) + + result = df.median() + expected = Series([], index=Index([]), dtype=np.float64) + tm.assert_series_equal(result, expected) + + # same thing, but with an additional non-categorical column + df["B"] = df["A"].astype(int) + + with pytest.raises(TypeError, match="does not implement reduction"): + df.median(numeric_only=False) + + result = df.median() + expected = Series([2.0], index=["B"]) + tm.assert_series_equal(result, expected) + + # TODO: np.median(df, axis=0) gives np.array([2.0, 2.0]) instead + # of expected.values + + @pytest.mark.parametrize("method", ["min", "max"]) + def test_min_max_categorical_dtype_non_ordered_nuisance_column(self, method): + # GH#28949 DataFrame.min should behave like Series.min + cat = Categorical(["a", "b", "c", "b"], ordered=False) + ser = Series(cat) + df = ser.to_frame("A") + + # Double-check the Series behavior + with pytest.raises(TypeError, match="is not ordered for operation"): + getattr(ser, method)() + + with pytest.raises(TypeError, match="is not ordered for operation"): + getattr(np, method)(ser) + + with pytest.raises(TypeError, match="is not ordered for operation"): + getattr(df, method)(numeric_only=False) + + result = getattr(df, method)() + expected = Series([], index=Index([]), dtype=np.float64) + tm.assert_series_equal(result, expected) + + result = getattr(np, method)(df) + tm.assert_series_equal(result, expected) + + # same thing, but with an additional non-categorical column + df["B"] = df["A"].astype(object) + result = getattr(df, method)() + if method == "min": + expected = Series(["a"], index=["B"]) + else: + expected = Series(["c"], index=["B"]) + tm.assert_series_equal(result, expected) + + result = getattr(np, method)(df) + tm.assert_series_equal(result, expected) + + def test_reduction_object_block_splits_nuisance_columns(self): + # GH#37827 + df = DataFrame({"A": [0, 1, 2], "B": ["a", "b", "c"]}, dtype=object) + + # We should only exclude "B", not "A" + result = df.mean() + expected = Series([1.0], index=["A"]) + tm.assert_series_equal(result, expected) + + # Same behavior but heterogeneous dtype + df["C"] = df["A"].astype(int) + 4 + + result = df.mean() + expected = Series([1.0, 5.0], index=["A", "C"]) + tm.assert_series_equal(result, expected) + + def test_sum_timedelta64_skipna_false(): # GH#17235 arr = np.arange(8).astype(np.int64).view("m8[s]").reshape(4, 2) @@ -1352,6 +1463,6 @@ def test_minmax_extensionarray(method, numeric_only): df = DataFrame({"Int64": ser}) result = getattr(df, method)(numeric_only=numeric_only) expected = Series( - [getattr(int64_info, method)], index=pd.Index(["Int64"], dtype="object") + [getattr(int64_info, method)], index=Index(["Int64"], dtype="object") ) tm.assert_series_equal(result, expected)