BUG: DataFrameGroupBy with numeric_only and empty non-numeric data (#41706)

jbrockmendel · web-flow · commit 6b94e2431536 · 2021-06-02T11:14:08.000-04:00
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -1061,6 +1061,7 @@ Groupby/resample/rolling
 - Bug in :meth:`DataFrameGroupBy.transform` with non-unique columns incorrectly raising ``AttributeError`` (:issue:`41427`)
 - Bug in :meth:`Resampler.apply` with non-unique columns incorrectly dropping duplicated columns (:issue:`41445`)
 - Bug in :meth:`SeriesGroupBy` aggregations incorrectly returning empty :class:`Series` instead of raising ``TypeError`` on aggregations that are invalid for its dtype, e.g. ``.prod`` with ``datetime64[ns]`` dtype (:issue:`41342`)
+- Bug in :class:`DataFrameGroupBy` aggregations incorrectly failing to drop columns with invalid dtypes for that aggregation when there are no valid columns (:issue:`41291`)
 - Bug in :meth:`DataFrame.rolling.__iter__` where ``on`` was not assigned to the index of the resulting objects (:issue:`40373`)
 - Bug in :meth:`DataFrameGroupBy.transform` and :meth:`DataFrameGroupBy.agg` with ``engine="numba"`` where ``*args`` were being cached with the user passed function (:issue:`41647`)
 
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -516,7 +516,7 @@ def group_add(add_t[:, ::1] out,
                 val = values[i, j]
 
                 # not nan
-                if val == val:
+                if not checknull(val):
                     nobs[lab, j] += 1
 
                     if nobs[lab, j] == 1:
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -67,10 +67,7 @@
     validate_func_kwargs,
 )
 from pandas.core.apply import GroupByApply
-from pandas.core.base import (
-    DataError,
-    SpecificationError,
-)
+from pandas.core.base import SpecificationError
 import pandas.core.common as com
 from pandas.core.construction import create_series_with_explicit_dtype
 from pandas.core.frame import DataFrame
@@ -516,16 +513,12 @@ def _cython_transform(
 
         obj = self._selected_obj
 
-        is_numeric = is_numeric_dtype(obj.dtype)
-        if numeric_only and not is_numeric:
-            raise DataError("No numeric types to aggregate")
-
         try:
             result = self.grouper._cython_operation(
                 "transform", obj._values, how, axis, **kwargs
             )
-        except (NotImplementedError, TypeError):
-            raise DataError("No numeric types to aggregate")
+        except NotImplementedError as err:
+            raise TypeError(f"{how} is not supported for {obj.dtype} dtype") from err
 
         return obj._constructor(result, index=self.obj.index, name=obj.name)
 
@@ -1064,7 +1057,6 @@ def _cython_agg_general(
         # Note: we never get here with how="ohlc"; that goes through SeriesGroupBy
 
         data: Manager2D = self._get_data_to_aggregate()
-        orig = data
 
         if numeric_only:
             data = data.get_numeric_data(copy=False)
@@ -1087,9 +1079,6 @@ def array_func(values: ArrayLike) -> ArrayLike:
         #  continue and exclude the block
         new_mgr = data.grouped_reduce(array_func, ignore_failures=True)
 
-        if not len(new_mgr) and len(orig):
-            # If the original Manager was already empty, no need to raise
-            raise DataError("No numeric types to aggregate")
         if len(new_mgr) < len(data):
             warnings.warn(
                 f"Dropping invalid columns in {type(self).__name__}.{how} "
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -1339,20 +1339,12 @@ def _agg_general(
 
         with group_selection_context(self):
             # try a cython aggregation if we can
-            result = None
-            try:
-                result = self._cython_agg_general(
-                    how=alias,
-                    alt=npfunc,
-                    numeric_only=numeric_only,
-                    min_count=min_count,
-                )
-            except DataError:
-                pass
-
-            # apply a non-cython aggregation
-            if result is None:
-                result = self.aggregate(lambda x: npfunc(x, axis=self.axis))
+            result = self._cython_agg_general(
+                how=alias,
+                alt=npfunc,
+                numeric_only=numeric_only,
+                min_count=min_count,
+            )
             return result.__finalize__(self.obj, method="groupby")
 
     def _agg_py_fallback(
diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py
@@ -128,8 +128,9 @@ def test_groupby_aggregation_multi_level_column():
         columns=MultiIndex.from_tuples([("A", 0), ("A", 1), ("B", 0), ("B", 1)]),
     )
 
-    result = df.groupby(level=1, axis=1).sum()
-    expected = DataFrame({0: [2.0, 1, 1, 1], 1: [1, 0, 1, 1]})
+    gb = df.groupby(level=1, axis=1)
+    result = gb.sum(numeric_only=False)
+    expected = DataFrame({0: [2.0, True, True, True], 1: [1, 0, 1, 1]})
 
     tm.assert_frame_equal(result, expected)
 
diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py
@@ -18,7 +18,6 @@
     bdate_range,
 )
 import pandas._testing as tm
-from pandas.core.groupby.groupby import DataError
 
 
 @pytest.mark.parametrize(
@@ -98,9 +97,9 @@ def test_cython_agg_nothing_to_agg():
 
     frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25})
 
-    msg = "No numeric types to aggregate"
-    with pytest.raises(DataError, match=msg):
-        frame[["b"]].groupby(frame["a"]).mean()
+    result = frame[["b"]].groupby(frame["a"]).mean()
+    expected = DataFrame([], index=frame["a"].sort_values().drop_duplicates())
+    tm.assert_frame_equal(result, expected)
 
 
 def test_cython_agg_nothing_to_agg_with_dates():
diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py
@@ -433,15 +433,22 @@ def test_agg_over_numpy_arrays():
         ],
         columns=["category", "arraydata"],
     )
-    result = df.groupby("category").agg(sum)
+    gb = df.groupby("category")
 
     expected_data = [[np.array([50, 70, 90])], [np.array([20, 30, 40])]]
     expected_index = Index([1, 2], name="category")
     expected_column = ["arraydata"]
     expected = DataFrame(expected_data, index=expected_index, columns=expected_column)
 
+    alt = gb.sum(numeric_only=False)
+    tm.assert_frame_equal(alt, expected)
+
+    result = gb.agg("sum", numeric_only=False)
     tm.assert_frame_equal(result, expected)
 
+    # FIXME: the original version of this test called `gb.agg(sum)`
+    #  and that raises TypeError if `numeric_only=False` is passed
+
 
 @pytest.mark.parametrize("as_period", [True, False])
 def test_agg_tzaware_non_datetime_result(as_period):
@@ -524,9 +531,14 @@ def test_sum_uint64_overflow():
     )
 
     expected.index.name = 0
-    result = df.groupby(0).sum()
+    result = df.groupby(0).sum(numeric_only=False)
     tm.assert_frame_equal(result, expected)
 
+    # out column is non-numeric, so with numeric_only=True it is dropped
+    result2 = df.groupby(0).sum(numeric_only=True)
+    expected2 = expected[[]]
+    tm.assert_frame_equal(result2, expected2)
+
 
 @pytest.mark.parametrize(
     "structure, expected",
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -638,7 +638,7 @@ def test_as_index_select_column():
 def test_groupby_as_index_select_column_sum_empty_df():
     # GH 35246
     df = DataFrame(columns=["A", "B", "C"])
-    left = df.groupby(by="A", as_index=False)["B"].sum()
+    left = df.groupby(by="A", as_index=False)["B"].sum(numeric_only=False)
     assert type(left) is DataFrame
     assert left.to_dict() == {"A": {}, "B": {}}
 
@@ -1861,6 +1861,49 @@ def get_result():
                         get_result()
 
                     return
+    else:
+        # ie. DataFrameGroupBy
+        if op in ["prod", "sum"]:
+            # ops that require more than just ordered-ness
+            if method != "apply":
+                # FIXME: apply goes through different code path
+                if df.dtypes[0].kind == "M":
+                    # GH#41291
+                    # datetime64 -> prod and sum are invalid
+                    result = get_result()
+
+                    # with numeric_only=True, these are dropped, and we get
+                    # an empty DataFrame back
+                    expected = df.set_index(keys)[[]]
+                    tm.assert_equal(result, expected)
+                    return
+
+                elif isinstance(values, Categorical):
+                    # GH#41291
+                    # Categorical doesn't implement sum or prod
+                    result = get_result()
+
+                    # with numeric_only=True, these are dropped, and we get
+                    # an empty DataFrame back
+                    expected = df.set_index(keys)[[]]
+                    if len(keys) != 1 and op == "prod":
+                        # TODO: why just prod and not sum?
+                        # Categorical is special without 'observed=True'
+                        lev = Categorical([0], dtype=values.dtype)
+                        mi = MultiIndex.from_product([lev, lev], names=["A", "B"])
+                        expected = DataFrame([], columns=[], index=mi)
+
+                    tm.assert_equal(result, expected)
+                    return
+
+                elif df.dtypes[0] == object:
+                    # FIXME: the test is actually wrong here, xref #41341
+                    result = get_result()
+                    # In this case we have list-of-list, will raise TypeError,
+                    # and subsequently be dropped as nuisance columns
+                    expected = df.set_index(keys)[[]]
+                    tm.assert_equal(result, expected)
+                    return
 
     result = get_result()
     expected = df.set_index(keys)[columns]
@@ -2313,12 +2356,17 @@ def test_groupby_all_nan_groups_drop():
 
 def test_groupby_empty_multi_column():
     # GH 15106
-    result = DataFrame(data=[], columns=["A", "B", "C"]).groupby(["A", "B"]).sum()
+    df = DataFrame(data=[], columns=["A", "B", "C"])
+    gb = df.groupby(["A", "B"])
+    result = gb.sum(numeric_only=False)
     expected = DataFrame(
         [], columns=["C"], index=MultiIndex([[], []], [[], []], names=["A", "B"])
     )
     tm.assert_frame_equal(result, expected)
 
+    result = gb.sum(numeric_only=True)
+    tm.assert_frame_equal(result, expected[[]])
+
 
 def test_groupby_filtered_df_std():
     # GH 16174
diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py
@@ -24,7 +24,6 @@
     DataFrameGroupBy,
     SeriesGroupBy,
 )
-from pandas.core.groupby.groupby import DataError
 
 
 def assert_fp_equal(a, b):
@@ -741,11 +740,21 @@ def test_cython_transform_frame(op, args, targop):
             tm.assert_frame_equal(expected, getattr(gb, op)(*args).sort_index(axis=1))
             # individual columns
             for c in df:
-                if c not in ["float", "int", "float_missing"] and op != "shift":
-                    msg = "No numeric types to aggregate"
-                    with pytest.raises(DataError, match=msg):
+                if (
+                    c not in ["float", "int", "float_missing"]
+                    and op != "shift"
+                    and not (c == "timedelta" and op == "cumsum")
+                ):
+                    msg = "|".join(
+                        [
+                            "does not support .* operations",
+                            ".* is not supported for object dtype",
+                            "is not implemented for this dtype",
+                        ]
+                    )
+                    with pytest.raises(TypeError, match=msg):
                         gb[c].transform(op)
-                    with pytest.raises(DataError, match=msg):
+                    with pytest.raises(TypeError, match=msg):
                         getattr(gb[c], op)()
                 else:
                     expected = gb[c].apply(targop)

Original file line number	Diff line number	Diff line change
`@@ -128,8 +128,9 @@ def test_groupby_aggregation_multi_level_column():`
`128`	`128`	`columns=MultiIndex.from_tuples([("A", 0), ("A", 1), ("B", 0), ("B", 1)]),`
`129`	`129`	`)`
`130`	`130`
`131`		`- result = df.groupby(level=1, axis=1).sum()`
`132`		`- expected = DataFrame({0: [2.0, 1, 1, 1], 1: [1, 0, 1, 1]})`
	`131`	`+ gb = df.groupby(level=1, axis=1)`
	`132`	`+ result = gb.sum(numeric_only=False)`
	`133`	`+ expected = DataFrame({0: [2.0, True, True, True], 1: [1, 0, 1, 1]})`
`133`	`134`
`134`	`135`	`tm.assert_frame_equal(result, expected)`
`135`	`136`