BUG: DataFrameGroupBy agg with multi-column object block (pandas-dev#41111)

jbrockmendel · JulianWgs · commit 0aa7598a72f8 · 2021-07-03T13:08:01.000+02:00
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -851,6 +851,7 @@ Groupby/resample/rolling
 - Bug in :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` computing wrong result with nullable data types too large to roundtrip when casting to float (:issue:`37493`)
 - Bug in :meth:`DataFrame.rolling` returning mean zero for all ``NaN`` window with ``min_periods=0`` if calculation is not numerical stable (:issue:`41053`)
 - Bug in :meth:`DataFrame.rolling` returning sum not zero for all ``NaN`` window with ``min_periods=0`` if calculation is not numerical stable (:issue:`41053`)
+- Bug in :meth:`DataFrameGroupBy.min` and :meth:`DataFrameGroupBy.max` with multiple object-dtype columns and ``numeric_only=False`` incorrectly raising ``ValueError`` (:issue:41111`)
 
 Reshaping
 ^^^^^^^^^
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -1088,9 +1088,7 @@ def _cython_agg_general(
 
         using_array_manager = isinstance(data, ArrayManager)
 
-        def cast_agg_result(
-            result: ArrayLike, values: ArrayLike, how: str
-        ) -> ArrayLike:
+        def cast_agg_result(result: ArrayLike, values: ArrayLike) -> ArrayLike:
             # see if we can cast the values to the desired dtype
             # this may not be the original dtype
 
@@ -1102,7 +1100,7 @@ def cast_agg_result(
 
             elif (
                 not using_array_manager
-                and isinstance(result, np.ndarray)
+                and isinstance(result.dtype, np.dtype)
                 and result.ndim == 1
             ):
                 # We went through a SeriesGroupByPath and need to reshape
@@ -1129,34 +1127,26 @@ def py_fallback(values: ArrayLike) -> ArrayLike:
             else:
                 # We only get here with values.dtype == object
                 # TODO special case not needed with ArrayManager
-                obj = DataFrame(values.T)
-                if obj.shape[1] == 1:
-                    # Avoid call to self.values that can occur in DataFrame
-                    #  reductions; see GH#28949
-                    obj = obj.iloc[:, 0]
+                df = DataFrame(values.T)
+                # bc we split object blocks in grouped_reduce, we have only 1 col
+                # otherwise we'd have to worry about block-splitting GH#39329
+                assert df.shape[1] == 1
+                # Avoid call to self.values that can occur in DataFrame
+                #  reductions; see GH#28949
+                obj = df.iloc[:, 0]
 
             # Create SeriesGroupBy with observed=True so that it does
             # not try to add missing categories if grouping over multiple
             # Categoricals. This will done by later self._reindex_output()
             # Doing it here creates an error. See GH#34951
             sgb = get_groupby(obj, self.grouper, observed=True)
-            result = sgb.aggregate(lambda x: alt(x, axis=self.axis))
+            # Note: bc obj is always a Series here, we can ignore axis and pass
+            #  `alt` directly instead of `lambda x: alt(x, axis=self.axis)`
+            res_ser = sgb.aggregate(alt)  # this will go through sgb._python_agg_general
 
-            # In the case of object dtype block, it may have been split
-            #  in the operation.  We un-split here.
-            result = result._consolidate()
-            # unwrap DataFrame/Series to get array
-            mgr = result._mgr
-            arrays = mgr.arrays
-            if len(arrays) != 1:
-                # We've split an object block! Everything we've assumed
-                # about a single block input returning a single block output
-                # is a lie. See eg GH-39329
-                return mgr.as_array()
-            else:
-                # We are a single block from a BlockManager
-                # or one array from SingleArrayManager
-                return arrays[0]
+            # unwrap Series to get array
+            res_values = res_ser._mgr.arrays[0]
+            return cast_agg_result(res_values, values)
 
         def array_func(values: ArrayLike) -> ArrayLike:
 
@@ -1170,7 +1160,6 @@ def array_func(values: ArrayLike) -> ArrayLike:
                 # try to python agg
                 result = py_fallback(values)
 
-                return cast_agg_result(result, values, how)
             return result
 
         # TypeError -> we may have an exception in trying to aggregate
diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -301,13 +301,25 @@ def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T:
         result_blocks: list[Block] = []
 
         for blk in self.blocks:
-            try:
-                applied = blk.apply(func)
-            except (TypeError, NotImplementedError):
-                if not ignore_failures:
-                    raise
-                continue
-            result_blocks = extend_blocks(applied, result_blocks)
+            if blk.is_object:
+                # split on object-dtype blocks bc some columns may raise
+                #  while others do not.
+                for sb in blk._split():
+                    try:
+                        applied = sb.apply(func)
+                    except (TypeError, NotImplementedError):
+                        if not ignore_failures:
+                            raise
+                        continue
+                    result_blocks = extend_blocks(applied, result_blocks)
+            else:
+                try:
+                    applied = blk.apply(func)
+                except (TypeError, NotImplementedError):
+                    if not ignore_failures:
+                        raise
+                    continue
+                result_blocks = extend_blocks(applied, result_blocks)
 
         if len(result_blocks) == 0:
             index = Index([None])  # placeholder
diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py
@@ -69,6 +69,37 @@ def test_max_min_non_numeric():
     assert "ss" in result
 
 
+def test_max_min_object_multiple_columns(using_array_manager):
+    # GH#41111 case where the aggregation is valid for some columns but not
+    # others; we split object blocks column-wise, consistent with
+    # DataFrame._reduce
+
+    df = DataFrame(
+        {
+            "A": [1, 1, 2, 2, 3],
+            "B": [1, "foo", 2, "bar", False],
+            "C": ["a", "b", "c", "d", "e"],
+        }
+    )
+    df._consolidate_inplace()  # should already be consolidate, but double-check
+    if not using_array_manager:
+        assert len(df._mgr.blocks) == 2
+
+    gb = df.groupby("A")
+
+    result = gb.max(numeric_only=False)
+    # "max" is valid for column "C" but not for "B"
+    ei = Index([1, 2, 3], name="A")
+    expected = DataFrame({"C": ["b", "d", "e"]}, index=ei)
+    tm.assert_frame_equal(result, expected)
+
+    result = gb.min(numeric_only=False)
+    # "min" is valid for column "C" but not for "B"
+    ei = Index([1, 2, 3], name="A")
+    expected = DataFrame({"C": ["a", "c", "e"]}, index=ei)
+    tm.assert_frame_equal(result, expected)
+
+
 def test_min_date_with_nans():
     # GH26321
     dates = pd.to_datetime(