Skip to content

Commit 0aa7598

Browse files
jbrockmendelJulianWgs
authored andcommitted
BUG: DataFrameGroupBy agg with multi-column object block (pandas-dev#41111)
1 parent 2e2a0d8 commit 0aa7598

File tree

4 files changed

+66
-33
lines changed

4 files changed

+66
-33
lines changed

doc/source/whatsnew/v1.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -851,6 +851,7 @@ Groupby/resample/rolling
851851
- Bug in :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` computing wrong result with nullable data types too large to roundtrip when casting to float (:issue:`37493`)
852852
- Bug in :meth:`DataFrame.rolling` returning mean zero for all ``NaN`` window with ``min_periods=0`` if calculation is not numerical stable (:issue:`41053`)
853853
- Bug in :meth:`DataFrame.rolling` returning sum not zero for all ``NaN`` window with ``min_periods=0`` if calculation is not numerical stable (:issue:`41053`)
854+
- Bug in :meth:`DataFrameGroupBy.min` and :meth:`DataFrameGroupBy.max` with multiple object-dtype columns and ``numeric_only=False`` incorrectly raising ``ValueError`` (:issue:41111`)
854855

855856
Reshaping
856857
^^^^^^^^^

pandas/core/groupby/generic.py

+15-26
Original file line numberDiff line numberDiff line change
@@ -1088,9 +1088,7 @@ def _cython_agg_general(
10881088

10891089
using_array_manager = isinstance(data, ArrayManager)
10901090

1091-
def cast_agg_result(
1092-
result: ArrayLike, values: ArrayLike, how: str
1093-
) -> ArrayLike:
1091+
def cast_agg_result(result: ArrayLike, values: ArrayLike) -> ArrayLike:
10941092
# see if we can cast the values to the desired dtype
10951093
# this may not be the original dtype
10961094

@@ -1102,7 +1100,7 @@ def cast_agg_result(
11021100

11031101
elif (
11041102
not using_array_manager
1105-
and isinstance(result, np.ndarray)
1103+
and isinstance(result.dtype, np.dtype)
11061104
and result.ndim == 1
11071105
):
11081106
# We went through a SeriesGroupByPath and need to reshape
@@ -1129,34 +1127,26 @@ def py_fallback(values: ArrayLike) -> ArrayLike:
11291127
else:
11301128
# We only get here with values.dtype == object
11311129
# TODO special case not needed with ArrayManager
1132-
obj = DataFrame(values.T)
1133-
if obj.shape[1] == 1:
1134-
# Avoid call to self.values that can occur in DataFrame
1135-
# reductions; see GH#28949
1136-
obj = obj.iloc[:, 0]
1130+
df = DataFrame(values.T)
1131+
# bc we split object blocks in grouped_reduce, we have only 1 col
1132+
# otherwise we'd have to worry about block-splitting GH#39329
1133+
assert df.shape[1] == 1
1134+
# Avoid call to self.values that can occur in DataFrame
1135+
# reductions; see GH#28949
1136+
obj = df.iloc[:, 0]
11371137

11381138
# Create SeriesGroupBy with observed=True so that it does
11391139
# not try to add missing categories if grouping over multiple
11401140
# Categoricals. This will done by later self._reindex_output()
11411141
# Doing it here creates an error. See GH#34951
11421142
sgb = get_groupby(obj, self.grouper, observed=True)
1143-
result = sgb.aggregate(lambda x: alt(x, axis=self.axis))
1143+
# Note: bc obj is always a Series here, we can ignore axis and pass
1144+
# `alt` directly instead of `lambda x: alt(x, axis=self.axis)`
1145+
res_ser = sgb.aggregate(alt) # this will go through sgb._python_agg_general
11441146

1145-
# In the case of object dtype block, it may have been split
1146-
# in the operation. We un-split here.
1147-
result = result._consolidate()
1148-
# unwrap DataFrame/Series to get array
1149-
mgr = result._mgr
1150-
arrays = mgr.arrays
1151-
if len(arrays) != 1:
1152-
# We've split an object block! Everything we've assumed
1153-
# about a single block input returning a single block output
1154-
# is a lie. See eg GH-39329
1155-
return mgr.as_array()
1156-
else:
1157-
# We are a single block from a BlockManager
1158-
# or one array from SingleArrayManager
1159-
return arrays[0]
1147+
# unwrap Series to get array
1148+
res_values = res_ser._mgr.arrays[0]
1149+
return cast_agg_result(res_values, values)
11601150

11611151
def array_func(values: ArrayLike) -> ArrayLike:
11621152

@@ -1170,7 +1160,6 @@ def array_func(values: ArrayLike) -> ArrayLike:
11701160
# try to python agg
11711161
result = py_fallback(values)
11721162

1173-
return cast_agg_result(result, values, how)
11741163
return result
11751164

11761165
# TypeError -> we may have an exception in trying to aggregate

pandas/core/internals/managers.py

+19-7
Original file line numberDiff line numberDiff line change
@@ -301,13 +301,25 @@ def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T:
301301
result_blocks: list[Block] = []
302302

303303
for blk in self.blocks:
304-
try:
305-
applied = blk.apply(func)
306-
except (TypeError, NotImplementedError):
307-
if not ignore_failures:
308-
raise
309-
continue
310-
result_blocks = extend_blocks(applied, result_blocks)
304+
if blk.is_object:
305+
# split on object-dtype blocks bc some columns may raise
306+
# while others do not.
307+
for sb in blk._split():
308+
try:
309+
applied = sb.apply(func)
310+
except (TypeError, NotImplementedError):
311+
if not ignore_failures:
312+
raise
313+
continue
314+
result_blocks = extend_blocks(applied, result_blocks)
315+
else:
316+
try:
317+
applied = blk.apply(func)
318+
except (TypeError, NotImplementedError):
319+
if not ignore_failures:
320+
raise
321+
continue
322+
result_blocks = extend_blocks(applied, result_blocks)
311323

312324
if len(result_blocks) == 0:
313325
index = Index([None]) # placeholder

pandas/tests/groupby/test_function.py

+31
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,37 @@ def test_max_min_non_numeric():
6969
assert "ss" in result
7070

7171

72+
def test_max_min_object_multiple_columns(using_array_manager):
73+
# GH#41111 case where the aggregation is valid for some columns but not
74+
# others; we split object blocks column-wise, consistent with
75+
# DataFrame._reduce
76+
77+
df = DataFrame(
78+
{
79+
"A": [1, 1, 2, 2, 3],
80+
"B": [1, "foo", 2, "bar", False],
81+
"C": ["a", "b", "c", "d", "e"],
82+
}
83+
)
84+
df._consolidate_inplace() # should already be consolidate, but double-check
85+
if not using_array_manager:
86+
assert len(df._mgr.blocks) == 2
87+
88+
gb = df.groupby("A")
89+
90+
result = gb.max(numeric_only=False)
91+
# "max" is valid for column "C" but not for "B"
92+
ei = Index([1, 2, 3], name="A")
93+
expected = DataFrame({"C": ["b", "d", "e"]}, index=ei)
94+
tm.assert_frame_equal(result, expected)
95+
96+
result = gb.min(numeric_only=False)
97+
# "min" is valid for column "C" but not for "B"
98+
ei = Index([1, 2, 3], name="A")
99+
expected = DataFrame({"C": ["a", "c", "e"]}, index=ei)
100+
tm.assert_frame_equal(result, expected)
101+
102+
72103
def test_min_date_with_nans():
73104
# GH26321
74105
dates = pd.to_datetime(

0 commit comments

Comments
 (0)