Skip to content

Commit 32990d5

Browse files
Backport PR #31616: REGR: Fixed AssertionError in groupby (#31703)
Co-authored-by: Tom Augspurger <[email protected]>
1 parent 508d7e1 commit 32990d5

File tree

3 files changed

+86
-22
lines changed

3 files changed

+86
-22
lines changed

doc/source/whatsnew/v1.0.1.rst

+1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ Fixed regressions
1919
- Fixed regression when indexing a ``Series`` or ``DataFrame`` indexed by ``DatetimeIndex`` with a slice containg a :class:`datetime.date` (:issue:`31501`)
2020
- Fixed regression in ``DataFrame.__setitem__`` raising an ``AttributeError`` with a :class:`MultiIndex` and a non-monotonic indexer (:issue:`31449`)
2121
- Fixed regression in :class:`Series` multiplication when multiplying a numeric :class:`Series` with >10000 elements with a timedelta-like scalar (:issue:`31457`)
22+
- Fixed regression in ``.groupby().agg()`` raising an ``AssertionError`` for some reductions like ``min`` on object-dtype columns (:issue:`31522`)
2223
- Fixed regression in ``.groupby()`` aggregations with categorical dtype using Cythonized reduction functions (e.g. ``first``) (:issue:`31450`)
2324
- Fixed regression in :meth:`GroupBy.apply` if called with a function which returned a non-pandas non-scalar object (e.g. a list or numpy array) (:issue:`31441`)
2425
- Fixed regression in :meth:`DataFrame.groupby` whereby taking the minimum or maximum of a column with period dtype would raise a ``TypeError``. (:issue:`31471`)

pandas/core/groupby/generic.py

+42-22
Original file line numberDiff line numberDiff line change
@@ -1019,6 +1019,10 @@ def _cython_agg_blocks(
10191019
agg_blocks: List[Block] = []
10201020
new_items: List[np.ndarray] = []
10211021
deleted_items: List[np.ndarray] = []
1022+
# Some object-dtype blocks might be split into List[Block[T], Block[U]]
1023+
split_items: List[np.ndarray] = []
1024+
split_frames: List[DataFrame] = []
1025+
10221026
no_result = object()
10231027
for block in data.blocks:
10241028
# Avoid inheriting result from earlier in the loop
@@ -1058,40 +1062,56 @@ def _cython_agg_blocks(
10581062
else:
10591063
result = cast(DataFrame, result)
10601064
# unwrap DataFrame to get array
1065+
if len(result._data.blocks) != 1:
1066+
# We've split an object block! Everything we've assumed
1067+
# about a single block input returning a single block output
1068+
# is a lie. To keep the code-path for the typical non-split case
1069+
# clean, we choose to clean up this mess later on.
1070+
split_items.append(locs)
1071+
split_frames.append(result)
1072+
continue
1073+
10611074
assert len(result._data.blocks) == 1
10621075
result = result._data.blocks[0].values
10631076
if isinstance(result, np.ndarray) and result.ndim == 1:
10641077
result = result.reshape(1, -1)
10651078

1066-
finally:
1067-
assert not isinstance(result, DataFrame)
1068-
1069-
if result is not no_result:
1070-
# see if we can cast the block back to the original dtype
1071-
result = maybe_downcast_numeric(result, block.dtype)
1072-
1073-
if block.is_extension and isinstance(result, np.ndarray):
1074-
# e.g. block.values was an IntegerArray
1075-
# (1, N) case can occur if block.values was Categorical
1076-
# and result is ndarray[object]
1077-
assert result.ndim == 1 or result.shape[0] == 1
1078-
try:
1079-
# Cast back if feasible
1080-
result = type(block.values)._from_sequence(
1081-
result.ravel(), dtype=block.values.dtype
1082-
)
1083-
except ValueError:
1084-
# reshape to be valid for non-Extension Block
1085-
result = result.reshape(1, -1)
1079+
assert not isinstance(result, DataFrame)
1080+
1081+
if result is not no_result:
1082+
# see if we can cast the block back to the original dtype
1083+
result = maybe_downcast_numeric(result, block.dtype)
1084+
1085+
if block.is_extension and isinstance(result, np.ndarray):
1086+
# e.g. block.values was an IntegerArray
1087+
# (1, N) case can occur if block.values was Categorical
1088+
# and result is ndarray[object]
1089+
assert result.ndim == 1 or result.shape[0] == 1
1090+
try:
1091+
# Cast back if feasible
1092+
result = type(block.values)._from_sequence(
1093+
result.ravel(), dtype=block.values.dtype
1094+
)
1095+
except ValueError:
1096+
# reshape to be valid for non-Extension Block
1097+
result = result.reshape(1, -1)
10861098

1087-
agg_block: Block = block.make_block(result)
1099+
agg_block: Block = block.make_block(result)
10881100

10891101
new_items.append(locs)
10901102
agg_blocks.append(agg_block)
10911103

1092-
if not agg_blocks:
1104+
if not (agg_blocks or split_frames):
10931105
raise DataError("No numeric types to aggregate")
10941106

1107+
if split_items:
1108+
# Clean up the mess left over from split blocks.
1109+
for locs, result in zip(split_items, split_frames):
1110+
assert len(locs) == result.shape[1]
1111+
for i, loc in enumerate(locs):
1112+
new_items.append(np.array([loc], dtype=locs.dtype))
1113+
agg_blocks.append(result.iloc[:, [i]]._data.blocks[0])
1114+
10951115
# reset the locs in the blocks to correspond to our
10961116
# current ordering
10971117
indexer = np.concatenate(new_items)

pandas/tests/groupby/aggregate/test_aggregate.py

+43
Original file line numberDiff line numberDiff line change
@@ -378,6 +378,49 @@ def test_agg_index_has_complex_internals(index):
378378
tm.assert_frame_equal(result, expected)
379379

380380

381+
def test_agg_split_block():
382+
# https://github.com/pandas-dev/pandas/issues/31522
383+
df = pd.DataFrame(
384+
{
385+
"key1": ["a", "a", "b", "b", "a"],
386+
"key2": ["one", "two", "one", "two", "one"],
387+
"key3": ["three", "three", "three", "six", "six"],
388+
}
389+
)
390+
result = df.groupby("key1").min()
391+
expected = pd.DataFrame(
392+
{"key2": ["one", "one"], "key3": ["six", "six"]},
393+
index=pd.Index(["a", "b"], name="key1"),
394+
)
395+
tm.assert_frame_equal(result, expected)
396+
397+
398+
def test_agg_split_object_part_datetime():
399+
# https://github.com/pandas-dev/pandas/pull/31616
400+
df = pd.DataFrame(
401+
{
402+
"A": pd.date_range("2000", periods=4),
403+
"B": ["a", "b", "c", "d"],
404+
"C": [1, 2, 3, 4],
405+
"D": ["b", "c", "d", "e"],
406+
"E": pd.date_range("2000", periods=4),
407+
"F": [1, 2, 3, 4],
408+
}
409+
).astype(object)
410+
result = df.groupby([0, 0, 0, 0]).min()
411+
expected = pd.DataFrame(
412+
{
413+
"A": [pd.Timestamp("2000")],
414+
"B": ["a"],
415+
"C": [1],
416+
"D": ["b"],
417+
"E": [pd.Timestamp("2000")],
418+
"F": [1],
419+
}
420+
)
421+
tm.assert_frame_equal(result, expected)
422+
423+
381424
def test_agg_cython_category_not_implemented_fallback():
382425
# https://github.com/pandas-dev/pandas/issues/31450
383426
df = pd.DataFrame({"col_num": [1, 1, 2, 3]})

0 commit comments

Comments
 (0)