Skip to content

Commit 2bf618f

Browse files
REGR: Fixed AssertionError in groupby (#31616)
* REGR: Fixed AssertionError in groupby Closes #31522
1 parent d73ded0 commit 2bf618f

File tree

3 files changed

+86
-22
lines changed

3 files changed

+86
-22
lines changed

doc/source/whatsnew/v1.0.1.rst

+1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ Fixed regressions
1919
- Fixed regression when indexing a ``Series`` or ``DataFrame`` indexed by ``DatetimeIndex`` with a slice containg a :class:`datetime.date` (:issue:`31501`)
2020
- Fixed regression in ``DataFrame.__setitem__`` raising an ``AttributeError`` with a :class:`MultiIndex` and a non-monotonic indexer (:issue:`31449`)
2121
- Fixed regression in :class:`Series` multiplication when multiplying a numeric :class:`Series` with >10000 elements with a timedelta-like scalar (:issue:`31457`)
22+
- Fixed regression in ``.groupby().agg()`` raising an ``AssertionError`` for some reductions like ``min`` on object-dtype columns (:issue:`31522`)
2223
- Fixed regression in ``.groupby()`` aggregations with categorical dtype using Cythonized reduction functions (e.g. ``first``) (:issue:`31450`)
2324
- Fixed regression in :meth:`GroupBy.apply` if called with a function which returned a non-pandas non-scalar object (e.g. a list or numpy array) (:issue:`31441`)
2425
- Fixed regression in :meth:`DataFrame.groupby` whereby taking the minimum or maximum of a column with period dtype would raise a ``TypeError``. (:issue:`31471`)

pandas/core/groupby/generic.py

+42-22
Original file line numberDiff line numberDiff line change
@@ -1022,6 +1022,10 @@ def _cython_agg_blocks(
10221022
agg_blocks: List[Block] = []
10231023
new_items: List[np.ndarray] = []
10241024
deleted_items: List[np.ndarray] = []
1025+
# Some object-dtype blocks might be split into List[Block[T], Block[U]]
1026+
split_items: List[np.ndarray] = []
1027+
split_frames: List[DataFrame] = []
1028+
10251029
no_result = object()
10261030
for block in data.blocks:
10271031
# Avoid inheriting result from earlier in the loop
@@ -1061,40 +1065,56 @@ def _cython_agg_blocks(
10611065
else:
10621066
result = cast(DataFrame, result)
10631067
# unwrap DataFrame to get array
1068+
if len(result._data.blocks) != 1:
1069+
# We've split an object block! Everything we've assumed
1070+
# about a single block input returning a single block output
1071+
# is a lie. To keep the code-path for the typical non-split case
1072+
# clean, we choose to clean up this mess later on.
1073+
split_items.append(locs)
1074+
split_frames.append(result)
1075+
continue
1076+
10641077
assert len(result._data.blocks) == 1
10651078
result = result._data.blocks[0].values
10661079
if isinstance(result, np.ndarray) and result.ndim == 1:
10671080
result = result.reshape(1, -1)
10681081

1069-
finally:
1070-
assert not isinstance(result, DataFrame)
1071-
1072-
if result is not no_result:
1073-
# see if we can cast the block back to the original dtype
1074-
result = maybe_downcast_numeric(result, block.dtype)
1075-
1076-
if block.is_extension and isinstance(result, np.ndarray):
1077-
# e.g. block.values was an IntegerArray
1078-
# (1, N) case can occur if block.values was Categorical
1079-
# and result is ndarray[object]
1080-
assert result.ndim == 1 or result.shape[0] == 1
1081-
try:
1082-
# Cast back if feasible
1083-
result = type(block.values)._from_sequence(
1084-
result.ravel(), dtype=block.values.dtype
1085-
)
1086-
except ValueError:
1087-
# reshape to be valid for non-Extension Block
1088-
result = result.reshape(1, -1)
1082+
assert not isinstance(result, DataFrame)
1083+
1084+
if result is not no_result:
1085+
# see if we can cast the block back to the original dtype
1086+
result = maybe_downcast_numeric(result, block.dtype)
1087+
1088+
if block.is_extension and isinstance(result, np.ndarray):
1089+
# e.g. block.values was an IntegerArray
1090+
# (1, N) case can occur if block.values was Categorical
1091+
# and result is ndarray[object]
1092+
assert result.ndim == 1 or result.shape[0] == 1
1093+
try:
1094+
# Cast back if feasible
1095+
result = type(block.values)._from_sequence(
1096+
result.ravel(), dtype=block.values.dtype
1097+
)
1098+
except ValueError:
1099+
# reshape to be valid for non-Extension Block
1100+
result = result.reshape(1, -1)
10891101

1090-
agg_block: Block = block.make_block(result)
1102+
agg_block: Block = block.make_block(result)
10911103

10921104
new_items.append(locs)
10931105
agg_blocks.append(agg_block)
10941106

1095-
if not agg_blocks:
1107+
if not (agg_blocks or split_frames):
10961108
raise DataError("No numeric types to aggregate")
10971109

1110+
if split_items:
1111+
# Clean up the mess left over from split blocks.
1112+
for locs, result in zip(split_items, split_frames):
1113+
assert len(locs) == result.shape[1]
1114+
for i, loc in enumerate(locs):
1115+
new_items.append(np.array([loc], dtype=locs.dtype))
1116+
agg_blocks.append(result.iloc[:, [i]]._data.blocks[0])
1117+
10981118
# reset the locs in the blocks to correspond to our
10991119
# current ordering
11001120
indexer = np.concatenate(new_items)

pandas/tests/groupby/aggregate/test_aggregate.py

+43
Original file line numberDiff line numberDiff line change
@@ -377,6 +377,49 @@ def test_agg_index_has_complex_internals(index):
377377
tm.assert_frame_equal(result, expected)
378378

379379

380+
def test_agg_split_block():
381+
# https://github.com/pandas-dev/pandas/issues/31522
382+
df = pd.DataFrame(
383+
{
384+
"key1": ["a", "a", "b", "b", "a"],
385+
"key2": ["one", "two", "one", "two", "one"],
386+
"key3": ["three", "three", "three", "six", "six"],
387+
}
388+
)
389+
result = df.groupby("key1").min()
390+
expected = pd.DataFrame(
391+
{"key2": ["one", "one"], "key3": ["six", "six"]},
392+
index=pd.Index(["a", "b"], name="key1"),
393+
)
394+
tm.assert_frame_equal(result, expected)
395+
396+
397+
def test_agg_split_object_part_datetime():
398+
# https://github.com/pandas-dev/pandas/pull/31616
399+
df = pd.DataFrame(
400+
{
401+
"A": pd.date_range("2000", periods=4),
402+
"B": ["a", "b", "c", "d"],
403+
"C": [1, 2, 3, 4],
404+
"D": ["b", "c", "d", "e"],
405+
"E": pd.date_range("2000", periods=4),
406+
"F": [1, 2, 3, 4],
407+
}
408+
).astype(object)
409+
result = df.groupby([0, 0, 0, 0]).min()
410+
expected = pd.DataFrame(
411+
{
412+
"A": [pd.Timestamp("2000")],
413+
"B": ["a"],
414+
"C": [1],
415+
"D": ["b"],
416+
"E": [pd.Timestamp("2000")],
417+
"F": [1],
418+
}
419+
)
420+
tm.assert_frame_equal(result, expected)
421+
422+
380423
def test_agg_cython_category_not_implemented_fallback():
381424
# https://github.com/pandas-dev/pandas/issues/31450
382425
df = pd.DataFrame({"col_num": [1, 1, 2, 3]})

0 commit comments

Comments
 (0)