diff --git a/doc/source/whatsnew/v1.0.1.rst b/doc/source/whatsnew/v1.0.1.rst index c82a58e5d3c45..041158e682bf9 100644 --- a/doc/source/whatsnew/v1.0.1.rst +++ b/doc/source/whatsnew/v1.0.1.rst @@ -19,6 +19,7 @@ Fixed regressions - Fixed regression when indexing a ``Series`` or ``DataFrame`` indexed by ``DatetimeIndex`` with a slice containg a :class:`datetime.date` (:issue:`31501`) - Fixed regression in ``DataFrame.__setitem__`` raising an ``AttributeError`` with a :class:`MultiIndex` and a non-monotonic indexer (:issue:`31449`) - Fixed regression in :class:`Series` multiplication when multiplying a numeric :class:`Series` with >10000 elements with a timedelta-like scalar (:issue:`31457`) +- Fixed regression in ``.groupby().agg()`` raising an ``AssertionError`` for some reductions like ``min`` on object-dtype columns (:issue:`31522`) - Fixed regression in ``.groupby()`` aggregations with categorical dtype using Cythonized reduction functions (e.g. ``first``) (:issue:`31450`) - Fixed regression in :meth:`GroupBy.apply` if called with a function which returned a non-pandas non-scalar object (e.g. a list or numpy array) (:issue:`31441`) - Fixed regression in :meth:`DataFrame.groupby` whereby taking the minimum or maximum of a column with period dtype would raise a ``TypeError``. (:issue:`31471`) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 27dd6e953c219..f194c774cf329 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1022,6 +1022,10 @@ def _cython_agg_blocks( agg_blocks: List[Block] = [] new_items: List[np.ndarray] = [] deleted_items: List[np.ndarray] = [] + # Some object-dtype blocks might be split into List[Block[T], Block[U]] + split_items: List[np.ndarray] = [] + split_frames: List[DataFrame] = [] + no_result = object() for block in data.blocks: # Avoid inheriting result from earlier in the loop @@ -1061,40 +1065,56 @@ def _cython_agg_blocks( else: result = cast(DataFrame, result) # unwrap DataFrame to get array + if len(result._data.blocks) != 1: + # We've split an object block! Everything we've assumed + # about a single block input returning a single block output + # is a lie. To keep the code-path for the typical non-split case + # clean, we choose to clean up this mess later on. + split_items.append(locs) + split_frames.append(result) + continue + assert len(result._data.blocks) == 1 result = result._data.blocks[0].values if isinstance(result, np.ndarray) and result.ndim == 1: result = result.reshape(1, -1) - finally: - assert not isinstance(result, DataFrame) - - if result is not no_result: - # see if we can cast the block back to the original dtype - result = maybe_downcast_numeric(result, block.dtype) - - if block.is_extension and isinstance(result, np.ndarray): - # e.g. block.values was an IntegerArray - # (1, N) case can occur if block.values was Categorical - # and result is ndarray[object] - assert result.ndim == 1 or result.shape[0] == 1 - try: - # Cast back if feasible - result = type(block.values)._from_sequence( - result.ravel(), dtype=block.values.dtype - ) - except ValueError: - # reshape to be valid for non-Extension Block - result = result.reshape(1, -1) + assert not isinstance(result, DataFrame) + + if result is not no_result: + # see if we can cast the block back to the original dtype + result = maybe_downcast_numeric(result, block.dtype) + + if block.is_extension and isinstance(result, np.ndarray): + # e.g. block.values was an IntegerArray + # (1, N) case can occur if block.values was Categorical + # and result is ndarray[object] + assert result.ndim == 1 or result.shape[0] == 1 + try: + # Cast back if feasible + result = type(block.values)._from_sequence( + result.ravel(), dtype=block.values.dtype + ) + except ValueError: + # reshape to be valid for non-Extension Block + result = result.reshape(1, -1) - agg_block: Block = block.make_block(result) + agg_block: Block = block.make_block(result) new_items.append(locs) agg_blocks.append(agg_block) - if not agg_blocks: + if not (agg_blocks or split_frames): raise DataError("No numeric types to aggregate") + if split_items: + # Clean up the mess left over from split blocks. + for locs, result in zip(split_items, split_frames): + assert len(locs) == result.shape[1] + for i, loc in enumerate(locs): + new_items.append(np.array([loc], dtype=locs.dtype)) + agg_blocks.append(result.iloc[:, [i]]._data.blocks[0]) + # reset the locs in the blocks to correspond to our # current ordering indexer = np.concatenate(new_items) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 4eb073a28d580..ff99081521ffb 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -377,6 +377,49 @@ def test_agg_index_has_complex_internals(index): tm.assert_frame_equal(result, expected) +def test_agg_split_block(): + # https://github.com/pandas-dev/pandas/issues/31522 + df = pd.DataFrame( + { + "key1": ["a", "a", "b", "b", "a"], + "key2": ["one", "two", "one", "two", "one"], + "key3": ["three", "three", "three", "six", "six"], + } + ) + result = df.groupby("key1").min() + expected = pd.DataFrame( + {"key2": ["one", "one"], "key3": ["six", "six"]}, + index=pd.Index(["a", "b"], name="key1"), + ) + tm.assert_frame_equal(result, expected) + + +def test_agg_split_object_part_datetime(): + # https://github.com/pandas-dev/pandas/pull/31616 + df = pd.DataFrame( + { + "A": pd.date_range("2000", periods=4), + "B": ["a", "b", "c", "d"], + "C": [1, 2, 3, 4], + "D": ["b", "c", "d", "e"], + "E": pd.date_range("2000", periods=4), + "F": [1, 2, 3, 4], + } + ).astype(object) + result = df.groupby([0, 0, 0, 0]).min() + expected = pd.DataFrame( + { + "A": [pd.Timestamp("2000")], + "B": ["a"], + "C": [1], + "D": ["b"], + "E": [pd.Timestamp("2000")], + "F": [1], + } + ) + tm.assert_frame_equal(result, expected) + + def test_agg_cython_category_not_implemented_fallback(): # https://github.com/pandas-dev/pandas/issues/31450 df = pd.DataFrame({"col_num": [1, 1, 2, 3]})