REF: Avoid post-processing in blockwise op (#35356)

jbrockmendel · web-flow · commit 188ce7301329 · 2020-08-08T10:51:39.000-04:00
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -1029,11 +1029,36 @@ def _cython_agg_blocks(
         agg_blocks: List[Block] = []
         new_items: List[np.ndarray] = []
         deleted_items: List[np.ndarray] = []
-        # Some object-dtype blocks might be split into List[Block[T], Block[U]]
-        split_items: List[np.ndarray] = []
-        split_frames: List[DataFrame] = []
 
         no_result = object()
+
+        def cast_result_block(result, block: "Block", how: str) -> "Block":
+            # see if we can cast the block to the desired dtype
+            # this may not be the original dtype
+            assert not isinstance(result, DataFrame)
+            assert result is not no_result
+
+            dtype = maybe_cast_result_dtype(block.dtype, how)
+            result = maybe_downcast_numeric(result, dtype)
+
+            if block.is_extension and isinstance(result, np.ndarray):
+                # e.g. block.values was an IntegerArray
+                # (1, N) case can occur if block.values was Categorical
+                #  and result is ndarray[object]
+                # TODO(EA2D): special casing not needed with 2D EAs
+                assert result.ndim == 1 or result.shape[0] == 1
+                try:
+                    # Cast back if feasible
+                    result = type(block.values)._from_sequence(
+                        result.ravel(), dtype=block.values.dtype
+                    )
+                except (ValueError, TypeError):
+                    # reshape to be valid for non-Extension Block
+                    result = result.reshape(1, -1)
+
+            agg_block: Block = block.make_block(result)
+            return agg_block
+
         for block in data.blocks:
             # Avoid inheriting result from earlier in the loop
             result = no_result
@@ -1065,9 +1090,9 @@ def _cython_agg_blocks(
                 # not try to add missing categories if grouping over multiple
                 # Categoricals. This will done by later self._reindex_output()
                 # Doing it here creates an error. See GH#34951
-                s = get_groupby(obj, self.grouper, observed=True)
+                sgb = get_groupby(obj, self.grouper, observed=True)
                 try:
-                    result = s.aggregate(lambda x: alt(x, axis=self.axis))
+                    result = sgb.aggregate(lambda x: alt(x, axis=self.axis))
                 except TypeError:
                     # we may have an exception in trying to aggregate
                     # continue and exclude the block
@@ -1081,54 +1106,26 @@ def _cython_agg_blocks(
                         # about a single block input returning a single block output
                         # is a lie. To keep the code-path for the typical non-split case
                         # clean, we choose to clean up this mess later on.
-                        split_items.append(locs)
-                        split_frames.append(result)
-                        continue
-
-                    assert len(result._mgr.blocks) == 1
-                    result = result._mgr.blocks[0].values
-                    if isinstance(result, np.ndarray) and result.ndim == 1:
-                        result = result.reshape(1, -1)
-
-            assert not isinstance(result, DataFrame)
-
-            if result is not no_result:
-                # see if we can cast the block to the desired dtype
-                # this may not be the original dtype
-                dtype = maybe_cast_result_dtype(block.dtype, how)
-                result = maybe_downcast_numeric(result, dtype)
-
-                if block.is_extension and isinstance(result, np.ndarray):
-                    # e.g. block.values was an IntegerArray
-                    # (1, N) case can occur if block.values was Categorical
-                    #  and result is ndarray[object]
-                    # TODO(EA2D): special casing not needed with 2D EAs
-                    assert result.ndim == 1 or result.shape[0] == 1
-                    try:
-                        # Cast back if feasible
-                        result = type(block.values)._from_sequence(
-                            result.ravel(), dtype=block.values.dtype
-                        )
-                    except (ValueError, TypeError):
-                        # reshape to be valid for non-Extension Block
-                        result = result.reshape(1, -1)
-
-                agg_block: Block = block.make_block(result)
-
-            new_items.append(locs)
-            agg_blocks.append(agg_block)
+                        assert len(locs) == result.shape[1]
+                        for i, loc in enumerate(locs):
+                            new_items.append(np.array([loc], dtype=locs.dtype))
+                            agg_block = result.iloc[:, [i]]._mgr.blocks[0]
+                            agg_blocks.append(agg_block)
+                    else:
+                        result = result._mgr.blocks[0].values
+                        if isinstance(result, np.ndarray) and result.ndim == 1:
+                            result = result.reshape(1, -1)
+                        agg_block = cast_result_block(result, block, how)
+                        new_items.append(locs)
+                        agg_blocks.append(agg_block)
+            else:
+                agg_block = cast_result_block(result, block, how)
+                new_items.append(locs)
+                agg_blocks.append(agg_block)
 
-        if not (agg_blocks or split_frames):
+        if not agg_blocks:
             raise DataError("No numeric types to aggregate")
 
-        if split_items:
-            # Clean up the mess left over from split blocks.
-            for locs, result in zip(split_items, split_frames):
-                assert len(locs) == result.shape[1]
-                for i, loc in enumerate(locs):
-                    new_items.append(np.array([loc], dtype=locs.dtype))
-                    agg_blocks.append(result.iloc[:, [i]]._mgr.blocks[0])
-
         # reset the locs in the blocks to correspond to our
         # current ordering
         indexer = np.concatenate(new_items)