diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b595e4d2158fc..f566dcfe1ee0e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -12914,12 +12914,12 @@ def _to_dict_of_blocks(self): Return a dict of dtype -> Constructor Types that each is a homogeneous dtype. - Internal ONLY - only works for BlockManager + Internal ONLY. """ mgr = self._mgr return { k: self._constructor_from_mgr(v, axes=v.axes).__finalize__(self) - for k, v in mgr.to_dict().items() + for k, v in mgr.to_iter_dict() } @property diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 7be1d5d95ffdf..1b72c164f7945 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -118,6 +118,7 @@ if TYPE_CHECKING: from collections.abc import ( + Generator, Iterable, Sequence, ) @@ -385,20 +386,18 @@ def _split_op_result(self, result: ArrayLike) -> list[Block]: return [nb] @final - def _split(self) -> list[Block]: + def _split(self) -> Generator[Block, None, None]: """ Split a block into a list of single-column blocks. """ assert self.ndim == 2 - new_blocks = [] for i, ref_loc in enumerate(self._mgr_locs): vals = self.values[slice(i, i + 1)] bp = BlockPlacement(ref_loc) nb = type(self)(vals, placement=bp, ndim=2, refs=self.refs) - new_blocks.append(nb) - return new_blocks + yield nb @final def split_and_operate(self, func, *args, **kwargs) -> list[Block]: @@ -537,7 +536,9 @@ def convert_dtypes( rbs = [] for blk in blks: # Determine dtype column by column - sub_blks = [blk] if blk.ndim == 1 or self.shape[0] == 1 else blk._split() + sub_blks = ( + [blk] if blk.ndim == 1 or self.shape[0] == 1 else list(blk._split()) + ) dtypes = [ convert_dtypes( b.values, @@ -1190,8 +1191,7 @@ def putmask(self, mask, new) -> list[Block]: is_array = isinstance(new, np.ndarray) res_blocks = [] - nbs = self._split() - for i, nb in enumerate(nbs): + for i, nb in enumerate(self._split()): n = new if is_array: # we have a different value per-column @@ -1255,8 +1255,7 @@ def where(self, other, cond) -> list[Block]: is_array = isinstance(other, (np.ndarray, ExtensionArray)) res_blocks = [] - nbs = self._split() - for i, nb in enumerate(nbs): + for i, nb in enumerate(self._split()): oth = other if is_array: # we have a different value per-column @@ -1698,8 +1697,7 @@ def where(self, other, cond) -> list[Block]: is_array = isinstance(orig_other, (np.ndarray, ExtensionArray)) res_blocks = [] - nbs = self._split() - for i, nb in enumerate(nbs): + for i, nb in enumerate(self._split()): n = orig_other if is_array: # we have a different value per-column @@ -1760,8 +1758,7 @@ def putmask(self, mask, new) -> list[Block]: is_array = isinstance(orig_new, (np.ndarray, ExtensionArray)) res_blocks = [] - nbs = self._split() - for i, nb in enumerate(nbs): + for i, nb in enumerate(self._split()): n = orig_new if is_array: # we have a different value per-column diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 8fda9cd23b508..7c1bcbec1d3f2 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -92,6 +92,8 @@ ) if TYPE_CHECKING: + from collections.abc import Generator + from pandas._typing import ( ArrayLike, AxisInt, @@ -645,8 +647,7 @@ def get_bool_data(self) -> Self: new_blocks.append(blk) elif blk.is_object: - nbs = blk._split() - new_blocks.extend(nb for nb in nbs if nb.is_bool) + new_blocks.extend(nb for nb in blk._split() if nb.is_bool) return self._combine(new_blocks) @@ -1525,7 +1526,9 @@ def _insert_update_mgr_locs(self, loc) -> None: When inserting a new Block at location 'loc', we increment all of the mgr_locs of blocks above that by one. """ - for blkno, count in _fast_count_smallints(self.blknos[loc:]): + # Faster version of set(arr) for sequences of small numbers + blknos = np.bincount(self.blknos[loc:]).nonzero()[0] + for blkno in blknos: # .620 this way, .326 of which is in increment_above blk = self.blocks[blkno] blk._mgr_locs = blk._mgr_locs.increment_above(loc) @@ -1597,7 +1600,7 @@ def grouped_reduce(self, func: Callable) -> Self: nrows = 0 else: nrows = result_blocks[0].values.shape[-1] - index = Index(range(nrows)) + index = default_index(nrows) return type(self).from_blocks(result_blocks, [self.axes[0], index]) @@ -1735,21 +1738,18 @@ def unstack(self, unstacker, fill_value) -> BlockManager: bm = BlockManager(new_blocks, [new_columns, new_index], verify_integrity=False) return bm - def to_dict(self) -> dict[str, Self]: + def to_iter_dict(self) -> Generator[tuple[str, Self], None, None]: """ - Return a dict of str(dtype) -> BlockManager + Yield a tuple of (str(dtype), BlockManager) Returns ------- - values : a dict of dtype -> BlockManager + values : a tuple of (str(dtype), BlockManager) """ - - bd: dict[str, list[Block]] = {} - for b in self.blocks: - bd.setdefault(str(b.dtype), []).append(b) - - # TODO(EA2D): the combine will be unnecessary with 2D EAs - return {dtype: self._combine(blocks) for dtype, blocks in bd.items()} + key = lambda block: str(block.dtype) + for dtype, blocks in itertools.groupby(sorted(self.blocks, key=key), key=key): + # TODO(EA2D): the combine will be unnecessary with 2D EAs + yield dtype, self._combine(list(blocks)) def as_array( self, @@ -2330,7 +2330,7 @@ def _grouping_func(tup: tuple[int, ArrayLike]) -> tuple[int, DtypeObj]: def _form_blocks(arrays: list[ArrayLike], consolidate: bool, refs: list) -> list[Block]: - tuples = list(enumerate(arrays)) + tuples = enumerate(arrays) if not consolidate: return _tuples_to_blocks_no_consolidate(tuples, refs) @@ -2351,7 +2351,7 @@ def _form_blocks(arrays: list[ArrayLike], consolidate: bool, refs: list) -> list if issubclass(dtype.type, (str, bytes)): dtype = np.dtype(object) - values, placement = _stack_arrays(list(tup_block), dtype) + values, placement = _stack_arrays(tup_block, dtype) if is_dtlike: values = ensure_wrapped_if_datetimelike(values) blk = block_type(values, placement=BlockPlacement(placement), ndim=2) @@ -2450,15 +2450,6 @@ def _merge_blocks( return blocks, False -def _fast_count_smallints(arr: npt.NDArray[np.intp]): - """Faster version of set(arr) for sequences of small numbers.""" - counts = np.bincount(arr) - nz = counts.nonzero()[0] - # Note: list(zip(...) outperforms list(np.c_[nz, counts[nz]]) here, - # in one benchmark by a factor of 11 - return zip(nz, counts[nz]) - - def _preprocess_slice_or_indexer( slice_or_indexer: slice | np.ndarray, length: int, allow_fill: bool ): diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 92addeb29252a..43bcf84f901b1 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -347,7 +347,7 @@ def test_split(self): # GH#37799 values = np.random.default_rng(2).standard_normal((3, 4)) blk = new_block(values, placement=BlockPlacement([3, 1, 6]), ndim=2) - result = blk._split() + result = list(blk._split()) # check that we get views, not copies values[:] = -9999