Skip to content

Commit 454e2e1

Browse files
authored
CLN: Use generators when objects are re-iterated over in core/internals (#58319)
* Make _split generator * More iterators * Remove typing
1 parent 3461db5 commit 454e2e1

File tree

4 files changed

+29
-41
lines changed

4 files changed

+29
-41
lines changed

pandas/core/frame.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -12925,12 +12925,12 @@ def _to_dict_of_blocks(self):
1292512925
Return a dict of dtype -> Constructor Types that
1292612926
each is a homogeneous dtype.
1292712927
12928-
Internal ONLY - only works for BlockManager
12928+
Internal ONLY.
1292912929
"""
1293012930
mgr = self._mgr
1293112931
return {
1293212932
k: self._constructor_from_mgr(v, axes=v.axes).__finalize__(self)
12933-
for k, v in mgr.to_dict().items()
12933+
for k, v in mgr.to_iter_dict()
1293412934
}
1293512935

1293612936
@property

pandas/core/internals/blocks.py

+10-13
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,7 @@
118118

119119
if TYPE_CHECKING:
120120
from collections.abc import (
121+
Generator,
121122
Iterable,
122123
Sequence,
123124
)
@@ -385,20 +386,18 @@ def _split_op_result(self, result: ArrayLike) -> list[Block]:
385386
return [nb]
386387

387388
@final
388-
def _split(self) -> list[Block]:
389+
def _split(self) -> Generator[Block, None, None]:
389390
"""
390391
Split a block into a list of single-column blocks.
391392
"""
392393
assert self.ndim == 2
393394

394-
new_blocks = []
395395
for i, ref_loc in enumerate(self._mgr_locs):
396396
vals = self.values[slice(i, i + 1)]
397397

398398
bp = BlockPlacement(ref_loc)
399399
nb = type(self)(vals, placement=bp, ndim=2, refs=self.refs)
400-
new_blocks.append(nb)
401-
return new_blocks
400+
yield nb
402401

403402
@final
404403
def split_and_operate(self, func, *args, **kwargs) -> list[Block]:
@@ -537,7 +536,9 @@ def convert_dtypes(
537536
rbs = []
538537
for blk in blks:
539538
# Determine dtype column by column
540-
sub_blks = [blk] if blk.ndim == 1 or self.shape[0] == 1 else blk._split()
539+
sub_blks = (
540+
[blk] if blk.ndim == 1 or self.shape[0] == 1 else list(blk._split())
541+
)
541542
dtypes = [
542543
convert_dtypes(
543544
b.values,
@@ -1190,8 +1191,7 @@ def putmask(self, mask, new) -> list[Block]:
11901191
is_array = isinstance(new, np.ndarray)
11911192

11921193
res_blocks = []
1193-
nbs = self._split()
1194-
for i, nb in enumerate(nbs):
1194+
for i, nb in enumerate(self._split()):
11951195
n = new
11961196
if is_array:
11971197
# we have a different value per-column
@@ -1255,8 +1255,7 @@ def where(self, other, cond) -> list[Block]:
12551255
is_array = isinstance(other, (np.ndarray, ExtensionArray))
12561256

12571257
res_blocks = []
1258-
nbs = self._split()
1259-
for i, nb in enumerate(nbs):
1258+
for i, nb in enumerate(self._split()):
12601259
oth = other
12611260
if is_array:
12621261
# we have a different value per-column
@@ -1698,8 +1697,7 @@ def where(self, other, cond) -> list[Block]:
16981697
is_array = isinstance(orig_other, (np.ndarray, ExtensionArray))
16991698

17001699
res_blocks = []
1701-
nbs = self._split()
1702-
for i, nb in enumerate(nbs):
1700+
for i, nb in enumerate(self._split()):
17031701
n = orig_other
17041702
if is_array:
17051703
# we have a different value per-column
@@ -1760,8 +1758,7 @@ def putmask(self, mask, new) -> list[Block]:
17601758
is_array = isinstance(orig_new, (np.ndarray, ExtensionArray))
17611759

17621760
res_blocks = []
1763-
nbs = self._split()
1764-
for i, nb in enumerate(nbs):
1761+
for i, nb in enumerate(self._split()):
17651762
n = orig_new
17661763
if is_array:
17671764
# we have a different value per-column

pandas/core/internals/managers.py

+16-25
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,8 @@
9292
)
9393

9494
if TYPE_CHECKING:
95+
from collections.abc import Generator
96+
9597
from pandas._typing import (
9698
ArrayLike,
9799
AxisInt,
@@ -645,8 +647,7 @@ def get_bool_data(self) -> Self:
645647
new_blocks.append(blk)
646648

647649
elif blk.is_object:
648-
nbs = blk._split()
649-
new_blocks.extend(nb for nb in nbs if nb.is_bool)
650+
new_blocks.extend(nb for nb in blk._split() if nb.is_bool)
650651

651652
return self._combine(new_blocks)
652653

@@ -1525,7 +1526,9 @@ def _insert_update_mgr_locs(self, loc) -> None:
15251526
When inserting a new Block at location 'loc', we increment
15261527
all of the mgr_locs of blocks above that by one.
15271528
"""
1528-
for blkno, count in _fast_count_smallints(self.blknos[loc:]):
1529+
# Faster version of set(arr) for sequences of small numbers
1530+
blknos = np.bincount(self.blknos[loc:]).nonzero()[0]
1531+
for blkno in blknos:
15291532
# .620 this way, .326 of which is in increment_above
15301533
blk = self.blocks[blkno]
15311534
blk._mgr_locs = blk._mgr_locs.increment_above(loc)
@@ -1597,7 +1600,7 @@ def grouped_reduce(self, func: Callable) -> Self:
15971600
nrows = 0
15981601
else:
15991602
nrows = result_blocks[0].values.shape[-1]
1600-
index = Index(range(nrows))
1603+
index = default_index(nrows)
16011604

16021605
return type(self).from_blocks(result_blocks, [self.axes[0], index])
16031606

@@ -1735,21 +1738,18 @@ def unstack(self, unstacker, fill_value) -> BlockManager:
17351738
bm = BlockManager(new_blocks, [new_columns, new_index], verify_integrity=False)
17361739
return bm
17371740

1738-
def to_dict(self) -> dict[str, Self]:
1741+
def to_iter_dict(self) -> Generator[tuple[str, Self], None, None]:
17391742
"""
1740-
Return a dict of str(dtype) -> BlockManager
1743+
Yield a tuple of (str(dtype), BlockManager)
17411744
17421745
Returns
17431746
-------
1744-
values : a dict of dtype -> BlockManager
1747+
values : a tuple of (str(dtype), BlockManager)
17451748
"""
1746-
1747-
bd: dict[str, list[Block]] = {}
1748-
for b in self.blocks:
1749-
bd.setdefault(str(b.dtype), []).append(b)
1750-
1751-
# TODO(EA2D): the combine will be unnecessary with 2D EAs
1752-
return {dtype: self._combine(blocks) for dtype, blocks in bd.items()}
1749+
key = lambda block: str(block.dtype)
1750+
for dtype, blocks in itertools.groupby(sorted(self.blocks, key=key), key=key):
1751+
# TODO(EA2D): the combine will be unnecessary with 2D EAs
1752+
yield dtype, self._combine(list(blocks))
17531753

17541754
def as_array(
17551755
self,
@@ -2330,7 +2330,7 @@ def _grouping_func(tup: tuple[int, ArrayLike]) -> tuple[int, DtypeObj]:
23302330

23312331

23322332
def _form_blocks(arrays: list[ArrayLike], consolidate: bool, refs: list) -> list[Block]:
2333-
tuples = list(enumerate(arrays))
2333+
tuples = enumerate(arrays)
23342334

23352335
if not consolidate:
23362336
return _tuples_to_blocks_no_consolidate(tuples, refs)
@@ -2351,7 +2351,7 @@ def _form_blocks(arrays: list[ArrayLike], consolidate: bool, refs: list) -> list
23512351
if issubclass(dtype.type, (str, bytes)):
23522352
dtype = np.dtype(object)
23532353

2354-
values, placement = _stack_arrays(list(tup_block), dtype)
2354+
values, placement = _stack_arrays(tup_block, dtype)
23552355
if is_dtlike:
23562356
values = ensure_wrapped_if_datetimelike(values)
23572357
blk = block_type(values, placement=BlockPlacement(placement), ndim=2)
@@ -2450,15 +2450,6 @@ def _merge_blocks(
24502450
return blocks, False
24512451

24522452

2453-
def _fast_count_smallints(arr: npt.NDArray[np.intp]):
2454-
"""Faster version of set(arr) for sequences of small numbers."""
2455-
counts = np.bincount(arr)
2456-
nz = counts.nonzero()[0]
2457-
# Note: list(zip(...) outperforms list(np.c_[nz, counts[nz]]) here,
2458-
# in one benchmark by a factor of 11
2459-
return zip(nz, counts[nz])
2460-
2461-
24622453
def _preprocess_slice_or_indexer(
24632454
slice_or_indexer: slice | np.ndarray, length: int, allow_fill: bool
24642455
):

pandas/tests/internals/test_internals.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -347,7 +347,7 @@ def test_split(self):
347347
# GH#37799
348348
values = np.random.default_rng(2).standard_normal((3, 4))
349349
blk = new_block(values, placement=BlockPlacement([3, 1, 6]), ndim=2)
350-
result = blk._split()
350+
result = list(blk._split())
351351

352352
# check that we get views, not copies
353353
values[:] = -9999

0 commit comments

Comments
 (0)