Skip to content

CLN: Use generators when objects are re-iterated over in core/internals #58319

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Apr 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -12914,12 +12914,12 @@ def _to_dict_of_blocks(self):
Return a dict of dtype -> Constructor Types that
each is a homogeneous dtype.

Internal ONLY - only works for BlockManager
Internal ONLY.
"""
mgr = self._mgr
return {
k: self._constructor_from_mgr(v, axes=v.axes).__finalize__(self)
for k, v in mgr.to_dict().items()
for k, v in mgr.to_iter_dict()
}

@property
Expand Down
23 changes: 10 additions & 13 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@

if TYPE_CHECKING:
from collections.abc import (
Generator,
Iterable,
Sequence,
)
Expand Down Expand Up @@ -385,20 +386,18 @@ def _split_op_result(self, result: ArrayLike) -> list[Block]:
return [nb]

@final
def _split(self) -> list[Block]:
def _split(self) -> Generator[Block, None, None]:
"""
Split a block into a list of single-column blocks.
"""
assert self.ndim == 2

new_blocks = []
for i, ref_loc in enumerate(self._mgr_locs):
vals = self.values[slice(i, i + 1)]

bp = BlockPlacement(ref_loc)
nb = type(self)(vals, placement=bp, ndim=2, refs=self.refs)
new_blocks.append(nb)
return new_blocks
yield nb

@final
def split_and_operate(self, func, *args, **kwargs) -> list[Block]:
Expand Down Expand Up @@ -537,7 +536,9 @@ def convert_dtypes(
rbs = []
for blk in blks:
# Determine dtype column by column
sub_blks = [blk] if blk.ndim == 1 or self.shape[0] == 1 else blk._split()
sub_blks = (
[blk] if blk.ndim == 1 or self.shape[0] == 1 else list(blk._split())
)
dtypes = [
convert_dtypes(
b.values,
Expand Down Expand Up @@ -1190,8 +1191,7 @@ def putmask(self, mask, new) -> list[Block]:
is_array = isinstance(new, np.ndarray)

res_blocks = []
nbs = self._split()
for i, nb in enumerate(nbs):
for i, nb in enumerate(self._split()):
n = new
if is_array:
# we have a different value per-column
Expand Down Expand Up @@ -1255,8 +1255,7 @@ def where(self, other, cond) -> list[Block]:
is_array = isinstance(other, (np.ndarray, ExtensionArray))

res_blocks = []
nbs = self._split()
for i, nb in enumerate(nbs):
for i, nb in enumerate(self._split()):
oth = other
if is_array:
# we have a different value per-column
Expand Down Expand Up @@ -1698,8 +1697,7 @@ def where(self, other, cond) -> list[Block]:
is_array = isinstance(orig_other, (np.ndarray, ExtensionArray))

res_blocks = []
nbs = self._split()
for i, nb in enumerate(nbs):
for i, nb in enumerate(self._split()):
n = orig_other
if is_array:
# we have a different value per-column
Expand Down Expand Up @@ -1760,8 +1758,7 @@ def putmask(self, mask, new) -> list[Block]:
is_array = isinstance(orig_new, (np.ndarray, ExtensionArray))

res_blocks = []
nbs = self._split()
for i, nb in enumerate(nbs):
for i, nb in enumerate(self._split()):
n = orig_new
if is_array:
# we have a different value per-column
Expand Down
41 changes: 16 additions & 25 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,8 @@
)

if TYPE_CHECKING:
from collections.abc import Generator

from pandas._typing import (
ArrayLike,
AxisInt,
Expand Down Expand Up @@ -645,8 +647,7 @@ def get_bool_data(self) -> Self:
new_blocks.append(blk)

elif blk.is_object:
nbs = blk._split()
new_blocks.extend(nb for nb in nbs if nb.is_bool)
new_blocks.extend(nb for nb in blk._split() if nb.is_bool)

return self._combine(new_blocks)

Expand Down Expand Up @@ -1525,7 +1526,9 @@ def _insert_update_mgr_locs(self, loc) -> None:
When inserting a new Block at location 'loc', we increment
all of the mgr_locs of blocks above that by one.
"""
for blkno, count in _fast_count_smallints(self.blknos[loc:]):
# Faster version of set(arr) for sequences of small numbers
blknos = np.bincount(self.blknos[loc:]).nonzero()[0]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the advantage of this over np.unique?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As mentioned in the comment it appears to be more performant

In [3]: import numpy as np

In [4]: arr = np.array([1])

In [5]: %timeit np.unique(arr)
1.95 µs ± 8.22 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)

In [7]: %timeit np.bincount(arr).nonzero()[0]
416 ns ± 56.8 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)

In [8]: arr = np.arange(1000)

In [9]: %timeit np.unique(arr)
5.97 µs ± 27.9 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)

In [10]: %timeit np.bincount(arr).nonzero()[0]
3.51 µs ± 16.5 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)

In [11]: arr = np.arange(100_000)

In [12]: %timeit np.unique(arr)
607 µs ± 4.32 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

In [13]: %timeit np.bincount(arr).nonzero()[0]
294 µs ± 3.27 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Its a nice idea but unless this is getting called in a loop I'd have a slight preference to just go with np.unique since its more expressive. Seems like we might be relying on some implementation details with bincount that could be hard to generalize

Though if you really want this I'd also say its not a blocker for me

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since this was just transferring over the logic from _fast_count_smallints I would prefer keeping this as-is in this PR but not opposed to changing this to np.unique in the future

for blkno in blknos:
# .620 this way, .326 of which is in increment_above
blk = self.blocks[blkno]
blk._mgr_locs = blk._mgr_locs.increment_above(loc)
Expand Down Expand Up @@ -1597,7 +1600,7 @@ def grouped_reduce(self, func: Callable) -> Self:
nrows = 0
else:
nrows = result_blocks[0].values.shape[-1]
index = Index(range(nrows))
index = default_index(nrows)

return type(self).from_blocks(result_blocks, [self.axes[0], index])

Expand Down Expand Up @@ -1735,21 +1738,18 @@ def unstack(self, unstacker, fill_value) -> BlockManager:
bm = BlockManager(new_blocks, [new_columns, new_index], verify_integrity=False)
return bm

def to_dict(self) -> dict[str, Self]:
def to_iter_dict(self) -> Generator[tuple[str, Self], None, None]:
"""
Return a dict of str(dtype) -> BlockManager
Yield a tuple of (str(dtype), BlockManager)

Returns
-------
values : a dict of dtype -> BlockManager
values : a tuple of (str(dtype), BlockManager)
"""

bd: dict[str, list[Block]] = {}
for b in self.blocks:
bd.setdefault(str(b.dtype), []).append(b)

# TODO(EA2D): the combine will be unnecessary with 2D EAs
return {dtype: self._combine(blocks) for dtype, blocks in bd.items()}
key = lambda block: str(block.dtype)
for dtype, blocks in itertools.groupby(sorted(self.blocks, key=key), key=key):
# TODO(EA2D): the combine will be unnecessary with 2D EAs
yield dtype, self._combine(list(blocks))

def as_array(
self,
Expand Down Expand Up @@ -2330,7 +2330,7 @@ def _grouping_func(tup: tuple[int, ArrayLike]) -> tuple[int, DtypeObj]:


def _form_blocks(arrays: list[ArrayLike], consolidate: bool, refs: list) -> list[Block]:
tuples = list(enumerate(arrays))
tuples = enumerate(arrays)

if not consolidate:
return _tuples_to_blocks_no_consolidate(tuples, refs)
Expand All @@ -2351,7 +2351,7 @@ def _form_blocks(arrays: list[ArrayLike], consolidate: bool, refs: list) -> list
if issubclass(dtype.type, (str, bytes)):
dtype = np.dtype(object)

values, placement = _stack_arrays(list(tup_block), dtype)
values, placement = _stack_arrays(tup_block, dtype)
if is_dtlike:
values = ensure_wrapped_if_datetimelike(values)
blk = block_type(values, placement=BlockPlacement(placement), ndim=2)
Expand Down Expand Up @@ -2450,15 +2450,6 @@ def _merge_blocks(
return blocks, False


def _fast_count_smallints(arr: npt.NDArray[np.intp]):
"""Faster version of set(arr) for sequences of small numbers."""
counts = np.bincount(arr)
nz = counts.nonzero()[0]
# Note: list(zip(...) outperforms list(np.c_[nz, counts[nz]]) here,
# in one benchmark by a factor of 11
return zip(nz, counts[nz])


def _preprocess_slice_or_indexer(
slice_or_indexer: slice | np.ndarray, length: int, allow_fill: bool
):
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/internals/test_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,7 @@ def test_split(self):
# GH#37799
values = np.random.default_rng(2).standard_normal((3, 4))
blk = new_block(values, placement=BlockPlacement([3, 1, 6]), ndim=2)
result = blk._split()
result = list(blk._split())

# check that we get views, not copies
values[:] = -9999
Expand Down