From e1c9f1b1f15d2f1db656ffef0331742548ae9c7f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 16 Apr 2024 11:47:09 -0700 Subject: [PATCH 1/4] Have methods returns generators instead of lists --- pandas/core/groupby/groupby.py | 5 ++--- pandas/core/groupby/ops.py | 12 ++++++------ pandas/core/indexes/base.py | 1 + pandas/core/indexes/multi.py | 8 ++------ pandas/core/internals/blocks.py | 9 ++++----- pandas/core/internals/concat.py | 19 ++++++++----------- pandas/io/formats/info.py | 2 +- pandas/io/sql.py | 2 +- pandas/tests/internals/test_internals.py | 2 +- 9 files changed, 26 insertions(+), 34 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index bc37405b25a16..79d9f49a3b355 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1286,11 +1286,10 @@ def _insert_inaxis_grouper( ) # zip in reverse so we can always insert at loc 0 - for level, (name, lev, in_axis) in enumerate( + for level, (name, lev) in enumerate( zip( reversed(self._grouper.names), - reversed(self._grouper.get_group_levels()), - reversed([grp.in_axis for grp in self._grouper.groupings]), + self._grouper.get_group_levels(), ) ): if name is None: diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 0d88882c9b7ef..e79ef80bdaa57 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -70,6 +70,7 @@ if TYPE_CHECKING: from collections.abc import ( + Generator, Hashable, Iterator, Sequence, @@ -857,16 +858,15 @@ def _unob_index_and_ids( return unob_index, unob_ids @final - def get_group_levels(self) -> list[Index]: + def get_group_levels(self) -> Generator[Index, None, None]: # Note: only called from _insert_inaxis_grouper, which # is only called for BaseGrouper, never for BinGrouper result_index = self.result_index if len(self.groupings) == 1: - return [result_index] - return [ - result_index.get_level_values(level) - for level in range(result_index.nlevels) - ] + yield result_index + else: + for level in range(result_index.nlevels, -1, -1): + yield result_index.get_level_values(level) # ------------------------------------------------------------ # Aggregation functions diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 69f916bb3f769..bb3726eb444ab 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4919,6 +4919,7 @@ def _validate_fill_value(self, value): raise TypeError return value + @property def _is_memory_usage_qualified(self) -> bool: """ Return a boolean if we need a qualified .info display. diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 9df0d26ce622a..21ce9b759f2df 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1351,13 +1351,14 @@ def __contains__(self, key: Any) -> bool: def dtype(self) -> np.dtype: return np.dtype("O") + @cache_readonly def _is_memory_usage_qualified(self) -> bool: """return a boolean if we need a qualified .info display""" def f(level) -> bool: return "mixed" in level or "string" in level or "unicode" in level - return any(f(level) for level in self._inferred_type_levels) + return any(f(level.inferred_type) for level in self.levels) # Cannot determine type of "memory_usage" @doc(Index.memory_usage) # type: ignore[has-type] @@ -1659,11 +1660,6 @@ def is_monotonic_decreasing(self) -> bool: # monotonic decreasing if and only if reverse is monotonic increasing return self[::-1].is_monotonic_increasing - @cache_readonly - def _inferred_type_levels(self) -> list[str]: - """return a list of the inferred types, one for each level""" - return [i.inferred_type for i in self.levels] - @doc(Index.duplicated) def duplicated(self, keep: DropKeep = "first") -> npt.NDArray[np.bool_]: shape = tuple(len(lev) for lev in self.levels) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 7be1d5d95ffdf..3fe1af101da48 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -118,6 +118,7 @@ if TYPE_CHECKING: from collections.abc import ( + Generator, Iterable, Sequence, ) @@ -385,20 +386,18 @@ def _split_op_result(self, result: ArrayLike) -> list[Block]: return [nb] @final - def _split(self) -> list[Block]: + def _split(self) -> Generator[Block, None, None]: """ - Split a block into a list of single-column blocks. + Split a block into each single-column block. """ assert self.ndim == 2 - new_blocks = [] for i, ref_loc in enumerate(self._mgr_locs): vals = self.values[slice(i, i + 1)] bp = BlockPlacement(ref_loc) nb = type(self)(vals, placement=bp, ndim=2, refs=self.refs) - new_blocks.append(nb) - return new_blocks + yield nb @final def split_and_operate(self, func, *args, **kwargs) -> list[Block]: diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index d833dab5b820f..6be3d037c4ebe 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -49,7 +49,10 @@ ) if TYPE_CHECKING: - from collections.abc import Sequence + from collections.abc import ( + Generator, + Sequence, + ) from pandas._typing import ( ArrayLike, @@ -118,12 +121,10 @@ def concatenate_managers( out.axes = axes return out - concat_plan = _get_combined_plan(mgrs) - blocks = [] values: ArrayLike - for placement, join_units in concat_plan: + for placement, join_units in _get_combined_plan(mgrs): unit = join_units[0] blk = unit.block @@ -258,14 +259,12 @@ def _concat_homogeneous_fastpath( def _get_combined_plan( mgrs: list[BlockManager], -) -> list[tuple[BlockPlacement, list[JoinUnit]]]: - plan = [] - +) -> Generator[tuple[BlockPlacement, list[JoinUnit]], None, None]: max_len = mgrs[0].shape[0] blknos_list = [mgr.blknos for mgr in mgrs] pairs = libinternals.get_concat_blkno_indexers(blknos_list) - for ind, (blknos, bp) in enumerate(pairs): + for blknos, bp in pairs: # assert bp.is_slice_like # assert len(bp) > 0 @@ -277,9 +276,7 @@ def _get_combined_plan( unit = JoinUnit(nb) units_for_bp.append(unit) - plan.append((bp, units_for_bp)) - - return plan + yield bp, units_for_bp def _get_block_for_concat_plan( diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index bb156f0fbf826..469dcfb76ba0b 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -422,7 +422,7 @@ def size_qualifier(self) -> str: # categories) if ( "object" in self.dtype_counts - or self.data.index._is_memory_usage_qualified() + or self.data.index._is_memory_usage_qualified ): size_qualifier = "+" return size_qualifier diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 8c4c4bac884e5..c0007c5e7d78c 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -177,7 +177,7 @@ def _convert_arrays_to_dataframe( result_arrays.append(ArrowExtensionArray(pa_array)) arrays = result_arrays # type: ignore[assignment] if arrays: - df = DataFrame(dict(zip(list(range(len(columns))), arrays))) + df = DataFrame(dict(zip(range(len(columns)), arrays))) df.columns = columns return df else: diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 92addeb29252a..43bcf84f901b1 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -347,7 +347,7 @@ def test_split(self): # GH#37799 values = np.random.default_rng(2).standard_normal((3, 4)) blk = new_block(values, placement=BlockPlacement([3, 1, 6]), ndim=2) - result = blk._split() + result = list(blk._split()) # check that we get views, not copies values[:] = -9999 From 9aeb5c953d0897e4f2c974e7e359e3c6cdfa106d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 17 Apr 2024 13:03:46 -0700 Subject: [PATCH 2/4] Fix ops method, undo block --- pandas/core/groupby/ops.py | 2 +- pandas/core/internals/blocks.py | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index e79ef80bdaa57..effa94b1606bd 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -865,7 +865,7 @@ def get_group_levels(self) -> Generator[Index, None, None]: if len(self.groupings) == 1: yield result_index else: - for level in range(result_index.nlevels, -1, -1): + for level in range(result_index.nlevels - 1, -1, -1): yield result_index.get_level_values(level) # ------------------------------------------------------------ diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 3fe1af101da48..7be1d5d95ffdf 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -118,7 +118,6 @@ if TYPE_CHECKING: from collections.abc import ( - Generator, Iterable, Sequence, ) @@ -386,18 +385,20 @@ def _split_op_result(self, result: ArrayLike) -> list[Block]: return [nb] @final - def _split(self) -> Generator[Block, None, None]: + def _split(self) -> list[Block]: """ - Split a block into each single-column block. + Split a block into a list of single-column blocks. """ assert self.ndim == 2 + new_blocks = [] for i, ref_loc in enumerate(self._mgr_locs): vals = self.values[slice(i, i + 1)] bp = BlockPlacement(ref_loc) nb = type(self)(vals, placement=bp, ndim=2, refs=self.refs) - yield nb + new_blocks.append(nb) + return new_blocks @final def split_and_operate(self, func, *args, **kwargs) -> list[Block]: From 36b481f2e2fb37f0c9032dcef5a8b1817d72d0da Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 17 Apr 2024 13:05:01 -0700 Subject: [PATCH 3/4] Undo internals test --- pandas/tests/internals/test_internals.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 43bcf84f901b1..92addeb29252a 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -347,7 +347,7 @@ def test_split(self): # GH#37799 values = np.random.default_rng(2).standard_normal((3, 4)) blk = new_block(values, placement=BlockPlacement([3, 1, 6]), ndim=2) - result = list(blk._split()) + result = blk._split() # check that we get views, not copies values[:] = -9999 From b2d10e2cfd48a31b4e77650f9ae119b7d320c457 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 18 Apr 2024 16:58:08 -0700 Subject: [PATCH 4/4] Make _is_memory_usage_qualified cache_readonly too --- pandas/core/indexes/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index bb3726eb444ab..685291e690196 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4919,7 +4919,7 @@ def _validate_fill_value(self, value): raise TypeError return value - @property + @cache_readonly def _is_memory_usage_qualified(self) -> bool: """ Return a boolean if we need a qualified .info display.