From 2fe595299787068612f4ad06e55665c6e1078383 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 25 Feb 2020 20:50:50 -0800 Subject: [PATCH] PERF: lazify blknos and blklocs --- pandas/core/internals/concat.py | 8 +-- pandas/core/internals/managers.py | 59 +++++++++++++++----- pandas/tests/frame/indexing/test_indexing.py | 5 +- pandas/tests/frame/test_nonunique_indexes.py | 4 +- pandas/tests/internals/test_internals.py | 3 +- 5 files changed, 57 insertions(+), 22 deletions(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 515e1bcd761b6..7570f6eddbd9c 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -48,8 +48,8 @@ def get_mgr_concatenation_plan(mgr, indexers): if 0 in indexers: ax0_indexer = indexers.pop(0) - blknos = algos.take_1d(mgr._blknos, ax0_indexer, fill_value=-1) - blklocs = algos.take_1d(mgr._blklocs, ax0_indexer, fill_value=-1) + blknos = algos.take_1d(mgr.blknos, ax0_indexer, fill_value=-1) + blklocs = algos.take_1d(mgr.blklocs, ax0_indexer, fill_value=-1) else: if mgr._is_single_block: @@ -57,8 +57,8 @@ def get_mgr_concatenation_plan(mgr, indexers): return [(blk.mgr_locs, JoinUnit(blk, mgr_shape, indexers))] ax0_indexer = None - blknos = mgr._blknos - blklocs = mgr._blklocs + blknos = mgr.blknos + blklocs = mgr.blklocs plan = [] for blkno, placements in libinternals.get_blkno_placements(blknos, group=False): diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 329bfdf543c62..f4e0535172e64 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -141,9 +141,37 @@ def __init__( if do_integrity_check: self._verify_integrity() + # Populate known_consolidate, blknos, and blklocs lazily self._known_consolidated = False + self._blknos = None + self._blklocs = None - self._rebuild_blknos_and_blklocs() + @property + def blknos(self): + """ + Suppose we want to find the array corresponding to our i'th column. + + blknos[i] identifies the block from self.blocks that contains this column. + + blklocs[i] identifies the column of interest within + self.blocks[self.blknos[i]] + """ + if self._blknos is None: + # Note: these can be altered by other BlockManager methods. + self._rebuild_blknos_and_blklocs() + + return self._blknos + + @property + def blklocs(self): + """ + See blknos.__doc__ + """ + if self._blklocs is None: + # Note: these can be altered by other BlockManager methods. + self._rebuild_blknos_and_blklocs() + + return self._blklocs def make_empty(self, axes=None): """ return an empty BlockManager with the items axis of len 0 """ @@ -227,6 +255,7 @@ def _rebuild_blknos_and_blklocs(self): new_blklocs[rl.indexer] = np.arange(len(rl)) if (new_blknos == -1).any(): + # TODO: can we avoid this? it isn't cheap raise AssertionError("Gaps in blk ref_locs") self._blknos = new_blknos @@ -250,7 +279,7 @@ def get_dtype_counts(self): def get_dtypes(self): dtypes = np.array([blk.dtype for blk in self.blocks]) - return algos.take_1d(dtypes, self._blknos, allow_fill=False) + return algos.take_1d(dtypes, self.blknos, allow_fill=False) def __getstate__(self): block_values = [b.values for b in self.blocks] @@ -944,8 +973,8 @@ def iget(self, i: int) -> "SingleBlockManager": """ Return the data as a SingleBlockManager. """ - block = self.blocks[self._blknos[i]] - values = block.iget(self._blklocs[i]) + block = self.blocks[self.blknos[i]] + values = block.iget(self.blklocs[i]) # shortcut for select a single-dim from a 2-dim BM return SingleBlockManager( @@ -973,7 +1002,7 @@ def delete(self, item): else: affected_start = is_deleted.nonzero()[0][0] - for blkno, _ in _fast_count_smallints(self._blknos[affected_start:]): + for blkno, _ in _fast_count_smallints(self.blknos[affected_start:]): blk = self.blocks[blkno] bml = blk.mgr_locs blk_del = is_deleted[bml.indexer].nonzero()[0] @@ -1002,6 +1031,8 @@ def set(self, item, value): """ # FIXME: refactor, clearly separate broadcasting & zip-like assignment # can prob also fix the various if tests for sparse/categorical + if self._blklocs is None and self.ndim > 1: + self._rebuild_blknos_and_blklocs() value_is_extension_type = is_extension_array_dtype(value) @@ -1038,8 +1069,9 @@ def value_getitem(placement): if isinstance(loc, int): loc = [loc] - blknos = self._blknos[loc] - blklocs = self._blklocs[loc].copy() + # Accessing public blknos ensures the public versions are initialized + blknos = self.blknos[loc] + blklocs = self.blklocs[loc].copy() unfit_mgr_locs = [] unfit_val_locs = [] @@ -1141,7 +1173,7 @@ def insert(self, loc: int, item, value, allow_duplicates: bool = False): block = make_block(values=value, ndim=self.ndim, placement=slice(loc, loc + 1)) - for blkno, count in _fast_count_smallints(self._blknos[loc:]): + for blkno, count in _fast_count_smallints(self.blknos[loc:]): blk = self.blocks[blkno] if count == len(blk.mgr_locs): blk.mgr_locs = blk.mgr_locs.add(1) @@ -1150,7 +1182,8 @@ def insert(self, loc: int, item, value, allow_duplicates: bool = False): new_mgr_locs[new_mgr_locs >= loc] += 1 blk.mgr_locs = new_mgr_locs - if loc == self._blklocs.shape[0]: + # Accessing public blklocs ensures the public versions are initialized + if loc == self.blklocs.shape[0]: # np.append is a lot faster, let's use it if we can. self._blklocs = np.append(self._blklocs, 0) self._blknos = np.append(self._blknos, len(self.blocks)) @@ -1268,14 +1301,14 @@ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple=None): ] if sl_type in ("slice", "mask"): - blknos = self._blknos[slobj] - blklocs = self._blklocs[slobj] + blknos = self.blknos[slobj] + blklocs = self.blklocs[slobj] else: blknos = algos.take_1d( - self._blknos, slobj, fill_value=-1, allow_fill=allow_fill + self.blknos, slobj, fill_value=-1, allow_fill=allow_fill ) blklocs = algos.take_1d( - self._blklocs, slobj, fill_value=-1, allow_fill=allow_fill + self.blklocs, slobj, fill_value=-1, allow_fill=allow_fill ) # When filling blknos, make sure blknos is updated before appending to diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 636cca0df9d4e..7b64227763ecc 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -2209,16 +2209,17 @@ def test_object_casting_indexing_wraps_datetimelike(): assert isinstance(ser.values[2], pd.Timedelta) mgr = df._data + mgr._rebuild_blknos_and_blklocs() arr = mgr.fast_xs(0) assert isinstance(arr[1], pd.Timestamp) assert isinstance(arr[2], pd.Timedelta) - blk = mgr.blocks[mgr._blknos[1]] + blk = mgr.blocks[mgr.blknos[1]] assert blk.dtype == "M8[ns]" # we got the right block val = blk.iget((0, 0)) assert isinstance(val, pd.Timestamp) - blk = mgr.blocks[mgr._blknos[2]] + blk = mgr.blocks[mgr.blknos[2]] assert blk.dtype == "m8[ns]" # we got the right block val = blk.iget((0, 0)) assert isinstance(val, pd.Timedelta) diff --git a/pandas/tests/frame/test_nonunique_indexes.py b/pandas/tests/frame/test_nonunique_indexes.py index 32ead406a3e86..233c0f4bd3544 100644 --- a/pandas/tests/frame/test_nonunique_indexes.py +++ b/pandas/tests/frame/test_nonunique_indexes.py @@ -474,8 +474,8 @@ def test_columns_with_dups(self): ) df = pd.concat([df_float, df_int, df_bool, df_object, df_dt], axis=1) - assert len(df._data._blknos) == len(df.columns) - assert len(df._data._blklocs) == len(df.columns) + assert len(df._data.blknos) == len(df.columns) + assert len(df._data.blklocs) == len(df.columns) # testing iloc for i in range(len(df.columns)): diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 27b0500983afd..378446398404e 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -309,7 +309,8 @@ def test_duplicate_ref_loc_failure(self): msg = "Gaps in blk ref_locs" with pytest.raises(AssertionError, match=msg): - BlockManager(blocks, axes) + mgr = BlockManager(blocks, axes) + mgr._rebuild_blknos_and_blklocs() blocks[0].mgr_locs = np.array([0]) blocks[1].mgr_locs = np.array([1])