Skip to content

Commit 86ed2b6

Browse files
authored
PERF: lazify blknos and blklocs (#32261)
1 parent 9aaaf1b commit 86ed2b6

File tree

5 files changed

+57
-22
lines changed

5 files changed

+57
-22
lines changed

pandas/core/internals/concat.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -48,17 +48,17 @@ def get_mgr_concatenation_plan(mgr, indexers):
4848

4949
if 0 in indexers:
5050
ax0_indexer = indexers.pop(0)
51-
blknos = algos.take_1d(mgr._blknos, ax0_indexer, fill_value=-1)
52-
blklocs = algos.take_1d(mgr._blklocs, ax0_indexer, fill_value=-1)
51+
blknos = algos.take_1d(mgr.blknos, ax0_indexer, fill_value=-1)
52+
blklocs = algos.take_1d(mgr.blklocs, ax0_indexer, fill_value=-1)
5353
else:
5454

5555
if mgr._is_single_block:
5656
blk = mgr.blocks[0]
5757
return [(blk.mgr_locs, JoinUnit(blk, mgr_shape, indexers))]
5858

5959
ax0_indexer = None
60-
blknos = mgr._blknos
61-
blklocs = mgr._blklocs
60+
blknos = mgr.blknos
61+
blklocs = mgr.blklocs
6262

6363
plan = []
6464
for blkno, placements in libinternals.get_blkno_placements(blknos, group=False):

pandas/core/internals/managers.py

+46-13
Original file line numberDiff line numberDiff line change
@@ -141,9 +141,37 @@ def __init__(
141141
if do_integrity_check:
142142
self._verify_integrity()
143143

144+
# Populate known_consolidate, blknos, and blklocs lazily
144145
self._known_consolidated = False
146+
self._blknos = None
147+
self._blklocs = None
145148

146-
self._rebuild_blknos_and_blklocs()
149+
@property
150+
def blknos(self):
151+
"""
152+
Suppose we want to find the array corresponding to our i'th column.
153+
154+
blknos[i] identifies the block from self.blocks that contains this column.
155+
156+
blklocs[i] identifies the column of interest within
157+
self.blocks[self.blknos[i]]
158+
"""
159+
if self._blknos is None:
160+
# Note: these can be altered by other BlockManager methods.
161+
self._rebuild_blknos_and_blklocs()
162+
163+
return self._blknos
164+
165+
@property
166+
def blklocs(self):
167+
"""
168+
See blknos.__doc__
169+
"""
170+
if self._blklocs is None:
171+
# Note: these can be altered by other BlockManager methods.
172+
self._rebuild_blknos_and_blklocs()
173+
174+
return self._blklocs
147175

148176
def make_empty(self, axes=None) -> "BlockManager":
149177
""" return an empty BlockManager with the items axis of len 0 """
@@ -230,6 +258,7 @@ def _rebuild_blknos_and_blklocs(self) -> None:
230258
new_blklocs[rl.indexer] = np.arange(len(rl))
231259

232260
if (new_blknos == -1).any():
261+
# TODO: can we avoid this? it isn't cheap
233262
raise AssertionError("Gaps in blk ref_locs")
234263

235264
self._blknos = new_blknos
@@ -253,7 +282,7 @@ def get_dtype_counts(self):
253282

254283
def get_dtypes(self):
255284
dtypes = np.array([blk.dtype for blk in self.blocks])
256-
return algos.take_1d(dtypes, self._blknos, allow_fill=False)
285+
return algos.take_1d(dtypes, self.blknos, allow_fill=False)
257286

258287
def __getstate__(self):
259288
block_values = [b.values for b in self.blocks]
@@ -951,8 +980,8 @@ def iget(self, i: int) -> "SingleBlockManager":
951980
"""
952981
Return the data as a SingleBlockManager.
953982
"""
954-
block = self.blocks[self._blknos[i]]
955-
values = block.iget(self._blklocs[i])
983+
block = self.blocks[self.blknos[i]]
984+
values = block.iget(self.blklocs[i])
956985

957986
# shortcut for select a single-dim from a 2-dim BM
958987
return SingleBlockManager(
@@ -980,7 +1009,7 @@ def delete(self, item):
9801009
else:
9811010
affected_start = is_deleted.nonzero()[0][0]
9821011

983-
for blkno, _ in _fast_count_smallints(self._blknos[affected_start:]):
1012+
for blkno, _ in _fast_count_smallints(self.blknos[affected_start:]):
9841013
blk = self.blocks[blkno]
9851014
bml = blk.mgr_locs
9861015
blk_del = is_deleted[bml.indexer].nonzero()[0]
@@ -1026,6 +1055,8 @@ def iset(self, loc: Union[int, slice, np.ndarray], value):
10261055
"""
10271056
# FIXME: refactor, clearly separate broadcasting & zip-like assignment
10281057
# can prob also fix the various if tests for sparse/categorical
1058+
if self._blklocs is None and self.ndim > 1:
1059+
self._rebuild_blknos_and_blklocs()
10291060

10301061
value_is_extension_type = is_extension_array_dtype(value)
10311062

@@ -1055,8 +1086,9 @@ def value_getitem(placement):
10551086
if isinstance(loc, int):
10561087
loc = [loc]
10571088

1058-
blknos = self._blknos[loc]
1059-
blklocs = self._blklocs[loc].copy()
1089+
# Accessing public blknos ensures the public versions are initialized
1090+
blknos = self.blknos[loc]
1091+
blklocs = self.blklocs[loc].copy()
10601092

10611093
unfit_mgr_locs = []
10621094
unfit_val_locs = []
@@ -1161,7 +1193,7 @@ def insert(self, loc: int, item, value, allow_duplicates: bool = False):
11611193

11621194
block = make_block(values=value, ndim=self.ndim, placement=slice(loc, loc + 1))
11631195

1164-
for blkno, count in _fast_count_smallints(self._blknos[loc:]):
1196+
for blkno, count in _fast_count_smallints(self.blknos[loc:]):
11651197
blk = self.blocks[blkno]
11661198
if count == len(blk.mgr_locs):
11671199
blk.mgr_locs = blk.mgr_locs.add(1)
@@ -1170,7 +1202,8 @@ def insert(self, loc: int, item, value, allow_duplicates: bool = False):
11701202
new_mgr_locs[new_mgr_locs >= loc] += 1
11711203
blk.mgr_locs = new_mgr_locs
11721204

1173-
if loc == self._blklocs.shape[0]:
1205+
# Accessing public blklocs ensures the public versions are initialized
1206+
if loc == self.blklocs.shape[0]:
11741207
# np.append is a lot faster, let's use it if we can.
11751208
self._blklocs = np.append(self._blklocs, 0)
11761209
self._blknos = np.append(self._blknos, len(self.blocks))
@@ -1301,14 +1334,14 @@ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple=None):
13011334
]
13021335

13031336
if sl_type in ("slice", "mask"):
1304-
blknos = self._blknos[slobj]
1305-
blklocs = self._blklocs[slobj]
1337+
blknos = self.blknos[slobj]
1338+
blklocs = self.blklocs[slobj]
13061339
else:
13071340
blknos = algos.take_1d(
1308-
self._blknos, slobj, fill_value=-1, allow_fill=allow_fill
1341+
self.blknos, slobj, fill_value=-1, allow_fill=allow_fill
13091342
)
13101343
blklocs = algos.take_1d(
1311-
self._blklocs, slobj, fill_value=-1, allow_fill=allow_fill
1344+
self.blklocs, slobj, fill_value=-1, allow_fill=allow_fill
13121345
)
13131346

13141347
# When filling blknos, make sure blknos is updated before appending to

pandas/tests/frame/indexing/test_indexing.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -2213,16 +2213,17 @@ def test_object_casting_indexing_wraps_datetimelike():
22132213
assert isinstance(ser.values[2], pd.Timedelta)
22142214

22152215
mgr = df._data
2216+
mgr._rebuild_blknos_and_blklocs()
22162217
arr = mgr.fast_xs(0)
22172218
assert isinstance(arr[1], pd.Timestamp)
22182219
assert isinstance(arr[2], pd.Timedelta)
22192220

2220-
blk = mgr.blocks[mgr._blknos[1]]
2221+
blk = mgr.blocks[mgr.blknos[1]]
22212222
assert blk.dtype == "M8[ns]" # we got the right block
22222223
val = blk.iget((0, 0))
22232224
assert isinstance(val, pd.Timestamp)
22242225

2225-
blk = mgr.blocks[mgr._blknos[2]]
2226+
blk = mgr.blocks[mgr.blknos[2]]
22262227
assert blk.dtype == "m8[ns]" # we got the right block
22272228
val = blk.iget((0, 0))
22282229
assert isinstance(val, pd.Timedelta)

pandas/tests/frame/test_nonunique_indexes.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -474,8 +474,8 @@ def test_columns_with_dups(self):
474474
)
475475
df = pd.concat([df_float, df_int, df_bool, df_object, df_dt], axis=1)
476476

477-
assert len(df._data._blknos) == len(df.columns)
478-
assert len(df._data._blklocs) == len(df.columns)
477+
assert len(df._data.blknos) == len(df.columns)
478+
assert len(df._data.blklocs) == len(df.columns)
479479

480480
# testing iloc
481481
for i in range(len(df.columns)):

pandas/tests/internals/test_internals.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -309,7 +309,8 @@ def test_duplicate_ref_loc_failure(self):
309309
msg = "Gaps in blk ref_locs"
310310

311311
with pytest.raises(AssertionError, match=msg):
312-
BlockManager(blocks, axes)
312+
mgr = BlockManager(blocks, axes)
313+
mgr._rebuild_blknos_and_blklocs()
313314

314315
blocks[0].mgr_locs = np.array([0])
315316
blocks[1].mgr_locs = np.array([1])

0 commit comments

Comments
 (0)