Skip to content

PERF: lazify blknos and blklocs #32261

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 4, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions pandas/core/internals/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,17 +48,17 @@ def get_mgr_concatenation_plan(mgr, indexers):

if 0 in indexers:
ax0_indexer = indexers.pop(0)
blknos = algos.take_1d(mgr._blknos, ax0_indexer, fill_value=-1)
blklocs = algos.take_1d(mgr._blklocs, ax0_indexer, fill_value=-1)
blknos = algos.take_1d(mgr.blknos, ax0_indexer, fill_value=-1)
blklocs = algos.take_1d(mgr.blklocs, ax0_indexer, fill_value=-1)
else:

if mgr._is_single_block:
blk = mgr.blocks[0]
return [(blk.mgr_locs, JoinUnit(blk, mgr_shape, indexers))]

ax0_indexer = None
blknos = mgr._blknos
blklocs = mgr._blklocs
blknos = mgr.blknos
blklocs = mgr.blklocs

plan = []
for blkno, placements in libinternals.get_blkno_placements(blknos, group=False):
Expand Down
59 changes: 46 additions & 13 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,9 +141,37 @@ def __init__(
if do_integrity_check:
self._verify_integrity()

# Populate known_consolidate, blknos, and blklocs lazily
self._known_consolidated = False
self._blknos = None
self._blklocs = None

self._rebuild_blknos_and_blklocs()
@property
def blknos(self):
"""
Suppose we want to find the array corresponding to our i'th column.

blknos[i] identifies the block from self.blocks that contains this column.

blklocs[i] identifies the column of interest within
self.blocks[self.blknos[i]]
"""
if self._blknos is None:
# Note: these can be altered by other BlockManager methods.
self._rebuild_blknos_and_blklocs()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

to make this more transparent, I would change the return signature of _rebuild_blknos_and_blklocs to return a tuple of (blknos, blklocs) and set them here. It is much more transparent.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that turns out to get messy because of the way we update these in-place in some other methods. so id like to skip this for now, and revisit it in the upcoming even-lazier pass

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok.


return self._blknos

@property
def blklocs(self):
"""
See blknos.__doc__
"""
if self._blklocs is None:
# Note: these can be altered by other BlockManager methods.
self._rebuild_blknos_and_blklocs()

return self._blklocs

def make_empty(self, axes=None):
""" return an empty BlockManager with the items axis of len 0 """
Expand Down Expand Up @@ -227,6 +255,7 @@ def _rebuild_blknos_and_blklocs(self):
new_blklocs[rl.indexer] = np.arange(len(rl))

if (new_blknos == -1).any():
# TODO: can we avoid this? it isn't cheap
raise AssertionError("Gaps in blk ref_locs")

self._blknos = new_blknos
Expand All @@ -250,7 +279,7 @@ def get_dtype_counts(self):

def get_dtypes(self):
dtypes = np.array([blk.dtype for blk in self.blocks])
return algos.take_1d(dtypes, self._blknos, allow_fill=False)
return algos.take_1d(dtypes, self.blknos, allow_fill=False)

def __getstate__(self):
block_values = [b.values for b in self.blocks]
Expand Down Expand Up @@ -944,8 +973,8 @@ def iget(self, i: int) -> "SingleBlockManager":
"""
Return the data as a SingleBlockManager.
"""
block = self.blocks[self._blknos[i]]
values = block.iget(self._blklocs[i])
block = self.blocks[self.blknos[i]]
values = block.iget(self.blklocs[i])

# shortcut for select a single-dim from a 2-dim BM
return SingleBlockManager(
Expand Down Expand Up @@ -973,7 +1002,7 @@ def delete(self, item):
else:
affected_start = is_deleted.nonzero()[0][0]

for blkno, _ in _fast_count_smallints(self._blknos[affected_start:]):
for blkno, _ in _fast_count_smallints(self.blknos[affected_start:]):
blk = self.blocks[blkno]
bml = blk.mgr_locs
blk_del = is_deleted[bml.indexer].nonzero()[0]
Expand Down Expand Up @@ -1002,6 +1031,8 @@ def set(self, item, value):
"""
# FIXME: refactor, clearly separate broadcasting & zip-like assignment
# can prob also fix the various if tests for sparse/categorical
if self._blklocs is None and self.ndim > 1:
self._rebuild_blknos_and_blklocs()

value_is_extension_type = is_extension_array_dtype(value)

Expand Down Expand Up @@ -1038,8 +1069,9 @@ def value_getitem(placement):
if isinstance(loc, int):
loc = [loc]

blknos = self._blknos[loc]
blklocs = self._blklocs[loc].copy()
# Accessing public blknos ensures the public versions are initialized
blknos = self.blknos[loc]
blklocs = self.blklocs[loc].copy()

unfit_mgr_locs = []
unfit_val_locs = []
Expand Down Expand Up @@ -1141,7 +1173,7 @@ def insert(self, loc: int, item, value, allow_duplicates: bool = False):

block = make_block(values=value, ndim=self.ndim, placement=slice(loc, loc + 1))

for blkno, count in _fast_count_smallints(self._blknos[loc:]):
for blkno, count in _fast_count_smallints(self.blknos[loc:]):
blk = self.blocks[blkno]
if count == len(blk.mgr_locs):
blk.mgr_locs = blk.mgr_locs.add(1)
Expand All @@ -1150,7 +1182,8 @@ def insert(self, loc: int, item, value, allow_duplicates: bool = False):
new_mgr_locs[new_mgr_locs >= loc] += 1
blk.mgr_locs = new_mgr_locs

if loc == self._blklocs.shape[0]:
# Accessing public blklocs ensures the public versions are initialized
if loc == self.blklocs.shape[0]:
# np.append is a lot faster, let's use it if we can.
self._blklocs = np.append(self._blklocs, 0)
self._blknos = np.append(self._blknos, len(self.blocks))
Expand Down Expand Up @@ -1268,14 +1301,14 @@ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple=None):
]

if sl_type in ("slice", "mask"):
blknos = self._blknos[slobj]
blklocs = self._blklocs[slobj]
blknos = self.blknos[slobj]
blklocs = self.blklocs[slobj]
else:
blknos = algos.take_1d(
self._blknos, slobj, fill_value=-1, allow_fill=allow_fill
self.blknos, slobj, fill_value=-1, allow_fill=allow_fill
)
blklocs = algos.take_1d(
self._blklocs, slobj, fill_value=-1, allow_fill=allow_fill
self.blklocs, slobj, fill_value=-1, allow_fill=allow_fill
)

# When filling blknos, make sure blknos is updated before appending to
Expand Down
5 changes: 3 additions & 2 deletions pandas/tests/frame/indexing/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -2209,16 +2209,17 @@ def test_object_casting_indexing_wraps_datetimelike():
assert isinstance(ser.values[2], pd.Timedelta)

mgr = df._data
mgr._rebuild_blknos_and_blklocs()
arr = mgr.fast_xs(0)
assert isinstance(arr[1], pd.Timestamp)
assert isinstance(arr[2], pd.Timedelta)

blk = mgr.blocks[mgr._blknos[1]]
blk = mgr.blocks[mgr.blknos[1]]
assert blk.dtype == "M8[ns]" # we got the right block
val = blk.iget((0, 0))
assert isinstance(val, pd.Timestamp)

blk = mgr.blocks[mgr._blknos[2]]
blk = mgr.blocks[mgr.blknos[2]]
assert blk.dtype == "m8[ns]" # we got the right block
val = blk.iget((0, 0))
assert isinstance(val, pd.Timedelta)
4 changes: 2 additions & 2 deletions pandas/tests/frame/test_nonunique_indexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -474,8 +474,8 @@ def test_columns_with_dups(self):
)
df = pd.concat([df_float, df_int, df_bool, df_object, df_dt], axis=1)

assert len(df._data._blknos) == len(df.columns)
assert len(df._data._blklocs) == len(df.columns)
assert len(df._data.blknos) == len(df.columns)
assert len(df._data.blklocs) == len(df.columns)

# testing iloc
for i in range(len(df.columns)):
Expand Down
3 changes: 2 additions & 1 deletion pandas/tests/internals/test_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,7 +309,8 @@ def test_duplicate_ref_loc_failure(self):
msg = "Gaps in blk ref_locs"

with pytest.raises(AssertionError, match=msg):
BlockManager(blocks, axes)
mgr = BlockManager(blocks, axes)
mgr._rebuild_blknos_and_blklocs()

blocks[0].mgr_locs = np.array([0])
blocks[1].mgr_locs = np.array([1])
Expand Down