Skip to content

Commit 69d7663

Browse files
authored
PERF: put BlockManager constructor in cython (#40842)
1 parent 5525561 commit 69d7663

File tree

11 files changed

+276
-146
lines changed

11 files changed

+276
-146
lines changed

pandas/_libs/internals.pyi

+13
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@ from pandas._typing import (
1111
T,
1212
)
1313

14+
from pandas import Index
15+
from pandas.core.internals.blocks import Block as B
16+
1417
def slice_len(slc: slice, objlen: int = ...) -> int: ...
1518

1619

@@ -66,3 +69,13 @@ class NumpyBlock(SharedBlock):
6669

6770
class Block(SharedBlock):
6871
...
72+
73+
class BlockManager:
74+
blocks: tuple[B, ...]
75+
axes: list[Index]
76+
_known_consolidated: bool
77+
_is_consolidated: bool
78+
_blknos: np.ndarray
79+
_blklocs: np.ndarray
80+
81+
def __init__(self, blocks: tuple[B, ...], axes: list[Index], verify_integrity=True): ...

pandas/_libs/internals.pyx

+77
Original file line numberDiff line numberDiff line change
@@ -533,3 +533,80 @@ cdef class Block(SharedBlock):
533533
# set values here the (implicit) call to SharedBlock.__cinit__ will
534534
# set placement and ndim
535535
self.values = values
536+
537+
538+
@cython.freelist(64)
539+
cdef class BlockManager:
540+
cdef:
541+
public tuple blocks
542+
public list axes
543+
public bint _known_consolidated, _is_consolidated
544+
public ndarray _blknos, _blklocs
545+
546+
def __cinit__(self, blocks, axes, verify_integrity=True):
547+
if isinstance(blocks, list):
548+
# Backward compat for e.g. pyarrow
549+
blocks = tuple(blocks)
550+
551+
self.blocks = blocks
552+
self.axes = axes.copy() # copy to make sure we are not remotely-mutable
553+
554+
# Populate known_consolidate, blknos, and blklocs lazily
555+
self._known_consolidated = False
556+
self._is_consolidated = False
557+
# error: Incompatible types in assignment (expression has type "None",
558+
# variable has type "ndarray")
559+
self._blknos = None # type: ignore[assignment]
560+
# error: Incompatible types in assignment (expression has type "None",
561+
# variable has type "ndarray")
562+
self._blklocs = None # type: ignore[assignment]
563+
564+
# -------------------------------------------------------------------
565+
# Pickle
566+
567+
cpdef __reduce__(self):
568+
if len(self.axes) == 1:
569+
# SingleBlockManager, __init__ expects Block, axis
570+
args = (self.blocks[0], self.axes[0])
571+
else:
572+
args = (self.blocks, self.axes)
573+
return type(self), args
574+
575+
cpdef __setstate__(self, state):
576+
from pandas.core.construction import extract_array
577+
from pandas.core.internals.blocks import (
578+
ensure_block_shape,
579+
new_block,
580+
)
581+
from pandas.core.internals.managers import ensure_index
582+
583+
if isinstance(state, tuple) and len(state) >= 4 and "0.14.1" in state[3]:
584+
state = state[3]["0.14.1"]
585+
axes = [ensure_index(ax) for ax in state["axes"]]
586+
ndim = len(axes)
587+
588+
for blk in state["blocks"]:
589+
vals = blk["values"]
590+
# older versions may hold e.g. DatetimeIndex instead of DTA
591+
vals = extract_array(vals, extract_numpy=True)
592+
blk["values"] = ensure_block_shape(vals, ndim=ndim)
593+
594+
nbs = [
595+
new_block(blk["values"], blk["mgr_locs"], ndim=ndim)
596+
for blk in state["blocks"]
597+
]
598+
blocks = tuple(nbs)
599+
self.blocks = blocks
600+
self.axes = axes
601+
602+
else:
603+
raise NotImplementedError("pre-0.14.1 pickles are no longer supported")
604+
605+
self._post_setstate()
606+
607+
def _post_setstate(self) -> None:
608+
self._is_consolidated = False
609+
self._known_consolidated = False
610+
self._rebuild_blknos_and_blklocs()
611+
612+
# -------------------------------------------------------------------

pandas/_libs/reduction.pyx

+1-1
Original file line numberDiff line numberDiff line change
@@ -489,6 +489,6 @@ cdef class BlockSlider:
489489
Ensure that we have the original blocks, blknos, and blklocs.
490490
"""
491491
mgr = self.dummy._mgr
492-
mgr.blocks = self.blocks
492+
mgr.blocks = tuple(self.blocks)
493493
mgr._blklocs = self.orig_blklocs
494494
mgr._blknos = self.orig_blknos

pandas/compat/pickle_compat.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
PeriodArray,
2222
TimedeltaArray,
2323
)
24+
from pandas.core.internals import BlockManager
2425

2526
if TYPE_CHECKING:
2627
from pandas import (
@@ -222,7 +223,8 @@ def load_newobj(self):
222223
elif issubclass(cls, TimedeltaArray) and not args:
223224
arr = np.array([], dtype="m8[ns]")
224225
obj = cls.__new__(cls, arr, arr.dtype)
225-
226+
elif cls is BlockManager and not args:
227+
obj = cls.__new__(cls, (), [], False)
226228
else:
227229
obj = cls.__new__(cls, *args)
228230

pandas/core/internals/concat.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,7 @@ def concatenate_managers(
225225
b = new_block(new_values, placement=placement, ndim=len(axes))
226226
blocks.append(b)
227227

228-
return BlockManager(blocks, axes)
228+
return BlockManager(tuple(blocks), axes)
229229

230230

231231
def _get_mgr_concatenation_plan(mgr: BlockManager, indexers: dict[int, np.ndarray]):

pandas/core/internals/managers.py

+55-97
Original file line numberDiff line numberDiff line change
@@ -135,22 +135,16 @@ class BaseBlockManager(DataManager):
135135
This is *not* a public API class
136136
"""
137137

138-
__slots__ = [
139-
"axes",
140-
"blocks",
141-
"_known_consolidated",
142-
"_is_consolidated",
143-
"_blknos",
144-
"_blklocs",
145-
]
138+
__slots__ = ()
146139

147140
_blknos: np.ndarray
148141
_blklocs: np.ndarray
149142
blocks: tuple[Block, ...]
150143
axes: list[Index]
151144

152-
# Non-trivially faster than a property
153145
ndim: int
146+
_known_consolidated: bool
147+
_is_consolidated: bool
154148

155149
def __init__(self, blocks, axes, verify_integrity=True):
156150
raise NotImplementedError
@@ -276,57 +270,6 @@ def arrays(self) -> list[ArrayLike]:
276270
"""
277271
return [blk.values for blk in self.blocks]
278272

279-
def __getstate__(self):
280-
block_values = [b.values for b in self.blocks]
281-
block_items = [self.items[b.mgr_locs.indexer] for b in self.blocks]
282-
axes_array = list(self.axes)
283-
284-
extra_state = {
285-
"0.14.1": {
286-
"axes": axes_array,
287-
"blocks": [
288-
{"values": b.values, "mgr_locs": b.mgr_locs.indexer}
289-
for b in self.blocks
290-
],
291-
}
292-
}
293-
294-
# First three elements of the state are to maintain forward
295-
# compatibility with 0.13.1.
296-
return axes_array, block_values, block_items, extra_state
297-
298-
def __setstate__(self, state):
299-
def unpickle_block(values, mgr_locs, ndim: int) -> Block:
300-
# TODO(EA2D): ndim would be unnecessary with 2D EAs
301-
# older pickles may store e.g. DatetimeIndex instead of DatetimeArray
302-
values = extract_array(values, extract_numpy=True)
303-
return new_block(values, placement=mgr_locs, ndim=ndim)
304-
305-
if isinstance(state, tuple) and len(state) >= 4 and "0.14.1" in state[3]:
306-
state = state[3]["0.14.1"]
307-
self.axes = [ensure_index(ax) for ax in state["axes"]]
308-
ndim = len(self.axes)
309-
310-
for blk in state["blocks"]:
311-
vals = blk["values"]
312-
# older versions may hold e.g. DatetimeIndex instead of DTA
313-
vals = extract_array(vals, extract_numpy=True)
314-
blk["values"] = ensure_block_shape(vals, ndim=ndim)
315-
316-
self.blocks = tuple(
317-
unpickle_block(b["values"], b["mgr_locs"], ndim=ndim)
318-
for b in state["blocks"]
319-
)
320-
else:
321-
raise NotImplementedError("pre-0.14.1 pickles are no longer supported")
322-
323-
self._post_setstate()
324-
325-
def _post_setstate(self) -> None:
326-
self._is_consolidated = False
327-
self._known_consolidated = False
328-
self._rebuild_blknos_and_blklocs()
329-
330273
def __repr__(self) -> str:
331274
output = type(self).__name__
332275
for i, ax in enumerate(self.axes):
@@ -823,7 +766,7 @@ def consolidate(self: T) -> T:
823766
if self.is_consolidated():
824767
return self
825768

826-
bm = type(self)(self.blocks, self.axes)
769+
bm = type(self)(self.blocks, self.axes, verify_integrity=False)
827770
bm._is_consolidated = False
828771
bm._consolidate_inplace()
829772
return bm
@@ -1079,7 +1022,7 @@ def take(self: T, indexer, axis: int = 1, verify: bool = True) -> T:
10791022
)
10801023

10811024

1082-
class BlockManager(BaseBlockManager):
1025+
class BlockManager(libinternals.BlockManager, BaseBlockManager):
10831026
"""
10841027
BaseBlockManager that holds 2D blocks.
10851028
"""
@@ -1095,27 +1038,18 @@ def __init__(
10951038
axes: Sequence[Index],
10961039
verify_integrity: bool = True,
10971040
):
1098-
self.axes = [ensure_index(ax) for ax in axes]
1099-
self.blocks: tuple[Block, ...] = tuple(blocks)
1100-
1101-
for block in blocks:
1102-
if self.ndim != block.ndim:
1103-
raise AssertionError(
1104-
f"Number of Block dimensions ({block.ndim}) must equal "
1105-
f"number of axes ({self.ndim})"
1106-
)
11071041

11081042
if verify_integrity:
1109-
self._verify_integrity()
1043+
assert all(isinstance(x, Index) for x in axes)
11101044

1111-
# Populate known_consolidate, blknos, and blklocs lazily
1112-
self._known_consolidated = False
1113-
# error: Incompatible types in assignment (expression has type "None",
1114-
# variable has type "ndarray")
1115-
self._blknos = None # type: ignore[assignment]
1116-
# error: Incompatible types in assignment (expression has type "None",
1117-
# variable has type "ndarray")
1118-
self._blklocs = None # type: ignore[assignment]
1045+
for block in blocks:
1046+
if self.ndim != block.ndim:
1047+
raise AssertionError(
1048+
f"Number of Block dimensions ({block.ndim}) must equal "
1049+
f"number of axes ({self.ndim})"
1050+
)
1051+
1052+
self._verify_integrity()
11191053

11201054
def _verify_integrity(self) -> None:
11211055
mgr_shape = self.shape
@@ -1130,21 +1064,6 @@ def _verify_integrity(self) -> None:
11301064
f"tot_items: {tot_items}"
11311065
)
11321066

1133-
@classmethod
1134-
def _simple_new(cls, blocks: tuple[Block, ...], axes: list[Index]):
1135-
"""
1136-
Fastpath constructor; does NO validation.
1137-
"""
1138-
obj = cls.__new__(cls)
1139-
obj.axes = axes
1140-
obj.blocks = blocks
1141-
1142-
# Populate known_consolidate, blknos, and blklocs lazily
1143-
obj._known_consolidated = False
1144-
obj._blknos = None
1145-
obj._blklocs = None
1146-
return obj
1147-
11481067
@classmethod
11491068
def from_blocks(cls, blocks: list[Block], axes: list[Index]) -> BlockManager:
11501069
"""
@@ -1210,7 +1129,7 @@ def get_slice(self, slobj: slice, axis: int = 0) -> BlockManager:
12101129
new_axes = list(self.axes)
12111130
new_axes[axis] = new_axes[axis]._getitem_slice(slobj)
12121131

1213-
return type(self)._simple_new(tuple(new_blocks), new_axes)
1132+
return type(self)(tuple(new_blocks), new_axes, verify_integrity=False)
12141133

12151134
def iget(self, i: int) -> SingleBlockManager:
12161135
"""
@@ -1418,7 +1337,7 @@ def idelete(self, indexer) -> BlockManager:
14181337
nbs = self._slice_take_blocks_ax0(taker, only_slice=True)
14191338
new_columns = self.items[~is_deleted]
14201339
axes = [new_columns, self.axes[1]]
1421-
return type(self)._simple_new(tuple(nbs), axes)
1340+
return type(self)(tuple(nbs), axes)
14221341

14231342
# ----------------------------------------------------------------
14241343
# Block-wise Operation
@@ -1602,6 +1521,45 @@ def from_array(cls, array: ArrayLike, index: Index) -> SingleBlockManager:
16021521
block = new_block(array, placement=slice(0, len(index)), ndim=1)
16031522
return cls(block, index)
16041523

1524+
def __getstate__(self):
1525+
block_values = [b.values for b in self.blocks]
1526+
block_items = [self.items[b.mgr_locs.indexer] for b in self.blocks]
1527+
axes_array = list(self.axes)
1528+
1529+
extra_state = {
1530+
"0.14.1": {
1531+
"axes": axes_array,
1532+
"blocks": [
1533+
{"values": b.values, "mgr_locs": b.mgr_locs.indexer}
1534+
for b in self.blocks
1535+
],
1536+
}
1537+
}
1538+
1539+
# First three elements of the state are to maintain forward
1540+
# compatibility with 0.13.1.
1541+
return axes_array, block_values, block_items, extra_state
1542+
1543+
def __setstate__(self, state):
1544+
def unpickle_block(values, mgr_locs, ndim: int) -> Block:
1545+
# TODO(EA2D): ndim would be unnecessary with 2D EAs
1546+
# older pickles may store e.g. DatetimeIndex instead of DatetimeArray
1547+
values = extract_array(values, extract_numpy=True)
1548+
return new_block(values, placement=mgr_locs, ndim=ndim)
1549+
1550+
if isinstance(state, tuple) and len(state) >= 4 and "0.14.1" in state[3]:
1551+
state = state[3]["0.14.1"]
1552+
self.axes = [ensure_index(ax) for ax in state["axes"]]
1553+
ndim = len(self.axes)
1554+
self.blocks = tuple(
1555+
unpickle_block(b["values"], b["mgr_locs"], ndim=ndim)
1556+
for b in state["blocks"]
1557+
)
1558+
else:
1559+
raise NotImplementedError("pre-0.14.1 pickles are no longer supported")
1560+
1561+
self._post_setstate()
1562+
16051563
def _post_setstate(self):
16061564
pass
16071565

pandas/core/internals/ops.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ def operate_blockwise(
7676
# assert len(slocs) == nlocs, (len(slocs), nlocs)
7777
# assert slocs == set(range(nlocs)), slocs
7878

79-
new_mgr = type(right)(res_blks, axes=right.axes, verify_integrity=False)
79+
new_mgr = type(right)(tuple(res_blks), axes=right.axes, verify_integrity=False)
8080
return new_mgr
8181

8282

0 commit comments

Comments
 (0)