Skip to content

PERF: BlockPlacement construction #40227

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Mar 6, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 9 additions & 7 deletions pandas/_libs/internals.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -15,22 +15,24 @@ cimport numpy as cnp
from numpy cimport (
NPY_INT64,
int64_t,
ndarray,
)

cnp.import_array()

from pandas._libs.algos import ensure_int64
from pandas._libs.util cimport is_integer_object


@cython.final
cdef class BlockPlacement:
# __slots__ = '_as_slice', '_as_array', '_len'
cdef:
slice _as_slice
object _as_array
ndarray _as_array # Note: this still allows `None`
bint _has_slice, _has_array, _is_known_slice_like

def __init__(self, val):
def __cinit__(self, val):
cdef:
slice slc

Expand All @@ -39,7 +41,7 @@ cdef class BlockPlacement:
self._has_slice = False
self._has_array = False

if isinstance(val, int):
if is_integer_object(val):
slc = slice(val, val + 1, 1)
self._as_slice = slc
self._has_slice = True
Expand Down Expand Up @@ -160,12 +162,12 @@ cdef class BlockPlacement:
np.concatenate([self.as_array] + [o.as_array for o in others])
)

cdef iadd(self, other):
cdef BlockPlacement iadd(self, other):
cdef:
slice s = self._ensure_has_slice()
Py_ssize_t other_int, start, stop, step, l

if isinstance(other, int) and s is not None:
if is_integer_object(other) and s is not None:
other_int = <Py_ssize_t>other

if other_int == 0:
Expand Down Expand Up @@ -438,13 +440,13 @@ def get_blkno_placements(blknos, group: bool = True):
"""
Parameters
----------
blknos : array of int64
blknos : np.ndarray[int64]
group : bool, default True

Returns
-------
iterator
yield (BlockPlacement, blkno)
yield (blkno, BlockPlacement)
"""
blknos = ensure_int64(blknos)

Expand Down
9 changes: 6 additions & 3 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4534,7 +4534,6 @@ def __getitem__(self, key):
# There's no custom logic to be implemented in __getslice__, so it's
# not overloaded intentionally.
getitem = self._data.__getitem__
promote = self._shallow_copy

if is_scalar(key):
key = com.cast_scalar_indexer(key, warn_float=True)
Expand All @@ -4543,7 +4542,9 @@ def __getitem__(self, key):
if isinstance(key, slice):
# This case is separated from the conditional above to avoid
# pessimization of basic indexing.
return promote(getitem(key))
result = getitem(key)
# Going through simple_new for performance.
return type(self)._simple_new(result, name=self.name)

if com.is_bool_indexer(key):
key = np.asarray(key, dtype=bool)
Expand All @@ -4553,7 +4554,9 @@ def __getitem__(self, key):
if np.ndim(result) > 1:
deprecate_ndim_indexing(result)
return result
return promote(result)
# NB: Using _constructor._simple_new would break if MultiIndex
# didn't override __getitem__
return self._constructor._simple_new(result, name=self.name)
else:
return result

Expand Down
2 changes: 1 addition & 1 deletion pandas/core/indexes/category.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ def _simple_new(cls, values: Categorical, name: Optional[Hashable] = None):
result = object.__new__(cls)

result._data = values
result.name = name
result._name = name
result._cache = {}

result._reset_identity()
Expand Down
13 changes: 7 additions & 6 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -2076,15 +2076,16 @@ def __getitem__(self, key):

return tuple(retval)
else:
# in general cannot be sure whether the result will be sorted
sortorder = None
if com.is_bool_indexer(key):
key = np.asarray(key, dtype=bool)
sortorder = self.sortorder
else:
# cannot be sure whether the result will be sorted
sortorder = None

if isinstance(key, Index):
key = np.asarray(key)
elif isinstance(key, slice):
if key.step is None or key.step > 0:
sortorder = self.sortorder
elif isinstance(key, Index):
key = np.asarray(key)

new_codes = [level_codes[key] for level_codes in self.codes]

Expand Down
2 changes: 1 addition & 1 deletion pandas/core/indexes/range.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ def _simple_new(cls, values: range, name: Hashable = None) -> RangeIndex:
assert isinstance(values, range)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@simonjayhawkins are the mypy checks thorough/reliable enough that we can safely remove these assertions?


result._range = values
result.name = name
result._name = name
result._cache = {}
result._reset_identity()
return result
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -449,7 +449,7 @@ def _split_op_result(self, result) -> List[Block]:
nbs = []
for i, loc in enumerate(self.mgr_locs):
vals = result[i]
block = self.make_block(values=vals, placement=[loc])
block = self.make_block(values=vals, placement=loc)
nbs.append(block)
return nbs

Expand Down
27 changes: 20 additions & 7 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,9 @@ class BlockManager(DataManager):
_blknos: np.ndarray
_blklocs: np.ndarray

# Non-trivially faster than a property
ndim = 2 # overridden by SingleBlockManager

def __init__(
self,
blocks: Sequence[Block],
Expand All @@ -173,6 +176,21 @@ def __init__(
self._blknos = None
self._blklocs = None

@classmethod
def _simple_new(cls, blocks: Tuple[Block, ...], axes: List[Index]):
"""
Fastpath constructor; does NO validation.
"""
obj = cls.__new__(cls)
obj.axes = axes
obj.blocks = blocks

# Populate known_consolidate, blknos, and blklocs lazily
obj._known_consolidated = False
obj._blknos = None
obj._blklocs = None
return obj

@classmethod
def from_blocks(cls, blocks: List[Block], axes: List[Index]):
"""
Expand Down Expand Up @@ -233,10 +251,6 @@ def __nonzero__(self) -> bool:
def shape(self) -> Shape:
return tuple(len(ax) for ax in self.axes)

@property
def ndim(self) -> int:
return len(self.axes)

def _normalize_axis(self, axis):
# switch axis to follow BlockManager logic
if self.ndim == 2:
Expand Down Expand Up @@ -800,8 +814,7 @@ def get_slice(self, slobj: slice, axis: int = 0) -> BlockManager:
new_axes = list(self.axes)
new_axes[axis] = new_axes[axis][slobj]

bm = type(self)(new_blocks, new_axes, verify_integrity=False)
return bm
return type(self)._simple_new(tuple(new_blocks), new_axes)

@property
def nblocks(self) -> int:
Expand Down Expand Up @@ -1322,7 +1335,7 @@ def reindex_indexer(

def _slice_take_blocks_ax0(
self, slice_or_indexer, fill_value=lib.no_default, only_slice: bool = False
):
) -> List[Block]:
"""
Slice/take blocks along axis=0.

Expand Down