From 48b913fb94b518aa61d77197e2eb071da8368adf Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 4 Mar 2021 07:59:18 -0800 Subject: [PATCH 1/6] PERF: calling BlockPlacement --- pandas/_libs/internals.pyx | 16 +++++++++------- pandas/core/internals/blocks.py | 2 +- pandas/core/internals/construction.py | 2 +- pandas/core/internals/managers.py | 3 +-- 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 150b7f62b4b26..9b2db897cdd32 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -15,11 +15,13 @@ cimport numpy as cnp from numpy cimport ( NPY_INT64, int64_t, + ndarray, ) cnp.import_array() from pandas._libs.algos import ensure_int64 +from pandas._libs.util cimport is_integer_object @cython.final @@ -27,10 +29,10 @@ cdef class BlockPlacement: # __slots__ = '_as_slice', '_as_array', '_len' cdef: slice _as_slice - object _as_array + ndarray _as_array # Note: this still allows `None` bint _has_slice, _has_array, _is_known_slice_like - def __init__(self, val): + def __cinit__(self, val): cdef: slice slc @@ -39,7 +41,7 @@ cdef class BlockPlacement: self._has_slice = False self._has_array = False - if isinstance(val, int): + if is_integer_object(val): slc = slice(val, val + 1, 1) self._as_slice = slc self._has_slice = True @@ -160,12 +162,12 @@ cdef class BlockPlacement: np.concatenate([self.as_array] + [o.as_array for o in others]) ) - cdef iadd(self, other): + cdef BlockPlacement iadd(self, other): cdef: slice s = self._ensure_has_slice() Py_ssize_t other_int, start, stop, step, l - if isinstance(other, int) and s is not None: + if is_integer_object(other) and s is not None: other_int = other if other_int == 0: @@ -438,13 +440,13 @@ def get_blkno_placements(blknos, group: bool = True): """ Parameters ---------- - blknos : array of int64 + blknos : np.ndarray[int64] group : bool, default True Returns ------- iterator - yield (BlockPlacement, blkno) + yield (blkno, BlockPlacement) """ blknos = ensure_int64(blknos) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index f2b8499a316b7..45e2cb670ef2b 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -447,7 +447,7 @@ def _split_op_result(self, result) -> List[Block]: nbs = [] for i, loc in enumerate(self.mgr_locs): vals = result[i] - block = self.make_block(values=vals, placement=[loc]) + block = self.make_block(values=vals, placement=loc) nbs.append(block) return nbs diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 814fd389bf644..5b0a3b35a0a1b 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -300,7 +300,7 @@ def ndarray_to_mgr( # TODO: What about re-joining object columns? dvals_list = [maybe_squeeze_dt64tz(x) for x in dvals_list] block_values = [ - make_block(dvals_list[n], placement=[n], ndim=2) + make_block(dvals_list[n], placement=n, ndim=2) for n in range(len(dvals_list)) ] diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 2ad7471d6f086..7b5b3ff03b58a 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -557,8 +557,7 @@ def quantile( if transposed: new_axes = new_axes[::-1] blocks = [ - b.make_block(b.values.T, placement=np.arange(b.shape[1])) - for b in blocks + b.make_block(b.values.T, placement=slice(0, b.shape[1])) for b in blocks ] return type(self)(blocks, new_axes) From d801c15f8ab3398dcc231821337d7edc0202ac24 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 4 Mar 2021 09:31:50 -0800 Subject: [PATCH 2/6] PERF: Index.__getitem__ for slices --- pandas/core/indexes/base.py | 7 ++++--- pandas/core/indexes/multi.py | 13 +++++++------ 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 44c9b33ae51c7..2c4fed50bff95 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4534,7 +4534,6 @@ def __getitem__(self, key): # There's no custom logic to be implemented in __getslice__, so it's # not overloaded intentionally. getitem = self._data.__getitem__ - promote = self._shallow_copy if is_scalar(key): key = com.cast_scalar_indexer(key, warn_float=True) @@ -4543,7 +4542,9 @@ def __getitem__(self, key): if isinstance(key, slice): # This case is separated from the conditional above to avoid # pessimization of basic indexing. - return promote(getitem(key)) + result = getitem(key) + # Going through simple_new for performance. + return type(self)._simple_new(result, name=self.name) if com.is_bool_indexer(key): key = np.asarray(key, dtype=bool) @@ -4553,7 +4554,7 @@ def __getitem__(self, key): if np.ndim(result) > 1: deprecate_ndim_indexing(result) return result - return promote(result) + return self._shallow_copy(result) else: return result diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 88b92c7b304ae..fd0e0ef5fa799 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2076,15 +2076,16 @@ def __getitem__(self, key): return tuple(retval) else: + # in general cannot be sure whether the result will be sorted + sortorder = None if com.is_bool_indexer(key): key = np.asarray(key, dtype=bool) sortorder = self.sortorder - else: - # cannot be sure whether the result will be sorted - sortorder = None - - if isinstance(key, Index): - key = np.asarray(key) + elif isinstance(key, slice): + if key.step is None or key.step > 0: + sortorder = self.sortorder + elif isinstance(key, Index): + key = np.asarray(key) new_codes = [level_codes[key] for level_codes in self.codes] From ebfd3a1f10b0c4a49a7ae9fb6704a30ad01aedb5 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 4 Mar 2021 09:42:30 -0800 Subject: [PATCH 3/6] PERF: BlockManager.ndim --- pandas/core/internals/managers.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 7b5b3ff03b58a..435a9d35301e3 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -146,6 +146,9 @@ class BlockManager(DataManager): _blknos: np.ndarray _blklocs: np.ndarray + # Non-trivially faster than a property + ndim = 2 # overridden by SingleBlockManager + def __init__( self, blocks: Sequence[Block], @@ -230,10 +233,6 @@ def __nonzero__(self) -> bool: def shape(self) -> Shape: return tuple(len(ax) for ax in self.axes) - @property - def ndim(self) -> int: - return len(self.axes) - def set_axis( self, axis: int, new_labels: Index, verify_integrity: bool = True ) -> None: From 9c2c4d3175b63bc72e48e14043edbfc44b6ce1bf Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 4 Mar 2021 15:33:20 -0800 Subject: [PATCH 4/6] _simple_new perf --- pandas/core/indexes/base.py | 4 +++- pandas/core/indexes/category.py | 2 +- pandas/core/indexes/range.py | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 2c4fed50bff95..d4ee7b1868184 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4554,7 +4554,9 @@ def __getitem__(self, key): if np.ndim(result) > 1: deprecate_ndim_indexing(result) return result - return self._shallow_copy(result) + # NB: Using _constructor._simple_new would break if MultiIndex + # didn't override __getitem__ + return self._constructor._simple_new(result, name=self.name) else: return result diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 13c53dfafed4d..7a204dcce8a88 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -231,7 +231,7 @@ def _simple_new(cls, values: Categorical, name: Optional[Hashable] = None): result = object.__new__(cls) result._data = values - result.name = name + result._name = name result._cache = {} result._reset_identity() diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 67af4b628e552..40d5596ddb296 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -164,7 +164,7 @@ def _simple_new(cls, values: range, name: Hashable = None) -> RangeIndex: assert isinstance(values, range) result._range = values - result.name = name + result._name = name result._cache = {} result._reset_identity() return result From 7b69805055e1b0c61179a15a9a1800f438f0e23f Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 5 Mar 2021 08:52:31 -0800 Subject: [PATCH 5/6] PERF: BlockManger._simple_new --- pandas/core/internals/managers.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 6a49f1d396cd2..22c0f4fb74125 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -176,6 +176,21 @@ def __init__( self._blknos = None self._blklocs = None + @classmethod + def _simple_new(cls, blocks: Tuple[Block], axes: List[Index]): + """ + Fastpath constructor; does NO validation. + """ + obj = cls.__new__(cls) + obj.axes = axes + obj.blocks = blocks + + # Populate known_consolidate, blknos, and blklocs lazily + obj._known_consolidated = False + obj._blknos = None + obj._blklocs = None + return obj + @classmethod def from_blocks(cls, blocks: List[Block], axes: List[Index]): """ @@ -799,8 +814,7 @@ def get_slice(self, slobj: slice, axis: int = 0) -> BlockManager: new_axes = list(self.axes) new_axes[axis] = new_axes[axis][slobj] - bm = type(self)(new_blocks, new_axes, verify_integrity=False) - return bm + return type(self)._simple_new(tuple(new_blocks), new_axes) @property def nblocks(self) -> int: From 1dfedc4441ab1ae867b0b45203d3a42cc9d4f672 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 5 Mar 2021 10:49:00 -0800 Subject: [PATCH 6/6] mypy fixup --- pandas/core/internals/managers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 22c0f4fb74125..b656c9e83e1a8 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -177,7 +177,7 @@ def __init__( self._blklocs = None @classmethod - def _simple_new(cls, blocks: Tuple[Block], axes: List[Index]): + def _simple_new(cls, blocks: Tuple[Block, ...], axes: List[Index]): """ Fastpath constructor; does NO validation. """ @@ -1335,7 +1335,7 @@ def reindex_indexer( def _slice_take_blocks_ax0( self, slice_or_indexer, fill_value=lib.no_default, only_slice: bool = False - ): + ) -> List[Block]: """ Slice/take blocks along axis=0.