From 3d84ddb05ad5f25aaea4d6400b8d6502fb98843e Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 2 Sep 2021 21:48:09 -0700 Subject: [PATCH 1/3] REF: reindex_indexer use np.void to avoid JoinUnit --- pandas/core/internals/concat.py | 44 +++++++++++++++++++++++++++---- pandas/core/internals/managers.py | 31 +++++++++++++++++++--- 2 files changed, 67 insertions(+), 8 deletions(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 6b41d7a26080d..224641a6ea40c 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -198,6 +198,27 @@ def concatenate_managers( if isinstance(mgrs_indexers[0][0], ArrayManager): return _concatenate_array_managers(mgrs_indexers, axes, concat_axis, copy) + new_mgrs_indexers = [] + for mgr, indexers in mgrs_indexers: + # We only reindex for axis=0 (i.e. columns), as this can be done cheaply + if 0 in indexers: + new_mgr = mgr.reindex_indexer( + axes[0], + indexers[0], + axis=0, + copy=False, + only_slice=True, + allow_dups=True, + use_na_proxy=True, + ) + new_indexers = indexers.copy() + del new_indexers[0] + new_mgrs_indexers.append((new_mgr, new_indexers)) + else: + new_mgrs_indexers.append((mgr, indexers)) + + mgrs_indexers = new_mgrs_indexers + concat_plans = [ _get_mgr_concatenation_plan(mgr, indexers) for mgr, indexers in mgrs_indexers ] @@ -375,6 +396,8 @@ def _is_valid_na_for(self, dtype: DtypeObj) -> bool: return False if self.block is None: return True + if self.block.dtype.kind == "V": + return True if self.dtype == object: values = self.block.values @@ -401,6 +424,8 @@ def is_na(self) -> bool: blk = self.block if blk is None: return True + if blk.dtype.kind == "V": + return True if not blk._can_hold_na: return False @@ -426,7 +451,7 @@ def is_na(self) -> bool: return all(isna_all(row) for row in values) def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: - if upcasted_na is None: + if upcasted_na is None and self.block.dtype.kind != "V": # No upcasting is necessary fill_value = self.block.fill_value values = self.block.get_values() @@ -435,6 +460,7 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: if self._is_valid_na_for(empty_dtype): # note: always holds when self.block is None + # or self.block.dtype.kind == "V" blk_dtype = getattr(self.block, "dtype", None) if blk_dtype == np.dtype("object"): @@ -512,7 +538,9 @@ def _concatenate_join_units( empty_dtype = _get_empty_dtype(join_units) - has_none_blocks = any(unit.block is None for unit in join_units) + has_none_blocks = any( + unit.block is None or unit.block.dtype.kind == "V" for unit in join_units + ) upcasted_na = _dtype_to_na_value(empty_dtype, has_none_blocks) to_concat = [ @@ -597,13 +625,19 @@ def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> DtypeObj: empty_dtype = join_units[0].block.dtype return empty_dtype - has_none_blocks = any(unit.block is None for unit in join_units) + has_none_blocks = any( + unit.block is None or unit.block.dtype.kind == "V" for unit in join_units + ) dtypes = [ unit.dtype for unit in join_units if unit.block is not None and not unit.is_na ] if not len(dtypes): - dtypes = [unit.dtype for unit in join_units if unit.block is not None] + dtypes = [ + unit.dtype + for unit in join_units + if unit.block is not None and unit.block.dtype.kind != "V" + ] dtype = find_common_type(dtypes) if has_none_blocks: @@ -619,7 +653,7 @@ def _is_uniform_join_units(join_units: list[JoinUnit]) -> bool: """ first = join_units[0].block - if first is None: + if first is None or first.dtype.kind == "V": return False return ( # exclude cases where a) ju.block is None or b) we have e.g. Int64+int64 diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 874065b50037f..a9894ab5acf23 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -66,6 +66,7 @@ from pandas.core.internals.blocks import ( Block, DatetimeTZBlock, + NumpyBlock, ensure_block_shape, extend_blocks, get_block_type, @@ -613,6 +614,8 @@ def reindex_indexer( copy: bool = True, consolidate: bool = True, only_slice: bool = False, + *, + use_na_proxy: bool = False, ) -> T: """ Parameters @@ -627,6 +630,8 @@ def reindex_indexer( Whether to consolidate inplace before reindexing. only_slice : bool, default False Whether to take views, not copies, along columns. + use_na_proxy : bool, default False + Whether to use a np.void ndarray for newly introduced columns. pandas-indexer with -1's only. """ @@ -651,7 +656,10 @@ def reindex_indexer( if axis == 0: new_blocks = self._slice_take_blocks_ax0( - indexer, fill_value=fill_value, only_slice=only_slice + indexer, + fill_value=fill_value, + only_slice=only_slice, + use_na_proxy=use_na_proxy, ) else: new_blocks = [ @@ -675,6 +683,8 @@ def _slice_take_blocks_ax0( slice_or_indexer: slice | np.ndarray, fill_value=lib.no_default, only_slice: bool = False, + *, + use_na_proxy: bool = False, ) -> list[Block]: """ Slice/take blocks along axis=0. @@ -688,6 +698,8 @@ def _slice_take_blocks_ax0( only_slice : bool, default False If True, we always return views on existing arrays, never copies. This is used when called from ops.blockwise.operate_blockwise. + use_na_proxy : bool, default False + Whether to use a np.void ndarray for newly introduced columns. Returns ------- @@ -756,7 +768,11 @@ def _slice_take_blocks_ax0( # If we've got here, fill_value was not lib.no_default blocks.append( - self._make_na_block(placement=mgr_locs, fill_value=fill_value) + self._make_na_block( + placement=mgr_locs, + fill_value=fill_value, + use_na_proxy=use_na_proxy, + ) ) else: blk = self.blocks[blkno] @@ -798,7 +814,16 @@ def _slice_take_blocks_ax0( return blocks - def _make_na_block(self, placement: BlockPlacement, fill_value=None) -> Block: + def _make_na_block( + self, placement: BlockPlacement, fill_value=None, use_na_proxy: bool = False + ) -> Block: + + if use_na_proxy: + assert fill_value is None + shape = (len(placement), self.shape[1]) + vals = np.empty(shape, dtype=np.void) + nb = NumpyBlock(vals, placement, ndim=2) + return nb if fill_value is None: fill_value = np.nan From 7b7002f15f3b0aa1f6a7bb66e8bd4627190b04b2 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 4 Sep 2021 17:59:55 -0700 Subject: [PATCH 2/3] refactor out _reindex_columns_void --- pandas/core/internals/concat.py | 53 ++++++++++++++++++++------------- 1 file changed, 33 insertions(+), 20 deletions(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 224641a6ea40c..ee6722ae6ee85 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -198,26 +198,7 @@ def concatenate_managers( if isinstance(mgrs_indexers[0][0], ArrayManager): return _concatenate_array_managers(mgrs_indexers, axes, concat_axis, copy) - new_mgrs_indexers = [] - for mgr, indexers in mgrs_indexers: - # We only reindex for axis=0 (i.e. columns), as this can be done cheaply - if 0 in indexers: - new_mgr = mgr.reindex_indexer( - axes[0], - indexers[0], - axis=0, - copy=False, - only_slice=True, - allow_dups=True, - use_na_proxy=True, - ) - new_indexers = indexers.copy() - del new_indexers[0] - new_mgrs_indexers.append((new_mgr, new_indexers)) - else: - new_mgrs_indexers.append((mgr, indexers)) - - mgrs_indexers = new_mgrs_indexers + mgrs_indexers = _reindex_columns_void(axes, mgrs_indexers) concat_plans = [ _get_mgr_concatenation_plan(mgr, indexers) for mgr, indexers in mgrs_indexers @@ -266,6 +247,38 @@ def concatenate_managers( return BlockManager(tuple(blocks), axes) +def _reindex_columns_void( + axes: list[Index], mgrs_indexers: list[tuple[BlockManager, dict[int, np.ndarray]]] +) -> list[tuple[BlockManager, dict[int, np.ndarray]]]: + """ + Reindex along columns so that all of the BlockManagers being concatenated + have matching columns. + + Columns added in this reindexing have dtype=np.void, indicating they + should be ignored when choosing a column's final dtype. + """ + new_mgrs_indexers = [] + for mgr, indexers in mgrs_indexers: + # We only reindex for axis=0 (i.e. columns), as this can be done cheaply + if 0 in indexers: + new_mgr = mgr.reindex_indexer( + axes[0], + indexers[0], + axis=0, + copy=False, + only_slice=True, + allow_dups=True, + use_na_proxy=True, + ) + new_indexers = indexers.copy() + del new_indexers[0] + new_mgrs_indexers.append((new_mgr, new_indexers)) + else: + new_mgrs_indexers.append((mgr, indexers)) + + return new_mgrs_indexers + + def _get_mgr_concatenation_plan(mgr: BlockManager, indexers: dict[int, np.ndarray]): """ Construct concatenation plan for given block manager and indexers. From 32d9d86f919fbbabe6c8d907e29aa05682e8cec0 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 6 Sep 2021 12:21:30 -0700 Subject: [PATCH 3/3] _reindex_columns_void -> _maybe_reindex_columns_na_proxy --- pandas/core/internals/concat.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index ee6722ae6ee85..25bdc71fe7d12 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -198,7 +198,7 @@ def concatenate_managers( if isinstance(mgrs_indexers[0][0], ArrayManager): return _concatenate_array_managers(mgrs_indexers, axes, concat_axis, copy) - mgrs_indexers = _reindex_columns_void(axes, mgrs_indexers) + mgrs_indexers = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers) concat_plans = [ _get_mgr_concatenation_plan(mgr, indexers) for mgr, indexers in mgrs_indexers @@ -247,7 +247,7 @@ def concatenate_managers( return BlockManager(tuple(blocks), axes) -def _reindex_columns_void( +def _maybe_reindex_columns_na_proxy( axes: list[Index], mgrs_indexers: list[tuple[BlockManager, dict[int, np.ndarray]]] ) -> list[tuple[BlockManager, dict[int, np.ndarray]]]: """