diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index c3be914aa095d..d7b899cc192fc 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -3,6 +3,7 @@ """ from __future__ import annotations +import itertools from typing import ( TYPE_CHECKING, Any, @@ -20,9 +21,13 @@ ) from pandas.util._validators import validate_bool_kwarg -from pandas.core.dtypes.astype import astype_array_safe +from pandas.core.dtypes.astype import ( + astype_array, + astype_array_safe, +) from pandas.core.dtypes.cast import ( ensure_dtype_can_hold_na, + find_common_type, infer_dtype_from_scalar, ) from pandas.core.dtypes.common import ( @@ -1136,6 +1141,30 @@ def as_array( return result + @classmethod + def concat_horizontal(cls, mgrs: list[Self], axes: list[Index]) -> Self: + """ + Concatenate uniformly-indexed ArrayManagers horizontally. + """ + # concatting along the columns -> combine reindexed arrays in a single manager + arrays = list(itertools.chain.from_iterable([mgr.arrays for mgr in mgrs])) + new_mgr = cls(arrays, [axes[1], axes[0]], verify_integrity=False) + return new_mgr + + @classmethod + def concat_vertical(cls, mgrs: list[Self], axes: list[Index]) -> Self: + """ + Concatenate uniformly-indexed ArrayManagers vertically. + """ + # concatting along the rows -> concat the reindexed arrays + # TODO(ArrayManager) doesn't yet preserve the correct dtype + arrays = [ + concat_arrays([mgrs[i].arrays[j] for i in range(len(mgrs))]) + for j in range(len(mgrs[0].arrays)) + ] + new_mgr = cls(arrays, [axes[1], axes[0]], verify_integrity=False) + return new_mgr + class SingleArrayManager(BaseArrayManager, SingleDataManager): __slots__ = [ @@ -1354,3 +1383,59 @@ def to_array(self, dtype: DtypeObj) -> ArrayLike: arr = np.empty(self.n, dtype=dtype) arr.fill(fill_value) return ensure_wrapped_if_datetimelike(arr) + + +def concat_arrays(to_concat: list) -> ArrayLike: + """ + Alternative for concat_compat but specialized for use in the ArrayManager. + + Differences: only deals with 1D arrays (no axis keyword), assumes + ensure_wrapped_if_datetimelike and does not skip empty arrays to determine + the dtype. + In addition ensures that all NullArrayProxies get replaced with actual + arrays. + + Parameters + ---------- + to_concat : list of arrays + + Returns + ------- + np.ndarray or ExtensionArray + """ + # ignore the all-NA proxies to determine the resulting dtype + to_concat_no_proxy = [x for x in to_concat if not isinstance(x, NullArrayProxy)] + + dtypes = {x.dtype for x in to_concat_no_proxy} + single_dtype = len(dtypes) == 1 + + if single_dtype: + target_dtype = to_concat_no_proxy[0].dtype + elif all(x.kind in "iub" and isinstance(x, np.dtype) for x in dtypes): + # GH#42092 + target_dtype = np.find_common_type(list(dtypes), []) + else: + target_dtype = find_common_type([arr.dtype for arr in to_concat_no_proxy]) + + to_concat = [ + arr.to_array(target_dtype) + if isinstance(arr, NullArrayProxy) + else astype_array(arr, target_dtype, copy=False) + for arr in to_concat + ] + + if isinstance(to_concat[0], ExtensionArray): + cls = type(to_concat[0]) + return cls._concat_same_type(to_concat) + + result = np.concatenate(to_concat) + + # TODO decide on exact behaviour (we shouldn't do this only for empty result) + # see https://github.com/pandas-dev/pandas/issues/39817 + if len(result) == 0: + # all empties -> check for bool to not coerce to float + kinds = {obj.dtype.kind for obj in to_concat_no_proxy} + if len(kinds) != 1: + if "b" in kinds: + result = result.astype(object) + return result diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index d2abf3dcb628d..e34e7fbf1035e 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -1,6 +1,5 @@ from __future__ import annotations -import itertools from typing import ( TYPE_CHECKING, Sequence, @@ -20,7 +19,6 @@ from pandas.util._decorators import cache_readonly from pandas.util._exceptions import find_stack_level -from pandas.core.dtypes.astype import astype_array from pandas.core.dtypes.cast import ( ensure_dtype_can_hold_na, find_common_type, @@ -38,13 +36,9 @@ isna_all, ) -from pandas.core.arrays import ExtensionArray from pandas.core.arrays.sparse import SparseDtype from pandas.core.construction import ensure_wrapped_if_datetimelike -from pandas.core.internals.array_manager import ( - ArrayManager, - NullArrayProxy, -) +from pandas.core.internals.array_manager import ArrayManager from pandas.core.internals.blocks import ( ensure_block_shape, new_block_2d, @@ -59,7 +53,7 @@ ArrayLike, AxisInt, DtypeObj, - Manager, + Manager2D, Shape, ) @@ -71,8 +65,8 @@ def _concatenate_array_managers( - mgrs: list[Manager], axes: list[Index], concat_axis: AxisInt -) -> Manager: + mgrs: list[ArrayManager], axes: list[Index], concat_axis: AxisInt +) -> Manager2D: """ Concatenate array managers into one. @@ -87,80 +81,16 @@ def _concatenate_array_managers( ArrayManager """ if concat_axis == 1: - # concatting along the rows -> concat the reindexed arrays - # TODO(ArrayManager) doesn't yet preserve the correct dtype - arrays = [ - concat_arrays([mgrs[i].arrays[j] for i in range(len(mgrs))]) - for j in range(len(mgrs[0].arrays)) - ] + return mgrs[0].concat_vertical(mgrs, axes) else: # concatting along the columns -> combine reindexed arrays in a single manager assert concat_axis == 0 - arrays = list(itertools.chain.from_iterable([mgr.arrays for mgr in mgrs])) - - new_mgr = ArrayManager(arrays, [axes[1], axes[0]], verify_integrity=False) - return new_mgr - - -def concat_arrays(to_concat: list) -> ArrayLike: - """ - Alternative for concat_compat but specialized for use in the ArrayManager. - - Differences: only deals with 1D arrays (no axis keyword), assumes - ensure_wrapped_if_datetimelike and does not skip empty arrays to determine - the dtype. - In addition ensures that all NullArrayProxies get replaced with actual - arrays. - - Parameters - ---------- - to_concat : list of arrays - - Returns - ------- - np.ndarray or ExtensionArray - """ - # ignore the all-NA proxies to determine the resulting dtype - to_concat_no_proxy = [x for x in to_concat if not isinstance(x, NullArrayProxy)] - - dtypes = {x.dtype for x in to_concat_no_proxy} - single_dtype = len(dtypes) == 1 - - if single_dtype: - target_dtype = to_concat_no_proxy[0].dtype - elif all(x.kind in "iub" and isinstance(x, np.dtype) for x in dtypes): - # GH#42092 - target_dtype = np.find_common_type(list(dtypes), []) - else: - target_dtype = find_common_type([arr.dtype for arr in to_concat_no_proxy]) - - to_concat = [ - arr.to_array(target_dtype) - if isinstance(arr, NullArrayProxy) - else astype_array(arr, target_dtype, copy=False) - for arr in to_concat - ] - - if isinstance(to_concat[0], ExtensionArray): - cls = type(to_concat[0]) - return cls._concat_same_type(to_concat) - - result = np.concatenate(to_concat) - - # TODO decide on exact behaviour (we shouldn't do this only for empty result) - # see https://github.com/pandas-dev/pandas/issues/39817 - if len(result) == 0: - # all empties -> check for bool to not coerce to float - kinds = {obj.dtype.kind for obj in to_concat_no_proxy} - if len(kinds) != 1: - if "b" in kinds: - result = result.astype(object) - return result + return mgrs[0].concat_horizontal(mgrs, axes) def concatenate_managers( mgrs_indexers, axes: list[Index], concat_axis: AxisInt, copy: bool -) -> Manager: +) -> Manager2D: """ Concatenate block managers into one. @@ -196,7 +126,7 @@ def concatenate_managers( if concat_axis == 0: mgrs = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers, needs_copy) - return _concat_managers_axis0(mgrs, axes) + return mgrs[0].concat_horizontal(mgrs, axes) if len(mgrs_indexers) > 0 and mgrs_indexers[0][0].nblocks > 0: first_dtype = mgrs_indexers[0][0].blocks[0].dtype @@ -266,29 +196,6 @@ def concatenate_managers( return BlockManager(tuple(blocks), axes) -def _concat_managers_axis0(mgrs: list[BlockManager], axes: list[Index]) -> BlockManager: - """ - concat_managers specialized to concat_axis=0, with reindexing already - having been done in _maybe_reindex_columns_na_proxy. - """ - - offset = 0 - blocks: list[Block] = [] - for i, mgr in enumerate(mgrs): - for blk in mgr.blocks: - # We need to do getitem_block here otherwise we would be altering - # blk.mgr_locs in place, which would render it invalid. This is only - # relevant in the copy=False case. - nb = blk.getitem_block(slice(None)) - nb._mgr_locs = nb._mgr_locs.add(offset) - blocks.append(nb) - - offset += len(mgr.items) - - result = BlockManager(tuple(blocks), axes) - return result - - def _maybe_reindex_columns_na_proxy( axes: list[Index], mgrs_indexers: list[tuple[BlockManager, dict[int, np.ndarray]]], diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 36dd0cece0f20..329b98da6a4d0 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1841,6 +1841,37 @@ def _consolidate_inplace(self) -> None: self._known_consolidated = True self._rebuild_blknos_and_blklocs() + # ---------------------------------------------------------------- + # Concatenation + + @classmethod + def concat_horizontal(cls, mgrs: list[Self], axes: list[Index]) -> Self: + """ + Concatenate uniformly-indexed BlockManagers horizontally. + """ + offset = 0 + blocks: list[Block] = [] + for mgr in mgrs: + for blk in mgr.blocks: + # We need to do getitem_block here otherwise we would be altering + # blk.mgr_locs in place, which would render it invalid. This is only + # relevant in the copy=False case. + nb = blk.getitem_block(slice(None)) + nb._mgr_locs = nb._mgr_locs.add(offset) + blocks.append(nb) + + offset += len(mgr.items) + + new_mgr = cls(tuple(blocks), axes) + return new_mgr + + @classmethod + def concat_vertical(cls, mgrs: list[Self], axes: list[Index]) -> Self: + """ + Concatenate uniformly-indexed BlockManagers vertically. + """ + raise NotImplementedError("This logic lives (for now) in internals.concat") + class SingleBlockManager(BaseBlockManager, SingleDataManager): """manage a single block with"""