Skip to content

REF: implement Manager.concat_vertical, concat_horizontal #53066

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
May 4, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 86 additions & 1 deletion pandas/core/internals/array_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""
from __future__ import annotations

import itertools
from typing import (
TYPE_CHECKING,
Any,
Expand All @@ -20,9 +21,13 @@
)
from pandas.util._validators import validate_bool_kwarg

from pandas.core.dtypes.astype import astype_array_safe
from pandas.core.dtypes.astype import (
astype_array,
astype_array_safe,
)
from pandas.core.dtypes.cast import (
ensure_dtype_can_hold_na,
find_common_type,
infer_dtype_from_scalar,
)
from pandas.core.dtypes.common import (
Expand Down Expand Up @@ -1136,6 +1141,30 @@ def as_array(

return result

@classmethod
def concat_horizontal(cls, mgrs: list[Self], axes: list[Index]) -> Self:
"""
Concatenate uniformly-indexed ArrayManagers horizontally.
"""
# concatting along the columns -> combine reindexed arrays in a single manager
arrays = list(itertools.chain.from_iterable([mgr.arrays for mgr in mgrs]))
new_mgr = cls(arrays, [axes[1], axes[0]], verify_integrity=False)
return new_mgr

@classmethod
def concat_vertical(cls, mgrs: list[Self], axes: list[Index]) -> Self:
"""
Concatenate uniformly-indexed ArrayManagers vertically.
"""
# concatting along the rows -> concat the reindexed arrays
# TODO(ArrayManager) doesn't yet preserve the correct dtype
arrays = [
concat_arrays([mgrs[i].arrays[j] for i in range(len(mgrs))])
for j in range(len(mgrs[0].arrays))
]
new_mgr = cls(arrays, [axes[1], axes[0]], verify_integrity=False)
return new_mgr


class SingleArrayManager(BaseArrayManager, SingleDataManager):
__slots__ = [
Expand Down Expand Up @@ -1354,3 +1383,59 @@ def to_array(self, dtype: DtypeObj) -> ArrayLike:
arr = np.empty(self.n, dtype=dtype)
arr.fill(fill_value)
return ensure_wrapped_if_datetimelike(arr)


def concat_arrays(to_concat: list) -> ArrayLike:
"""
Alternative for concat_compat but specialized for use in the ArrayManager.

Differences: only deals with 1D arrays (no axis keyword), assumes
ensure_wrapped_if_datetimelike and does not skip empty arrays to determine
the dtype.
In addition ensures that all NullArrayProxies get replaced with actual
arrays.

Parameters
----------
to_concat : list of arrays

Returns
-------
np.ndarray or ExtensionArray
"""
# ignore the all-NA proxies to determine the resulting dtype
to_concat_no_proxy = [x for x in to_concat if not isinstance(x, NullArrayProxy)]

dtypes = {x.dtype for x in to_concat_no_proxy}
single_dtype = len(dtypes) == 1

if single_dtype:
target_dtype = to_concat_no_proxy[0].dtype
elif all(x.kind in "iub" and isinstance(x, np.dtype) for x in dtypes):
# GH#42092
target_dtype = np.find_common_type(list(dtypes), [])
else:
target_dtype = find_common_type([arr.dtype for arr in to_concat_no_proxy])

to_concat = [
arr.to_array(target_dtype)
if isinstance(arr, NullArrayProxy)
else astype_array(arr, target_dtype, copy=False)
for arr in to_concat
]

if isinstance(to_concat[0], ExtensionArray):
cls = type(to_concat[0])
return cls._concat_same_type(to_concat)

result = np.concatenate(to_concat)

# TODO decide on exact behaviour (we shouldn't do this only for empty result)
# see https://github.com/pandas-dev/pandas/issues/39817
if len(result) == 0:
# all empties -> check for bool to not coerce to float
kinds = {obj.dtype.kind for obj in to_concat_no_proxy}
if len(kinds) != 1:
if "b" in kinds:
result = result.astype(object)
return result
109 changes: 8 additions & 101 deletions pandas/core/internals/concat.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from __future__ import annotations

import itertools
from typing import (
TYPE_CHECKING,
Sequence,
Expand All @@ -20,7 +19,6 @@
from pandas.util._decorators import cache_readonly
from pandas.util._exceptions import find_stack_level

from pandas.core.dtypes.astype import astype_array
from pandas.core.dtypes.cast import (
ensure_dtype_can_hold_na,
find_common_type,
Expand All @@ -38,13 +36,9 @@
isna_all,
)

from pandas.core.arrays import ExtensionArray
from pandas.core.arrays.sparse import SparseDtype
from pandas.core.construction import ensure_wrapped_if_datetimelike
from pandas.core.internals.array_manager import (
ArrayManager,
NullArrayProxy,
)
from pandas.core.internals.array_manager import ArrayManager
from pandas.core.internals.blocks import (
ensure_block_shape,
new_block_2d,
Expand All @@ -59,7 +53,7 @@
ArrayLike,
AxisInt,
DtypeObj,
Manager,
Manager2D,
Shape,
)

Expand All @@ -71,8 +65,8 @@


def _concatenate_array_managers(
mgrs: list[Manager], axes: list[Index], concat_axis: AxisInt
) -> Manager:
mgrs: list[ArrayManager], axes: list[Index], concat_axis: AxisInt
) -> Manager2D:
"""
Concatenate array managers into one.

Expand All @@ -87,80 +81,16 @@ def _concatenate_array_managers(
ArrayManager
"""
if concat_axis == 1:
# concatting along the rows -> concat the reindexed arrays
# TODO(ArrayManager) doesn't yet preserve the correct dtype
arrays = [
concat_arrays([mgrs[i].arrays[j] for i in range(len(mgrs))])
for j in range(len(mgrs[0].arrays))
]
return mgrs[0].concat_vertical(mgrs, axes)
else:
# concatting along the columns -> combine reindexed arrays in a single manager
assert concat_axis == 0
arrays = list(itertools.chain.from_iterable([mgr.arrays for mgr in mgrs]))

new_mgr = ArrayManager(arrays, [axes[1], axes[0]], verify_integrity=False)
return new_mgr


def concat_arrays(to_concat: list) -> ArrayLike:
"""
Alternative for concat_compat but specialized for use in the ArrayManager.

Differences: only deals with 1D arrays (no axis keyword), assumes
ensure_wrapped_if_datetimelike and does not skip empty arrays to determine
the dtype.
In addition ensures that all NullArrayProxies get replaced with actual
arrays.

Parameters
----------
to_concat : list of arrays

Returns
-------
np.ndarray or ExtensionArray
"""
# ignore the all-NA proxies to determine the resulting dtype
to_concat_no_proxy = [x for x in to_concat if not isinstance(x, NullArrayProxy)]

dtypes = {x.dtype for x in to_concat_no_proxy}
single_dtype = len(dtypes) == 1

if single_dtype:
target_dtype = to_concat_no_proxy[0].dtype
elif all(x.kind in "iub" and isinstance(x, np.dtype) for x in dtypes):
# GH#42092
target_dtype = np.find_common_type(list(dtypes), [])
else:
target_dtype = find_common_type([arr.dtype for arr in to_concat_no_proxy])

to_concat = [
arr.to_array(target_dtype)
if isinstance(arr, NullArrayProxy)
else astype_array(arr, target_dtype, copy=False)
for arr in to_concat
]

if isinstance(to_concat[0], ExtensionArray):
cls = type(to_concat[0])
return cls._concat_same_type(to_concat)

result = np.concatenate(to_concat)

# TODO decide on exact behaviour (we shouldn't do this only for empty result)
# see https://github.com/pandas-dev/pandas/issues/39817
if len(result) == 0:
# all empties -> check for bool to not coerce to float
kinds = {obj.dtype.kind for obj in to_concat_no_proxy}
if len(kinds) != 1:
if "b" in kinds:
result = result.astype(object)
return result
return mgrs[0].concat_horizontal(mgrs, axes)


def concatenate_managers(
mgrs_indexers, axes: list[Index], concat_axis: AxisInt, copy: bool
) -> Manager:
) -> Manager2D:
"""
Concatenate block managers into one.

Expand Down Expand Up @@ -196,7 +126,7 @@ def concatenate_managers(

if concat_axis == 0:
mgrs = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers, needs_copy)
return _concat_managers_axis0(mgrs, axes)
return mgrs[0].concat_horizontal(mgrs, axes)

if len(mgrs_indexers) > 0 and mgrs_indexers[0][0].nblocks > 0:
first_dtype = mgrs_indexers[0][0].blocks[0].dtype
Expand Down Expand Up @@ -266,29 +196,6 @@ def concatenate_managers(
return BlockManager(tuple(blocks), axes)


def _concat_managers_axis0(mgrs: list[BlockManager], axes: list[Index]) -> BlockManager:
"""
concat_managers specialized to concat_axis=0, with reindexing already
having been done in _maybe_reindex_columns_na_proxy.
"""

offset = 0
blocks: list[Block] = []
for i, mgr in enumerate(mgrs):
for blk in mgr.blocks:
# We need to do getitem_block here otherwise we would be altering
# blk.mgr_locs in place, which would render it invalid. This is only
# relevant in the copy=False case.
nb = blk.getitem_block(slice(None))
nb._mgr_locs = nb._mgr_locs.add(offset)
blocks.append(nb)

offset += len(mgr.items)

result = BlockManager(tuple(blocks), axes)
return result


def _maybe_reindex_columns_na_proxy(
axes: list[Index],
mgrs_indexers: list[tuple[BlockManager, dict[int, np.ndarray]]],
Expand Down
31 changes: 31 additions & 0 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1841,6 +1841,37 @@ def _consolidate_inplace(self) -> None:
self._known_consolidated = True
self._rebuild_blknos_and_blklocs()

# ----------------------------------------------------------------
# Concatenation

@classmethod
def concat_horizontal(cls, mgrs: list[Self], axes: list[Index]) -> Self:
"""
Concatenate uniformly-indexed BlockManagers horizontally.
"""
offset = 0
blocks: list[Block] = []
for i, mgr in enumerate(mgrs):
for blk in mgr.blocks:
# We need to do getitem_block here otherwise we would be altering
# blk.mgr_locs in place, which would render it invalid. This is only
# relevant in the copy=False case.
nb = blk.getitem_block(slice(None))
nb._mgr_locs = nb._mgr_locs.add(offset)
blocks.append(nb)

offset += len(mgr.items)

new_mgr = cls(tuple(blocks), axes)
return new_mgr

@classmethod
def concat_vertical(cls, mgrs: list[Self], axes: list[Index]) -> Self:
"""
Concatenate uniformly-indexed BlockManagers vertically.
"""
raise NotImplementedError("This logic lives (for now) in internals.concat")


class SingleBlockManager(BaseBlockManager, SingleDataManager):
"""manage a single block with"""
Expand Down