Skip to content

Commit 86a4ee0

Browse files
REF: implement Manager.concat_vertical, concat_horizontal (#53066)
* REF: implement Manager.concat_vertical, concat_horizontal * Update pandas/core/internals/managers.py Co-authored-by: Matthew Roeschke <[email protected]> --------- Co-authored-by: Matthew Roeschke <[email protected]>
1 parent d3bc372 commit 86a4ee0

File tree

3 files changed

+125
-102
lines changed

3 files changed

+125
-102
lines changed

pandas/core/internals/array_manager.py

+86-1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
"""
44
from __future__ import annotations
55

6+
import itertools
67
from typing import (
78
TYPE_CHECKING,
89
Any,
@@ -20,9 +21,13 @@
2021
)
2122
from pandas.util._validators import validate_bool_kwarg
2223

23-
from pandas.core.dtypes.astype import astype_array_safe
24+
from pandas.core.dtypes.astype import (
25+
astype_array,
26+
astype_array_safe,
27+
)
2428
from pandas.core.dtypes.cast import (
2529
ensure_dtype_can_hold_na,
30+
find_common_type,
2631
infer_dtype_from_scalar,
2732
)
2833
from pandas.core.dtypes.common import (
@@ -1132,6 +1137,30 @@ def as_array(
11321137

11331138
return result
11341139

1140+
@classmethod
1141+
def concat_horizontal(cls, mgrs: list[Self], axes: list[Index]) -> Self:
1142+
"""
1143+
Concatenate uniformly-indexed ArrayManagers horizontally.
1144+
"""
1145+
# concatting along the columns -> combine reindexed arrays in a single manager
1146+
arrays = list(itertools.chain.from_iterable([mgr.arrays for mgr in mgrs]))
1147+
new_mgr = cls(arrays, [axes[1], axes[0]], verify_integrity=False)
1148+
return new_mgr
1149+
1150+
@classmethod
1151+
def concat_vertical(cls, mgrs: list[Self], axes: list[Index]) -> Self:
1152+
"""
1153+
Concatenate uniformly-indexed ArrayManagers vertically.
1154+
"""
1155+
# concatting along the rows -> concat the reindexed arrays
1156+
# TODO(ArrayManager) doesn't yet preserve the correct dtype
1157+
arrays = [
1158+
concat_arrays([mgrs[i].arrays[j] for i in range(len(mgrs))])
1159+
for j in range(len(mgrs[0].arrays))
1160+
]
1161+
new_mgr = cls(arrays, [axes[1], axes[0]], verify_integrity=False)
1162+
return new_mgr
1163+
11351164

11361165
class SingleArrayManager(BaseArrayManager, SingleDataManager):
11371166
__slots__ = [
@@ -1350,3 +1379,59 @@ def to_array(self, dtype: DtypeObj) -> ArrayLike:
13501379
arr = np.empty(self.n, dtype=dtype)
13511380
arr.fill(fill_value)
13521381
return ensure_wrapped_if_datetimelike(arr)
1382+
1383+
1384+
def concat_arrays(to_concat: list) -> ArrayLike:
1385+
"""
1386+
Alternative for concat_compat but specialized for use in the ArrayManager.
1387+
1388+
Differences: only deals with 1D arrays (no axis keyword), assumes
1389+
ensure_wrapped_if_datetimelike and does not skip empty arrays to determine
1390+
the dtype.
1391+
In addition ensures that all NullArrayProxies get replaced with actual
1392+
arrays.
1393+
1394+
Parameters
1395+
----------
1396+
to_concat : list of arrays
1397+
1398+
Returns
1399+
-------
1400+
np.ndarray or ExtensionArray
1401+
"""
1402+
# ignore the all-NA proxies to determine the resulting dtype
1403+
to_concat_no_proxy = [x for x in to_concat if not isinstance(x, NullArrayProxy)]
1404+
1405+
dtypes = {x.dtype for x in to_concat_no_proxy}
1406+
single_dtype = len(dtypes) == 1
1407+
1408+
if single_dtype:
1409+
target_dtype = to_concat_no_proxy[0].dtype
1410+
elif all(x.kind in "iub" and isinstance(x, np.dtype) for x in dtypes):
1411+
# GH#42092
1412+
target_dtype = np.find_common_type(list(dtypes), [])
1413+
else:
1414+
target_dtype = find_common_type([arr.dtype for arr in to_concat_no_proxy])
1415+
1416+
to_concat = [
1417+
arr.to_array(target_dtype)
1418+
if isinstance(arr, NullArrayProxy)
1419+
else astype_array(arr, target_dtype, copy=False)
1420+
for arr in to_concat
1421+
]
1422+
1423+
if isinstance(to_concat[0], ExtensionArray):
1424+
cls = type(to_concat[0])
1425+
return cls._concat_same_type(to_concat)
1426+
1427+
result = np.concatenate(to_concat)
1428+
1429+
# TODO decide on exact behaviour (we shouldn't do this only for empty result)
1430+
# see https://github.com/pandas-dev/pandas/issues/39817
1431+
if len(result) == 0:
1432+
# all empties -> check for bool to not coerce to float
1433+
kinds = {obj.dtype.kind for obj in to_concat_no_proxy}
1434+
if len(kinds) != 1:
1435+
if "b" in kinds:
1436+
result = result.astype(object)
1437+
return result

pandas/core/internals/concat.py

+8-101
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
from __future__ import annotations
22

3-
import itertools
43
from typing import (
54
TYPE_CHECKING,
65
Sequence,
@@ -20,7 +19,6 @@
2019
from pandas.util._decorators import cache_readonly
2120
from pandas.util._exceptions import find_stack_level
2221

23-
from pandas.core.dtypes.astype import astype_array
2422
from pandas.core.dtypes.cast import (
2523
ensure_dtype_can_hold_na,
2624
find_common_type,
@@ -38,13 +36,9 @@
3836
isna_all,
3937
)
4038

41-
from pandas.core.arrays import ExtensionArray
4239
from pandas.core.arrays.sparse import SparseDtype
4340
from pandas.core.construction import ensure_wrapped_if_datetimelike
44-
from pandas.core.internals.array_manager import (
45-
ArrayManager,
46-
NullArrayProxy,
47-
)
41+
from pandas.core.internals.array_manager import ArrayManager
4842
from pandas.core.internals.blocks import (
4943
ensure_block_shape,
5044
new_block_2d,
@@ -59,7 +53,7 @@
5953
ArrayLike,
6054
AxisInt,
6155
DtypeObj,
62-
Manager,
56+
Manager2D,
6357
Shape,
6458
)
6559

@@ -71,8 +65,8 @@
7165

7266

7367
def _concatenate_array_managers(
74-
mgrs: list[Manager], axes: list[Index], concat_axis: AxisInt
75-
) -> Manager:
68+
mgrs: list[ArrayManager], axes: list[Index], concat_axis: AxisInt
69+
) -> Manager2D:
7670
"""
7771
Concatenate array managers into one.
7872
@@ -87,80 +81,16 @@ def _concatenate_array_managers(
8781
ArrayManager
8882
"""
8983
if concat_axis == 1:
90-
# concatting along the rows -> concat the reindexed arrays
91-
# TODO(ArrayManager) doesn't yet preserve the correct dtype
92-
arrays = [
93-
concat_arrays([mgrs[i].arrays[j] for i in range(len(mgrs))])
94-
for j in range(len(mgrs[0].arrays))
95-
]
84+
return mgrs[0].concat_vertical(mgrs, axes)
9685
else:
9786
# concatting along the columns -> combine reindexed arrays in a single manager
9887
assert concat_axis == 0
99-
arrays = list(itertools.chain.from_iterable([mgr.arrays for mgr in mgrs]))
100-
101-
new_mgr = ArrayManager(arrays, [axes[1], axes[0]], verify_integrity=False)
102-
return new_mgr
103-
104-
105-
def concat_arrays(to_concat: list) -> ArrayLike:
106-
"""
107-
Alternative for concat_compat but specialized for use in the ArrayManager.
108-
109-
Differences: only deals with 1D arrays (no axis keyword), assumes
110-
ensure_wrapped_if_datetimelike and does not skip empty arrays to determine
111-
the dtype.
112-
In addition ensures that all NullArrayProxies get replaced with actual
113-
arrays.
114-
115-
Parameters
116-
----------
117-
to_concat : list of arrays
118-
119-
Returns
120-
-------
121-
np.ndarray or ExtensionArray
122-
"""
123-
# ignore the all-NA proxies to determine the resulting dtype
124-
to_concat_no_proxy = [x for x in to_concat if not isinstance(x, NullArrayProxy)]
125-
126-
dtypes = {x.dtype for x in to_concat_no_proxy}
127-
single_dtype = len(dtypes) == 1
128-
129-
if single_dtype:
130-
target_dtype = to_concat_no_proxy[0].dtype
131-
elif all(x.kind in "iub" and isinstance(x, np.dtype) for x in dtypes):
132-
# GH#42092
133-
target_dtype = np.find_common_type(list(dtypes), [])
134-
else:
135-
target_dtype = find_common_type([arr.dtype for arr in to_concat_no_proxy])
136-
137-
to_concat = [
138-
arr.to_array(target_dtype)
139-
if isinstance(arr, NullArrayProxy)
140-
else astype_array(arr, target_dtype, copy=False)
141-
for arr in to_concat
142-
]
143-
144-
if isinstance(to_concat[0], ExtensionArray):
145-
cls = type(to_concat[0])
146-
return cls._concat_same_type(to_concat)
147-
148-
result = np.concatenate(to_concat)
149-
150-
# TODO decide on exact behaviour (we shouldn't do this only for empty result)
151-
# see https://github.com/pandas-dev/pandas/issues/39817
152-
if len(result) == 0:
153-
# all empties -> check for bool to not coerce to float
154-
kinds = {obj.dtype.kind for obj in to_concat_no_proxy}
155-
if len(kinds) != 1:
156-
if "b" in kinds:
157-
result = result.astype(object)
158-
return result
88+
return mgrs[0].concat_horizontal(mgrs, axes)
15989

16090

16191
def concatenate_managers(
16292
mgrs_indexers, axes: list[Index], concat_axis: AxisInt, copy: bool
163-
) -> Manager:
93+
) -> Manager2D:
16494
"""
16595
Concatenate block managers into one.
16696
@@ -196,7 +126,7 @@ def concatenate_managers(
196126

197127
if concat_axis == 0:
198128
mgrs = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers, needs_copy)
199-
return _concat_managers_axis0(mgrs, axes)
129+
return mgrs[0].concat_horizontal(mgrs, axes)
200130

201131
if len(mgrs_indexers) > 0 and mgrs_indexers[0][0].nblocks > 0:
202132
first_dtype = mgrs_indexers[0][0].blocks[0].dtype
@@ -266,29 +196,6 @@ def concatenate_managers(
266196
return BlockManager(tuple(blocks), axes)
267197

268198

269-
def _concat_managers_axis0(mgrs: list[BlockManager], axes: list[Index]) -> BlockManager:
270-
"""
271-
concat_managers specialized to concat_axis=0, with reindexing already
272-
having been done in _maybe_reindex_columns_na_proxy.
273-
"""
274-
275-
offset = 0
276-
blocks: list[Block] = []
277-
for i, mgr in enumerate(mgrs):
278-
for blk in mgr.blocks:
279-
# We need to do getitem_block here otherwise we would be altering
280-
# blk.mgr_locs in place, which would render it invalid. This is only
281-
# relevant in the copy=False case.
282-
nb = blk.getitem_block(slice(None))
283-
nb._mgr_locs = nb._mgr_locs.add(offset)
284-
blocks.append(nb)
285-
286-
offset += len(mgr.items)
287-
288-
result = BlockManager(tuple(blocks), axes)
289-
return result
290-
291-
292199
def _maybe_reindex_columns_na_proxy(
293200
axes: list[Index],
294201
mgrs_indexers: list[tuple[BlockManager, dict[int, np.ndarray]]],

pandas/core/internals/managers.py

+31
Original file line numberDiff line numberDiff line change
@@ -1841,6 +1841,37 @@ def _consolidate_inplace(self) -> None:
18411841
self._known_consolidated = True
18421842
self._rebuild_blknos_and_blklocs()
18431843

1844+
# ----------------------------------------------------------------
1845+
# Concatenation
1846+
1847+
@classmethod
1848+
def concat_horizontal(cls, mgrs: list[Self], axes: list[Index]) -> Self:
1849+
"""
1850+
Concatenate uniformly-indexed BlockManagers horizontally.
1851+
"""
1852+
offset = 0
1853+
blocks: list[Block] = []
1854+
for mgr in mgrs:
1855+
for blk in mgr.blocks:
1856+
# We need to do getitem_block here otherwise we would be altering
1857+
# blk.mgr_locs in place, which would render it invalid. This is only
1858+
# relevant in the copy=False case.
1859+
nb = blk.getitem_block(slice(None))
1860+
nb._mgr_locs = nb._mgr_locs.add(offset)
1861+
blocks.append(nb)
1862+
1863+
offset += len(mgr.items)
1864+
1865+
new_mgr = cls(tuple(blocks), axes)
1866+
return new_mgr
1867+
1868+
@classmethod
1869+
def concat_vertical(cls, mgrs: list[Self], axes: list[Index]) -> Self:
1870+
"""
1871+
Concatenate uniformly-indexed BlockManagers vertically.
1872+
"""
1873+
raise NotImplementedError("This logic lives (for now) in internals.concat")
1874+
18441875

18451876
class SingleBlockManager(BaseBlockManager, SingleDataManager):
18461877
"""manage a single block with"""

0 commit comments

Comments
 (0)