Skip to content

Commit 0de6f8b

Browse files
authored
REF: concat on bm_axis==0 (pandas-dev#43626)
1 parent e406626 commit 0de6f8b

File tree

1 file changed

+77
-100
lines changed

1 file changed

+77
-100
lines changed

pandas/core/internals/concat.py

+77-100
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,6 @@
3737
)
3838
from pandas.core.dtypes.dtypes import ExtensionDtype
3939

40-
import pandas.core.algorithms as algos
4140
from pandas.core.arrays import (
4241
DatetimeArray,
4342
ExtensionArray,
@@ -189,19 +188,29 @@ def concatenate_managers(
189188
if isinstance(mgrs_indexers[0][0], ArrayManager):
190189
return _concatenate_array_managers(mgrs_indexers, axes, concat_axis, copy)
191190

191+
# Assertions disabled for performance
192+
# for tup in mgrs_indexers:
193+
# # caller is responsible for ensuring this
194+
# indexers = tup[1]
195+
# assert concat_axis not in indexers
196+
197+
if concat_axis == 0:
198+
return _concat_managers_axis0(mgrs_indexers, axes, copy)
199+
192200
mgrs_indexers = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers)
193201

194-
concat_plans = [
195-
_get_mgr_concatenation_plan(mgr, indexers) for mgr, indexers in mgrs_indexers
196-
]
197-
concat_plan = _combine_concat_plans(concat_plans, concat_axis)
202+
# Assertion disabled for performance
203+
# assert all(not x[1] for x in mgrs_indexers)
204+
205+
concat_plans = [_get_mgr_concatenation_plan(mgr) for mgr, _ in mgrs_indexers]
206+
concat_plan = _combine_concat_plans(concat_plans)
198207
blocks = []
199208

200209
for placement, join_units in concat_plan:
201210
unit = join_units[0]
202211
blk = unit.block
203212

204-
if len(join_units) == 1 and not join_units[0].indexers:
213+
if len(join_units) == 1:
205214
values = blk.values
206215
if copy:
207216
values = values.copy()
@@ -225,7 +234,7 @@ def concatenate_managers(
225234

226235
fastpath = blk.values.dtype == values.dtype
227236
else:
228-
values = _concatenate_join_units(join_units, concat_axis, copy=copy)
237+
values = _concatenate_join_units(join_units, copy=copy)
229238
fastpath = False
230239

231240
if fastpath:
@@ -238,6 +247,42 @@ def concatenate_managers(
238247
return BlockManager(tuple(blocks), axes)
239248

240249

250+
def _concat_managers_axis0(
251+
mgrs_indexers, axes: list[Index], copy: bool
252+
) -> BlockManager:
253+
"""
254+
concat_managers specialized to concat_axis=0, with reindexing already
255+
having been done in _maybe_reindex_columns_na_proxy.
256+
"""
257+
had_reindexers = {
258+
i: len(mgrs_indexers[i][1]) > 0 for i in range(len(mgrs_indexers))
259+
}
260+
mgrs_indexers = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers)
261+
262+
mgrs = [x[0] for x in mgrs_indexers]
263+
264+
offset = 0
265+
blocks = []
266+
for i, mgr in enumerate(mgrs):
267+
# If we already reindexed, then we definitely don't need another copy
268+
made_copy = had_reindexers[i]
269+
270+
for blk in mgr.blocks:
271+
if made_copy:
272+
nb = blk.copy(deep=False)
273+
elif copy:
274+
nb = blk.copy()
275+
else:
276+
# by slicing instead of copy(deep=False), we get a new array
277+
# object, see test_concat_copy
278+
nb = blk.getitem_block(slice(None))
279+
nb._mgr_locs = nb._mgr_locs.add(offset)
280+
blocks.append(nb)
281+
282+
offset += len(mgr.items)
283+
return BlockManager(tuple(blocks), axes)
284+
285+
241286
def _maybe_reindex_columns_na_proxy(
242287
axes: list[Index], mgrs_indexers: list[tuple[BlockManager, dict[int, np.ndarray]]]
243288
) -> list[tuple[BlockManager, dict[int, np.ndarray]]]:
@@ -248,36 +293,33 @@ def _maybe_reindex_columns_na_proxy(
248293
Columns added in this reindexing have dtype=np.void, indicating they
249294
should be ignored when choosing a column's final dtype.
250295
"""
251-
new_mgrs_indexers = []
296+
new_mgrs_indexers: list[tuple[BlockManager, dict[int, np.ndarray]]] = []
297+
252298
for mgr, indexers in mgrs_indexers:
253-
# We only reindex for axis=0 (i.e. columns), as this can be done cheaply
254-
if 0 in indexers:
255-
new_mgr = mgr.reindex_indexer(
256-
axes[0],
257-
indexers[0],
258-
axis=0,
299+
# For axis=0 (i.e. columns) we use_na_proxy and only_slice, so this
300+
# is a cheap reindexing.
301+
for i, indexer in indexers.items():
302+
mgr = mgr.reindex_indexer(
303+
axes[i],
304+
indexers[i],
305+
axis=i,
259306
copy=False,
260-
only_slice=True,
307+
only_slice=True, # only relevant for i==0
261308
allow_dups=True,
262-
use_na_proxy=True,
309+
use_na_proxy=True, # only relevant for i==0
263310
)
264-
new_indexers = indexers.copy()
265-
del new_indexers[0]
266-
new_mgrs_indexers.append((new_mgr, new_indexers))
267-
else:
268-
new_mgrs_indexers.append((mgr, indexers))
311+
new_mgrs_indexers.append((mgr, {}))
269312

270313
return new_mgrs_indexers
271314

272315

273-
def _get_mgr_concatenation_plan(mgr: BlockManager, indexers: dict[int, np.ndarray]):
316+
def _get_mgr_concatenation_plan(mgr: BlockManager):
274317
"""
275-
Construct concatenation plan for given block manager and indexers.
318+
Construct concatenation plan for given block manager.
276319
277320
Parameters
278321
----------
279322
mgr : BlockManager
280-
indexers : dict of {axis: indexer}
281323
282324
Returns
283325
-------
@@ -287,27 +329,11 @@ def _get_mgr_concatenation_plan(mgr: BlockManager, indexers: dict[int, np.ndarra
287329
# Calculate post-reindex shape , save for item axis which will be separate
288330
# for each block anyway.
289331
mgr_shape_list = list(mgr.shape)
290-
for ax, indexer in indexers.items():
291-
mgr_shape_list[ax] = len(indexer)
292332
mgr_shape = tuple(mgr_shape_list)
293333

294-
assert 0 not in indexers
295-
296-
needs_filling = False
297-
if 1 in indexers:
298-
# indexers[1] is shared by all the JoinUnits, so we can save time
299-
# by only doing this check once
300-
if (indexers[1] == -1).any():
301-
needs_filling = True
302-
303334
if mgr.is_single_block:
304335
blk = mgr.blocks[0]
305-
return [
306-
(
307-
blk.mgr_locs,
308-
JoinUnit(blk, mgr_shape, indexers, needs_filling=needs_filling),
309-
)
310-
]
336+
return [(blk.mgr_locs, JoinUnit(blk, mgr_shape))]
311337

312338
blknos = mgr.blknos
313339
blklocs = mgr.blklocs
@@ -318,8 +344,6 @@ def _get_mgr_concatenation_plan(mgr: BlockManager, indexers: dict[int, np.ndarra
318344
assert placements.is_slice_like
319345
assert blkno != -1
320346

321-
join_unit_indexers = indexers.copy()
322-
323347
shape_list = list(mgr_shape)
324348
shape_list[0] = len(placements)
325349
shape = tuple(shape_list)
@@ -351,30 +375,21 @@ def _get_mgr_concatenation_plan(mgr: BlockManager, indexers: dict[int, np.ndarra
351375
# Assertions disabled for performance
352376
# assert blk._mgr_locs.as_slice == placements.as_slice
353377
# assert blk.shape[0] == shape[0]
354-
unit = JoinUnit(blk, shape, join_unit_indexers, needs_filling=needs_filling)
378+
unit = JoinUnit(blk, shape)
355379

356380
plan.append((placements, unit))
357381

358382
return plan
359383

360384

361385
class JoinUnit:
362-
def __init__(
363-
self, block: Block, shape: Shape, indexers=None, *, needs_filling: bool = False
364-
):
386+
def __init__(self, block: Block, shape: Shape):
365387
# Passing shape explicitly is required for cases when block is None.
366-
# Note: block is None implies indexers is None, but not vice-versa
367-
if indexers is None:
368-
indexers = {}
369-
# we should *never* have `0 in indexers`
370388
self.block = block
371-
self.indexers = indexers
372389
self.shape = shape
373390

374-
self.needs_filling = needs_filling
375-
376391
def __repr__(self) -> str:
377-
return f"{type(self).__name__}({repr(self.block)}, {self.indexers})"
392+
return f"{type(self).__name__}({repr(self.block)})"
378393

379394
@cache_readonly
380395
def is_na(self) -> bool:
@@ -391,24 +406,14 @@ def get_reindexed_values(self, empty_dtype: DtypeObj) -> ArrayLike:
391406

392407
else:
393408

394-
if (not self.indexers) and (not self.block._can_consolidate):
409+
if not self.block._can_consolidate:
395410
# preserve these for validation in concat_compat
396411
return self.block.values
397412

398413
# No dtype upcasting is done here, it will be performed during
399414
# concatenation itself.
400415
values = self.block.values
401416

402-
if not self.indexers:
403-
# If there's no indexing to be done, we want to signal outside
404-
# code that this array must be copied explicitly. This is done
405-
# by returning a view and checking `retval.base`.
406-
values = values.view()
407-
408-
else:
409-
for ax, indexer in self.indexers.items():
410-
values = algos.take_nd(values, indexer, axis=ax)
411-
412417
return values
413418

414419

@@ -446,15 +451,10 @@ def make_na_array(dtype: DtypeObj, shape: Shape) -> ArrayLike:
446451
return missing_arr
447452

448453

449-
def _concatenate_join_units(
450-
join_units: list[JoinUnit], concat_axis: int, copy: bool
451-
) -> ArrayLike:
454+
def _concatenate_join_units(join_units: list[JoinUnit], copy: bool) -> ArrayLike:
452455
"""
453-
Concatenate values from several join units along selected axis.
456+
Concatenate values from several join units along axis=1.
454457
"""
455-
if concat_axis == 0 and len(join_units) > 1:
456-
# Concatenating join units along ax0 is handled in _merge_blocks.
457-
raise AssertionError("Concatenating join units along axis0")
458458

459459
empty_dtype = _get_empty_dtype(join_units)
460460

@@ -488,7 +488,7 @@ def _concatenate_join_units(
488488
concat_values = ensure_block_shape(concat_values, 2)
489489

490490
else:
491-
concat_values = concat_compat(to_concat, axis=concat_axis)
491+
concat_values = concat_compat(to_concat, axis=1)
492492

493493
return concat_values
494494

@@ -532,7 +532,7 @@ def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> DtypeObj:
532532
empty_dtype = join_units[0].block.dtype
533533
return empty_dtype
534534

535-
needs_can_hold_na = any(unit.is_na or unit.needs_filling for unit in join_units)
535+
needs_can_hold_na = any(unit.is_na for unit in join_units)
536536

537537
dtypes = [unit.block.dtype for unit in join_units if not unit.is_na]
538538

@@ -569,9 +569,6 @@ def _is_uniform_join_units(join_units: list[JoinUnit]) -> bool:
569569
# unless we're an extension dtype.
570570
all(not ju.is_na or ju.block.is_extension for ju in join_units)
571571
and
572-
# no blocks with indexers (as then the dimensions do not fit)
573-
all(not ju.indexers for ju in join_units)
574-
and
575572
# only use this path when there is something to concatenate
576573
len(join_units) > 1
577574
)
@@ -591,25 +588,17 @@ def _trim_join_unit(join_unit: JoinUnit, length: int) -> JoinUnit:
591588
592589
Extra items that didn't fit are returned as a separate block.
593590
"""
594-
assert 0 not in join_unit.indexers
595-
extra_indexers = join_unit.indexers
596591

597592
extra_block = join_unit.block.getitem_block(slice(length, None))
598593
join_unit.block = join_unit.block.getitem_block(slice(length))
599594

600595
extra_shape = (join_unit.shape[0] - length,) + join_unit.shape[1:]
601596
join_unit.shape = (length,) + join_unit.shape[1:]
602597

603-
# extra_indexers does not introduce any -1s, so we can inherit needs_filling
604-
return JoinUnit(
605-
block=extra_block,
606-
indexers=extra_indexers,
607-
shape=extra_shape,
608-
needs_filling=join_unit.needs_filling,
609-
)
598+
return JoinUnit(block=extra_block, shape=extra_shape)
610599

611600

612-
def _combine_concat_plans(plans, concat_axis: int):
601+
def _combine_concat_plans(plans):
613602
"""
614603
Combine multiple concatenation plans into one.
615604
@@ -619,18 +608,6 @@ def _combine_concat_plans(plans, concat_axis: int):
619608
for p in plans[0]:
620609
yield p[0], [p[1]]
621610

622-
elif concat_axis == 0:
623-
offset = 0
624-
for plan in plans:
625-
last_plc = None
626-
627-
for plc, unit in plan:
628-
yield plc.add(offset), [unit]
629-
last_plc = plc
630-
631-
if last_plc is not None:
632-
offset += last_plc.as_slice.stop
633-
634611
else:
635612
# singleton list so we can modify it as a side-effect within _next_or_none
636613
num_ended = [0]

0 commit comments

Comments
 (0)