37
37
)
38
38
from pandas .core .dtypes .dtypes import ExtensionDtype
39
39
40
- import pandas .core .algorithms as algos
41
40
from pandas .core .arrays import (
42
41
DatetimeArray ,
43
42
ExtensionArray ,
@@ -189,19 +188,29 @@ def concatenate_managers(
189
188
if isinstance (mgrs_indexers [0 ][0 ], ArrayManager ):
190
189
return _concatenate_array_managers (mgrs_indexers , axes , concat_axis , copy )
191
190
191
+ # Assertions disabled for performance
192
+ # for tup in mgrs_indexers:
193
+ # # caller is responsible for ensuring this
194
+ # indexers = tup[1]
195
+ # assert concat_axis not in indexers
196
+
197
+ if concat_axis == 0 :
198
+ return _concat_managers_axis0 (mgrs_indexers , axes , copy )
199
+
192
200
mgrs_indexers = _maybe_reindex_columns_na_proxy (axes , mgrs_indexers )
193
201
194
- concat_plans = [
195
- _get_mgr_concatenation_plan (mgr , indexers ) for mgr , indexers in mgrs_indexers
196
- ]
197
- concat_plan = _combine_concat_plans (concat_plans , concat_axis )
202
+ # Assertion disabled for performance
203
+ # assert all(not x[1] for x in mgrs_indexers)
204
+
205
+ concat_plans = [_get_mgr_concatenation_plan (mgr ) for mgr , _ in mgrs_indexers ]
206
+ concat_plan = _combine_concat_plans (concat_plans )
198
207
blocks = []
199
208
200
209
for placement , join_units in concat_plan :
201
210
unit = join_units [0 ]
202
211
blk = unit .block
203
212
204
- if len (join_units ) == 1 and not join_units [ 0 ]. indexers :
213
+ if len (join_units ) == 1 :
205
214
values = blk .values
206
215
if copy :
207
216
values = values .copy ()
@@ -225,7 +234,7 @@ def concatenate_managers(
225
234
226
235
fastpath = blk .values .dtype == values .dtype
227
236
else :
228
- values = _concatenate_join_units (join_units , concat_axis , copy = copy )
237
+ values = _concatenate_join_units (join_units , copy = copy )
229
238
fastpath = False
230
239
231
240
if fastpath :
@@ -238,6 +247,42 @@ def concatenate_managers(
238
247
return BlockManager (tuple (blocks ), axes )
239
248
240
249
250
+ def _concat_managers_axis0 (
251
+ mgrs_indexers , axes : list [Index ], copy : bool
252
+ ) -> BlockManager :
253
+ """
254
+ concat_managers specialized to concat_axis=0, with reindexing already
255
+ having been done in _maybe_reindex_columns_na_proxy.
256
+ """
257
+ had_reindexers = {
258
+ i : len (mgrs_indexers [i ][1 ]) > 0 for i in range (len (mgrs_indexers ))
259
+ }
260
+ mgrs_indexers = _maybe_reindex_columns_na_proxy (axes , mgrs_indexers )
261
+
262
+ mgrs = [x [0 ] for x in mgrs_indexers ]
263
+
264
+ offset = 0
265
+ blocks = []
266
+ for i , mgr in enumerate (mgrs ):
267
+ # If we already reindexed, then we definitely don't need another copy
268
+ made_copy = had_reindexers [i ]
269
+
270
+ for blk in mgr .blocks :
271
+ if made_copy :
272
+ nb = blk .copy (deep = False )
273
+ elif copy :
274
+ nb = blk .copy ()
275
+ else :
276
+ # by slicing instead of copy(deep=False), we get a new array
277
+ # object, see test_concat_copy
278
+ nb = blk .getitem_block (slice (None ))
279
+ nb ._mgr_locs = nb ._mgr_locs .add (offset )
280
+ blocks .append (nb )
281
+
282
+ offset += len (mgr .items )
283
+ return BlockManager (tuple (blocks ), axes )
284
+
285
+
241
286
def _maybe_reindex_columns_na_proxy (
242
287
axes : list [Index ], mgrs_indexers : list [tuple [BlockManager , dict [int , np .ndarray ]]]
243
288
) -> list [tuple [BlockManager , dict [int , np .ndarray ]]]:
@@ -248,36 +293,33 @@ def _maybe_reindex_columns_na_proxy(
248
293
Columns added in this reindexing have dtype=np.void, indicating they
249
294
should be ignored when choosing a column's final dtype.
250
295
"""
251
- new_mgrs_indexers = []
296
+ new_mgrs_indexers : list [tuple [BlockManager , dict [int , np .ndarray ]]] = []
297
+
252
298
for mgr , indexers in mgrs_indexers :
253
- # We only reindex for axis=0 (i.e. columns), as this can be done cheaply
254
- if 0 in indexers :
255
- new_mgr = mgr .reindex_indexer (
256
- axes [0 ],
257
- indexers [0 ],
258
- axis = 0 ,
299
+ # For axis=0 (i.e. columns) we use_na_proxy and only_slice, so this
300
+ # is a cheap reindexing.
301
+ for i , indexer in indexers .items ():
302
+ mgr = mgr .reindex_indexer (
303
+ axes [i ],
304
+ indexers [i ],
305
+ axis = i ,
259
306
copy = False ,
260
- only_slice = True ,
307
+ only_slice = True , # only relevant for i==0
261
308
allow_dups = True ,
262
- use_na_proxy = True ,
309
+ use_na_proxy = True , # only relevant for i==0
263
310
)
264
- new_indexers = indexers .copy ()
265
- del new_indexers [0 ]
266
- new_mgrs_indexers .append ((new_mgr , new_indexers ))
267
- else :
268
- new_mgrs_indexers .append ((mgr , indexers ))
311
+ new_mgrs_indexers .append ((mgr , {}))
269
312
270
313
return new_mgrs_indexers
271
314
272
315
273
- def _get_mgr_concatenation_plan (mgr : BlockManager , indexers : dict [ int , np . ndarray ] ):
316
+ def _get_mgr_concatenation_plan (mgr : BlockManager ):
274
317
"""
275
- Construct concatenation plan for given block manager and indexers .
318
+ Construct concatenation plan for given block manager.
276
319
277
320
Parameters
278
321
----------
279
322
mgr : BlockManager
280
- indexers : dict of {axis: indexer}
281
323
282
324
Returns
283
325
-------
@@ -287,27 +329,11 @@ def _get_mgr_concatenation_plan(mgr: BlockManager, indexers: dict[int, np.ndarra
287
329
# Calculate post-reindex shape , save for item axis which will be separate
288
330
# for each block anyway.
289
331
mgr_shape_list = list (mgr .shape )
290
- for ax , indexer in indexers .items ():
291
- mgr_shape_list [ax ] = len (indexer )
292
332
mgr_shape = tuple (mgr_shape_list )
293
333
294
- assert 0 not in indexers
295
-
296
- needs_filling = False
297
- if 1 in indexers :
298
- # indexers[1] is shared by all the JoinUnits, so we can save time
299
- # by only doing this check once
300
- if (indexers [1 ] == - 1 ).any ():
301
- needs_filling = True
302
-
303
334
if mgr .is_single_block :
304
335
blk = mgr .blocks [0 ]
305
- return [
306
- (
307
- blk .mgr_locs ,
308
- JoinUnit (blk , mgr_shape , indexers , needs_filling = needs_filling ),
309
- )
310
- ]
336
+ return [(blk .mgr_locs , JoinUnit (blk , mgr_shape ))]
311
337
312
338
blknos = mgr .blknos
313
339
blklocs = mgr .blklocs
@@ -318,8 +344,6 @@ def _get_mgr_concatenation_plan(mgr: BlockManager, indexers: dict[int, np.ndarra
318
344
assert placements .is_slice_like
319
345
assert blkno != - 1
320
346
321
- join_unit_indexers = indexers .copy ()
322
-
323
347
shape_list = list (mgr_shape )
324
348
shape_list [0 ] = len (placements )
325
349
shape = tuple (shape_list )
@@ -351,30 +375,21 @@ def _get_mgr_concatenation_plan(mgr: BlockManager, indexers: dict[int, np.ndarra
351
375
# Assertions disabled for performance
352
376
# assert blk._mgr_locs.as_slice == placements.as_slice
353
377
# assert blk.shape[0] == shape[0]
354
- unit = JoinUnit (blk , shape , join_unit_indexers , needs_filling = needs_filling )
378
+ unit = JoinUnit (blk , shape )
355
379
356
380
plan .append ((placements , unit ))
357
381
358
382
return plan
359
383
360
384
361
385
class JoinUnit :
362
- def __init__ (
363
- self , block : Block , shape : Shape , indexers = None , * , needs_filling : bool = False
364
- ):
386
+ def __init__ (self , block : Block , shape : Shape ):
365
387
# Passing shape explicitly is required for cases when block is None.
366
- # Note: block is None implies indexers is None, but not vice-versa
367
- if indexers is None :
368
- indexers = {}
369
- # we should *never* have `0 in indexers`
370
388
self .block = block
371
- self .indexers = indexers
372
389
self .shape = shape
373
390
374
- self .needs_filling = needs_filling
375
-
376
391
def __repr__ (self ) -> str :
377
- return f"{ type (self ).__name__ } ({ repr (self .block )} , { self . indexers } )"
392
+ return f"{ type (self ).__name__ } ({ repr (self .block )} )"
378
393
379
394
@cache_readonly
380
395
def is_na (self ) -> bool :
@@ -391,24 +406,14 @@ def get_reindexed_values(self, empty_dtype: DtypeObj) -> ArrayLike:
391
406
392
407
else :
393
408
394
- if ( not self .indexers ) and ( not self . block ._can_consolidate ) :
409
+ if not self .block ._can_consolidate :
395
410
# preserve these for validation in concat_compat
396
411
return self .block .values
397
412
398
413
# No dtype upcasting is done here, it will be performed during
399
414
# concatenation itself.
400
415
values = self .block .values
401
416
402
- if not self .indexers :
403
- # If there's no indexing to be done, we want to signal outside
404
- # code that this array must be copied explicitly. This is done
405
- # by returning a view and checking `retval.base`.
406
- values = values .view ()
407
-
408
- else :
409
- for ax , indexer in self .indexers .items ():
410
- values = algos .take_nd (values , indexer , axis = ax )
411
-
412
417
return values
413
418
414
419
@@ -446,15 +451,10 @@ def make_na_array(dtype: DtypeObj, shape: Shape) -> ArrayLike:
446
451
return missing_arr
447
452
448
453
449
- def _concatenate_join_units (
450
- join_units : list [JoinUnit ], concat_axis : int , copy : bool
451
- ) -> ArrayLike :
454
+ def _concatenate_join_units (join_units : list [JoinUnit ], copy : bool ) -> ArrayLike :
452
455
"""
453
- Concatenate values from several join units along selected axis.
456
+ Concatenate values from several join units along axis=1 .
454
457
"""
455
- if concat_axis == 0 and len (join_units ) > 1 :
456
- # Concatenating join units along ax0 is handled in _merge_blocks.
457
- raise AssertionError ("Concatenating join units along axis0" )
458
458
459
459
empty_dtype = _get_empty_dtype (join_units )
460
460
@@ -488,7 +488,7 @@ def _concatenate_join_units(
488
488
concat_values = ensure_block_shape (concat_values , 2 )
489
489
490
490
else :
491
- concat_values = concat_compat (to_concat , axis = concat_axis )
491
+ concat_values = concat_compat (to_concat , axis = 1 )
492
492
493
493
return concat_values
494
494
@@ -532,7 +532,7 @@ def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> DtypeObj:
532
532
empty_dtype = join_units [0 ].block .dtype
533
533
return empty_dtype
534
534
535
- needs_can_hold_na = any (unit .is_na or unit . needs_filling for unit in join_units )
535
+ needs_can_hold_na = any (unit .is_na for unit in join_units )
536
536
537
537
dtypes = [unit .block .dtype for unit in join_units if not unit .is_na ]
538
538
@@ -569,9 +569,6 @@ def _is_uniform_join_units(join_units: list[JoinUnit]) -> bool:
569
569
# unless we're an extension dtype.
570
570
all (not ju .is_na or ju .block .is_extension for ju in join_units )
571
571
and
572
- # no blocks with indexers (as then the dimensions do not fit)
573
- all (not ju .indexers for ju in join_units )
574
- and
575
572
# only use this path when there is something to concatenate
576
573
len (join_units ) > 1
577
574
)
@@ -591,25 +588,17 @@ def _trim_join_unit(join_unit: JoinUnit, length: int) -> JoinUnit:
591
588
592
589
Extra items that didn't fit are returned as a separate block.
593
590
"""
594
- assert 0 not in join_unit .indexers
595
- extra_indexers = join_unit .indexers
596
591
597
592
extra_block = join_unit .block .getitem_block (slice (length , None ))
598
593
join_unit .block = join_unit .block .getitem_block (slice (length ))
599
594
600
595
extra_shape = (join_unit .shape [0 ] - length ,) + join_unit .shape [1 :]
601
596
join_unit .shape = (length ,) + join_unit .shape [1 :]
602
597
603
- # extra_indexers does not introduce any -1s, so we can inherit needs_filling
604
- return JoinUnit (
605
- block = extra_block ,
606
- indexers = extra_indexers ,
607
- shape = extra_shape ,
608
- needs_filling = join_unit .needs_filling ,
609
- )
598
+ return JoinUnit (block = extra_block , shape = extra_shape )
610
599
611
600
612
- def _combine_concat_plans (plans , concat_axis : int ):
601
+ def _combine_concat_plans (plans ):
613
602
"""
614
603
Combine multiple concatenation plans into one.
615
604
@@ -619,18 +608,6 @@ def _combine_concat_plans(plans, concat_axis: int):
619
608
for p in plans [0 ]:
620
609
yield p [0 ], [p [1 ]]
621
610
622
- elif concat_axis == 0 :
623
- offset = 0
624
- for plan in plans :
625
- last_plc = None
626
-
627
- for plc , unit in plan :
628
- yield plc .add (offset ), [unit ]
629
- last_plc = plc
630
-
631
- if last_plc is not None :
632
- offset += last_plc .as_slice .stop
633
-
634
611
else :
635
612
# singleton list so we can modify it as a side-effect within _next_or_none
636
613
num_ended = [0 ]
0 commit comments