36
36
)
37
37
from pandas .core .dtypes .dtypes import ExtensionDtype
38
38
39
+ import pandas .core .algorithms as algos
39
40
from pandas .core .arrays import (
40
41
DatetimeArray ,
41
42
ExtensionArray ,
@@ -191,29 +192,19 @@ def concatenate_managers(
191
192
if isinstance (mgrs_indexers [0 ][0 ], ArrayManager ):
192
193
return _concatenate_array_managers (mgrs_indexers , axes , concat_axis , copy )
193
194
194
- # Assertions disabled for performance
195
- # for tup in mgrs_indexers:
196
- # # caller is responsible for ensuring this
197
- # indexers = tup[1]
198
- # assert concat_axis not in indexers
199
-
200
- if concat_axis == 0 :
201
- return _concat_managers_axis0 (mgrs_indexers , axes , copy )
202
-
203
195
mgrs_indexers = _maybe_reindex_columns_na_proxy (axes , mgrs_indexers )
204
196
205
- # Assertion disabled for performance
206
- # assert all(not x[1] for x in mgrs_indexers)
207
-
208
- concat_plans = [_get_mgr_concatenation_plan (mgr ) for mgr , _ in mgrs_indexers ]
209
- concat_plan = _combine_concat_plans (concat_plans )
197
+ concat_plans = [
198
+ _get_mgr_concatenation_plan (mgr , indexers ) for mgr , indexers in mgrs_indexers
199
+ ]
200
+ concat_plan = _combine_concat_plans (concat_plans , concat_axis )
210
201
blocks = []
211
202
212
203
for placement , join_units in concat_plan :
213
204
unit = join_units [0 ]
214
205
blk = unit .block
215
206
216
- if len (join_units ) == 1 :
207
+ if len (join_units ) == 1 and not join_units [ 0 ]. indexers :
217
208
values = blk .values
218
209
if copy :
219
210
values = values .copy ()
@@ -237,7 +228,7 @@ def concatenate_managers(
237
228
238
229
fastpath = blk .values .dtype == values .dtype
239
230
else :
240
- values = _concatenate_join_units (join_units , copy = copy )
231
+ values = _concatenate_join_units (join_units , concat_axis , copy = copy )
241
232
fastpath = False
242
233
243
234
if fastpath :
@@ -250,42 +241,6 @@ def concatenate_managers(
250
241
return BlockManager (tuple (blocks ), axes )
251
242
252
243
253
- def _concat_managers_axis0 (
254
- mgrs_indexers , axes : list [Index ], copy : bool
255
- ) -> BlockManager :
256
- """
257
- concat_managers specialized to concat_axis=0, with reindexing already
258
- having been done in _maybe_reindex_columns_na_proxy.
259
- """
260
- had_reindexers = {
261
- i : len (mgrs_indexers [i ][1 ]) > 0 for i in range (len (mgrs_indexers ))
262
- }
263
- mgrs_indexers = _maybe_reindex_columns_na_proxy (axes , mgrs_indexers )
264
-
265
- mgrs = [x [0 ] for x in mgrs_indexers ]
266
-
267
- offset = 0
268
- blocks = []
269
- for i , mgr in enumerate (mgrs ):
270
- # If we already reindexed, then we definitely don't need another copy
271
- made_copy = had_reindexers [i ]
272
-
273
- for blk in mgr .blocks :
274
- if made_copy :
275
- nb = blk .copy (deep = False )
276
- elif copy :
277
- nb = blk .copy ()
278
- else :
279
- # by slicing instead of copy(deep=False), we get a new array
280
- # object, see test_concat_copy
281
- nb = blk .getitem_block (slice (None ))
282
- nb ._mgr_locs = nb ._mgr_locs .add (offset )
283
- blocks .append (nb )
284
-
285
- offset += len (mgr .items )
286
- return BlockManager (tuple (blocks ), axes )
287
-
288
-
289
244
def _maybe_reindex_columns_na_proxy (
290
245
axes : list [Index ], mgrs_indexers : list [tuple [BlockManager , dict [int , np .ndarray ]]]
291
246
) -> list [tuple [BlockManager , dict [int , np .ndarray ]]]:
@@ -296,33 +251,36 @@ def _maybe_reindex_columns_na_proxy(
296
251
Columns added in this reindexing have dtype=np.void, indicating they
297
252
should be ignored when choosing a column's final dtype.
298
253
"""
299
- new_mgrs_indexers : list [tuple [BlockManager , dict [int , np .ndarray ]]] = []
300
-
254
+ new_mgrs_indexers = []
301
255
for mgr , indexers in mgrs_indexers :
302
- # For axis=0 (i.e. columns) we use_na_proxy and only_slice, so this
303
- # is a cheap reindexing.
304
- for i , indexer in indexers .items ():
305
- mgr = mgr .reindex_indexer (
306
- axes [i ],
307
- indexers [i ],
308
- axis = i ,
256
+ # We only reindex for axis=0 (i.e. columns), as this can be done cheaply
257
+ if 0 in indexers :
258
+ new_mgr = mgr .reindex_indexer (
259
+ axes [0 ],
260
+ indexers [0 ],
261
+ axis = 0 ,
309
262
copy = False ,
310
- only_slice = True , # only relevant for i==0
263
+ only_slice = True ,
311
264
allow_dups = True ,
312
- use_na_proxy = True , # only relevant for i==0
265
+ use_na_proxy = True ,
313
266
)
314
- new_mgrs_indexers .append ((mgr , {}))
267
+ new_indexers = indexers .copy ()
268
+ del new_indexers [0 ]
269
+ new_mgrs_indexers .append ((new_mgr , new_indexers ))
270
+ else :
271
+ new_mgrs_indexers .append ((mgr , indexers ))
315
272
316
273
return new_mgrs_indexers
317
274
318
275
319
- def _get_mgr_concatenation_plan (mgr : BlockManager ):
276
+ def _get_mgr_concatenation_plan (mgr : BlockManager , indexers : dict [ int , np . ndarray ] ):
320
277
"""
321
- Construct concatenation plan for given block manager.
278
+ Construct concatenation plan for given block manager and indexers .
322
279
323
280
Parameters
324
281
----------
325
282
mgr : BlockManager
283
+ indexers : dict of {axis: indexer}
326
284
327
285
Returns
328
286
-------
@@ -332,11 +290,27 @@ def _get_mgr_concatenation_plan(mgr: BlockManager):
332
290
# Calculate post-reindex shape , save for item axis which will be separate
333
291
# for each block anyway.
334
292
mgr_shape_list = list (mgr .shape )
293
+ for ax , indexer in indexers .items ():
294
+ mgr_shape_list [ax ] = len (indexer )
335
295
mgr_shape = tuple (mgr_shape_list )
336
296
297
+ assert 0 not in indexers
298
+
299
+ needs_filling = False
300
+ if 1 in indexers :
301
+ # indexers[1] is shared by all the JoinUnits, so we can save time
302
+ # by only doing this check once
303
+ if (indexers [1 ] == - 1 ).any ():
304
+ needs_filling = True
305
+
337
306
if mgr .is_single_block :
338
307
blk = mgr .blocks [0 ]
339
- return [(blk .mgr_locs , JoinUnit (blk , mgr_shape ))]
308
+ return [
309
+ (
310
+ blk .mgr_locs ,
311
+ JoinUnit (blk , mgr_shape , indexers , needs_filling = needs_filling ),
312
+ )
313
+ ]
340
314
341
315
blknos = mgr .blknos
342
316
blklocs = mgr .blklocs
@@ -347,6 +321,8 @@ def _get_mgr_concatenation_plan(mgr: BlockManager):
347
321
assert placements .is_slice_like
348
322
assert blkno != - 1
349
323
324
+ join_unit_indexers = indexers .copy ()
325
+
350
326
shape_list = list (mgr_shape )
351
327
shape_list [0 ] = len (placements )
352
328
shape = tuple (shape_list )
@@ -380,21 +356,30 @@ def _get_mgr_concatenation_plan(mgr: BlockManager):
380
356
# Assertions disabled for performance
381
357
# assert blk._mgr_locs.as_slice == placements.as_slice
382
358
# assert blk.shape[0] == shape[0]
383
- unit = JoinUnit (blk , shape )
359
+ unit = JoinUnit (blk , shape , join_unit_indexers , needs_filling = needs_filling )
384
360
385
361
plan .append ((placements , unit ))
386
362
387
363
return plan
388
364
389
365
390
366
class JoinUnit :
391
- def __init__ (self , block : Block , shape : Shape ):
367
+ def __init__ (
368
+ self , block : Block , shape : Shape , indexers = None , * , needs_filling : bool = False
369
+ ):
392
370
# Passing shape explicitly is required for cases when block is None.
371
+ # Note: block is None implies indexers is None, but not vice-versa
372
+ if indexers is None :
373
+ indexers = {}
374
+ # we should *never* have `0 in indexers`
393
375
self .block = block
376
+ self .indexers = indexers
394
377
self .shape = shape
395
378
379
+ self .needs_filling = needs_filling
380
+
396
381
def __repr__ (self ) -> str :
397
- return f"{ type (self ).__name__ } ({ repr (self .block )} )"
382
+ return f"{ type (self ).__name__ } ({ repr (self .block )} , { self . indexers } )"
398
383
399
384
@cache_readonly
400
385
def is_na (self ) -> bool :
@@ -411,14 +396,24 @@ def get_reindexed_values(self, empty_dtype: DtypeObj) -> ArrayLike:
411
396
412
397
else :
413
398
414
- if not self .block ._can_consolidate :
399
+ if ( not self .indexers ) and ( not self . block ._can_consolidate ) :
415
400
# preserve these for validation in concat_compat
416
401
return self .block .values
417
402
418
403
# No dtype upcasting is done here, it will be performed during
419
404
# concatenation itself.
420
405
values = self .block .values
421
406
407
+ if not self .indexers :
408
+ # If there's no indexing to be done, we want to signal outside
409
+ # code that this array must be copied explicitly. This is done
410
+ # by returning a view and checking `retval.base`.
411
+ values = values .view ()
412
+
413
+ else :
414
+ for ax , indexer in self .indexers .items ():
415
+ values = algos .take_nd (values , indexer , axis = ax )
416
+
422
417
return values
423
418
424
419
@@ -456,10 +451,15 @@ def make_na_array(dtype: DtypeObj, shape: Shape) -> ArrayLike:
456
451
return missing_arr
457
452
458
453
459
- def _concatenate_join_units (join_units : list [JoinUnit ], copy : bool ) -> ArrayLike :
454
+ def _concatenate_join_units (
455
+ join_units : list [JoinUnit ], concat_axis : int , copy : bool
456
+ ) -> ArrayLike :
460
457
"""
461
- Concatenate values from several join units along axis=1 .
458
+ Concatenate values from several join units along selected axis.
462
459
"""
460
+ if concat_axis == 0 and len (join_units ) > 1 :
461
+ # Concatenating join units along ax0 is handled in _merge_blocks.
462
+ raise AssertionError ("Concatenating join units along axis0" )
463
463
464
464
empty_dtype = _get_empty_dtype (join_units )
465
465
@@ -495,7 +495,7 @@ def _concatenate_join_units(join_units: list[JoinUnit], copy: bool) -> ArrayLike
495
495
concat_values = ensure_block_shape (concat_values , 2 )
496
496
497
497
else :
498
- concat_values = concat_compat (to_concat , axis = 1 )
498
+ concat_values = concat_compat (to_concat , axis = concat_axis )
499
499
500
500
return concat_values
501
501
@@ -538,7 +538,7 @@ def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> DtypeObj:
538
538
empty_dtype = join_units [0 ].block .dtype
539
539
return empty_dtype
540
540
541
- needs_can_hold_na = any (unit .is_na for unit in join_units )
541
+ needs_can_hold_na = any (unit .is_na or unit . needs_filling for unit in join_units )
542
542
543
543
dtypes = [unit .block .dtype for unit in join_units if not unit .is_na ]
544
544
@@ -575,6 +575,9 @@ def _is_uniform_join_units(join_units: list[JoinUnit]) -> bool:
575
575
# unless we're an extension dtype.
576
576
all (not ju .is_na or ju .block .is_extension for ju in join_units )
577
577
and
578
+ # no blocks with indexers (as then the dimensions do not fit)
579
+ all (not ju .indexers for ju in join_units )
580
+ and
578
581
# only use this path when there is something to concatenate
579
582
len (join_units ) > 1
580
583
)
@@ -594,17 +597,25 @@ def _trim_join_unit(join_unit: JoinUnit, length: int) -> JoinUnit:
594
597
595
598
Extra items that didn't fit are returned as a separate block.
596
599
"""
600
+ assert 0 not in join_unit .indexers
601
+ extra_indexers = join_unit .indexers
597
602
598
603
extra_block = join_unit .block .getitem_block (slice (length , None ))
599
604
join_unit .block = join_unit .block .getitem_block (slice (length ))
600
605
601
606
extra_shape = (join_unit .shape [0 ] - length ,) + join_unit .shape [1 :]
602
607
join_unit .shape = (length ,) + join_unit .shape [1 :]
603
608
604
- return JoinUnit (block = extra_block , shape = extra_shape )
609
+ # extra_indexers does not introduce any -1s, so we can inherit needs_filling
610
+ return JoinUnit (
611
+ block = extra_block ,
612
+ indexers = extra_indexers ,
613
+ shape = extra_shape ,
614
+ needs_filling = join_unit .needs_filling ,
615
+ )
605
616
606
617
607
- def _combine_concat_plans (plans ):
618
+ def _combine_concat_plans (plans , concat_axis : int ):
608
619
"""
609
620
Combine multiple concatenation plans into one.
610
621
@@ -614,6 +625,18 @@ def _combine_concat_plans(plans):
614
625
for p in plans [0 ]:
615
626
yield p [0 ], [p [1 ]]
616
627
628
+ elif concat_axis == 0 :
629
+ offset = 0
630
+ for plan in plans :
631
+ last_plc = None
632
+
633
+ for plc , unit in plan :
634
+ yield plc .add (offset ), [unit ]
635
+ last_plc = plc
636
+
637
+ if last_plc is not None :
638
+ offset += last_plc .as_slice .stop
639
+
617
640
else :
618
641
# singleton list so we can modify it as a side-effect within _next_or_none
619
642
num_ended = [0 ]
0 commit comments