Skip to content

Commit 4fef063

Browse files
authored
PERF: homogeneous concat (#52685)
* PERF: homogeneous concat * Handle float32, whatsnew * more specific whatsnew
1 parent 68b2fa6 commit 4fef063

File tree

2 files changed

+69
-0
lines changed

2 files changed

+69
-0
lines changed

doc/source/whatsnew/v2.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ Other enhancements
9494
- Improved error message when creating a DataFrame with empty data (0 rows), no index and an incorrect number of columns. (:issue:`52084`)
9595
- Let :meth:`DataFrame.to_feather` accept a non-default :class:`Index` and non-string column names (:issue:`51787`)
9696
- Performance improvement in :func:`read_csv` (:issue:`52632`) with ``engine="c"``
97+
- Performance improvement in :func:`concat` with homogeneous ``np.float64`` or ``np.float32`` dtypes (:issue:`52685`)
9798
-
9899

99100
.. ---------------------------------------------------------------------------

pandas/core/internals/concat.py

+68
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
from pandas._libs import (
1313
NaT,
14+
algos as libalgos,
1415
internals as libinternals,
1516
lib,
1617
)
@@ -59,6 +60,7 @@
5960
AxisInt,
6061
DtypeObj,
6162
Manager,
63+
Shape,
6264
)
6365

6466
from pandas import Index
@@ -202,6 +204,21 @@ def concatenate_managers(
202204
if concat_axis == 0:
203205
return _concat_managers_axis0(mgrs_indexers, axes, copy)
204206

207+
if len(mgrs_indexers) > 0 and mgrs_indexers[0][0].nblocks > 0:
208+
first_dtype = mgrs_indexers[0][0].blocks[0].dtype
209+
if first_dtype in [np.float64, np.float32]:
210+
# TODO: support more dtypes here. This will be simpler once
211+
# JoinUnit.is_na behavior is deprecated.
212+
if (
213+
all(_is_homogeneous_mgr(mgr, first_dtype) for mgr, _ in mgrs_indexers)
214+
and len(mgrs_indexers) > 1
215+
):
216+
# Fastpath!
217+
# Length restriction is just to avoid having to worry about 'copy'
218+
shape = tuple(len(x) for x in axes)
219+
nb = _concat_homogeneous_fastpath(mgrs_indexers, shape, first_dtype)
220+
return BlockManager((nb,), axes)
221+
205222
mgrs_indexers = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers)
206223

207224
concat_plan = _get_combined_plan([mgr for mgr, _ in mgrs_indexers])
@@ -322,6 +339,57 @@ def _maybe_reindex_columns_na_proxy(
322339
return new_mgrs_indexers
323340

324341

342+
def _is_homogeneous_mgr(mgr: BlockManager, first_dtype: DtypeObj) -> bool:
343+
"""
344+
Check if this Manager can be treated as a single ndarray.
345+
"""
346+
if mgr.nblocks != 1:
347+
return False
348+
blk = mgr.blocks[0]
349+
if not (blk.mgr_locs.is_slice_like and blk.mgr_locs.as_slice.step == 1):
350+
return False
351+
352+
return blk.dtype == first_dtype
353+
354+
355+
def _concat_homogeneous_fastpath(
356+
mgrs_indexers, shape: Shape, first_dtype: np.dtype
357+
) -> Block:
358+
"""
359+
With single-Block managers with homogeneous dtypes (that can already hold nan),
360+
we avoid [...]
361+
"""
362+
# assumes
363+
# all(_is_homogeneous_mgr(mgr, first_dtype) for mgr, _ in in mgrs_indexers)
364+
arr = np.empty(shape, dtype=first_dtype)
365+
366+
if first_dtype == np.float64:
367+
take_func = libalgos.take_2d_axis0_float64_float64
368+
else:
369+
take_func = libalgos.take_2d_axis0_float32_float32
370+
371+
start = 0
372+
for mgr, indexers in mgrs_indexers:
373+
mgr_len = mgr.shape[1]
374+
end = start + mgr_len
375+
376+
if 0 in indexers:
377+
take_func(
378+
mgr.blocks[0].values,
379+
indexers[0],
380+
arr[:, start:end],
381+
)
382+
else:
383+
# No reindexing necessary, we can copy values directly
384+
arr[:, start:end] = mgr.blocks[0].values
385+
386+
start += mgr_len
387+
388+
bp = libinternals.BlockPlacement(slice(shape[0]))
389+
nb = new_block_2d(arr, bp)
390+
return nb
391+
392+
325393
def _get_combined_plan(
326394
mgrs: list[BlockManager],
327395
) -> list[tuple[BlockPlacement, list[JoinUnit]]]:

0 commit comments

Comments
 (0)