Skip to content

Commit 1bd88d7

Browse files
authored
PERF: internals.concat (pandas-dev#43021)
1 parent 0799773 commit 1bd88d7

File tree

3 files changed

+20
-15
lines changed

3 files changed

+20
-15
lines changed

pandas/_libs/internals.pyi

+2
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ class BlockPlacement:
3333
@property
3434
def as_array(self) -> np.ndarray: ...
3535
@property
36+
def as_slice(self) -> slice: ...
37+
@property
3638
def is_slice_like(self) -> bool: ...
3739
@overload
3840
def __getitem__(self, loc: slice | Sequence[int]) -> BlockPlacement: ...

pandas/_libs/internals.pyx

+1-1
Original file line numberDiff line numberDiff line change
@@ -395,7 +395,7 @@ def get_blkno_indexers(
395395
cdef:
396396
int64_t cur_blkno
397397
Py_ssize_t i, start, stop, n, diff, tot_len
398-
object blkno
398+
int64_t blkno
399399
object group_dict = defaultdict(list)
400400

401401
n = blknos.shape[0]

pandas/core/internals/concat.py

+17-14
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@
2929
is_datetime64tz_dtype,
3030
is_dtype_equal,
3131
is_extension_array_dtype,
32-
is_sparse,
3332
)
3433
from pandas.core.dtypes.concat import (
3534
cast_to_common_type,
@@ -46,6 +45,7 @@
4645
DatetimeArray,
4746
ExtensionArray,
4847
)
48+
from pandas.core.arrays.sparse import SparseDtype
4949
from pandas.core.construction import ensure_wrapped_if_datetimelike
5050
from pandas.core.internals.array_manager import (
5151
ArrayManager,
@@ -260,7 +260,10 @@ def _get_mgr_concatenation_plan(mgr: BlockManager, indexers: dict[int, np.ndarra
260260
mgr_shape_list[ax] = len(indexer)
261261
mgr_shape = tuple(mgr_shape_list)
262262

263+
has_column_indexer = False
264+
263265
if 0 in indexers:
266+
has_column_indexer = True
264267
ax0_indexer = indexers.pop(0)
265268
blknos = algos.take_nd(mgr.blknos, ax0_indexer, fill_value=-1)
266269
blklocs = algos.take_nd(mgr.blklocs, ax0_indexer, fill_value=-1)
@@ -270,9 +273,6 @@ def _get_mgr_concatenation_plan(mgr: BlockManager, indexers: dict[int, np.ndarra
270273
blk = mgr.blocks[0]
271274
return [(blk.mgr_locs, JoinUnit(blk, mgr_shape, indexers))]
272275

273-
# error: Incompatible types in assignment (expression has type "None", variable
274-
# has type "ndarray")
275-
ax0_indexer = None # type: ignore[assignment]
276276
blknos = mgr.blknos
277277
blklocs = mgr.blklocs
278278

@@ -288,6 +288,7 @@ def _get_mgr_concatenation_plan(mgr: BlockManager, indexers: dict[int, np.ndarra
288288
shape = tuple(shape_list)
289289

290290
if blkno == -1:
291+
# only reachable in the `0 in indexers` case
291292
unit = JoinUnit(None, shape)
292293
else:
293294
blk = mgr.blocks[blkno]
@@ -302,7 +303,7 @@ def _get_mgr_concatenation_plan(mgr: BlockManager, indexers: dict[int, np.ndarra
302303
# placement was sequential before.
303304
(
304305
(
305-
ax0_indexer is None
306+
not has_column_indexer
306307
and blk.mgr_locs.is_slice_like
307308
and blk.mgr_locs.as_slice.step == 1
308309
)
@@ -330,6 +331,7 @@ def _get_mgr_concatenation_plan(mgr: BlockManager, indexers: dict[int, np.ndarra
330331
class JoinUnit:
331332
def __init__(self, block, shape: Shape, indexers=None):
332333
# Passing shape explicitly is required for cases when block is None.
334+
# Note: block is None implies indexers is None, but not vice-versa
333335
if indexers is None:
334336
indexers = {}
335337
self.block = block
@@ -358,7 +360,7 @@ def dtype(self):
358360
return blk.dtype
359361
return ensure_dtype_can_hold_na(blk.dtype)
360362

361-
def is_valid_na_for(self, dtype: DtypeObj) -> bool:
363+
def _is_valid_na_for(self, dtype: DtypeObj) -> bool:
362364
"""
363365
Check that we are all-NA of a type/dtype that is compatible with this dtype.
364366
Augments `self.is_na` with an additional check of the type of NA values.
@@ -389,11 +391,8 @@ def is_na(self) -> bool:
389391
if not self.block._can_hold_na:
390392
return False
391393

392-
# Usually it's enough to check but a small fraction of values to see if
393-
# a block is NOT null, chunks should help in such cases. 1000 value
394-
# was chosen rather arbitrarily.
395394
values = self.block.values
396-
if is_sparse(self.block.values.dtype):
395+
if isinstance(self.block.values.dtype, SparseDtype):
397396
return False
398397
elif self.block.is_extension:
399398
# TODO(EA2D): no need for special case with 2D EAs
@@ -411,7 +410,8 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike:
411410
else:
412411
fill_value = upcasted_na
413412

414-
if self.is_valid_na_for(empty_dtype):
413+
if self._is_valid_na_for(empty_dtype):
414+
# note: always holds when self.block is None
415415
blk_dtype = getattr(self.block, "dtype", None)
416416

417417
if blk_dtype == np.dtype("object"):
@@ -592,13 +592,16 @@ def _is_uniform_join_units(join_units: list[JoinUnit]) -> bool:
592592
_concatenate_join_units (which uses `concat_compat`).
593593
594594
"""
595+
first = join_units[0].block
596+
if first is None:
597+
return False
595598
return (
596-
# all blocks need to have the same type
597-
all(type(ju.block) is type(join_units[0].block) for ju in join_units) # noqa
599+
# exclude cases where a) ju.block is None or b) we have e.g. Int64+int64
600+
all(type(ju.block) is type(first) for ju in join_units)
598601
and
599602
# e.g. DatetimeLikeBlock can be dt64 or td64, but these are not uniform
600603
all(
601-
is_dtype_equal(ju.block.dtype, join_units[0].block.dtype)
604+
is_dtype_equal(ju.block.dtype, first.dtype)
602605
# GH#42092 we only want the dtype_equal check for non-numeric blocks
603606
# (for now, may change but that would need a deprecation)
604607
or ju.block.dtype.kind in ["b", "i", "u"]

0 commit comments

Comments
 (0)