Skip to content

Commit 978c1eb

Browse files
Revert "CLN: remove unused concat code (pandas-dev#43577)"
This reverts commit 95eb153.
1 parent 88fb277 commit 978c1eb

File tree

2 files changed

+93
-6
lines changed

2 files changed

+93
-6
lines changed

pandas/core/dtypes/missing.py

+38
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import pandas._libs.missing as libmissing
1919
from pandas._libs.tslibs import (
2020
NaT,
21+
Period,
2122
iNaT,
2223
)
2324

@@ -739,3 +740,40 @@ def is_valid_na_for_dtype(obj, dtype: DtypeObj) -> bool:
739740

740741
# fallback, default to allowing NaN, None, NA, NaT
741742
return not isinstance(obj, (np.datetime64, np.timedelta64, Decimal))
743+
744+
745+
def isna_all(arr: ArrayLike) -> bool:
746+
"""
747+
Optimized equivalent to isna(arr).all()
748+
"""
749+
total_len = len(arr)
750+
751+
# Usually it's enough to check but a small fraction of values to see if
752+
# a block is NOT null, chunks should help in such cases.
753+
# parameters 1000 and 40 were chosen arbitrarily
754+
chunk_len = max(total_len // 40, 1000)
755+
756+
dtype = arr.dtype
757+
if dtype.kind == "f":
758+
checker = nan_checker
759+
760+
elif dtype.kind in ["m", "M"] or dtype.type is Period:
761+
# error: Incompatible types in assignment (expression has type
762+
# "Callable[[Any], Any]", variable has type "ufunc")
763+
checker = lambda x: np.asarray(x.view("i8")) == iNaT # type: ignore[assignment]
764+
765+
else:
766+
# error: Incompatible types in assignment (expression has type "Callable[[Any],
767+
# Any]", variable has type "ufunc")
768+
checker = lambda x: _isna_array( # type: ignore[assignment]
769+
x, inf_as_na=INF_AS_NA
770+
)
771+
772+
return all(
773+
# error: Argument 1 to "__call__" of "ufunc" has incompatible type
774+
# "Union[ExtensionArray, Any]"; expected "Union[Union[int, float, complex, str,
775+
# bytes, generic], Sequence[Union[int, float, complex, str, bytes, generic]],
776+
# Sequence[Sequence[Any]], _SupportsArray]"
777+
checker(arr[i : i + chunk_len]).all() # type: ignore[arg-type]
778+
for i in range(0, total_len, chunk_len)
779+
)

pandas/core/internals/concat.py

+55-6
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,11 @@
1010

1111
import numpy as np
1212

13-
from pandas._libs import internals as libinternals
13+
from pandas._libs import (
14+
NaT,
15+
internals as libinternals,
16+
)
17+
from pandas._libs.missing import NA
1418
from pandas._typing import (
1519
ArrayLike,
1620
DtypeObj,
@@ -27,12 +31,14 @@
2731
is_1d_only_ea_dtype,
2832
is_datetime64tz_dtype,
2933
is_dtype_equal,
34+
needs_i8_conversion,
3035
)
3136
from pandas.core.dtypes.concat import (
3237
cast_to_common_type,
3338
concat_compat,
3439
)
3540
from pandas.core.dtypes.dtypes import ExtensionDtype
41+
from pandas.core.dtypes.missing import is_valid_na_for_dtype
3642

3743
import pandas.core.algorithms as algos
3844
from pandas.core.arrays import (
@@ -378,6 +384,36 @@ def dtype(self):
378384
return blk.dtype
379385
return ensure_dtype_can_hold_na(blk.dtype)
380386

387+
def _is_valid_na_for(self, dtype: DtypeObj) -> bool:
388+
"""
389+
Check that we are all-NA of a type/dtype that is compatible with this dtype.
390+
Augments `self.is_na` with an additional check of the type of NA values.
391+
"""
392+
if not self.is_na:
393+
return False
394+
if self.block.dtype.kind == "V":
395+
return True
396+
397+
if self.dtype == object:
398+
values = self.block.values
399+
return all(is_valid_na_for_dtype(x, dtype) for x in values.ravel(order="K"))
400+
401+
na_value = self.block.fill_value
402+
if na_value is NaT and not is_dtype_equal(self.dtype, dtype):
403+
# e.g. we are dt64 and other is td64
404+
# fill_values match but we should not cast self.block.values to dtype
405+
# TODO: this will need updating if we ever have non-nano dt64/td64
406+
return False
407+
408+
if na_value is NA and needs_i8_conversion(dtype):
409+
# FIXME: kludge; test_append_empty_frame_with_timedelta64ns_nat
410+
# e.g. self.dtype == "Int64" and dtype is td64, we dont want
411+
# to consider these as matching
412+
return False
413+
414+
# TODO: better to use can_hold_element?
415+
return is_valid_na_for_dtype(na_value, dtype)
416+
381417
@cache_readonly
382418
def is_na(self) -> bool:
383419
blk = self.block
@@ -388,14 +424,24 @@ def is_na(self) -> bool:
388424
def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike:
389425
values: ArrayLike
390426

391-
if upcasted_na is None and not self.is_na:
427+
if upcasted_na is None and self.block.dtype.kind != "V":
392428
# No upcasting is necessary
393429
fill_value = self.block.fill_value
394430
values = self.block.get_values()
395431
else:
396432
fill_value = upcasted_na
397433

398-
if self.is_na:
434+
if self._is_valid_na_for(empty_dtype):
435+
# note: always holds when self.block.dtype.kind == "V"
436+
blk_dtype = self.block.dtype
437+
438+
if blk_dtype == np.dtype("object"):
439+
# we want to avoid filling with np.nan if we are
440+
# using None; we already know that we are all
441+
# nulls
442+
values = self.block.values.ravel(order="K")
443+
if len(values) and values[0] is None:
444+
fill_value = None
399445

400446
if is_datetime64tz_dtype(empty_dtype):
401447
i8values = np.full(self.shape, fill_value.value)
@@ -464,7 +510,8 @@ def _concatenate_join_units(
464510

465511
empty_dtype = _get_empty_dtype(join_units)
466512

467-
upcasted_na = _dtype_to_na_value(empty_dtype)
513+
has_none_blocks = any(unit.block.dtype.kind == "V" for unit in join_units)
514+
upcasted_na = _dtype_to_na_value(empty_dtype, has_none_blocks)
468515

469516
to_concat = [
470517
ju.get_reindexed_values(empty_dtype=empty_dtype, upcasted_na=upcasted_na)
@@ -506,7 +553,7 @@ def _concatenate_join_units(
506553
return concat_values
507554

508555

509-
def _dtype_to_na_value(dtype: DtypeObj):
556+
def _dtype_to_na_value(dtype: DtypeObj, has_none_blocks: bool):
510557
"""
511558
Find the NA value to go with this dtype.
512559
"""
@@ -544,9 +591,11 @@ def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> DtypeObj:
544591
empty_dtype = join_units[0].block.dtype
545592
return empty_dtype
546593

547-
has_none_blocks = any(unit.is_na for unit in join_units)
594+
has_none_blocks = any(unit.block.dtype.kind == "V" for unit in join_units)
548595

549596
dtypes = [unit.dtype for unit in join_units if not unit.is_na]
597+
if not len(dtypes):
598+
dtypes = [unit.dtype for unit in join_units if unit.block.dtype.kind != "V"]
550599

551600
dtype = find_common_type(dtypes)
552601
if has_none_blocks:

0 commit comments

Comments
 (0)