diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 163500525dbd8..d2e4974741b88 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -9,7 +9,7 @@ from pandas._libs import lib import pandas._libs.missing as libmissing -from pandas._libs.tslibs import NaT, iNaT +from pandas._libs.tslibs import NaT, Period, iNaT from pandas._typing import ArrayLike, DtypeObj from pandas.core.dtypes.common import ( @@ -43,6 +43,9 @@ isposinf_scalar = libmissing.isposinf_scalar isneginf_scalar = libmissing.isneginf_scalar +nan_checker = np.isnan +INF_AS_NA = False + def isna(obj): """ @@ -188,6 +191,12 @@ def _use_inf_as_na(key): """ inf_as_na = get_option(key) globals()["_isna"] = partial(_isna, inf_as_na=inf_as_na) + if inf_as_na: + globals()["nan_checker"] = lambda x: ~np.isfinite(x) + globals()["INF_AS_NA"] = True + else: + globals()["nan_checker"] = np.isnan + globals()["INF_AS_NA"] = False def _isna_ndarraylike(obj, inf_as_na: bool = False): @@ -602,3 +611,31 @@ def is_valid_nat_for_dtype(obj, dtype: DtypeObj) -> bool: # must be PeriodDType return not isinstance(obj, (np.datetime64, np.timedelta64)) + + +def isna_all(arr: ArrayLike) -> bool: + """ + Optimized equivalent to isna(arr).all() + """ + total_len = len(arr) + + # Usually it's enough to check but a small fraction of values to see if + # a block is NOT null, chunks should help in such cases. + # parameters 1000 and 40 were chosen arbitrarily + chunk_len = max(total_len // 40, 1000) + + dtype = arr.dtype + if dtype.kind == "f": + checker = nan_checker + + elif dtype.kind in ["m", "M"] or dtype.type is Period: + checker = lambda x: np.asarray(x.view("i8")) == iNaT + + else: + checker = lambda x: _isna_ndarraylike(x, inf_as_na=INF_AS_NA) + + for i in range(0, total_len, chunk_len): + if not checker(arr[i : i + chunk_len]).all(): + return False + + return True diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 513c5fed1ca62..f5d0c921e1006 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -21,7 +21,7 @@ is_timedelta64_dtype, ) from pandas.core.dtypes.concat import concat_compat -from pandas.core.dtypes.missing import isna +from pandas.core.dtypes.missing import isna_all import pandas.core.algorithms as algos from pandas.core.arrays import DatetimeArray, ExtensionArray @@ -223,13 +223,8 @@ def is_na(self): values_flat = values else: values_flat = values.ravel(order="K") - total_len = values_flat.shape[0] - chunk_len = max(total_len // 40, 1000) - for i in range(0, total_len, chunk_len): - if not isna(values_flat[i : i + chunk_len]).all(): - return False - return True + return isna_all(values_flat) def get_reindexed_values(self, empty_dtype, upcasted_na): if upcasted_na is None: @@ -474,8 +469,8 @@ def _is_uniform_join_units(join_units: List[JoinUnit]) -> bool: # cannot necessarily join return ( # all blocks need to have the same type - all(type(ju.block) is type(join_units[0].block) for ju in join_units) - and # noqa + all(type(ju.block) is type(join_units[0].block) for ju in join_units) # noqa + and # no blocks that would get missing values (can lead to type upcasts) # unless we're an extension dtype. all(not ju.is_na or ju.block.is_extension for ju in join_units)