Skip to content

Commit 22f6dc2

Browse files
jbrockmendelKevin D Smith
authored and
Kevin D Smith
committed
PERF: JoinUnit.is_na (pandas-dev#36312)
1 parent 3cf1a18 commit 22f6dc2

File tree

2 files changed

+42
-10
lines changed

2 files changed

+42
-10
lines changed

pandas/core/dtypes/missing.py

+38-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
from pandas._libs import lib
1111
import pandas._libs.missing as libmissing
12-
from pandas._libs.tslibs import NaT, iNaT
12+
from pandas._libs.tslibs import NaT, Period, iNaT
1313
from pandas._typing import ArrayLike, DtypeObj
1414

1515
from pandas.core.dtypes.common import (
@@ -43,6 +43,9 @@
4343
isposinf_scalar = libmissing.isposinf_scalar
4444
isneginf_scalar = libmissing.isneginf_scalar
4545

46+
nan_checker = np.isnan
47+
INF_AS_NA = False
48+
4649

4750
def isna(obj):
4851
"""
@@ -188,6 +191,12 @@ def _use_inf_as_na(key):
188191
"""
189192
inf_as_na = get_option(key)
190193
globals()["_isna"] = partial(_isna, inf_as_na=inf_as_na)
194+
if inf_as_na:
195+
globals()["nan_checker"] = lambda x: ~np.isfinite(x)
196+
globals()["INF_AS_NA"] = True
197+
else:
198+
globals()["nan_checker"] = np.isnan
199+
globals()["INF_AS_NA"] = False
191200

192201

193202
def _isna_ndarraylike(obj, inf_as_na: bool = False):
@@ -602,3 +611,31 @@ def is_valid_nat_for_dtype(obj, dtype: DtypeObj) -> bool:
602611

603612
# must be PeriodDType
604613
return not isinstance(obj, (np.datetime64, np.timedelta64))
614+
615+
616+
def isna_all(arr: ArrayLike) -> bool:
617+
"""
618+
Optimized equivalent to isna(arr).all()
619+
"""
620+
total_len = len(arr)
621+
622+
# Usually it's enough to check but a small fraction of values to see if
623+
# a block is NOT null, chunks should help in such cases.
624+
# parameters 1000 and 40 were chosen arbitrarily
625+
chunk_len = max(total_len // 40, 1000)
626+
627+
dtype = arr.dtype
628+
if dtype.kind == "f":
629+
checker = nan_checker
630+
631+
elif dtype.kind in ["m", "M"] or dtype.type is Period:
632+
checker = lambda x: np.asarray(x.view("i8")) == iNaT
633+
634+
else:
635+
checker = lambda x: _isna_ndarraylike(x, inf_as_na=INF_AS_NA)
636+
637+
for i in range(0, total_len, chunk_len):
638+
if not checker(arr[i : i + chunk_len]).all():
639+
return False
640+
641+
return True

pandas/core/internals/concat.py

+4-9
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
is_timedelta64_dtype,
2222
)
2323
from pandas.core.dtypes.concat import concat_compat
24-
from pandas.core.dtypes.missing import isna
24+
from pandas.core.dtypes.missing import isna_all
2525

2626
import pandas.core.algorithms as algos
2727
from pandas.core.arrays import DatetimeArray, ExtensionArray
@@ -223,13 +223,8 @@ def is_na(self):
223223
values_flat = values
224224
else:
225225
values_flat = values.ravel(order="K")
226-
total_len = values_flat.shape[0]
227-
chunk_len = max(total_len // 40, 1000)
228-
for i in range(0, total_len, chunk_len):
229-
if not isna(values_flat[i : i + chunk_len]).all():
230-
return False
231226

232-
return True
227+
return isna_all(values_flat)
233228

234229
def get_reindexed_values(self, empty_dtype, upcasted_na):
235230
if upcasted_na is None:
@@ -474,8 +469,8 @@ def _is_uniform_join_units(join_units: List[JoinUnit]) -> bool:
474469
# cannot necessarily join
475470
return (
476471
# all blocks need to have the same type
477-
all(type(ju.block) is type(join_units[0].block) for ju in join_units)
478-
and # noqa
472+
all(type(ju.block) is type(join_units[0].block) for ju in join_units) # noqa
473+
and
479474
# no blocks that would get missing values (can lead to type upcasts)
480475
# unless we're an extension dtype.
481476
all(not ju.is_na or ju.block.is_extension for ju in join_units)

0 commit comments

Comments
 (0)