Skip to content

Commit 9097263

Browse files
authored
PERF/CLN: Use Numpy C Iterator API to unify isnaobj 1D/2D (pandas-dev#50658)
* PERF: Faster isna * CLN: Unify 1-D and 2-D implementations of isnaobj * remove commented code * Remove all isnaobj2d usages * Address code comments + whatsnew * Remove type stub * placate pylint
1 parent ac3c010 commit 9097263

File tree

6 files changed

+27
-54
lines changed

6 files changed

+27
-54
lines changed

doc/source/whatsnew/v2.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -872,6 +872,7 @@ Performance improvements
872872
- Performance improvement in :func:`to_datetime` when using ``'%Y%m%d'`` format (:issue:`17410`)
873873
- Performance improvement in :func:`to_datetime` when format is given or can be inferred (:issue:`50465`)
874874
- Performance improvement in :func:`read_csv` when passing :func:`to_datetime` lambda-function to ``date_parser`` and inputs have mixed timezone offsetes (:issue:`35296`)
875+
- Performance improvement in :func:`isna` and :func:`isnull` (:issue:`50658`)
875876
- Performance improvement in :meth:`.SeriesGroupBy.value_counts` with categorical dtype (:issue:`46202`)
876877
- Fixed a reference leak in :func:`read_hdf` (:issue:`37441`)
877878
- Fixed a memory leak in :meth:`DataFrame.to_json` and :meth:`Series.to_json` when serializing datetimes and timedeltas (:issue:`40443`)

pandas/_libs/algos.pyx

+1-1
Original file line numberDiff line numberDiff line change
@@ -1366,7 +1366,7 @@ def rank_2d(
13661366
nan_fill_val = get_rank_nan_fill_val(nans_rank_highest, <numeric_object_t>0)
13671367

13681368
if numeric_object_t is object:
1369-
mask = missing.isnaobj2d(values).view(np.uint8)
1369+
mask = missing.isnaobj(values).view(np.uint8)
13701370
elif numeric_object_t is float64_t or numeric_object_t is float32_t:
13711371
mask = np.isnan(values).view(np.uint8)
13721372
else:

pandas/_libs/missing.pyi

-1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,5 @@ def isposinf_scalar(val: object) -> bool: ...
1313
def isneginf_scalar(val: object) -> bool: ...
1414
def checknull(val: object, inf_as_na: bool = ...) -> bool: ...
1515
def isnaobj(arr: np.ndarray, inf_as_na: bool = ...) -> npt.NDArray[np.bool_]: ...
16-
def isnaobj2d(arr: np.ndarray, inf_as_na: bool = ...) -> npt.NDArray[np.bool_]: ...
1716
def is_numeric_na(values: np.ndarray) -> npt.NDArray[np.bool_]: ...
1817
def is_float_nan(values: np.ndarray) -> npt.NDArray[np.bool_]: ...

pandas/_libs/missing.pyx

+15-47
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,12 @@ from sys import maxsize
44

55
cimport cython
66
from cython cimport Py_ssize_t
7+
78
import numpy as np
89

910
cimport numpy as cnp
1011
from numpy cimport (
12+
flatiter,
1113
float64_t,
1214
int64_t,
1315
ndarray,
@@ -197,56 +199,22 @@ cpdef ndarray[uint8_t] isnaobj(ndarray arr, bint inf_as_na=False):
197199
result : ndarray (dtype=np.bool_)
198200
"""
199201
cdef:
200-
Py_ssize_t i, n
202+
Py_ssize_t i, n = arr.size
201203
object val
202-
ndarray[uint8_t] result
203-
204-
assert arr.ndim == 1, "'arr' must be 1-D."
205-
206-
n = len(arr)
207-
result = np.empty(n, dtype=np.uint8)
208-
for i in range(n):
209-
val = arr[i]
210-
result[i] = checknull(val, inf_as_na=inf_as_na)
211-
return result.view(np.bool_)
212-
213-
214-
@cython.wraparound(False)
215-
@cython.boundscheck(False)
216-
def isnaobj2d(arr: ndarray, inf_as_na: bool = False) -> ndarray:
217-
"""
218-
Return boolean mask denoting which elements of a 2-D array are na-like,
219-
according to the criteria defined in `checknull`:
220-
- None
221-
- nan
222-
- NaT
223-
- np.datetime64 representation of NaT
224-
- np.timedelta64 representation of NaT
225-
- NA
226-
- Decimal("NaN")
227-
228-
Parameters
229-
----------
230-
arr : ndarray
231-
232-
Returns
233-
-------
234-
result : ndarray (dtype=np.bool_)
235-
"""
236-
cdef:
237-
Py_ssize_t i, j, n, m
238-
object val
239-
ndarray[uint8_t, ndim=2] result
240-
241-
assert arr.ndim == 2, "'arr' must be 2-D."
204+
bint is_null
205+
ndarray result = np.empty((<object>arr).shape, dtype=np.uint8)
206+
flatiter it = cnp.PyArray_IterNew(arr)
207+
flatiter it2 = cnp.PyArray_IterNew(result)
242208

243-
n, m = (<object>arr).shape
244-
result = np.zeros((n, m), dtype=np.uint8)
245209
for i in range(n):
246-
for j in range(m):
247-
val = arr[i, j]
248-
if checknull(val, inf_as_na=inf_as_na):
249-
result[i, j] = 1
210+
# The PyArray_GETITEM and PyArray_ITER_NEXT are faster
211+
# equivalents to `val = values[i]`
212+
val = cnp.PyArray_GETITEM(arr, cnp.PyArray_ITER_DATA(it))
213+
cnp.PyArray_ITER_NEXT(it)
214+
is_null = checknull(val, inf_as_na=inf_as_na)
215+
# Dereference pointer (set value)
216+
(<uint8_t *>(cnp.PyArray_ITER_DATA(it2)))[0] = <uint8_t>is_null
217+
cnp.PyArray_ITER_NEXT(it2)
250218
return result.view(np.bool_)
251219

252220

pandas/core/dtypes/missing.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -313,10 +313,8 @@ def _isna_string_dtype(values: np.ndarray, inf_as_na: bool) -> npt.NDArray[np.bo
313313
result = np.zeros(values.shape, dtype=bool)
314314
else:
315315

316-
if values.ndim == 1:
316+
if values.ndim in {1, 2}:
317317
result = libmissing.isnaobj(values, inf_as_na=inf_as_na)
318-
elif values.ndim == 2:
319-
result = libmissing.isnaobj2d(values, inf_as_na=inf_as_na)
320318
else:
321319
# 0-D, reached via e.g. mask_missing
322320
result = libmissing.isnaobj(values.ravel(), inf_as_na=inf_as_na)

pandas/tests/dtypes/test_missing.py

+9-2
Original file line numberDiff line numberDiff line change
@@ -726,9 +726,16 @@ def _check_behavior(self, arr, expected):
726726
arr = np.atleast_2d(arr)
727727
expected = np.atleast_2d(expected)
728728

729-
result = libmissing.isnaobj2d(arr)
729+
result = libmissing.isnaobj(arr)
730+
tm.assert_numpy_array_equal(result, expected)
731+
result = libmissing.isnaobj(arr, inf_as_na=True)
730732
tm.assert_numpy_array_equal(result, expected)
731-
result = libmissing.isnaobj2d(arr, inf_as_na=True)
733+
734+
# Test fortran order
735+
arr = arr.copy(order="F")
736+
result = libmissing.isnaobj(arr)
737+
tm.assert_numpy_array_equal(result, expected)
738+
result = libmissing.isnaobj(arr, inf_as_na=True)
732739
tm.assert_numpy_array_equal(result, expected)
733740

734741
def test_basic(self):

0 commit comments

Comments
 (0)