diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index 2f44128cda822..43cdc34eefe53 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -220,6 +220,51 @@ cpdef ndarray[uint8_t] isnaobj(ndarray arr): return result.view(np.bool_) +@cython.wraparound(False) +@cython.boundscheck(False) +cpdef ndarray[uint8_t] isna_string(ndarray arr): + """ + Return boolean mask denoting which elements of a 1-D array are na-like, + assuming the array only contains pd.NA or strings (guaranteed by the + StringDtype). + + This is a special case version of `isnaobj` above, specialized a subset + of possible values. + + Parameters + ---------- + arr : ndarray + + Returns + ------- + result : ndarray (dtype=np.bool_) + """ + cdef: + Py_ssize_t i, n = arr.size + object val + bint is_null + ndarray result = np.zeros((arr).shape, dtype=np.uint8) + flatiter it = cnp.PyArray_IterNew(arr) + flatiter it2 = cnp.PyArray_IterNew(result) + + for i in range(n): + # The PyArray_GETITEM and PyArray_ITER_NEXT are faster + # equivalents to `val = values[i]` + val = cnp.PyArray_GETITEM(arr, cnp.PyArray_ITER_DATA(it)) + cnp.PyArray_ITER_NEXT(it) + if val is C_NA: + # Dereference pointer (set value) + ((cnp.PyArray_ITER_DATA(it2)))[0] = 1 + elif not isinstance(val, str): + # this should never be reached, but for safety + # (and doesn't give much overhead) + raise ValueError( + f"Array should only contain strings or pd.NA, got {type(val)}" + ) + cnp.PyArray_ITER_NEXT(it2) + return result.view(np.bool_) + + def isposinf_scalar(val: object) -> bool: return util.is_float_object(val) and val == INF diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 06c54303187eb..4a96374baf422 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -439,6 +439,9 @@ def __arrow_array__(self, type=None): values[self.isna()] = None return pa.array(values, type=type, from_pandas=True) + def isna(self) -> np.ndarray: + return libmissing.isna_string(self._ndarray) + def _values_for_factorize(self) -> tuple[np.ndarray, None]: arr = self._ndarray.copy() mask = self.isna()