diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index b021af4694d7d..85d34cac5a7bf 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -148,10 +148,14 @@ def time_searchsorted(self, dtype): class Map: - params = (["dict", "Series", "lambda"], ["object", "category", "int"]) - param_names = "mapper" - - def setup(self, mapper, dtype): + params = ( + ["dict", "Series", "lambda"], + ["object", "category", "int"], + [None, "ignore"], + ) + param_names = ["mapper", "dtype", "na_action"] + + def setup(self, mapper, dtype, na_action): map_size = 1000 map_data = Series(map_size - np.arange(map_size), dtype=dtype) @@ -168,8 +172,8 @@ def setup(self, mapper, dtype): self.s = Series(np.random.randint(0, map_size, 10000), dtype=dtype) - def time_map(self, mapper, *args, **kwargs): - self.s.map(self.map_data) + def time_map(self, mapper, dtype, na_action): + self.s.map(self.map_data, na_action=na_action) class Clip: diff --git a/pandas/_libs/dtypes.pxd b/pandas/_libs/dtypes.pxd index de4b70d387b5f..ccfb2d2ef4a23 100644 --- a/pandas/_libs/dtypes.pxd +++ b/pandas/_libs/dtypes.pxd @@ -34,8 +34,3 @@ ctypedef fused numeric_t: ctypedef fused numeric_object_t: numeric_t object - -ctypedef fused uint8_int64_object_t: - uint8_t - int64_t - object diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 4fd68a1593e49..6a31ce84ed418 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -53,6 +53,7 @@ from numpy cimport ( PyArray_ITER_DATA, PyArray_ITER_NEXT, PyArray_IterNew, + PyArray_SETITEM, complex128_t, flatiter, float64_t, @@ -75,7 +76,6 @@ cdef extern from "pandas/parser/pd_parser.h": PandasParser_IMPORT from pandas._libs cimport util -from pandas._libs.dtypes cimport uint8_int64_object_t from pandas._libs.util cimport ( INT64_MAX, INT64_MIN, @@ -2845,14 +2845,16 @@ no_default = _NoDefault.no_default # Sentinel indicating the default value. NoDefault = Literal[_NoDefault.no_default] +@cython.boundscheck(False) +@cython.wraparound(False) def map_infer_mask( - ndarray[object] arr, - object f, - const uint8_t[:] mask, - *, - bint convert=True, - object na_value=no_default, - cnp.dtype dtype=np.dtype(object) + ndarray arr, + object f, + const uint8_t[:] mask, + *, + bint convert=True, + object na_value=no_default, + cnp.dtype dtype=np.dtype(object) ) -> "ArrayLike": """ Substitute for np.vectorize with pandas-friendly dtype inference. @@ -2875,53 +2877,39 @@ def map_infer_mask( ------- np.ndarray or an ExtensionArray """ - cdef Py_ssize_t n = len(arr) - result = np.empty(n, dtype=dtype) - - _map_infer_mask( - result, - arr, - f, - mask, - na_value, - ) - if convert: - return maybe_convert_objects(result) - else: - return result - - -@cython.boundscheck(False) -@cython.wraparound(False) -def _map_infer_mask( - ndarray[uint8_int64_object_t] out, - ndarray[object] arr, - object f, - const uint8_t[:] mask, - object na_value=no_default, -) -> None: - """ - Helper for map_infer_mask, split off to use fused types based on the result. - """ cdef: - Py_ssize_t i, n + Py_ssize_t i + Py_ssize_t n = len(arr) object val - n = len(arr) + ndarray result = np.empty(n, dtype=dtype) + + flatiter arr_it = PyArray_IterNew(arr) + flatiter result_it = PyArray_IterNew(result) + for i in range(n): if mask[i]: if na_value is no_default: - val = arr[i] + val = PyArray_GETITEM(arr, PyArray_ITER_DATA(arr_it)) else: val = na_value else: - val = f(arr[i]) + val = PyArray_GETITEM(arr, PyArray_ITER_DATA(arr_it)) + val = f(val) if cnp.PyArray_IsZeroDim(val): # unbox 0-dim arrays, GH#690 val = val.item() - out[i] = val + PyArray_SETITEM(result, PyArray_ITER_DATA(result_it), val) + + PyArray_ITER_NEXT(arr_it) + PyArray_ITER_NEXT(result_it) + + if convert: + return maybe_convert_objects(result) + else: + return result @cython.boundscheck(False) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index ec2534ce174ac..7f19c6e668409 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -627,28 +627,25 @@ def _str_map( na_value = np.nan else: na_value = False - try: - result = lib.map_infer_mask( - arr, - f, - mask.view("uint8"), - convert=False, - na_value=na_value, - dtype=np.dtype(cast(type, dtype)), - ) - return result - - except ValueError: - result = lib.map_infer_mask( - arr, - f, - mask.view("uint8"), - convert=False, - na_value=na_value, - ) - if convert and result.dtype == object: - result = lib.maybe_convert_objects(result) - return result + + dtype = np.dtype(cast(type, dtype)) + if mask.any(): + # numpy int/bool dtypes cannot hold NaNs so we must convert to + # float64 for int (to match maybe_convert_objects) or + # object for bool (again to match maybe_convert_objects) + if is_integer_dtype(dtype): + dtype = np.dtype("float64") + else: + dtype = np.dtype(object) + result = lib.map_infer_mask( + arr, + f, + mask.view("uint8"), + convert=False, + na_value=na_value, + dtype=dtype, + ) + return result elif is_string_dtype(dtype) and not is_object_dtype(dtype): # i.e. StringDtype