Skip to content

Commit 24182c2

Browse files
authored
CLN: Simplify map_infer_mask (#58483)
* CLN: Simplify map_infer_mask * fix some tests * fix tests? * Fix types? * fixup annotations
1 parent e67241b commit 24182c2

File tree

4 files changed

+58
-74
lines changed

4 files changed

+58
-74
lines changed

asv_bench/benchmarks/series_methods.py

+10-6
Original file line numberDiff line numberDiff line change
@@ -148,10 +148,14 @@ def time_searchsorted(self, dtype):
148148

149149

150150
class Map:
151-
params = (["dict", "Series", "lambda"], ["object", "category", "int"])
152-
param_names = "mapper"
153-
154-
def setup(self, mapper, dtype):
151+
params = (
152+
["dict", "Series", "lambda"],
153+
["object", "category", "int"],
154+
[None, "ignore"],
155+
)
156+
param_names = ["mapper", "dtype", "na_action"]
157+
158+
def setup(self, mapper, dtype, na_action):
155159
map_size = 1000
156160
map_data = Series(map_size - np.arange(map_size), dtype=dtype)
157161

@@ -168,8 +172,8 @@ def setup(self, mapper, dtype):
168172

169173
self.s = Series(np.random.randint(0, map_size, 10000), dtype=dtype)
170174

171-
def time_map(self, mapper, *args, **kwargs):
172-
self.s.map(self.map_data)
175+
def time_map(self, mapper, dtype, na_action):
176+
self.s.map(self.map_data, na_action=na_action)
173177

174178

175179
class Clip:

pandas/_libs/dtypes.pxd

-5
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,3 @@ ctypedef fused numeric_t:
3434
ctypedef fused numeric_object_t:
3535
numeric_t
3636
object
37-
38-
ctypedef fused uint8_int64_object_t:
39-
uint8_t
40-
int64_t
41-
object

pandas/_libs/lib.pyx

+29-41
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ from numpy cimport (
5353
PyArray_ITER_DATA,
5454
PyArray_ITER_NEXT,
5555
PyArray_IterNew,
56+
PyArray_SETITEM,
5657
complex128_t,
5758
flatiter,
5859
float64_t,
@@ -75,7 +76,6 @@ cdef extern from "pandas/parser/pd_parser.h":
7576
PandasParser_IMPORT
7677

7778
from pandas._libs cimport util
78-
from pandas._libs.dtypes cimport uint8_int64_object_t
7979
from pandas._libs.util cimport (
8080
INT64_MAX,
8181
INT64_MIN,
@@ -2845,14 +2845,16 @@ no_default = _NoDefault.no_default # Sentinel indicating the default value.
28452845
NoDefault = Literal[_NoDefault.no_default]
28462846

28472847

2848+
@cython.boundscheck(False)
2849+
@cython.wraparound(False)
28482850
def map_infer_mask(
2849-
ndarray[object] arr,
2850-
object f,
2851-
const uint8_t[:] mask,
2852-
*,
2853-
bint convert=True,
2854-
object na_value=no_default,
2855-
cnp.dtype dtype=np.dtype(object)
2851+
ndarray arr,
2852+
object f,
2853+
const uint8_t[:] mask,
2854+
*,
2855+
bint convert=True,
2856+
object na_value=no_default,
2857+
cnp.dtype dtype=np.dtype(object)
28562858
) -> "ArrayLike":
28572859
"""
28582860
Substitute for np.vectorize with pandas-friendly dtype inference.
@@ -2875,53 +2877,39 @@ def map_infer_mask(
28752877
-------
28762878
np.ndarray or an ExtensionArray
28772879
"""
2878-
cdef Py_ssize_t n = len(arr)
2879-
result = np.empty(n, dtype=dtype)
2880-
2881-
_map_infer_mask(
2882-
result,
2883-
arr,
2884-
f,
2885-
mask,
2886-
na_value,
2887-
)
2888-
if convert:
2889-
return maybe_convert_objects(result)
2890-
else:
2891-
return result
2892-
2893-
2894-
@cython.boundscheck(False)
2895-
@cython.wraparound(False)
2896-
def _map_infer_mask(
2897-
ndarray[uint8_int64_object_t] out,
2898-
ndarray[object] arr,
2899-
object f,
2900-
const uint8_t[:] mask,
2901-
object na_value=no_default,
2902-
) -> None:
2903-
"""
2904-
Helper for map_infer_mask, split off to use fused types based on the result.
2905-
"""
29062880
cdef:
2907-
Py_ssize_t i, n
2881+
Py_ssize_t i
2882+
Py_ssize_t n = len(arr)
29082883
object val
29092884

2910-
n = len(arr)
2885+
ndarray result = np.empty(n, dtype=dtype)
2886+
2887+
flatiter arr_it = PyArray_IterNew(arr)
2888+
flatiter result_it = PyArray_IterNew(result)
2889+
29112890
for i in range(n):
29122891
if mask[i]:
29132892
if na_value is no_default:
2914-
val = arr[i]
2893+
val = PyArray_GETITEM(arr, PyArray_ITER_DATA(arr_it))
29152894
else:
29162895
val = na_value
29172896
else:
2918-
val = f(arr[i])
2897+
val = PyArray_GETITEM(arr, PyArray_ITER_DATA(arr_it))
2898+
val = f(val)
29192899

29202900
if cnp.PyArray_IsZeroDim(val):
29212901
# unbox 0-dim arrays, GH#690
29222902
val = val.item()
29232903

2924-
out[i] = val
2904+
PyArray_SETITEM(result, PyArray_ITER_DATA(result_it), val)
2905+
2906+
PyArray_ITER_NEXT(arr_it)
2907+
PyArray_ITER_NEXT(result_it)
2908+
2909+
if convert:
2910+
return maybe_convert_objects(result)
2911+
else:
2912+
return result
29252913

29262914

29272915
@cython.boundscheck(False)

pandas/core/arrays/string_arrow.py

+19-22
Original file line numberDiff line numberDiff line change
@@ -629,28 +629,25 @@ def _str_map(
629629
na_value = np.nan
630630
else:
631631
na_value = False
632-
try:
633-
result = lib.map_infer_mask(
634-
arr,
635-
f,
636-
mask.view("uint8"),
637-
convert=False,
638-
na_value=na_value,
639-
dtype=np.dtype(cast(type, dtype)),
640-
)
641-
return result
642-
643-
except ValueError:
644-
result = lib.map_infer_mask(
645-
arr,
646-
f,
647-
mask.view("uint8"),
648-
convert=False,
649-
na_value=na_value,
650-
)
651-
if convert and result.dtype == object:
652-
result = lib.maybe_convert_objects(result)
653-
return result
632+
633+
dtype = np.dtype(cast(type, dtype))
634+
if mask.any():
635+
# numpy int/bool dtypes cannot hold NaNs so we must convert to
636+
# float64 for int (to match maybe_convert_objects) or
637+
# object for bool (again to match maybe_convert_objects)
638+
if is_integer_dtype(dtype):
639+
dtype = np.dtype("float64")
640+
else:
641+
dtype = np.dtype(object)
642+
result = lib.map_infer_mask(
643+
arr,
644+
f,
645+
mask.view("uint8"),
646+
convert=False,
647+
na_value=na_value,
648+
dtype=dtype,
649+
)
650+
return result
654651

655652
elif is_string_dtype(dtype) and not is_object_dtype(dtype):
656653
# i.e. StringDtype

0 commit comments

Comments
 (0)