-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
CLN: Simplify map_infer_mask #58483
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
CLN: Simplify map_infer_mask #58483
Changes from 4 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -53,6 +53,7 @@ from numpy cimport ( | |
PyArray_ITER_DATA, | ||
PyArray_ITER_NEXT, | ||
PyArray_IterNew, | ||
PyArray_SETITEM, | ||
complex128_t, | ||
flatiter, | ||
float64_t, | ||
|
@@ -75,7 +76,6 @@ cdef extern from "pandas/parser/pd_parser.h": | |
PandasParser_IMPORT | ||
|
||
from pandas._libs cimport util | ||
from pandas._libs.dtypes cimport uint8_int64_object_t | ||
from pandas._libs.util cimport ( | ||
INT64_MAX, | ||
INT64_MIN, | ||
|
@@ -2845,15 +2845,17 @@ no_default = _NoDefault.no_default # Sentinel indicating the default value. | |
NoDefault = Literal[_NoDefault.no_default] | ||
|
||
|
||
@cython.boundscheck(False) | ||
@cython.wraparound(False) | ||
def map_infer_mask( | ||
ndarray[object] arr, | ||
object f, | ||
const uint8_t[:] mask, | ||
*, | ||
bint convert=True, | ||
object na_value=no_default, | ||
cnp.dtype dtype=np.dtype(object) | ||
) -> "ArrayLike": | ||
ndarray arr, | ||
object f, | ||
const uint8_t[:] mask, | ||
*, | ||
bint convert=True, | ||
object na_value=no_default, | ||
object dtype=np.dtype(object) | ||
) -> None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. returning None looks inaccurate. am i reading this wrong? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sorry i mixed up the annotations from the old |
||
""" | ||
Substitute for np.vectorize with pandas-friendly dtype inference. | ||
|
||
|
@@ -2875,53 +2877,39 @@ def map_infer_mask( | |
------- | ||
np.ndarray or an ExtensionArray | ||
""" | ||
cdef Py_ssize_t n = len(arr) | ||
result = np.empty(n, dtype=dtype) | ||
|
||
_map_infer_mask( | ||
result, | ||
arr, | ||
f, | ||
mask, | ||
na_value, | ||
) | ||
if convert: | ||
return maybe_convert_objects(result) | ||
else: | ||
return result | ||
|
||
|
||
@cython.boundscheck(False) | ||
@cython.wraparound(False) | ||
def _map_infer_mask( | ||
ndarray[uint8_int64_object_t] out, | ||
ndarray[object] arr, | ||
object f, | ||
const uint8_t[:] mask, | ||
object na_value=no_default, | ||
) -> None: | ||
""" | ||
Helper for map_infer_mask, split off to use fused types based on the result. | ||
""" | ||
cdef: | ||
Py_ssize_t i, n | ||
Py_ssize_t i | ||
Py_ssize_t n = len(arr) | ||
object val | ||
|
||
n = len(arr) | ||
ndarray result = np.empty(n, dtype=dtype) | ||
|
||
flatiter arr_it = PyArray_IterNew(arr) | ||
flatiter result_it = PyArray_IterNew(result) | ||
|
||
for i in range(n): | ||
if mask[i]: | ||
if na_value is no_default: | ||
val = arr[i] | ||
val = PyArray_GETITEM(arr, PyArray_ITER_DATA(arr_it)) | ||
WillAyd marked this conversation as resolved.
Show resolved
Hide resolved
|
||
else: | ||
val = na_value | ||
else: | ||
val = f(arr[i]) | ||
val = PyArray_GETITEM(arr, PyArray_ITER_DATA(arr_it)) | ||
val = f(val) | ||
|
||
if cnp.PyArray_IsZeroDim(val): | ||
# unbox 0-dim arrays, GH#690 | ||
val = val.item() | ||
|
||
out[i] = val | ||
PyArray_SETITEM(result, PyArray_ITER_DATA(result_it), val) | ||
|
||
PyArray_ITER_NEXT(arr_it) | ||
PyArray_ITER_NEXT(result_it) | ||
|
||
if convert: | ||
return maybe_convert_objects(result) | ||
else: | ||
return result | ||
|
||
|
||
@cython.boundscheck(False) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -627,28 +627,25 @@ def _str_map( | |
na_value = np.nan | ||
else: | ||
na_value = False | ||
try: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you're free, could you review the changes to the Arrow strings here? I think what I have here is correct, but not too familiar with the arrow strings. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
result = lib.map_infer_mask( | ||
arr, | ||
f, | ||
mask.view("uint8"), | ||
convert=False, | ||
na_value=na_value, | ||
dtype=np.dtype(cast(type, dtype)), | ||
) | ||
return result | ||
|
||
except ValueError: | ||
result = lib.map_infer_mask( | ||
arr, | ||
f, | ||
mask.view("uint8"), | ||
convert=False, | ||
na_value=na_value, | ||
) | ||
if convert and result.dtype == object: | ||
result = lib.maybe_convert_objects(result) | ||
return result | ||
|
||
dtype = np.dtype(cast(type, dtype)) | ||
if mask.any(): | ||
# numpy int/bool dtypes cannot hold NaNs so we must convert to | ||
# float64 for int (to match maybe_convert_objects) or | ||
# object for bool (again to match maybe_convert_objects) | ||
if is_integer_dtype(dtype): | ||
dtype = np.dtype("float64") | ||
else: | ||
dtype = np.dtype(object) | ||
result = lib.map_infer_mask( | ||
arr, | ||
f, | ||
mask.view("uint8"), | ||
convert=False, | ||
na_value=na_value, | ||
dtype=dtype, | ||
) | ||
return result | ||
|
||
elif is_string_dtype(dtype) and not is_object_dtype(dtype): | ||
# i.e. StringDtype | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why this change? is the new numpy string dtype not an instance of cnp.dtype?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this should be an accidental change I forgot to roll back.
In general, is there a reason we prefer
cnp.dtype
?With
cnp.dtype
I find it harder to do stuff like kind checks on the dtype object.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just the usual "explicit is better than implicit" reasons; if it is
object
here then in 6 months I'll have to check whether it can be an ExtensionDtype