-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
REGR: isin with nullable types with missing values raising #42473
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 10 commits
d9132d7
1902d7a
d2b988d
e60c0d4
e902218
814f978
0a9b4e6
6a68490
44bcd67
821c6b6
893343f
e7b66b7
3cfe98e
ec61ab1
17a50b9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,9 @@ | ||
import numpy as np | ||
|
||
try: | ||
from pandas.compat import np_version_under1p20 | ||
except ImportError: | ||
from pandas.compat.numpy import _np_version_under1p20 as np_version_under1p20 | ||
|
||
from pandas import ( | ||
Categorical, | ||
Float64Dtype, | ||
Int64Dtype, | ||
NaT, | ||
Series, | ||
date_range, | ||
|
@@ -274,7 +271,15 @@ def time_isin(self, series_type, vals_type): | |
|
||
class IsInLongSeriesLookUpDominates: | ||
params = [ | ||
["int64", "int32", "float64", "float32", "object", "Int64", "Float64"], | ||
[ | ||
"int64", | ||
"int32", | ||
"float64", | ||
"float32", | ||
"object", | ||
Int64Dtype(), | ||
Float64Dtype(), | ||
], | ||
[5, 1000], | ||
["random_hits", "random_misses", "monotone_hits", "monotone_misses"], | ||
] | ||
|
@@ -283,10 +288,6 @@ class IsInLongSeriesLookUpDominates: | |
def setup(self, dtype, MaxNumber, series_type): | ||
N = 10 ** 7 | ||
|
||
# https://github.com/pandas-dev/pandas/issues/39844 | ||
if not np_version_under1p20 and dtype in ("Int64", "Float64"): | ||
raise NotImplementedError | ||
|
||
if series_type == "random_hits": | ||
array = np.random.randint(0, MaxNumber, N) | ||
if series_type == "random_misses": | ||
|
@@ -297,6 +298,10 @@ def setup(self, dtype, MaxNumber, series_type): | |
array = np.arange(N) + MaxNumber | ||
|
||
self.series = Series(array).astype(dtype) | ||
|
||
if isinstance(dtype, (Int64Dtype, Float64Dtype)): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. see above, this is a problem if you need to do this |
||
dtype = dtype.type | ||
|
||
self.values = np.arange(MaxNumber).astype(dtype) | ||
|
||
def time_isin(self, dtypes, MaxNumber, series_type): | ||
|
@@ -305,24 +310,32 @@ def time_isin(self, dtypes, MaxNumber, series_type): | |
|
||
class IsInLongSeriesValuesDominate: | ||
params = [ | ||
["int64", "int32", "float64", "float32", "object", "Int64", "Float64"], | ||
[ | ||
"int64", | ||
"int32", | ||
"float64", | ||
"float32", | ||
"object", | ||
Int64Dtype(), | ||
Float64Dtype(), | ||
], | ||
["random", "monotone"], | ||
] | ||
param_names = ["dtype", "series_type"] | ||
|
||
def setup(self, dtype, series_type): | ||
N = 10 ** 7 | ||
|
||
# https://github.com/pandas-dev/pandas/issues/39844 | ||
if not np_version_under1p20 and dtype in ("Int64", "Float64"): | ||
raise NotImplementedError | ||
|
||
if series_type == "random": | ||
vals = np.random.randint(0, 10 * N, N) | ||
if series_type == "monotone": | ||
vals = np.arange(N) | ||
|
||
if isinstance(dtype, (Int64Dtype, Float64Dtype)): | ||
dtype = dtype.type | ||
|
||
self.values = vals.astype(dtype) | ||
|
||
M = 10 ** 6 + 1 | ||
self.series = Series(np.arange(M)).astype(dtype) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -334,6 +334,22 @@ def isnaobj2d_old(arr: ndarray) -> ndarray: | |
return result.view(np.bool_) | ||
|
||
|
||
@cython.wraparound(False) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is the same as: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. its might be a very tiny bit faster but likely not worth it There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The issue here was that we need to be able to explicitly check for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we actually test this? i think we accept multiple types of Nulls here (e.g. None is accepted as well as np.nan for strings). I would just use our existing routine unless you have tests that show this is not sufficient. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is not tested (hence the regression :) On 1.2.x the behavior is
This
So this matches existing behavior and how There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The added test has parameterizations which hit this explicitly. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
If we're uncertain about this behavior, could also just turn the added cython routine into something similar done in python space just to fix the regression. Then that could easily be removed for 1.4 if we decide to change the behavior. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
i havent looked at this closely, but the canonical behavior belongs in is_valid_na_for_dtype There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we're going to at some point need a vectorized is_matching_na, which this would be a special case of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Not sure if along the lines of what you were thinking, but pushed a different solution just checking for presence of a valid missing value using |
||
@cython.boundscheck(False) | ||
def has_NA(ndarray[object, ndim=1] arr) -> bool: | ||
""" | ||
Return True if NA present in arr, False otherwise | ||
""" | ||
cdef: | ||
Py_ssize_t i | ||
|
||
for i in range(len(arr)): | ||
if arr[i] is C_NA: | ||
return True | ||
|
||
return False | ||
|
||
|
||
def isposinf_scalar(val: object) -> bool: | ||
return util.is_float_object(val) and val == INF | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -403,12 +403,20 @@ def isin(self, values) -> BooleanArray: # type: ignore[override] | |
|
||
from pandas.core.arrays import BooleanArray | ||
|
||
result = isin(self._data, values) | ||
# algorithms.isin will eventually convert values to an ndarray, so no extra | ||
# cost to doing it here first | ||
values_arr = np.asarray(values) | ||
result = isin(self._data, values_arr) | ||
|
||
if self._hasna: | ||
if libmissing.NA in values: | ||
result += self._mask | ||
else: | ||
result *= np.invert(self._mask) | ||
values_have_NA = is_object_dtype(values_arr.dtype) and libmissing.has_NA( | ||
values_arr | ||
) | ||
|
||
# For now, NA does not propagate so set result according to presence of NA, | ||
# see https://github.com/pandas-dev/pandas/pull/38379 for some discussion | ||
result[self._mask] = values_have_NA | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This line should be equivalent to the deleted if/else AFAICT, but I think makes logic easier to follow |
||
|
||
mask = np.zeros_like(self, dtype=bool) | ||
return BooleanArray(result, mask, copy=False) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is very odd, why doesn't
Int64
not work here?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This was so the
type
attribute could be accessed to get the underlyingnumpy
compatible type. I'll try to reorganize to avoid this