Skip to content

REGR: isin with nullable types with missing values raising #42473

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Jul 14, 2021
Merged
50 changes: 34 additions & 16 deletions asv_bench/benchmarks/algos/isin.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
import numpy as np

try:
from pandas.compat import np_version_under1p20
except ImportError:
from pandas.compat.numpy import _np_version_under1p20 as np_version_under1p20
from pandas.core.dtypes.cast import is_extension_array_dtype
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

get this from dtypes.common


from pandas import (
Categorical,
Float64Dtype,
Int64Dtype,
NaT,
Series,
date_range,
Expand Down Expand Up @@ -274,7 +273,15 @@ def time_isin(self, series_type, vals_type):

class IsInLongSeriesLookUpDominates:
params = [
["int64", "int32", "float64", "float32", "object", "Int64", "Float64"],
[
"int64",
"int32",
"float64",
"float32",
"object",
Int64Dtype(),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is very odd, why doesn't Int64 not work here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was so the type attribute could be accessed to get the underlying numpy compatible type. I'll try to reorganize to avoid this

Float64Dtype(),
],
[5, 1000],
["random_hits", "random_misses", "monotone_hits", "monotone_misses"],
]
Expand All @@ -283,10 +290,6 @@ class IsInLongSeriesLookUpDominates:
def setup(self, dtype, MaxNumber, series_type):
N = 10 ** 7

# https://github.com/pandas-dev/pandas/issues/39844
if not np_version_under1p20 and dtype in ("Int64", "Float64"):
raise NotImplementedError

if series_type == "random_hits":
array = np.random.randint(0, MaxNumber, N)
if series_type == "random_misses":
Expand All @@ -297,32 +300,47 @@ def setup(self, dtype, MaxNumber, series_type):
array = np.arange(N) + MaxNumber

self.series = Series(array).astype(dtype)
self.values = np.arange(MaxNumber).astype(dtype)

if is_extension_array_dtype(dtype):
vals_dtype = dtype.type
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should this be more specific to Int64Dtype/Float64Dtype? we we had e.g. PeriodDtype then the astype on L308 would break

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good, have limited to those to make clearer what can pass through here in current form

else:
vals_dtype = dtype
self.values = np.arange(MaxNumber).astype(vals_dtype)

def time_isin(self, dtypes, MaxNumber, series_type):
self.series.isin(self.values)


class IsInLongSeriesValuesDominate:
params = [
["int64", "int32", "float64", "float32", "object", "Int64", "Float64"],
[
"int64",
"int32",
"float64",
"float32",
"object",
Int64Dtype(),
Float64Dtype(),
],
["random", "monotone"],
]
param_names = ["dtype", "series_type"]

def setup(self, dtype, series_type):
N = 10 ** 7

# https://github.com/pandas-dev/pandas/issues/39844
if not np_version_under1p20 and dtype in ("Int64", "Float64"):
raise NotImplementedError

if series_type == "random":
vals = np.random.randint(0, 10 * N, N)
if series_type == "monotone":
vals = np.arange(N)

self.values = vals.astype(dtype)
if is_extension_array_dtype(dtype):
vals_dtype = dtype.type
else:
vals_dtype = dtype

self.values = vals.astype(vals_dtype)

M = 10 ** 6 + 1
self.series = Series(np.arange(M)).astype(dtype)

Expand Down
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.3.1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ Fixed regressions
- :class:`DataFrame` constructed with with an older version of pandas could not be unpickled (:issue:`42345`)
- Performance regression in constructing a :class:`DataFrame` from a dictionary of dictionaries (:issue:`42338`)
- Fixed regression in :meth:`DataFrame.agg` dropping values when the DataFrame had an Extension Array dtype, a duplicate index, and ``axis=1`` (:issue:`42380`)
-
- Fixed regression in :meth:`DataFrame.isin` and :meth:`Series.isin` raising ``TypeError`` with nullable data containing at least one missing value (:issue:`42405`)

.. ---------------------------------------------------------------------------

Expand Down
1 change: 1 addition & 0 deletions pandas/_libs/lib.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,7 @@ def array_equivalent_object(
right: np.ndarray, # object[:]
) -> bool: ...
def has_infs(arr: np.ndarray) -> bool: ... # const floating[:]
def has_NA(arr: np.ndarray) -> bool: ...
def get_reverse_indexer(
indexer: np.ndarray, # const intp_t[:]
length: int,
Expand Down
18 changes: 18 additions & 0 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -538,6 +538,24 @@ def has_infs(floating[:] arr) -> bool:
return ret


@cython.wraparound(False)
@cython.boundscheck(False)
def has_NA(ndarray arr) -> bool:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ndarray[object]?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When I try that (or object[:]), end up getting errors like ValueError: Buffer dtype mismatch, expected 'Python object' but got 'double'

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yah, this should only be called on object-dtype, in other cases we already know the result is False

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah got it thanks, have changed

"""
Return True if NA present in arr, False otherwise
"""
cdef:
Py_ssize_t i

assert arr.ndim == 1, "'arr' must be 1-D."

for i in range(len(arr)):
if arr[i] is C_NA:
return True

return False
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This may be overkill, but fits with some other cython methods like has_infs and would hopefully be useful in other locations

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could live in _libs.missing?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

have moved



def maybe_indices_to_slice(ndarray[intp_t] indices, int max_len):
cdef:
Py_ssize_t i, n = len(indices)
Expand Down
16 changes: 11 additions & 5 deletions pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,12 +403,18 @@ def isin(self, values) -> BooleanArray: # type: ignore[override]

from pandas.core.arrays import BooleanArray

result = isin(self._data, values)
# algorithms.isin will eventually convert values to an ndarray, so no extra
# cost to doing it here first
values_arr = np.asarray(values)
result = isin(self._data, values_arr)

if self._hasna:
if libmissing.NA in values:
result += self._mask
else:
result *= np.invert(self._mask)
values_have_NA = lib.has_NA(values_arr)

# For now, NA does not propagate so set result according to presence of NA,
# see https://github.com/pandas-dev/pandas/pull/38379 for some discussion
result[self._mask] = values_have_NA
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This line should be equivalent to the deleted if/else AFAICT, but I think makes logic easier to follow


mask = np.zeros_like(self, dtype=bool)
return BooleanArray(result, mask, copy=False)

Expand Down
21 changes: 21 additions & 0 deletions pandas/tests/series/methods/test_isin.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,27 @@ def test_isin_float_in_int_series(self, values):
expected = Series([True, False])
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("dtype", ["boolean", "Int64", "Float64"])
@pytest.mark.parametrize(
"data,values,expected",
[
([0, 1, 0], [1], [False, True, False]),
([0, 1, 0], [1, pd.NA], [False, True, False]),
([0, pd.NA, 0], [1, 0], [True, False, True]),
([0, 1, pd.NA], [1, pd.NA], [False, True, True]),
([0, 1, pd.NA], [1, np.nan], [False, True, False]),
([0, pd.NA, pd.NA], [np.nan, pd.NaT, None], [False, False, False]),
],
)
def test_isin_masked_types(self, dtype, data, values, expected):
# GH#42405
ser = Series(data, dtype=dtype)

result = ser.isin(values)
expected = Series(expected, dtype="boolean")

tm.assert_series_equal(result, expected)


@pytest.mark.slow
def test_isin_large_series_mixed_dtypes_and_nan():
Expand Down