Skip to content

Commit fc4df5d

Browse files
Hanspaghsimonjayhawkins
authored andcommitted
Backport PR pandas-dev#36266:: BUG: fix isin with nans and large arrays
1 parent 711f923 commit fc4df5d

File tree

3 files changed

+24
-2
lines changed

3 files changed

+24
-2
lines changed

doc/source/whatsnew/v1.1.3.rst

+1
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ Bug fixes
4545
- Bug in :func:`read_spss` where passing a ``pathlib.Path`` as ``path`` would raise a ``TypeError`` (:issue:`33666`)
4646
- Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with ``category`` dtype not propagating ``na`` parameter (:issue:`36241`)
4747
- Bug in :class:`Series` constructor where integer overflow would occur for sufficiently large scalar inputs when an index was provided (:issue:`36291`)
48+
- Bug in :meth:`Series.isin` and :meth:`DataFrame.isin` when using ``NaN`` and a row length above 1,000,000 (:issue:`22205`)
4849

4950
.. ---------------------------------------------------------------------------
5051

pandas/core/algorithms.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -438,7 +438,12 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray:
438438
# GH16012
439439
# Ensure np.in1d doesn't get object types or it *may* throw an exception
440440
if len(comps) > 1_000_000 and not is_object_dtype(comps):
441-
f = np.in1d
441+
# If the the values include nan we need to check for nan explicitly
442+
# since np.nan it not equal to np.nan
443+
if np.isnan(values).any():
444+
f = lambda c, v: np.logical_or(np.in1d(c, v), np.isnan(c))
445+
else:
446+
f = np.in1d
442447
elif is_integer_dtype(comps):
443448
try:
444449
values = values.astype("int64", copy=False)

pandas/tests/test_algos.py

+17-1
Original file line numberDiff line numberDiff line change
@@ -787,7 +787,6 @@ def test_i8(self):
787787
tm.assert_numpy_array_equal(result, expected)
788788

789789
def test_large(self):
790-
791790
s = pd.date_range("20000101", periods=2000000, freq="s").values
792791
result = algos.isin(s, s[0:2])
793792
expected = np.zeros(len(s), dtype=bool)
@@ -827,6 +826,23 @@ def test_same_nan_is_in(self):
827826
result = algos.isin(comps, values)
828827
tm.assert_numpy_array_equal(expected, result)
829828

829+
def test_same_nan_is_in_large(self):
830+
# https://github.com/pandas-dev/pandas/issues/22205
831+
s = np.tile(1.0, 1_000_001)
832+
s[0] = np.nan
833+
result = algos.isin(s, [np.nan, 1])
834+
expected = np.ones(len(s), dtype=bool)
835+
tm.assert_numpy_array_equal(result, expected)
836+
837+
def test_same_nan_is_in_large_series(self):
838+
# https://github.com/pandas-dev/pandas/issues/22205
839+
s = np.tile(1.0, 1_000_001)
840+
series = pd.Series(s)
841+
s[0] = np.nan
842+
result = series.isin([np.nan, 1])
843+
expected = pd.Series(np.ones(len(s), dtype=bool))
844+
tm.assert_series_equal(result, expected)
845+
830846
def test_same_object_is_in(self):
831847
# GH 22160
832848
# there could be special treatment for nans

0 commit comments

Comments
 (0)