Skip to content

Commit 9b18811

Browse files
realeadjreback
authored andcommitted
BUG: treat nan-objects the same way float64-nans are treated - all na… (#22207)
1 parent 020e948 commit 9b18811

File tree

5 files changed

+64
-4
lines changed

5 files changed

+64
-4
lines changed

asv_bench/benchmarks/series_methods.py

+37
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,43 @@ def time_isin(self, dtypes):
3838
self.s.isin(self.values)
3939

4040

41+
class IsInForObjects(object):
42+
43+
def setup(self):
44+
self.s_nans = Series(np.full(10**4, np.nan)).astype(np.object)
45+
self.vals_nans = np.full(10**4, np.nan).astype(np.object)
46+
self.s_short = Series(np.arange(2)).astype(np.object)
47+
self.s_long = Series(np.arange(10**5)).astype(np.object)
48+
self.vals_short = np.arange(2).astype(np.object)
49+
self.vals_long = np.arange(10**5).astype(np.object)
50+
# because of nans floats are special:
51+
self.s_long_floats = Series(np.arange(10**5,
52+
dtype=np.float)).astype(np.object)
53+
self.vals_long_floats = np.arange(10**5,
54+
dtype=np.float).astype(np.object)
55+
56+
def time_isin_nans(self):
57+
# if nan-objects are different objects,
58+
# this has the potential to trigger O(n^2) running time
59+
self.s_nans.isin(self.vals_nans)
60+
61+
def time_isin_short_series_long_values(self):
62+
# running time dominated by the preprocessing
63+
self.s_short.isin(self.vals_long)
64+
65+
def time_isin_long_series_short_values(self):
66+
# running time dominated by look-up
67+
self.s_long.isin(self.vals_short)
68+
69+
def time_isin_long_series_long_values(self):
70+
# no dominating part
71+
self.s_long.isin(self.vals_long)
72+
73+
def time_isin_long_series_long_values_floats(self):
74+
# no dominating part
75+
self.s_long_floats.isin(self.vals_long_floats)
76+
77+
4178
class NSort(object):
4279

4380
goal_time = 0.2

doc/source/whatsnew/v0.24.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -618,6 +618,7 @@ Missing
618618

619619
- Bug in :func:`DataFrame.fillna` where a ``ValueError`` would raise when one column contained a ``datetime64[ns, tz]`` dtype (:issue:`15522`)
620620
- Bug in :func:`Series.hasnans` that could be incorrectly cached and return incorrect answers if null elements are introduced after an initial call (:issue:`19700`)
621+
- :func:`Series.isin` now treats all nans as equal also for `np.object`-dtype. This behavior is consistent with the behavior for float64 (:issue:`22119`)
621622

622623
MultiIndex
623624
^^^^^^^^^^

pandas/_libs/src/klib/khash_python.h

+10-1
Original file line numberDiff line numberDiff line change
@@ -47,10 +47,19 @@ int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) {
4747
PyErr_Clear();
4848
return 0;
4949
}
50+
if (result == 0) { // still could be two NaNs
51+
return PyFloat_CheckExact(a) &&
52+
PyFloat_CheckExact(b) &&
53+
Py_IS_NAN(PyFloat_AS_DOUBLE(a)) &&
54+
Py_IS_NAN(PyFloat_AS_DOUBLE(b));
55+
}
5056
return result;
5157
}
5258

53-
59+
// For PyObject_Hash holds:
60+
// hash(0.0) == 0 == hash(-0.0)
61+
// hash(X) == 0 if X is a NaN-value
62+
// so it is OK to use it directly
5463
#define kh_python_hash_func(key) (PyObject_Hash(key))
5564
#define kh_python_hash_equal(a, b) (pyobject_cmp(a, b))
5665

pandas/tests/indexes/test_base.py

+8-3
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@
1313
from pandas.tests.indexes.common import Base
1414

1515
from pandas.compat import (range, lrange, lzip, u,
16-
text_type, zip, PY3, PY35, PY36, PYPY, StringIO)
16+
text_type, zip, PY3, PY35, PY36, StringIO)
17+
import math
1718
import operator
1819
import numpy as np
1920

@@ -1663,9 +1664,13 @@ def test_isin_nan_common_object(self, nulls_fixture, nulls_fixture2):
16631664
# Test cartesian product of null fixtures and ensure that we don't
16641665
# mangle the various types (save a corner case with PyPy)
16651666

1666-
if PYPY and nulls_fixture is np.nan: # np.nan is float('nan') on PyPy
1667+
# all nans are the same
1668+
if (isinstance(nulls_fixture, float) and
1669+
isinstance(nulls_fixture2, float) and
1670+
math.isnan(nulls_fixture) and
1671+
math.isnan(nulls_fixture2)):
16671672
tm.assert_numpy_array_equal(Index(['a', nulls_fixture]).isin(
1668-
[float('nan')]), np.array([False, True]))
1673+
[nulls_fixture2]), np.array([False, True]))
16691674

16701675
elif nulls_fixture is nulls_fixture2: # should preserve NA type
16711676
tm.assert_numpy_array_equal(Index(['a', nulls_fixture]).isin(

pandas/tests/test_algos.py

+8
Original file line numberDiff line numberDiff line change
@@ -624,6 +624,14 @@ def test_empty(self, empty):
624624
result = algos.isin(vals, empty)
625625
tm.assert_numpy_array_equal(expected, result)
626626

627+
def test_different_nan_objects(self):
628+
# GH 22119
629+
comps = np.array(['nan', np.nan * 1j, float('nan')], dtype=np.object)
630+
vals = np.array([float('nan')], dtype=np.object)
631+
expected = np.array([False, False, True])
632+
result = algos.isin(comps, vals)
633+
tm.assert_numpy_array_equal(expected, result)
634+
627635

628636
class TestValueCounts(object):
629637

0 commit comments

Comments
 (0)