From 5d60c035a7a059ae66c8b689ecdd7857dd7b2aad Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Tue, 8 Jun 2021 23:05:40 +0200 Subject: [PATCH 01/11] adding breaking example with complex --- pandas/tests/libs/test_hashtable.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index aeff591e3f0dc..8da479cf249d4 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -178,6 +178,24 @@ def test_no_reallocation(self, table_type, dtype): assert n_buckets_start == clean_table.get_state()["n_buckets"] +class TestPyObjectHashTableWithNans: + def test_nan_float(self): + nan1 = float("nan") + nan2 = float("nan") + assert nan1 is not nan2 + table = ht.PyObjectHashTable() + table.set_item(nan1, 42) + assert table.get_item(nan2) == 42 + + def test_nan_complex(self): + nan1 = 1j * float("nan") + nan2 = 1j * float("nan") + assert nan1 is not nan2 + table = ht.PyObjectHashTable() + table.set_item(nan1, 42) + assert table.get_item(nan2) == 42 + + def test_get_labels_groupby_for_Int64(writable): table = ht.Int64HashTable() vals = np.array([1, 2, -1, 2, 1, -1], dtype=np.int64) From dfa9f83ef4462bae8b53c388597a80cfadba43e3 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Tue, 8 Jun 2021 23:26:51 +0200 Subject: [PATCH 02/11] fix complex case --- pandas/_libs/src/klib/khash_python.h | 43 ++++++++++++++++++++++++---- pandas/tests/libs/test_hashtable.py | 30 +++++++++++++++++-- 2 files changed, 65 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h index aee018262e3a6..9d0ee7698af6c 100644 --- a/pandas/_libs/src/klib/khash_python.h +++ b/pandas/_libs/src/klib/khash_python.h @@ -163,17 +163,50 @@ KHASH_MAP_INIT_COMPLEX128(complex128, size_t) #define kh_exist_complex128(h, k) (kh_exist(h, k)) +int PANDAS_INLINE floatobject_cmp(PyObject* a, PyObject* b){ + return Py_IS_NAN(PyFloat_AS_DOUBLE(a)) && + Py_IS_NAN(PyFloat_AS_DOUBLE(b)); +} + + +int PANDAS_INLINE complexobject_cmp(PyObject* a, PyObject* b){ + return ( + Py_IS_NAN(PyComplex_RealAsDouble(a)) && + Py_IS_NAN(PyComplex_RealAsDouble(b)) && + Py_IS_NAN(PyComplex_ImagAsDouble(a)) && + Py_IS_NAN(PyComplex_ImagAsDouble(b)) + ) + || + ( + Py_IS_NAN(PyComplex_RealAsDouble(a)) && + Py_IS_NAN(PyComplex_RealAsDouble(b)) && + PyComplex_ImagAsDouble(a) == PyComplex_ImagAsDouble(b) + ) + || + ( + PyComplex_RealAsDouble(a) == PyComplex_RealAsDouble(b) && + Py_IS_NAN(PyComplex_ImagAsDouble(a)) && + Py_IS_NAN(PyComplex_ImagAsDouble(b)) + ); +} + + int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) { int result = PyObject_RichCompareBool(a, b, Py_EQ); if (result < 0) { PyErr_Clear(); return 0; } - if (result == 0) { // still could be two NaNs - return PyFloat_CheckExact(a) && - PyFloat_CheckExact(b) && - Py_IS_NAN(PyFloat_AS_DOUBLE(a)) && - Py_IS_NAN(PyFloat_AS_DOUBLE(b)); + if (result == 0) { // still could be built-ins with NaNs + if (Py_TYPE(a) != Py_TYPE(b)) { + return 0; + } + if (PyFloat_CheckExact(a)) { + return floatobject_cmp(a, b); + } + if (PyComplex_CheckExact(a)) { + return complexobject_cmp(a, b); + } } return result; } diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index 8da479cf249d4..81419891bf00e 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -187,14 +187,38 @@ def test_nan_float(self): table.set_item(nan1, 42) assert table.get_item(nan2) == 42 - def test_nan_complex(self): - nan1 = 1j * float("nan") - nan2 = 1j * float("nan") + def test_nan_complex_both(self): + nan1 = complex(float("nan"), float("nan")) + nan2 = complex(float("nan"), float("nan")) assert nan1 is not nan2 table = ht.PyObjectHashTable() table.set_item(nan1, 42) assert table.get_item(nan2) == 42 + def test_nan_complex_real(self): + nan1 = complex(float("nan"), 1) + nan2 = complex(float("nan"), 1) + other = complex(float("nan"), 2) + assert nan1 is not nan2 + table = ht.PyObjectHashTable() + table.set_item(nan1, 42) + assert table.get_item(nan2) == 42 + with pytest.raises(KeyError, match=None) as error: + table.get_item(other) + assert str(error.value) == str(other) + + def test_nan_complex_imag(self): + nan1 = complex(1, float("nan")) + nan2 = complex(1, float("nan")) + other = complex(2, float("nan")) + assert nan1 is not nan2 + table = ht.PyObjectHashTable() + table.set_item(nan1, 42) + assert table.get_item(nan2) == 42 + with pytest.raises(KeyError, match=None) as error: + table.get_item(other) + assert str(error.value) == str(other) + def test_get_labels_groupby_for_Int64(writable): table = ht.Int64HashTable() From 6de86483ace5b9025d2902c1af04eb3c66d54799 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Tue, 8 Jun 2021 23:44:04 +0200 Subject: [PATCH 03/11] add failing example for tuple --- pandas/tests/libs/test_hashtable.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index 81419891bf00e..353e074b324c2 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -219,6 +219,25 @@ def test_nan_complex_imag(self): table.get_item(other) assert str(error.value) == str(other) + def test_nan_in_tuple(self): + nan1 = (float("nan"),) + nan2 = (float("nan"),) + assert nan1[0] is not nan2[0] + table = ht.PyObjectHashTable() + table.set_item(nan1, 42) + assert table.get_item(nan2) == 42 + + def test_nan_in_nested_tuple(self): + nan1 = (1, (2, (float("nan"),))) + nan2 = (1, (2, (float("nan"),))) + other = (1, 2) + table = ht.PyObjectHashTable() + table.set_item(nan1, 42) + assert table.get_item(nan2) == 42 + with pytest.raises(KeyError, match=None) as error: + table.get_item(other) + assert str(error.value) == str(other) + def test_get_labels_groupby_for_Int64(writable): table = ht.Int64HashTable() From de6f55857f85a2359ea2bb00b8270f204f4fe9aa Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Wed, 9 Jun 2021 00:15:00 +0200 Subject: [PATCH 04/11] fix tuple-case --- pandas/_libs/src/klib/khash_python.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h index 9d0ee7698af6c..464a2f60f3dbd 100644 --- a/pandas/_libs/src/klib/khash_python.h +++ b/pandas/_libs/src/klib/khash_python.h @@ -190,6 +190,24 @@ int PANDAS_INLINE complexobject_cmp(PyObject* a, PyObject* b){ ); } +int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b); + + +int PANDAS_INLINE tupleobject_cmp(PyObject* a, PyObject* b){ + Py_ssize_t i; + + if (Py_SIZE(a) != Py_SIZE(b)) { + return 0; + } + + for (i = 0; i < Py_SIZE(a); ++i) { + if (!pyobject_cmp(PyTuple_GET_ITEM(a, i), PyTuple_GET_ITEM(b, i))) { + return 0; + } + } + return 1; +} + int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) { int result = PyObject_RichCompareBool(a, b, Py_EQ); @@ -207,6 +225,9 @@ int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) { if (PyComplex_CheckExact(a)) { return complexobject_cmp(a, b); } + if (PyTuple_CheckExact(a)) { + return tupleobject_cmp(a, b); + } } return result; } From 81ce1bced3750eba7ca3cd3c1235e5ee02a015ca Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Fri, 11 Jun 2021 16:38:02 +0200 Subject: [PATCH 05/11] make functions type-safe --- pandas/_libs/src/klib/khash_python.h | 32 ++++++++++++++-------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h index 464a2f60f3dbd..d6a5c593df26b 100644 --- a/pandas/_libs/src/klib/khash_python.h +++ b/pandas/_libs/src/klib/khash_python.h @@ -163,37 +163,37 @@ KHASH_MAP_INIT_COMPLEX128(complex128, size_t) #define kh_exist_complex128(h, k) (kh_exist(h, k)) -int PANDAS_INLINE floatobject_cmp(PyObject* a, PyObject* b){ +int PANDAS_INLINE floatobject_cmp(PyFloatObject* a, PyFloatObject* b){ return Py_IS_NAN(PyFloat_AS_DOUBLE(a)) && Py_IS_NAN(PyFloat_AS_DOUBLE(b)); } -int PANDAS_INLINE complexobject_cmp(PyObject* a, PyObject* b){ +int PANDAS_INLINE complexobject_cmp(PyComplexObject* a, PyComplexObject* b){ return ( - Py_IS_NAN(PyComplex_RealAsDouble(a)) && - Py_IS_NAN(PyComplex_RealAsDouble(b)) && - Py_IS_NAN(PyComplex_ImagAsDouble(a)) && - Py_IS_NAN(PyComplex_ImagAsDouble(b)) + Py_IS_NAN(a->cval.real) && + Py_IS_NAN(b->cval.real) && + Py_IS_NAN(a->cval.imag) && + Py_IS_NAN(b->cval.imag) ) || ( - Py_IS_NAN(PyComplex_RealAsDouble(a)) && - Py_IS_NAN(PyComplex_RealAsDouble(b)) && - PyComplex_ImagAsDouble(a) == PyComplex_ImagAsDouble(b) + Py_IS_NAN(a->cval.real) && + Py_IS_NAN(b->cval.real) && + a->cval.imag == b->cval.imag ) || ( - PyComplex_RealAsDouble(a) == PyComplex_RealAsDouble(b) && - Py_IS_NAN(PyComplex_ImagAsDouble(a)) && - Py_IS_NAN(PyComplex_ImagAsDouble(b)) + a->cval.real == b->cval.real && + Py_IS_NAN(a->cval.imag) && + Py_IS_NAN(b->cval.imag) ); } int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b); -int PANDAS_INLINE tupleobject_cmp(PyObject* a, PyObject* b){ +int PANDAS_INLINE tupleobject_cmp(PyTupleObject* a, PyTupleObject* b){ Py_ssize_t i; if (Py_SIZE(a) != Py_SIZE(b)) { @@ -220,13 +220,13 @@ int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) { return 0; } if (PyFloat_CheckExact(a)) { - return floatobject_cmp(a, b); + return floatobject_cmp((PyFloatObject*)a, (PyFloatObject*)b); } if (PyComplex_CheckExact(a)) { - return complexobject_cmp(a, b); + return complexobject_cmp((PyComplexObject*)a, (PyComplexObject*)b); } if (PyTuple_CheckExact(a)) { - return tupleobject_cmp(a, b); + return tupleobject_cmp((PyTupleObject*)a, (PyTupleObject*)b); } } return result; From 9f6fc0360ae0de6c88a483b7e0f88d3b53c99966 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Fri, 11 Jun 2021 16:49:11 +0200 Subject: [PATCH 06/11] performance: first special cases, than all others (otherwise tuple will be traversed twice) --- pandas/_libs/src/klib/khash_python.h | 31 ++++++++++++++++++---------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h index d6a5c593df26b..3fcf7b7e1c0ee 100644 --- a/pandas/_libs/src/klib/khash_python.h +++ b/pandas/_libs/src/klib/khash_python.h @@ -164,8 +164,12 @@ KHASH_MAP_INIT_COMPLEX128(complex128, size_t) int PANDAS_INLINE floatobject_cmp(PyFloatObject* a, PyFloatObject* b){ - return Py_IS_NAN(PyFloat_AS_DOUBLE(a)) && - Py_IS_NAN(PyFloat_AS_DOUBLE(b)); + return ( + Py_IS_NAN(PyFloat_AS_DOUBLE(a)) && + Py_IS_NAN(PyFloat_AS_DOUBLE(b)) + ) + || + ( PyFloat_AS_DOUBLE(a) == PyFloat_AS_DOUBLE(b) ); } @@ -187,6 +191,11 @@ int PANDAS_INLINE complexobject_cmp(PyComplexObject* a, PyComplexObject* b){ a->cval.real == b->cval.real && Py_IS_NAN(a->cval.imag) && Py_IS_NAN(b->cval.imag) + ) + || + ( + a->cval.real == b->cval.real && + a->cval.imag == b->cval.imag ); } @@ -210,15 +219,8 @@ int PANDAS_INLINE tupleobject_cmp(PyTupleObject* a, PyTupleObject* b){ int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) { - int result = PyObject_RichCompareBool(a, b, Py_EQ); - if (result < 0) { - PyErr_Clear(); - return 0; - } - if (result == 0) { // still could be built-ins with NaNs - if (Py_TYPE(a) != Py_TYPE(b)) { - return 0; - } + if (Py_TYPE(a) == Py_TYPE(b)) { + // special handling for some built-in types which could have NaNs: if (PyFloat_CheckExact(a)) { return floatobject_cmp((PyFloatObject*)a, (PyFloatObject*)b); } @@ -228,7 +230,14 @@ int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) { if (PyTuple_CheckExact(a)) { return tupleobject_cmp((PyTupleObject*)a, (PyTupleObject*)b); } + // frozenset isn't yet supported } + + int result = PyObject_RichCompareBool(a, b, Py_EQ); + if (result < 0) { + PyErr_Clear(); + return 0; + } return result; } From a8ccfe974a94801cf35bb5a3aa12daefa0384ccc Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Fri, 11 Jun 2021 16:59:27 +0200 Subject: [PATCH 07/11] add test case from GH-41836 --- pandas/tests/libs/test_hashtable.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index 353e074b324c2..0edcebdc069f4 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -8,6 +8,7 @@ import pandas as pd import pandas._testing as tm +from pandas.core.algorithms import isin @contextmanager @@ -487,3 +488,12 @@ def test_mode(self, dtype, type_suffix): values = np.array([42, np.nan, np.nan, np.nan], dtype=dtype) assert mode(values, True) == 42 assert np.isnan(mode(values, False)) + + +def test_ismember_tuple_with_nans(): + # GH-41836 + values = [("a", float("nan")), ("b", 1)] + comps = [("a", float("nan"))] + result = isin(values, comps) + expected = np.array([True, False], dtype=np.bool_) + tm.assert_numpy_array_equal(result, expected) From 5bc56d8d2ed05c9a3630294de02d990769df19a0 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Sun, 13 Jun 2021 14:34:40 +0200 Subject: [PATCH 08/11] there are no differences now for PyPy and CPython - both take nan correctly into account --- pandas/tests/indexes/multi/test_isin.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/pandas/tests/indexes/multi/test_isin.py b/pandas/tests/indexes/multi/test_isin.py index 97eb34e28764b..695458273d16e 100644 --- a/pandas/tests/indexes/multi/test_isin.py +++ b/pandas/tests/indexes/multi/test_isin.py @@ -1,14 +1,11 @@ import numpy as np import pytest -from pandas.compat import PYPY - from pandas import MultiIndex import pandas._testing as tm -@pytest.mark.skipif(not PYPY, reason="tuples cmp recursively on PyPy") -def test_isin_nan_pypy(): +def test_isin_nan(): idx = MultiIndex.from_arrays([["foo", "bar"], [1.0, np.nan]]) tm.assert_numpy_array_equal(idx.isin([("bar", np.nan)]), np.array([False, True])) tm.assert_numpy_array_equal( @@ -31,15 +28,6 @@ def test_isin(): assert result.dtype == np.bool_ -@pytest.mark.skipif(PYPY, reason="tuples cmp recursively on PyPy") -def test_isin_nan_not_pypy(): - idx = MultiIndex.from_arrays([["foo", "bar"], [1.0, np.nan]]) - tm.assert_numpy_array_equal(idx.isin([("bar", np.nan)]), np.array([False, False])) - tm.assert_numpy_array_equal( - idx.isin([("bar", float("nan"))]), np.array([False, False]) - ) - - def test_isin_level_kwarg(): idx = MultiIndex.from_arrays([["qux", "baz", "foo", "bar"], np.arange(4)]) From b2c870665e80864122a1869762927ca327beccfa Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Wed, 16 Jun 2021 07:01:45 +0200 Subject: [PATCH 09/11] adding what is new --- doc/source/whatsnew/v1.3.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 1a5a9980e5e96..4ccda47a98603 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -1033,6 +1033,7 @@ Missing - Bug in :meth:`DataFrame.fillna` not accepting a dictionary for the ``downcast`` keyword (:issue:`40809`) - Bug in :func:`isna` not returning a copy of the mask for nullable types, causing any subsequent mask modification to change the original array (:issue:`40935`) - Bug in :class:`DataFrame` construction with float data containing ``NaN`` and an integer ``dtype`` casting instead of retaining the ``NaN`` (:issue:`26919`) +- Bug in :func:`isin` didn't treat all nans as equivalent if they were in tuples (:issue:`41836`) MultiIndex ^^^^^^^^^^ From 86ad54b01e6e808df5dba291207a416e7d940d31 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Wed, 16 Jun 2021 21:06:55 +0200 Subject: [PATCH 10/11] use Series.isin and MultiIndex.isin rather than isin in whatsnew --- doc/source/whatsnew/v1.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 4ccda47a98603..cadc5615cd654 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -1033,7 +1033,7 @@ Missing - Bug in :meth:`DataFrame.fillna` not accepting a dictionary for the ``downcast`` keyword (:issue:`40809`) - Bug in :func:`isna` not returning a copy of the mask for nullable types, causing any subsequent mask modification to change the original array (:issue:`40935`) - Bug in :class:`DataFrame` construction with float data containing ``NaN`` and an integer ``dtype`` casting instead of retaining the ``NaN`` (:issue:`26919`) -- Bug in :func:`isin` didn't treat all nans as equivalent if they were in tuples (:issue:`41836`) +- Bug in :meth:`Series.isin` and :meth:`MultiIndex.isin` didn't treat all nans as equivalent if they were in tuples (:issue:`41836`) MultiIndex ^^^^^^^^^^ From a35e61725aa05d4ec31611c1e088406508dade4c Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Wed, 16 Jun 2021 21:31:08 +0200 Subject: [PATCH 11/11] add comments --- pandas/_libs/src/klib/khash_python.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h index 3fcf7b7e1c0ee..87c6283c19a2f 100644 --- a/pandas/_libs/src/klib/khash_python.h +++ b/pandas/_libs/src/klib/khash_python.h @@ -163,6 +163,7 @@ KHASH_MAP_INIT_COMPLEX128(complex128, size_t) #define kh_exist_complex128(h, k) (kh_exist(h, k)) +// NaN-floats should be in the same equivalency class, see GH 22119 int PANDAS_INLINE floatobject_cmp(PyFloatObject* a, PyFloatObject* b){ return ( Py_IS_NAN(PyFloat_AS_DOUBLE(a)) && @@ -173,6 +174,9 @@ int PANDAS_INLINE floatobject_cmp(PyFloatObject* a, PyFloatObject* b){ } +// NaNs should be in the same equivalency class, see GH 41836 +// PyObject_RichCompareBool for complexobjects has a different behavior +// needs to be replaced int PANDAS_INLINE complexobject_cmp(PyComplexObject* a, PyComplexObject* b){ return ( Py_IS_NAN(a->cval.real) && @@ -202,6 +206,9 @@ int PANDAS_INLINE complexobject_cmp(PyComplexObject* a, PyComplexObject* b){ int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b); +// replacing PyObject_RichCompareBool (NaN!=NaN) with pyobject_cmp (NaN==NaN), +// which treats NaNs as equivalent +// see GH 41836 int PANDAS_INLINE tupleobject_cmp(PyTupleObject* a, PyTupleObject* b){ Py_ssize_t i; @@ -220,7 +227,9 @@ int PANDAS_INLINE tupleobject_cmp(PyTupleObject* a, PyTupleObject* b){ int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) { if (Py_TYPE(a) == Py_TYPE(b)) { - // special handling for some built-in types which could have NaNs: + // special handling for some built-in types which could have NaNs + // as we would like to have them equivalent, but the usual + // PyObject_RichCompareBool would return False if (PyFloat_CheckExact(a)) { return floatobject_cmp((PyFloatObject*)a, (PyFloatObject*)b); }