Skip to content

BUG: take nans correctly into consideration in complex and tuple #41952

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Jun 16, 2021
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1033,6 +1033,7 @@ Missing
- Bug in :meth:`DataFrame.fillna` not accepting a dictionary for the ``downcast`` keyword (:issue:`40809`)
- Bug in :func:`isna` not returning a copy of the mask for nullable types, causing any subsequent mask modification to change the original array (:issue:`40935`)
- Bug in :class:`DataFrame` construction with float data containing ``NaN`` and an integer ``dtype`` casting instead of retaining the ``NaN`` (:issue:`26919`)
- Bug in :meth:`Series.isin` and :meth:`MultiIndex.isin` didn't treat all nans as equivalent if they were in tuples (:issue:`41836`)

MultiIndex
^^^^^^^^^^
Expand Down
84 changes: 78 additions & 6 deletions pandas/_libs/src/klib/khash_python.h
Original file line number Diff line number Diff line change
Expand Up @@ -163,18 +163,90 @@ KHASH_MAP_INIT_COMPLEX128(complex128, size_t)
#define kh_exist_complex128(h, k) (kh_exist(h, k))


// NaN-floats should be in the same equivalency class, see GH 22119
int PANDAS_INLINE floatobject_cmp(PyFloatObject* a, PyFloatObject* b){
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add some comments before these of what is going on / why doing is & the issue reference

return (
Py_IS_NAN(PyFloat_AS_DOUBLE(a)) &&
Py_IS_NAN(PyFloat_AS_DOUBLE(b))
)
||
( PyFloat_AS_DOUBLE(a) == PyFloat_AS_DOUBLE(b) );
}


// NaNs should be in the same equivalency class, see GH 41836
// PyObject_RichCompareBool for complexobjects has a different behavior
// needs to be replaced
int PANDAS_INLINE complexobject_cmp(PyComplexObject* a, PyComplexObject* b){
return (
Py_IS_NAN(a->cval.real) &&
Py_IS_NAN(b->cval.real) &&
Py_IS_NAN(a->cval.imag) &&
Py_IS_NAN(b->cval.imag)
)
||
(
Py_IS_NAN(a->cval.real) &&
Py_IS_NAN(b->cval.real) &&
a->cval.imag == b->cval.imag
)
||
(
a->cval.real == b->cval.real &&
Py_IS_NAN(a->cval.imag) &&
Py_IS_NAN(b->cval.imag)
)
||
(
a->cval.real == b->cval.real &&
a->cval.imag == b->cval.imag
);
}

int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b);


// replacing PyObject_RichCompareBool (NaN!=NaN) with pyobject_cmp (NaN==NaN),
// which treats NaNs as equivalent
// see GH 41836
int PANDAS_INLINE tupleobject_cmp(PyTupleObject* a, PyTupleObject* b){
Py_ssize_t i;

if (Py_SIZE(a) != Py_SIZE(b)) {
return 0;
}

for (i = 0; i < Py_SIZE(a); ++i) {
if (!pyobject_cmp(PyTuple_GET_ITEM(a, i), PyTuple_GET_ITEM(b, i))) {
return 0;
}
}
return 1;
}


int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) {
if (Py_TYPE(a) == Py_TYPE(b)) {
// special handling for some built-in types which could have NaNs
// as we would like to have them equivalent, but the usual
// PyObject_RichCompareBool would return False
if (PyFloat_CheckExact(a)) {
return floatobject_cmp((PyFloatObject*)a, (PyFloatObject*)b);
}
if (PyComplex_CheckExact(a)) {
return complexobject_cmp((PyComplexObject*)a, (PyComplexObject*)b);
}
if (PyTuple_CheckExact(a)) {
return tupleobject_cmp((PyTupleObject*)a, (PyTupleObject*)b);
}
// frozenset isn't yet supported
}

int result = PyObject_RichCompareBool(a, b, Py_EQ);
if (result < 0) {
PyErr_Clear();
return 0;
}
if (result == 0) { // still could be two NaNs
return PyFloat_CheckExact(a) &&
PyFloat_CheckExact(b) &&
Py_IS_NAN(PyFloat_AS_DOUBLE(a)) &&
Py_IS_NAN(PyFloat_AS_DOUBLE(b));
}
return result;
}

Expand Down
14 changes: 1 addition & 13 deletions pandas/tests/indexes/multi/test_isin.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,11 @@
import numpy as np
import pytest

from pandas.compat import PYPY

from pandas import MultiIndex
import pandas._testing as tm


@pytest.mark.skipif(not PYPY, reason="tuples cmp recursively on PyPy")
def test_isin_nan_pypy():
def test_isin_nan():
idx = MultiIndex.from_arrays([["foo", "bar"], [1.0, np.nan]])
tm.assert_numpy_array_equal(idx.isin([("bar", np.nan)]), np.array([False, True]))
tm.assert_numpy_array_equal(
Expand All @@ -31,15 +28,6 @@ def test_isin():
assert result.dtype == np.bool_


@pytest.mark.skipif(PYPY, reason="tuples cmp recursively on PyPy")
def test_isin_nan_not_pypy():
idx = MultiIndex.from_arrays([["foo", "bar"], [1.0, np.nan]])
tm.assert_numpy_array_equal(idx.isin([("bar", np.nan)]), np.array([False, False]))
tm.assert_numpy_array_equal(
idx.isin([("bar", float("nan"))]), np.array([False, False])
)


def test_isin_level_kwarg():
idx = MultiIndex.from_arrays([["qux", "baz", "foo", "bar"], np.arange(4)])

Expand Down
71 changes: 71 additions & 0 deletions pandas/tests/libs/test_hashtable.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import pandas as pd
import pandas._testing as tm
from pandas.core.algorithms import isin


@contextmanager
Expand Down Expand Up @@ -178,6 +179,67 @@ def test_no_reallocation(self, table_type, dtype):
assert n_buckets_start == clean_table.get_state()["n_buckets"]


class TestPyObjectHashTableWithNans:
def test_nan_float(self):
nan1 = float("nan")
nan2 = float("nan")
assert nan1 is not nan2
table = ht.PyObjectHashTable()
table.set_item(nan1, 42)
assert table.get_item(nan2) == 42

def test_nan_complex_both(self):
nan1 = complex(float("nan"), float("nan"))
nan2 = complex(float("nan"), float("nan"))
assert nan1 is not nan2
table = ht.PyObjectHashTable()
table.set_item(nan1, 42)
assert table.get_item(nan2) == 42

def test_nan_complex_real(self):
nan1 = complex(float("nan"), 1)
nan2 = complex(float("nan"), 1)
other = complex(float("nan"), 2)
assert nan1 is not nan2
table = ht.PyObjectHashTable()
table.set_item(nan1, 42)
assert table.get_item(nan2) == 42
with pytest.raises(KeyError, match=None) as error:
table.get_item(other)
assert str(error.value) == str(other)

def test_nan_complex_imag(self):
nan1 = complex(1, float("nan"))
nan2 = complex(1, float("nan"))
other = complex(2, float("nan"))
assert nan1 is not nan2
table = ht.PyObjectHashTable()
table.set_item(nan1, 42)
assert table.get_item(nan2) == 42
with pytest.raises(KeyError, match=None) as error:
table.get_item(other)
assert str(error.value) == str(other)

def test_nan_in_tuple(self):
nan1 = (float("nan"),)
nan2 = (float("nan"),)
assert nan1[0] is not nan2[0]
table = ht.PyObjectHashTable()
table.set_item(nan1, 42)
assert table.get_item(nan2) == 42

def test_nan_in_nested_tuple(self):
nan1 = (1, (2, (float("nan"),)))
nan2 = (1, (2, (float("nan"),)))
other = (1, 2)
table = ht.PyObjectHashTable()
table.set_item(nan1, 42)
assert table.get_item(nan2) == 42
with pytest.raises(KeyError, match=None) as error:
table.get_item(other)
assert str(error.value) == str(other)


def test_get_labels_groupby_for_Int64(writable):
table = ht.Int64HashTable()
vals = np.array([1, 2, -1, 2, 1, -1], dtype=np.int64)
Expand Down Expand Up @@ -426,3 +488,12 @@ def test_mode(self, dtype, type_suffix):
values = np.array([42, np.nan, np.nan, np.nan], dtype=dtype)
assert mode(values, True) == 42
assert np.isnan(mode(values, False))


def test_ismember_tuple_with_nans():
# GH-41836
values = [("a", float("nan")), ("b", 1)]
comps = [("a", float("nan"))]
result = isin(values, comps)
expected = np.array([True, False], dtype=np.bool_)
tm.assert_numpy_array_equal(result, expected)