Skip to content

BUG: treat nan-objects the same way float64-nans are treated - all na… #22207

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions asv_bench/benchmarks/series_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,43 @@ def time_isin(self, dtypes):
self.s.isin(self.values)


class IsInForObjects(object):

def setup(self):
self.s_nans = Series(np.full(10**4, np.nan)).astype(np.object)
self.vals_nans = np.full(10**4, np.nan).astype(np.object)
self.s_short = Series(np.arange(2)).astype(np.object)
self.s_long = Series(np.arange(10**5)).astype(np.object)
self.vals_short = np.arange(2).astype(np.object)
self.vals_long = np.arange(10**5).astype(np.object)
# because of nans floats are special:
self.s_long_floats = Series(np.arange(10**5,
dtype=np.float)).astype(np.object)
self.vals_long_floats = np.arange(10**5,
dtype=np.float).astype(np.object)

def time_isin_nans(self):
# if nan-objects are different objects,
# this has the potential to trigger O(n^2) running time
self.s_nans.isin(self.vals_nans)

def time_isin_short_series_long_values(self):
# running time dominated by the preprocessing
self.s_short.isin(self.vals_long)

def time_isin_long_series_short_values(self):
# running time dominated by look-up
self.s_long.isin(self.vals_short)

def time_isin_long_series_long_values(self):
# no dominating part
self.s_long.isin(self.vals_long)

def time_isin_long_series_long_values_floats(self):
# no dominating part
self.s_long_floats.isin(self.vals_long_floats)


class NSort(object):

goal_time = 0.2
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -512,6 +512,7 @@ Missing

- Bug in :func:`DataFrame.fillna` where a ``ValueError`` would raise when one column contained a ``datetime64[ns, tz]`` dtype (:issue:`15522`)
- Bug in :func:`Series.hasnans` that could be incorrectly cached and return incorrect answers if null elements are introduced after an initial call (:issue:`19700`)
- :func:`Series.isin` now treats all nans as equal also for `np.object`-dtype. This behavior is consistent with the behavior for float64 (:issue:`22119`)

MultiIndex
^^^^^^^^^^
Expand Down
11 changes: 10 additions & 1 deletion pandas/_libs/src/klib/khash_python.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,19 @@ int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) {
PyErr_Clear();
return 0;
}
if (result == 0) { // still could be two NaNs
return PyFloat_CheckExact(a) &&
PyFloat_CheckExact(b) &&
Py_IS_NAN(PyFloat_AS_DOUBLE(a)) &&
Py_IS_NAN(PyFloat_AS_DOUBLE(b));
}
return result;
}


// For PyObject_Hash holds:
// hash(0.0) == 0 == hash(-0.0)
// hash(X) == 0 if X is a NaN-value
// so it is OK to use it directly
#define kh_python_hash_func(key) (PyObject_Hash(key))
#define kh_python_hash_equal(a, b) (pyobject_cmp(a, b))

Expand Down
11 changes: 8 additions & 3 deletions pandas/tests/indexes/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
from pandas.tests.indexes.common import Base

from pandas.compat import (range, lrange, lzip, u,
text_type, zip, PY3, PY35, PY36, PYPY, StringIO)
text_type, zip, PY3, PY35, PY36, StringIO)
import math
import operator
import numpy as np

Expand Down Expand Up @@ -1661,9 +1662,13 @@ def test_isin_nan_common_object(self, nulls_fixture, nulls_fixture2):
# Test cartesian product of null fixtures and ensure that we don't
# mangle the various types (save a corner case with PyPy)

if PYPY and nulls_fixture is np.nan: # np.nan is float('nan') on PyPy
# all nans are the same
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so this test seems now very restrictive?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

did you remove the PYPY on purpose?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cc @mattip can you comment on the equivalence of nulls here? we don't test on PYPY, can you see if this still fails?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback After the fix the behavior is less subtle:

Index([nulls_fixture]).isin([nulls_fixture2])

is always True as long as nulls_fixture and nulls_fixture2 are nans, no matter whether those are the same nan-objects or different nan-objects. Thus PyPy is no longer a special case. But I must confess, I didn't test with PyPy and assumed this would be done somewhere in CI.

It seems as if the purpose of this test is to check, that the identity of the object is preserved (something similar to bug #22160). In this case np.nan is no longer a good tool to do this check, because isin bcomes agnostic to the exact identity of the nan-object.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will check it on PyPy, thanks for the heads up.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this particular test in its 25 variants passes on PyPy. There are other failures but they seem unrelated.

if (isinstance(nulls_fixture, float) and
isinstance(nulls_fixture2, float) and
math.isnan(nulls_fixture) and
math.isnan(nulls_fixture2)):
tm.assert_numpy_array_equal(Index(['a', nulls_fixture]).isin(
[float('nan')]), np.array([False, True]))
[nulls_fixture2]), np.array([False, True]))

elif nulls_fixture is nulls_fixture2: # should preserve NA type
tm.assert_numpy_array_equal(Index(['a', nulls_fixture]).isin(
Expand Down
8 changes: 8 additions & 0 deletions pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -623,6 +623,14 @@ def test_empty(self, empty):
result = algos.isin(vals, empty)
tm.assert_numpy_array_equal(expected, result)

def test_different_nan_objects(self):
# GH 22119
comps = np.array(['nan', np.nan * 1j, float('nan')], dtype=np.object)
vals = np.array([float('nan')], dtype=np.object)
expected = np.array([False, False, True])
result = algos.isin(comps, vals)
tm.assert_numpy_array_equal(expected, result)


class TestValueCounts(object):

Expand Down