From 3a3178eebc0c6298bb1e80ccb7f5a8c72c2560c9 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Sat, 4 Aug 2018 22:42:10 +0200 Subject: [PATCH 1/5] BUG: treat nan-objects the same way float64-nans are treated - all nans are from the same equivalence class --- pandas/_libs/src/klib/khash_python.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h index e9fb49e8a5e42..45a93051f78d3 100644 --- a/pandas/_libs/src/klib/khash_python.h +++ b/pandas/_libs/src/klib/khash_python.h @@ -47,10 +47,19 @@ int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) { PyErr_Clear(); return 0; } + if (result == 0) { // still could be two NaNs + return PyFloat_CheckExact(a) && + PyFloat_CheckExact(b) && + Py_IS_NAN(PyFloat_AS_DOUBLE(a)) && + Py_IS_NAN(PyFloat_AS_DOUBLE(b)); + } return result; } - +// For PyObject_Hash holds: +// hash(0.0) == 0 == hash(-0.0) +// hash(X) == 0 if X is a NaN-value +// so it is OK to use it directly #define kh_python_hash_func(key) (PyObject_Hash(key)) #define kh_python_hash_equal(a, b) (pyobject_cmp(a, b)) From 06976bb09ec3b5778024ffaf4f9cc52ae6ea2bb4 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Sat, 4 Aug 2018 23:39:25 +0200 Subject: [PATCH 2/5] adding performance and unit tests --- asv_bench/benchmarks/series_methods.py | 37 ++++++++++++++++++++++++++ pandas/tests/test_algos.py | 8 ++++++ 2 files changed, 45 insertions(+) diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index a5ccf5c32b876..eb2034728b650 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -38,6 +38,43 @@ def time_isin(self, dtypes): self.s.isin(self.values) +class IsInForObjects(object): + + def setup(self): + self.s_nans = Series(np.full(10**4, np.nan)).astype(np.object) + self.vals_nans = np.full(10**4, np.nan).astype(np.object) + self.s_short = Series(np.arange(2)).astype(np.object) + self.s_long = Series(np.arange(10**5)).astype(np.object) + self.vals_short = np.arange(2).astype(np.object) + self.vals_long = np.arange(10**5).astype(np.object) + # because of nans floats are special: + self.s_long_floats = Series(np.arange(10**5, + dtype=np.float)).astype(np.object) + self.vals_long_floats = np.arange(10**5, + dtype=np.float).astype(np.object) + + def time_isin_nans(self): + # if nan-objects are different objects, + # this has the potential to trigger O(n^2) running time + self.s_nans.isin(self.vals_nans) + + def time_isin_short_series_long_values(self): + # running time dominated by the preprocessing + self.s_short.isin(self.vals_long) + + def time_isin_long_series_short_values(self): + # running time dominated by look-up + self.s_long.isin(self.vals_short) + + def time_isin_long_series_long_values(self): + # no dominating part + self.s_long.isin(self.vals_long) + + def time_isin_long_series_long_values_floats(self): + # no dominating part + self.s_long_floats.isin(self.vals_long_floats) + + class NSort(object): goal_time = 0.2 diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 7ce2aaf7d7fbb..d8d98318b4c72 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -623,6 +623,14 @@ def test_empty(self, empty): result = algos.isin(vals, empty) tm.assert_numpy_array_equal(expected, result) + def test_different_nan_objects(self): + # GH 22119 + comps = np.array(['nan', np.nan * 1j, float('nan')], dtype=np.object) + vals = np.array([float('nan')], dtype=np.object) + expected = np.array([False, False, True]) + result = algos.isin(comps, vals) + tm.assert_numpy_array_equal(expected, result) + class TestValueCounts(object): From e583a0c9fd51c70fd148f85a95f5b05d519208b2 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Sat, 4 Aug 2018 23:49:34 +0200 Subject: [PATCH 3/5] adding whatsnew note --- doc/source/whatsnew/v0.24.0.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 42e286f487a7d..a513c2bd03ac7 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -512,6 +512,7 @@ Missing - Bug in :func:`DataFrame.fillna` where a ``ValueError`` would raise when one column contained a ``datetime64[ns, tz]`` dtype (:issue:`15522`) - Bug in :func:`Series.hasnans` that could be incorrectly cached and return incorrect answers if null elements are introduced after an initial call (:issue:`19700`) +- :func:`Series.isin` now treats all nans as equal also for `np.object`-dtype. This behavior is consistent with the behavior for float64 (:issue:`22119`) MultiIndex ^^^^^^^^^^ From d6b032463ed14cbabc1028842f81531222501762 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Sun, 5 Aug 2018 22:56:24 +0200 Subject: [PATCH 4/5] making PEP8 happy --- asv_bench/benchmarks/series_methods.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index eb2034728b650..34a8d552304d1 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -48,10 +48,10 @@ def setup(self): self.vals_short = np.arange(2).astype(np.object) self.vals_long = np.arange(10**5).astype(np.object) # because of nans floats are special: - self.s_long_floats = Series(np.arange(10**5, - dtype=np.float)).astype(np.object) - self.vals_long_floats = np.arange(10**5, - dtype=np.float).astype(np.object) + self.s_long_floats = Series(np.arange(10**5, + dtype=np.float)).astype(np.object) + self.vals_long_floats = np.arange(10**5, + dtype=np.float).astype(np.object) def time_isin_nans(self): # if nan-objects are different objects, From 5cc5e0fa90f3eb15787935478c01195b79ce0284 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Tue, 7 Aug 2018 22:44:21 +0200 Subject: [PATCH 5/5] all nans are the same, so we need to update the behavior of tests which assumed differently --- pandas/tests/indexes/test_base.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 754703dfc4bee..8ed8d2014f5bc 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -13,7 +13,8 @@ from pandas.tests.indexes.common import Base from pandas.compat import (range, lrange, lzip, u, - text_type, zip, PY3, PY35, PY36, PYPY, StringIO) + text_type, zip, PY3, PY35, PY36, StringIO) +import math import operator import numpy as np @@ -1661,9 +1662,13 @@ def test_isin_nan_common_object(self, nulls_fixture, nulls_fixture2): # Test cartesian product of null fixtures and ensure that we don't # mangle the various types (save a corner case with PyPy) - if PYPY and nulls_fixture is np.nan: # np.nan is float('nan') on PyPy + # all nans are the same + if (isinstance(nulls_fixture, float) and + isinstance(nulls_fixture2, float) and + math.isnan(nulls_fixture) and + math.isnan(nulls_fixture2)): tm.assert_numpy_array_equal(Index(['a', nulls_fixture]).isin( - [float('nan')]), np.array([False, True])) + [nulls_fixture2]), np.array([False, True])) elif nulls_fixture is nulls_fixture2: # should preserve NA type tm.assert_numpy_array_equal(Index(['a', nulls_fixture]).isin(