From 12d6a07707baad69e3faac4c422154c8404fab57 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 7 Feb 2021 09:47:49 -0800 Subject: [PATCH 1/4] BUG: equals/assert_numpy_array_equals with non-singleton NAs --- pandas/_libs/lib.pyx | 4 +- pandas/_libs/missing.pxd | 2 + pandas/_libs/missing.pyx | 37 +++++++++++++++++++ pandas/tests/dtypes/test_missing.py | 16 +++++++- pandas/tests/series/methods/test_equals.py | 23 +++++++++++- pandas/tests/series/test_api.py | 2 +- .../util/test_assert_numpy_array_equal.py | 10 +++++ 7 files changed, 89 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 0f796b2b65dff..cd3350ddaacbb 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -73,6 +73,7 @@ from pandas._libs.tslib import array_to_datetime from pandas._libs.missing cimport ( C_NA, checknull, + is_matching_na, is_null_datetime64, is_null_timedelta64, isnaobj, @@ -584,8 +585,7 @@ def array_equivalent_object(left: object[:], right: object[:]) -> bool: return False elif (x is C_NA) ^ (y is C_NA): return False - elif not (PyObject_RichCompareBool(x, y, Py_EQ) or - (x is None or is_nan(x)) and (y is None or is_nan(y))): + elif not (PyObject_RichCompareBool(x, y, Py_EQ) or is_matching_na(x, y)): return False except ValueError: # Avoid raising ValueError when comparing Numpy arrays to other types diff --git a/pandas/_libs/missing.pxd b/pandas/_libs/missing.pxd index e02b84381b62c..3425dc1701384 100644 --- a/pandas/_libs/missing.pxd +++ b/pandas/_libs/missing.pxd @@ -1,6 +1,8 @@ from numpy cimport ndarray, uint8_t +cpdef bint is_matching_na(object left, object right) + cpdef bint checknull(object val) cpdef bint checknull_old(object val) cpdef ndarray[uint8_t] isnaobj(ndarray arr) diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index abf38265ddc6d..9d26630ac19b5 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -29,6 +29,43 @@ cdef: bint is_32bit = not IS64 +cpdef bint is_matching_na(object left, object right): + """ + Check if two scalars are both NA of matching types. + """ + if left is None: + return right is None + elif left is C_NA: + return right is C_NA + elif left is NaT: + return right is NaT + elif util.is_float_object(left): + return ( + util.is_nan(left) + and util.is_float_object(right) + and util.is_nan(right) + ) + elif util.is_complex_object(left): + return ( + util.is_nan(left) + and util.is_complex_object(right) + and util.is_nan(right) + ) + elif util.is_datetime64_object(left): + return ( + get_datetime64_value(left) == NPY_NAT + and util.is_datetime64_object(right) + and get_datetime64_value(right) == NPY_NAT + ) + elif util.is_timedelta64_object(left): + return ( + get_timedelta64_value(left) == NPY_NAT + and util.is_timedelta64_object(right) + and get_timedelta64_value(right) == NPY_NAT + ) + return False + + cpdef bint checknull(object val): """ Return boolean describing of the input is NA-like, defined here as any diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py index 0d92ef02e07c8..4061eec2273af 100644 --- a/pandas/tests/dtypes/test_missing.py +++ b/pandas/tests/dtypes/test_missing.py @@ -10,7 +10,7 @@ from pandas._libs import missing as libmissing from pandas._libs.tslibs import iNaT, is_null_datetimelike -from pandas.core.dtypes.common import is_scalar +from pandas.core.dtypes.common import is_float, is_scalar from pandas.core.dtypes.dtypes import DatetimeTZDtype, IntervalDtype, PeriodDtype from pandas.core.dtypes.missing import ( array_equivalent, @@ -653,3 +653,17 @@ def test_is_null_datetimelike(self): for value in never_na_vals: assert not is_null_datetimelike(value) + + def test_is_matching_na(self, nulls_fixture, nulls_fixture2): + left = nulls_fixture + right = nulls_fixture2 + + assert libmissing.is_matching_na(left, left) + + if left is right: + assert libmissing.is_matching_na(left, right) + elif is_float(left) and is_float(right): + # np.nan vs float("NaN") we consider as matching + assert libmissing.is_matching_na(left, right) + else: + assert not libmissing.is_matching_na(left, right) diff --git a/pandas/tests/series/methods/test_equals.py b/pandas/tests/series/methods/test_equals.py index cf55482fefe22..6875deb133b62 100644 --- a/pandas/tests/series/methods/test_equals.py +++ b/pandas/tests/series/methods/test_equals.py @@ -3,7 +3,7 @@ import numpy as np import pytest -from pandas import MultiIndex, Series +from pandas import Index, MultiIndex, Series import pandas._testing as tm @@ -65,3 +65,24 @@ def test_equals_false_negative(): assert s1.equals(s4) assert s1.equals(s5) assert s5.equals(s6) + + +def test_equals_matching_nas(): + # matching but not identicanl NAs + left = Series([np.datetime64("NaT")], dtype=object) + right = Series([np.datetime64("NaT")], dtype=object) + assert left.equals(right) + assert Index(left).equals(Index(right)) + assert left.array.equals(right.array) + + left = Series([np.timedelta64("NaT")], dtype=object) + right = Series([np.timedelta64("NaT")], dtype=object) + assert left.equals(right) + assert Index(left).equals(Index(right)) + assert left.array.equals(right.array) + + left = Series([np.float64("NaN")], dtype=object) + right = Series([np.float64("NaN")], dtype=object) + assert left.equals(right) + assert Index(left).equals(Index(right)) + assert left.array.equals(right.array) diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index 4dd91b942474a..c09df52fb5df5 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -174,6 +174,6 @@ def test_attrs(self): @skip_if_no("jinja2") def test_inspect_getmembers(self): # GH38782 - ser = Series() + ser = Series(dtype=object) with tm.assert_produces_warning(None): inspect.getmembers(ser) diff --git a/pandas/tests/util/test_assert_numpy_array_equal.py b/pandas/tests/util/test_assert_numpy_array_equal.py index d29ddedd2fdd6..660402ee857e3 100644 --- a/pandas/tests/util/test_assert_numpy_array_equal.py +++ b/pandas/tests/util/test_assert_numpy_array_equal.py @@ -1,3 +1,5 @@ +import copy + import numpy as np import pytest @@ -198,6 +200,14 @@ def test_numpy_array_equal_identical_na(nulls_fixture): tm.assert_numpy_array_equal(a, a) + # matching but not the identical object + if hasattr(nulls_fixture, "copy"): + other = nulls_fixture.copy() + else: + other = copy.copy(nulls_fixture) + b = np.array([other], dtype=object) + tm.assert_numpy_array_equal(a, b) + def test_numpy_array_equal_different_na(): a = np.array([np.nan], dtype=object) From 9a449a92e0412530557b23a2ea282d066c4dbbf6 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 8 Feb 2021 08:10:45 -0800 Subject: [PATCH 2/4] retain nan vs None equivalence --- pandas/_libs/lib.pyx | 7 +++- pandas/tests/series/methods/test_equals.py | 37 +++++++++++++++++++++- 2 files changed, 42 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index cd3350ddaacbb..903d7ccfb294c 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -585,7 +585,12 @@ def array_equivalent_object(left: object[:], right: object[:]) -> bool: return False elif (x is C_NA) ^ (y is C_NA): return False - elif not (PyObject_RichCompareBool(x, y, Py_EQ) or is_matching_na(x, y)): + elif not ( + PyObject_RichCompareBool(x, y, Py_EQ) + or is_matching_na(x, y) + or (x is None and util.is_nan(y)) + or (util.is_nan(x) and y is None) + ): return False except ValueError: # Avoid raising ValueError when comparing Numpy arrays to other types diff --git a/pandas/tests/series/methods/test_equals.py b/pandas/tests/series/methods/test_equals.py index 6875deb133b62..85d74196e59b4 100644 --- a/pandas/tests/series/methods/test_equals.py +++ b/pandas/tests/series/methods/test_equals.py @@ -1,8 +1,13 @@ from contextlib import nullcontext +import copy import numpy as np import pytest +from pandas._libs.missing import is_matching_na + +from pandas.core.dtypes.common import is_float + from pandas import Index, MultiIndex, Series import pandas._testing as tm @@ -68,7 +73,7 @@ def test_equals_false_negative(): def test_equals_matching_nas(): - # matching but not identicanl NAs + # matching but not identical NAs left = Series([np.datetime64("NaT")], dtype=object) right = Series([np.datetime64("NaT")], dtype=object) assert left.equals(right) @@ -86,3 +91,33 @@ def test_equals_matching_nas(): assert left.equals(right) assert Index(left).equals(Index(right)) assert left.array.equals(right.array) + + +def test_equals_mismatched_nas(nulls_fixture, nulls_fixture2): + # GH#39650 + left = nulls_fixture + right = nulls_fixture2 + if hasattr(right, "copy"): + right = right.copy() + else: + right = copy.copy(right) + + ser = Series([left], dtype=object) + ser2 = Series([right], dtype=object) + + if is_matching_na(left, right): + assert ser.equals(ser2) + elif (left is None and is_float(right)) or (right is None and is_float(left)): + assert ser.equals(ser2) + else: + assert not ser.equals(ser2) + + +def test_equals_none_vs_nan(): + # GH#39650 + ser = Series([1, None], dtype=object) + ser2 = Series([1, np.nan], dtype=object) + + assert ser.equals(ser2) + assert Index(ser).equals(Index(ser2)) + assert ser.array.equals(ser2.array) From 94dc12541f7ec609e12c0f65aac0e7c3d4090908 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 8 Feb 2021 14:03:00 -0800 Subject: [PATCH 3/4] nan_matches_none keyword --- pandas/_libs/lib.pyx | 4 +--- pandas/_libs/missing.pxd | 2 +- pandas/_libs/missing.pyx | 17 ++++++++++++++++- pandas/tests/dtypes/test_missing.py | 8 ++++++++ 4 files changed, 26 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 903d7ccfb294c..6793ea0ab3bc2 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -587,9 +587,7 @@ def array_equivalent_object(left: object[:], right: object[:]) -> bool: return False elif not ( PyObject_RichCompareBool(x, y, Py_EQ) - or is_matching_na(x, y) - or (x is None and util.is_nan(y)) - or (util.is_nan(x) and y is None) + or is_matching_na(x, y, nan_matches_none=True) ): return False except ValueError: diff --git a/pandas/_libs/missing.pxd b/pandas/_libs/missing.pxd index 3425dc1701384..ce8e8007e7630 100644 --- a/pandas/_libs/missing.pxd +++ b/pandas/_libs/missing.pxd @@ -1,7 +1,7 @@ from numpy cimport ndarray, uint8_t -cpdef bint is_matching_na(object left, object right) +cpdef bint is_matching_na(object left, object right, bint nan_matches_none=*) cpdef bint checknull(object val) cpdef bint checknull_old(object val) diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index 9d26630ac19b5..d91d0261a1b33 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -29,17 +29,32 @@ cdef: bint is_32bit = not IS64 -cpdef bint is_matching_na(object left, object right): +cpdef bint is_matching_na(object left, object right, bint nan_matches_none=False): """ Check if two scalars are both NA of matching types. + + Parameters + ---------- + left : Any + right : Any + nan_matches_none : bool, default False + For backwards compatibility, consider NaN as matching None. + + Returns + ------- + bool """ if left is None: + if nan_matches_none and util.is_nan(right): + return True return right is None elif left is C_NA: return right is C_NA elif left is NaT: return right is NaT elif util.is_float_object(left): + if nan_matches_none and right is None: + return True return ( util.is_nan(left) and util.is_float_object(right) diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py index 4061eec2273af..f6566d205e65c 100644 --- a/pandas/tests/dtypes/test_missing.py +++ b/pandas/tests/dtypes/test_missing.py @@ -667,3 +667,11 @@ def test_is_matching_na(self, nulls_fixture, nulls_fixture2): assert libmissing.is_matching_na(left, right) else: assert not libmissing.is_matching_na(left, right) + + def test_is_matching_na_nan_matches_none(self): + + assert not libmissing.is_matching_na(None, np.nan) + assert not libmissing.is_matching_na(np.nan, None) + + assert libmissing.is_matching_na(None, np.nan, nan_matches_none=True) + assert libmissing.is_matching_na(np.nan, None, nan_matches_none=True) From 5250a8a0de6b352b6d466b66ce90950980bb1e0d Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 10 Feb 2021 08:26:33 -0800 Subject: [PATCH 4/4] whatsnew --- doc/source/whatsnew/v1.3.0.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index bed3484793bcc..a3a9a4c5eda08 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -454,7 +454,8 @@ Other - Bug in :func:`pandas.testing.assert_series_equal`, :func:`pandas.testing.assert_frame_equal`, :func:`pandas.testing.assert_index_equal` and :func:`pandas.testing.assert_extension_array_equal` incorrectly raising when an attribute has an unrecognized NA type (:issue:`39461`) - Bug in :class:`Styler` where ``subset`` arg in methods raised an error for some valid multiindex slices (:issue:`33562`) - :class:`Styler` rendered HTML output minor alterations to support w3 good code standard (:issue:`39626`) -- +- Bug in :meth:`DataFrame.equals`, :meth:`Series.equals`, :meth:`Index.equals` with object-dtype containing ``np.datetime64("NaT")`` or ``np.timedelta64("NaT")`` (:issue:`39650`) + .. ---------------------------------------------------------------------------