From 9f7e022644440c9c8b04fdc305dd3b59d231d644 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 7 Sep 2022 20:29:26 +0200 Subject: [PATCH 1/3] BUG: Index.equals raising with index of tuples that contain NA --- doc/source/whatsnew/v1.6.0.rst | 2 +- pandas/_libs/lib.pyx | 7 ++++++- pandas/_libs/missing.pxd | 1 + pandas/_libs/missing.pyx | 28 ++++++++++++++++++++++++++++ pandas/tests/dtypes/test_missing.py | 13 +++++++++++++ 5 files changed, 49 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst index 42d3ce8069322..572b756542e33 100644 --- a/doc/source/whatsnew/v1.6.0.rst +++ b/doc/source/whatsnew/v1.6.0.rst @@ -159,7 +159,7 @@ Indexing Missing ^^^^^^^ -- +- Bug in :meth:`Index.equals` raising ``TypeError` when :class:`Index` consists of tuples that contain ``NA`` (:issue:`40000`) - MultiIndex diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 65677bbdb0ea9..4e382c4ed7929 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -31,6 +31,7 @@ from cython cimport ( floating, ) +from pandas._libs.missing import check_na_tuples_nonequal from pandas.util._exceptions import find_stack_level import_datetime() @@ -650,7 +651,7 @@ def array_equivalent_object(left: object[:], right: object[:]) -> bool: or is_matching_na(x, y, nan_matches_none=True) ): return False - except ValueError: + except (ValueError, TypeError): # Avoid raising ValueError when comparing Numpy arrays to other types if cnp.PyArray_IsAnyScalar(x) != cnp.PyArray_IsAnyScalar(y): # Only compare scalars to scalars and non-scalars to non-scalars @@ -659,7 +660,11 @@ def array_equivalent_object(left: object[:], right: object[:]) -> bool: and not (isinstance(x, type(y)) or isinstance(y, type(x)))): # Check if non-scalars have the same type return False + elif check_na_tuples_nonequal(x, y): + # We have tuples where one Side has a NA and the other side does not + return False raise + return True diff --git a/pandas/_libs/missing.pxd b/pandas/_libs/missing.pxd index 854dcf2ec9775..5920649519442 100644 --- a/pandas/_libs/missing.pxd +++ b/pandas/_libs/missing.pxd @@ -5,6 +5,7 @@ from numpy cimport ( cpdef bint is_matching_na(object left, object right, bint nan_matches_none=*) +cpdef bint check_na_tuples_nonequal(object left, object right) cpdef bint checknull(object val, bint inf_as_na=*) cpdef ndarray[uint8_t] isnaobj(ndarray arr, bint inf_as_na=*) diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index 9b470e95dc22b..1f9a65480e79a 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -42,6 +42,34 @@ cdef: type cDecimal = Decimal # for faster isinstance checks +cpdef bint check_na_tuples_nonequal(object left, object right): + """ + When we have NA in one of the tuples but not the other we have to check here, + because our regular checks fail before with ambigous boolean value. + + Parameters + ---------- + left: Any + right: Any + + Returns + ------- + True if we are dealing with tuples that have NA on one side and non NA on + the other side. + + """ + if not isinstance(left, tuple) or not isinstance(right, tuple): + return False + + for left_element, right_element in zip(left, right): + if left_element is C_NA and not right_element is C_NA: + return True + elif right_element is C_NA and not left_element is C_NA: + return True + + return False + + cpdef bint is_matching_na(object left, object right, bint nan_matches_none=False): """ Check if two scalars are both NA of matching types. diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py index 9c8aeb050ec27..bac4a4134fed5 100644 --- a/pandas/tests/dtypes/test_missing.py +++ b/pandas/tests/dtypes/test_missing.py @@ -538,6 +538,19 @@ def test_array_equivalent_nested(): assert not array_equivalent(left, right, strict_nan=True) +def test_array_equivalent_index_with_tuples(): + # GH# + idx1 = pd.Index(np.array([(pd.NA, 4), (1, 1)], dtype="object")) + idx2 = pd.Index(np.array([(1, 1), (pd.NA, 4)], dtype="object")) + assert not array_equivalent(idx1, idx2) + assert not idx1.equals(idx2) + + idx1 = pd.Index(np.array([(4, pd.NA), (1, 1)], dtype="object")) + idx2 = pd.Index(np.array([(1, 1), (4, pd.NA)], dtype="object")) + assert not array_equivalent(idx1, idx2) + assert not idx1.equals(idx2) + + @pytest.mark.parametrize( "dtype, na_value", [ From 0e5218898e9a95f035b66e27539f3ac2374d831c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 7 Sep 2022 22:41:47 +0200 Subject: [PATCH 2/3] Adress review --- doc/source/whatsnew/v1.6.0.rst | 2 +- pandas/_libs/lib.pyx | 1 + pandas/_libs/missing.pyx | 4 ++-- pandas/tests/dtypes/test_missing.py | 6 +++++- 4 files changed, 9 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst index 572b756542e33..2c8e44777a59c 100644 --- a/doc/source/whatsnew/v1.6.0.rst +++ b/doc/source/whatsnew/v1.6.0.rst @@ -159,7 +159,7 @@ Indexing Missing ^^^^^^^ -- Bug in :meth:`Index.equals` raising ``TypeError` when :class:`Index` consists of tuples that contain ``NA`` (:issue:`40000`) +- Bug in :meth:`Index.equals` raising ``TypeError` when :class:`Index` consists of tuples that contain ``NA`` (:issue:`48446`) - MultiIndex diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 4e382c4ed7929..b202b7b1a152b 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -662,6 +662,7 @@ def array_equivalent_object(left: object[:], right: object[:]) -> bool: return False elif check_na_tuples_nonequal(x, y): # We have tuples where one Side has a NA and the other side does not + # Only condition we may end up with a TypeError return False raise diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index 1f9a65480e79a..26d44a6376f4f 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -62,9 +62,9 @@ cpdef bint check_na_tuples_nonequal(object left, object right): return False for left_element, right_element in zip(left, right): - if left_element is C_NA and not right_element is C_NA: + if left_element is C_NA and right_element is not C_NA: return True - elif right_element is C_NA and not left_element is C_NA: + elif right_element is C_NA and left_element is not C_NA: return True return False diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py index bac4a4134fed5..e1dd182a5ae30 100644 --- a/pandas/tests/dtypes/test_missing.py +++ b/pandas/tests/dtypes/test_missing.py @@ -539,16 +539,20 @@ def test_array_equivalent_nested(): def test_array_equivalent_index_with_tuples(): - # GH# + # GH#48446 idx1 = pd.Index(np.array([(pd.NA, 4), (1, 1)], dtype="object")) idx2 = pd.Index(np.array([(1, 1), (pd.NA, 4)], dtype="object")) assert not array_equivalent(idx1, idx2) assert not idx1.equals(idx2) + assert not array_equivalent(idx2, idx1) + assert not idx2.equals(idx1) idx1 = pd.Index(np.array([(4, pd.NA), (1, 1)], dtype="object")) idx2 = pd.Index(np.array([(1, 1), (4, pd.NA)], dtype="object")) assert not array_equivalent(idx1, idx2) assert not idx1.equals(idx2) + assert not array_equivalent(idx2, idx1) + assert not idx2.equals(idx1) @pytest.mark.parametrize( From 626f1dcead31032f83e4ef57c42c73aece3f780c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 12 Sep 2022 20:24:00 +0200 Subject: [PATCH 3/3] Add sanity check --- pandas/_libs/missing.pyx | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index 26d44a6376f4f..518f17d840dd5 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -61,6 +61,9 @@ cpdef bint check_na_tuples_nonequal(object left, object right): if not isinstance(left, tuple) or not isinstance(right, tuple): return False + if len(left) != len(right): + return False + for left_element, right_element in zip(left, right): if left_element is C_NA and right_element is not C_NA: return True