From f027ff9bbbe4d41bac13c9994d395131df09c3ec Mon Sep 17 00:00:00 2001 From: Vamsi Verma Date: Wed, 5 Oct 2022 20:55:01 +0530 Subject: [PATCH 1/9] BUG: df.compare returns incorrect result when comparing NA values --- pandas/core/generic.py | 1 + pandas/tests/frame/methods/test_compare.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 3df5d2aaf9896..fcdff306495ed 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9262,6 +9262,7 @@ def compare( ) mask = ~((self == other) | (self.isna() & other.isna())) + mask.fillna(True, inplace=True) if not keep_equal: self = self.where(mask) diff --git a/pandas/tests/frame/methods/test_compare.py b/pandas/tests/frame/methods/test_compare.py index 55e5db9603fe5..ecbdbe3707b2e 100644 --- a/pandas/tests/frame/methods/test_compare.py +++ b/pandas/tests/frame/methods/test_compare.py @@ -245,7 +245,7 @@ def test_compare_ea_and_np_dtype(): result = df1.compare(df2, keep_shape=True) expected = pd.DataFrame( { - ("a", "self"): [4.0, np.nan], + ("a", "self"): [4.0, 4], ("a", "other"): pd.Series([1, pd.NA], dtype="Int64"), ("b", "self"): np.nan, ("b", "other"): np.nan, From 0f3fcef5c848c43f951b03cf494ad2553c4a8122 Mon Sep 17 00:00:00 2001 From: Vamsi Verma Date: Thu, 6 Oct 2022 02:15:45 +0000 Subject: [PATCH 2/9] add bug fix note --- doc/source/whatsnew/v1.6.0.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst index 3c7a80f096844..f1c0654066058 100644 --- a/doc/source/whatsnew/v1.6.0.rst +++ b/doc/source/whatsnew/v1.6.0.rst @@ -209,6 +209,8 @@ Indexing - Bug in :meth:`DataFrame.reindex` filling with wrong values when indexing columns and index for ``uint`` dtypes (:issue:`48184`) - Bug in :meth:`DataFrame.reindex` casting dtype to ``object`` when :class:`DataFrame` has single extension array column when re-indexing ``columns`` and ``index`` (:issue:`48190`) - Bug in :func:`~DataFrame.describe` when formatting percentiles in the resulting index showed more decimals than needed (:issue:`46362`) +- Bug in :meth:`DataFrame.compare` does not recognize differences with the nullable Int64 data type (:issue:`48939`) +- Missing ^^^^^^^ From 9c0a19be0411567b397816457f208cd7298d342f Mon Sep 17 00:00:00 2001 From: Vamsi Verma Date: Thu, 6 Oct 2022 04:29:39 +0000 Subject: [PATCH 3/9] add pd.compare tests specific to int64 null dtype --- pandas/tests/frame/methods/test_compare.py | 32 ++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/pandas/tests/frame/methods/test_compare.py b/pandas/tests/frame/methods/test_compare.py index ecbdbe3707b2e..fef1cc4d1d109 100644 --- a/pandas/tests/frame/methods/test_compare.py +++ b/pandas/tests/frame/methods/test_compare.py @@ -252,3 +252,35 @@ def test_compare_ea_and_np_dtype(): } ) tm.assert_frame_equal(result, expected) + + +def test_compare_with_equal_null_int64_dtypes(): + # GH #48939 + # two int64 nulls are considered same. + df1 = pd.DataFrame({"a": pd.Series([4.0, pd.NA], dtype="Int64"), "b": [1.0, 2]}) + df2 = pd.DataFrame({"a": pd.Series([3, pd.NA], dtype="Int64"), "b": [1.0, 2]}) + result = df1.compare(df2) + expected = pd.DataFrame( + { + ("a", "self"): pd.Series([4], dtype="Int64"), + ("a", "other"): pd.Series([3], dtype="Int64"), + } + ) + tm.assert_frame_equal(result, expected) + + +def test_compare_with_unequal_null_int64_dtypes(): + # GH #48939 + # comparison with int64 null dtype shouldn't obscure result. + df1 = pd.DataFrame({"a": pd.Series([4.0, 4], dtype="Int64"), "b": [1.0, 2]}) + df2 = pd.DataFrame({"a": pd.Series([1, pd.NA], dtype="Int64"), "b": [1.0, 2]}) + result = df1.compare(df2, keep_shape=True) + expected = pd.DataFrame( + { + ("a", "self"): pd.Series([4.0, 4], dtype="Int64"), + ("a", "other"): pd.Series([1, pd.NA], dtype="Int64"), + ("b", "self"): np.nan, + ("b", "other"): np.nan, + } + ) + tm.assert_frame_equal(result, expected) From 0b5ec5c746f0a9c307210eef6c3cf5bd4ad5f165 Mon Sep 17 00:00:00 2001 From: Vamsi Verma Date: Thu, 6 Oct 2022 04:38:43 +0000 Subject: [PATCH 4/9] change bug fix note to be specifically about int64 dtype null values --- doc/source/whatsnew/v1.6.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst index f1c0654066058..f0a5d03f65efc 100644 --- a/doc/source/whatsnew/v1.6.0.rst +++ b/doc/source/whatsnew/v1.6.0.rst @@ -209,7 +209,7 @@ Indexing - Bug in :meth:`DataFrame.reindex` filling with wrong values when indexing columns and index for ``uint`` dtypes (:issue:`48184`) - Bug in :meth:`DataFrame.reindex` casting dtype to ``object`` when :class:`DataFrame` has single extension array column when re-indexing ``columns`` and ``index`` (:issue:`48190`) - Bug in :func:`~DataFrame.describe` when formatting percentiles in the resulting index showed more decimals than needed (:issue:`46362`) -- Bug in :meth:`DataFrame.compare` does not recognize differences with the nullable Int64 data type (:issue:`48939`) +- Bug in :meth:`DataFrame.compare` does not recognize differences when comparing values with null Int64 data type (:issue:`48939`) - Missing From d72e0e3b76b945cd0643d231e06af3df7957ff85 Mon Sep 17 00:00:00 2001 From: Vamsi Verma Date: Thu, 6 Oct 2022 09:24:44 +0000 Subject: [PATCH 5/9] change bug fix note --- doc/source/whatsnew/v1.6.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst index f0a5d03f65efc..626b20da4f5aa 100644 --- a/doc/source/whatsnew/v1.6.0.rst +++ b/doc/source/whatsnew/v1.6.0.rst @@ -209,7 +209,7 @@ Indexing - Bug in :meth:`DataFrame.reindex` filling with wrong values when indexing columns and index for ``uint`` dtypes (:issue:`48184`) - Bug in :meth:`DataFrame.reindex` casting dtype to ``object`` when :class:`DataFrame` has single extension array column when re-indexing ``columns`` and ``index`` (:issue:`48190`) - Bug in :func:`~DataFrame.describe` when formatting percentiles in the resulting index showed more decimals than needed (:issue:`46362`) -- Bug in :meth:`DataFrame.compare` does not recognize differences when comparing values with null Int64 data type (:issue:`48939`) +- Bug in :meth:`DataFrame.compare` does not recognize differences when comparing ``NA`` with value in nullable dtypes (:issue:`48939`) - Missing From 4412e331151d90fc5a4d97b41106453e70c1d774 Mon Sep 17 00:00:00 2001 From: Vamsi Verma Date: Thu, 6 Oct 2022 11:29:17 +0000 Subject: [PATCH 6/9] add nullable int64 test and parameterize ea and np dtype test --- pandas/tests/frame/methods/test_compare.py | 96 +++++++++++++--------- 1 file changed, 56 insertions(+), 40 deletions(-) diff --git a/pandas/tests/frame/methods/test_compare.py b/pandas/tests/frame/methods/test_compare.py index fef1cc4d1d109..d4dfb20719287 100644 --- a/pandas/tests/frame/methods/test_compare.py +++ b/pandas/tests/frame/methods/test_compare.py @@ -238,49 +238,65 @@ def test_invalid_input_result_names(result_names): df1.compare(df2, result_names=result_names) -def test_compare_ea_and_np_dtype(): - # GH#44014 - df1 = pd.DataFrame({"a": [4.0, 4], "b": [1.0, 2]}) - df2 = pd.DataFrame({"a": pd.Series([1, pd.NA], dtype="Int64"), "b": [1.0, 2]}) +@pytest.mark.parametrize( + "df1,df2,expected", + [ + ( + pd.DataFrame({"a": [4.0, 4], "b": [1.0, 2]}), + pd.DataFrame({"a": pd.Series([1, pd.NA], dtype="Int64"), "b": [1.0, 2]}), + pd.DataFrame( + { + ("a", "self"): [4.0, 4], + ("a", "other"): pd.Series([1, pd.NA], dtype="Int64"), + ("b", "self"): np.nan, + ("b", "other"): np.nan, + } + ), + ), + ( + pd.DataFrame({"a": pd.Series([4, pd.NA], dtype="Int64"), "b": [1.0, 2]}), + pd.DataFrame({"a": [4, pd.NA], "b": [1.0, 2]}), + pd.DataFrame( + { + ("a", "self"): pd.Series([4, pd.NA], dtype="Int64"), + ("a", "other"): pd.Series([4, pd.NA]), + ("b", "self"): np.nan, + ("b", "other"): np.nan, + } + ), + ), + ], +) +def test_compare_ea_and_np_dtype(df1, df2, expected): result = df1.compare(df2, keep_shape=True) - expected = pd.DataFrame( - { - ("a", "self"): [4.0, 4], - ("a", "other"): pd.Series([1, pd.NA], dtype="Int64"), - ("b", "self"): np.nan, - ("b", "other"): np.nan, - } - ) tm.assert_frame_equal(result, expected) -def test_compare_with_equal_null_int64_dtypes(): - # GH #48939 - # two int64 nulls are considered same. - df1 = pd.DataFrame({"a": pd.Series([4.0, pd.NA], dtype="Int64"), "b": [1.0, 2]}) - df2 = pd.DataFrame({"a": pd.Series([3, pd.NA], dtype="Int64"), "b": [1.0, 2]}) - result = df1.compare(df2) - expected = pd.DataFrame( - { - ("a", "self"): pd.Series([4], dtype="Int64"), - ("a", "other"): pd.Series([3], dtype="Int64"), - } - ) - tm.assert_frame_equal(result, expected) - - -def test_compare_with_unequal_null_int64_dtypes(): - # GH #48939 - # comparison with int64 null dtype shouldn't obscure result. - df1 = pd.DataFrame({"a": pd.Series([4.0, 4], dtype="Int64"), "b": [1.0, 2]}) - df2 = pd.DataFrame({"a": pd.Series([1, pd.NA], dtype="Int64"), "b": [1.0, 2]}) +@pytest.mark.parametrize( + "df1,df2,expected", + [ + ( + pd.DataFrame({"a": pd.Series([4.0, pd.NA], dtype="Int64")}), + pd.DataFrame({"a": pd.Series([3, pd.NA], dtype="Int64")}), + pd.DataFrame( + { + ("a", "self"): pd.Series([4, pd.NA], dtype="Int64"), + ("a", "other"): pd.Series([3, pd.NA], dtype="Int64"), + } + ), + ), + ( + pd.DataFrame({"a": pd.Series([4, pd.NA], dtype="Int64")}), + pd.DataFrame({"a": pd.Series([4, pd.NA], dtype="Int64")}), + pd.DataFrame( + { + ("a", "self"): pd.Series([pd.NA, pd.NA], dtype="Int64"), + ("a", "other"): pd.Series([pd.NA, pd.NA], dtype="Int64"), + } + ), + ), + ], +) +def test_compare_nullable_int64_dtype(df1, df2, expected): result = df1.compare(df2, keep_shape=True) - expected = pd.DataFrame( - { - ("a", "self"): pd.Series([4.0, 4], dtype="Int64"), - ("a", "other"): pd.Series([1, pd.NA], dtype="Int64"), - ("b", "self"): np.nan, - ("b", "other"): np.nan, - } - ) tm.assert_frame_equal(result, expected) From 9b97320a20a996337162161bc36e207b2c2ea0c9 Mon Sep 17 00:00:00 2001 From: Vamsi Verma Date: Wed, 12 Oct 2022 04:23:19 +0000 Subject: [PATCH 7/9] fix test parametrization --- pandas/tests/frame/methods/test_compare.py | 87 +++++++++------------- 1 file changed, 36 insertions(+), 51 deletions(-) diff --git a/pandas/tests/frame/methods/test_compare.py b/pandas/tests/frame/methods/test_compare.py index d4dfb20719287..46c79cec4a210 100644 --- a/pandas/tests/frame/methods/test_compare.py +++ b/pandas/tests/frame/methods/test_compare.py @@ -239,64 +239,49 @@ def test_invalid_input_result_names(result_names): @pytest.mark.parametrize( - "df1,df2,expected", - [ - ( - pd.DataFrame({"a": [4.0, 4], "b": [1.0, 2]}), - pd.DataFrame({"a": pd.Series([1, pd.NA], dtype="Int64"), "b": [1.0, 2]}), - pd.DataFrame( - { - ("a", "self"): [4.0, 4], - ("a", "other"): pd.Series([1, pd.NA], dtype="Int64"), - ("b", "self"): np.nan, - ("b", "other"): np.nan, - } - ), - ), - ( - pd.DataFrame({"a": pd.Series([4, pd.NA], dtype="Int64"), "b": [1.0, 2]}), - pd.DataFrame({"a": [4, pd.NA], "b": [1.0, 2]}), - pd.DataFrame( - { - ("a", "self"): pd.Series([4, pd.NA], dtype="Int64"), - ("a", "other"): pd.Series([4, pd.NA]), - ("b", "self"): np.nan, - ("b", "other"): np.nan, - } - ), - ), - ], + "ea_val,np_dtype_val", + [(4, pd.NA), (pd.NA, pd.NA), (pd.NA, 4)], ) -def test_compare_ea_and_np_dtype(df1, df2, expected): - result = df1.compare(df2, keep_shape=True) +def test_compare_ea_and_np_dtype(ea_val, np_dtype_val): + ea = [4.0, ea_val] + np_dtype = pd.Series([1, np_dtype_val], dtype="Int64") + + ea_df = pd.DataFrame({"a": ea, "b": [1.0, 2]}) + np_dtype_df = pd.DataFrame({"a": np_dtype, "b": [1.0, 2]}) + expected = pd.DataFrame( + { + ("a", "self"): ea, + ("a", "other"): np_dtype, + ("b", "self"): np.nan, + ("b", "other"): np.nan, + } + ) + result = ea_df.compare(np_dtype_df, keep_shape=True) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( - "df1,df2,expected", + "df1_val,df2_val,diff_self,diff_other", [ - ( - pd.DataFrame({"a": pd.Series([4.0, pd.NA], dtype="Int64")}), - pd.DataFrame({"a": pd.Series([3, pd.NA], dtype="Int64")}), - pd.DataFrame( - { - ("a", "self"): pd.Series([4, pd.NA], dtype="Int64"), - ("a", "other"): pd.Series([3, pd.NA], dtype="Int64"), - } - ), - ), - ( - pd.DataFrame({"a": pd.Series([4, pd.NA], dtype="Int64")}), - pd.DataFrame({"a": pd.Series([4, pd.NA], dtype="Int64")}), - pd.DataFrame( - { - ("a", "self"): pd.Series([pd.NA, pd.NA], dtype="Int64"), - ("a", "other"): pd.Series([pd.NA, pd.NA], dtype="Int64"), - } - ), - ), + (4, 3, 4, 3), + (4, 4, pd.NA, pd.NA), + (4, pd.NA, 4, pd.NA), + (pd.NA, pd.NA, pd.NA, pd.NA), ], ) -def test_compare_nullable_int64_dtype(df1, df2, expected): +def test_compare_nullable_int64_dtype(df1_val, df2_val, diff_self, diff_other): + + df1 = pd.DataFrame({"a": pd.Series([df1_val, pd.NA], dtype="Int64"), "b": [1.0, 2]}) + df2 = df1.copy() + df2.loc[0, "a"] = df2_val + + expected = pd.DataFrame( + { + ("a", "self"): pd.Series([diff_self, pd.NA], dtype="Int64"), + ("a", "other"): pd.Series([diff_other, pd.NA], dtype="Int64"), + ("b", "self"): np.nan, + ("b", "other"): np.nan, + } + ) result = df1.compare(df2, keep_shape=True) tm.assert_frame_equal(result, expected) From 4838dc07aaebc2da782556ca4f1d61910c520615 Mon Sep 17 00:00:00 2001 From: Vamsi Verma Date: Fri, 14 Oct 2022 03:08:57 +0000 Subject: [PATCH 8/9] rename variables, add gh refs --- pandas/tests/frame/methods/test_compare.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/pandas/tests/frame/methods/test_compare.py b/pandas/tests/frame/methods/test_compare.py index 46c79cec4a210..f97640b9a8045 100644 --- a/pandas/tests/frame/methods/test_compare.py +++ b/pandas/tests/frame/methods/test_compare.py @@ -239,24 +239,25 @@ def test_invalid_input_result_names(result_names): @pytest.mark.parametrize( - "ea_val,np_dtype_val", + "val1,val2", [(4, pd.NA), (pd.NA, pd.NA), (pd.NA, 4)], ) -def test_compare_ea_and_np_dtype(ea_val, np_dtype_val): - ea = [4.0, ea_val] - np_dtype = pd.Series([1, np_dtype_val], dtype="Int64") +def test_compare_ea_and_np_dtype(val1, val2): + # GH 48966 + arr = [4.0, val1] + np_dtype_arr = pd.Series([1, val2], dtype="Int64") - ea_df = pd.DataFrame({"a": ea, "b": [1.0, 2]}) - np_dtype_df = pd.DataFrame({"a": np_dtype, "b": [1.0, 2]}) + df1 = pd.DataFrame({"a": arr, "b": [1.0, 2]}) + df2 = pd.DataFrame({"a": np_dtype_arr, "b": [1.0, 2]}) expected = pd.DataFrame( { - ("a", "self"): ea, - ("a", "other"): np_dtype, + ("a", "self"): arr, + ("a", "other"): np_dtype_arr, ("b", "self"): np.nan, ("b", "other"): np.nan, } ) - result = ea_df.compare(np_dtype_df, keep_shape=True) + result = df1.compare(df2, keep_shape=True) tm.assert_frame_equal(result, expected) @@ -270,7 +271,7 @@ def test_compare_ea_and_np_dtype(ea_val, np_dtype_val): ], ) def test_compare_nullable_int64_dtype(df1_val, df2_val, diff_self, diff_other): - + # GH 48966 df1 = pd.DataFrame({"a": pd.Series([df1_val, pd.NA], dtype="Int64"), "b": [1.0, 2]}) df2 = df1.copy() df2.loc[0, "a"] = df2_val From 3f739ddea7ba0c0727d89bfa8f9f01d877f0bb3c Mon Sep 17 00:00:00 2001 From: Vamsi Verma Date: Fri, 14 Oct 2022 15:09:54 +0000 Subject: [PATCH 9/9] change var name --- pandas/tests/frame/methods/test_compare.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/frame/methods/test_compare.py b/pandas/tests/frame/methods/test_compare.py index f97640b9a8045..2c47285d7c507 100644 --- a/pandas/tests/frame/methods/test_compare.py +++ b/pandas/tests/frame/methods/test_compare.py @@ -245,14 +245,14 @@ def test_invalid_input_result_names(result_names): def test_compare_ea_and_np_dtype(val1, val2): # GH 48966 arr = [4.0, val1] - np_dtype_arr = pd.Series([1, val2], dtype="Int64") + ser = pd.Series([1, val2], dtype="Int64") df1 = pd.DataFrame({"a": arr, "b": [1.0, 2]}) - df2 = pd.DataFrame({"a": np_dtype_arr, "b": [1.0, 2]}) + df2 = pd.DataFrame({"a": ser, "b": [1.0, 2]}) expected = pd.DataFrame( { ("a", "self"): arr, - ("a", "other"): np_dtype_arr, + ("a", "other"): ser, ("b", "self"): np.nan, ("b", "other"): np.nan, }