Skip to content

Commit dfcadab

Browse files
vamsi-verma-snoatamir
authored andcommitted
BUG: pd.compare does not recognize differences when comparing values with null Int64 data type (pandas-dev#48966)
1 parent bf63e5c commit dfcadab

File tree

3 files changed

+43
-6
lines changed

3 files changed

+43
-6
lines changed

doc/source/whatsnew/v2.0.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,8 @@ Indexing
218218
- Bug in :meth:`DataFrame.reindex` filling with wrong values when indexing columns and index for ``uint`` dtypes (:issue:`48184`)
219219
- Bug in :meth:`DataFrame.reindex` casting dtype to ``object`` when :class:`DataFrame` has single extension array column when re-indexing ``columns`` and ``index`` (:issue:`48190`)
220220
- Bug in :func:`~DataFrame.describe` when formatting percentiles in the resulting index showed more decimals than needed (:issue:`46362`)
221+
- Bug in :meth:`DataFrame.compare` does not recognize differences when comparing ``NA`` with value in nullable dtypes (:issue:`48939`)
222+
-
221223

222224
Missing
223225
^^^^^^^

pandas/core/generic.py

+1
Original file line numberDiff line numberDiff line change
@@ -9261,6 +9261,7 @@ def compare(
92619261
)
92629262

92639263
mask = ~((self == other) | (self.isna() & other.isna()))
9264+
mask.fillna(True, inplace=True)
92649265

92659266
if not keep_equal:
92669267
self = self.where(mask)

pandas/tests/frame/methods/test_compare.py

+40-6
Original file line numberDiff line numberDiff line change
@@ -238,17 +238,51 @@ def test_invalid_input_result_names(result_names):
238238
df1.compare(df2, result_names=result_names)
239239

240240

241-
def test_compare_ea_and_np_dtype():
242-
# GH#44014
243-
df1 = pd.DataFrame({"a": [4.0, 4], "b": [1.0, 2]})
244-
df2 = pd.DataFrame({"a": pd.Series([1, pd.NA], dtype="Int64"), "b": [1.0, 2]})
241+
@pytest.mark.parametrize(
242+
"val1,val2",
243+
[(4, pd.NA), (pd.NA, pd.NA), (pd.NA, 4)],
244+
)
245+
def test_compare_ea_and_np_dtype(val1, val2):
246+
# GH 48966
247+
arr = [4.0, val1]
248+
ser = pd.Series([1, val2], dtype="Int64")
249+
250+
df1 = pd.DataFrame({"a": arr, "b": [1.0, 2]})
251+
df2 = pd.DataFrame({"a": ser, "b": [1.0, 2]})
252+
expected = pd.DataFrame(
253+
{
254+
("a", "self"): arr,
255+
("a", "other"): ser,
256+
("b", "self"): np.nan,
257+
("b", "other"): np.nan,
258+
}
259+
)
245260
result = df1.compare(df2, keep_shape=True)
261+
tm.assert_frame_equal(result, expected)
262+
263+
264+
@pytest.mark.parametrize(
265+
"df1_val,df2_val,diff_self,diff_other",
266+
[
267+
(4, 3, 4, 3),
268+
(4, 4, pd.NA, pd.NA),
269+
(4, pd.NA, 4, pd.NA),
270+
(pd.NA, pd.NA, pd.NA, pd.NA),
271+
],
272+
)
273+
def test_compare_nullable_int64_dtype(df1_val, df2_val, diff_self, diff_other):
274+
# GH 48966
275+
df1 = pd.DataFrame({"a": pd.Series([df1_val, pd.NA], dtype="Int64"), "b": [1.0, 2]})
276+
df2 = df1.copy()
277+
df2.loc[0, "a"] = df2_val
278+
246279
expected = pd.DataFrame(
247280
{
248-
("a", "self"): [4.0, np.nan],
249-
("a", "other"): pd.Series([1, pd.NA], dtype="Int64"),
281+
("a", "self"): pd.Series([diff_self, pd.NA], dtype="Int64"),
282+
("a", "other"): pd.Series([diff_other, pd.NA], dtype="Int64"),
250283
("b", "self"): np.nan,
251284
("b", "other"): np.nan,
252285
}
253286
)
287+
result = df1.compare(df2, keep_shape=True)
254288
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)