ENH: add result_names argument to DataFrame.compare #44354 (#47643)

ahmedibrhm · web-flow · commit cb5a924cd851 · 2022-07-16T10:29:38.000-04:00
* DOC #45443 edited the documentation of where/mask functions * DOC #45443 edited the documentation of where/mask functions * Update generic.py * ENH: add suffixes argument to DataFrame.compare #44354 * Edited the tests * space fixing * Update shared_docs.py * Update series.py * Update series.py * invalid argument tests * issue reference * syntax editing * grammar fixing * edit doc * editting doc * Update 02_read_write.rst * Update 02_read_write.rst * Update v1.5.0.rst * Update v1.5.0.rst * np * 1.5.0 rst * created tests for invalid input * space * space * space * editing test
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -278,6 +278,7 @@ Other enhancements
 - :meth:`DatetimeIndex.astype` now supports casting timezone-naive indexes to ``datetime64[s]``, ``datetime64[ms]``, and ``datetime64[us]``, and timezone-aware indexes to the corresponding ``datetime64[unit, tzname]`` dtypes (:issue:`47579`)
 - :class:`Series` reducers (e.g. ``min``, ``max``, ``sum``, ``mean``) will now successfully operate when the dtype is numeric and ``numeric_only=True`` is provided; previously this would raise a ``NotImplementedError`` (:issue:`47500`)
 - :meth:`RangeIndex.union` now can return a :class:`RangeIndex` instead of a :class:`Int64Index` if the resulting values are equally spaced (:issue:`47557`, :issue:`43885`)
+- :meth:`DataFrame.compare` now accepts an argument ``result_names`` to allow the user to specify the result's names of both left and right DataFrame which are being compared. This is by default ``'self'`` and ``'other'`` (:issue:`44354`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_150.notable_bug_fixes:
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -7776,6 +7776,14 @@ def __rdivmod__(self, other) -> tuple[DataFrame, DataFrame]:
 0    a     c  NaN   NaN
 2  NaN   NaN  3.0   4.0
 
+Assign result_names
+
+>>> df.compare(df2, result_names=("left", "right"))
+  col1       col3
+  left right left right
+0    a     c  NaN   NaN
+2  NaN   NaN  3.0   4.0
+
 Stack the differences on rows
 
 >>> df.compare(df2, align_axis=0)
@@ -7823,12 +7831,14 @@ def compare(
         align_axis: Axis = 1,
         keep_shape: bool = False,
         keep_equal: bool = False,
+        result_names: Suffixes = ("self", "other"),
     ) -> DataFrame:
         return super().compare(
             other=other,
             align_axis=align_axis,
             keep_shape=keep_shape,
             keep_equal=keep_equal,
+            result_names=result_names,
         )
 
     def combine(
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -59,6 +59,7 @@
     Renamer,
     SortKind,
     StorageOptions,
+    Suffixes,
     T,
     TimedeltaConvertibleTypes,
     TimestampConvertibleTypes,
@@ -8970,6 +8971,7 @@ def compare(
         align_axis: Axis = 1,
         keep_shape: bool_t = False,
         keep_equal: bool_t = False,
+        result_names: Suffixes = ("self", "other"),
     ):
         from pandas.core.reshape.concat import concat
 
@@ -8980,7 +8982,6 @@ def compare(
             )
 
         mask = ~((self == other) | (self.isna() & other.isna()))
-        keys = ["self", "other"]
 
         if not keep_equal:
             self = self.where(mask)
@@ -8995,13 +8996,18 @@ def compare(
             else:
                 self = self[mask]
                 other = other[mask]
+        if not isinstance(result_names, tuple):
+            raise TypeError(
+                f"Passing 'result_names' as a {type(result_names)} is not "
+                "supported. Provide 'result_names' as a tuple instead."
+            )
 
         if align_axis in (1, "columns"):  # This is needed for Series
             axis = 1
         else:
             axis = self._get_axis_number(align_axis)
 
-        diff = concat([self, other], axis=axis, keys=keys)
+        diff = concat([self, other], axis=axis, keys=result_names)
 
         if axis >= self.ndim:
             # No need to reorganize data if stacking on new axis
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -165,6 +165,7 @@
     from pandas._typing import (
         NumpySorter,
         NumpyValueArrayLike,
+        Suffixes,
     )
 
     from pandas.core.frame import DataFrame
@@ -3237,12 +3238,14 @@ def compare(
         align_axis: Axis = 1,
         keep_shape: bool = False,
         keep_equal: bool = False,
+        result_names: Suffixes = ("self", "other"),
     ) -> DataFrame | Series:
         return super().compare(
             other=other,
             align_axis=align_axis,
             keep_shape=keep_shape,
             keep_equal=keep_equal,
+            result_names=result_names,
         )
 
     def combine(self, other, func, fill_value=None) -> Series:
diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py
@@ -75,6 +75,11 @@
 keep_equal : bool, default False
     If true, the result keeps values that are equal.
     Otherwise, equal values are shown as NaNs.
+
+result_names : tuple, default ('self', 'other')
+    Set the dataframes names in the comparison.
+
+    .. versionadded:: 1.5.0
 """
 
 _shared_docs[
diff --git a/pandas/tests/frame/methods/test_compare.py b/pandas/tests/frame/methods/test_compare.py
@@ -180,3 +180,59 @@ def test_compare_unaligned_objects():
         df1 = pd.DataFrame(np.ones((3, 3)))
         df2 = pd.DataFrame(np.zeros((2, 1)))
         df1.compare(df2)
+
+
+def test_compare_result_names():
+    # GH 44354
+    df1 = pd.DataFrame(
+        {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]},
+    )
+    df2 = pd.DataFrame(
+        {
+            "col1": ["c", "b", "c"],
+            "col2": [1.0, 2.0, np.nan],
+            "col3": [1.0, 2.0, np.nan],
+        },
+    )
+    result = df1.compare(df2, result_names=("left", "right"))
+    expected = pd.DataFrame(
+        {
+            ("col1", "left"): {0: "a", 2: np.nan},
+            ("col1", "right"): {0: "c", 2: np.nan},
+            ("col3", "left"): {0: np.nan, 2: 3.0},
+            ("col3", "right"): {0: np.nan, 2: np.nan},
+        }
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "result_names",
+    [
+        [1, 2],
+        "HK",
+        {"2": 2, "3": 3},
+        3,
+        3.0,
+    ],
+)
+def test_invalid_input_result_names(result_names):
+    # GH 44354
+    df1 = pd.DataFrame(
+        {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]},
+    )
+    df2 = pd.DataFrame(
+        {
+            "col1": ["c", "b", "c"],
+            "col2": [1.0, 2.0, np.nan],
+            "col3": [1.0, 2.0, np.nan],
+        },
+    )
+    with pytest.raises(
+        TypeError,
+        match=(
+            f"Passing 'result_names' as a {type(result_names)} is not "
+            "supported. Provide 'result_names' as a tuple instead."
+        ),
+    ):
+        df1.compare(df2, result_names=result_names)