diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index b081f743f9b0b..22a5f2a08362f 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -278,6 +278,7 @@ Other enhancements - :meth:`DatetimeIndex.astype` now supports casting timezone-naive indexes to ``datetime64[s]``, ``datetime64[ms]``, and ``datetime64[us]``, and timezone-aware indexes to the corresponding ``datetime64[unit, tzname]`` dtypes (:issue:`47579`) - :class:`Series` reducers (e.g. ``min``, ``max``, ``sum``, ``mean``) will now successfully operate when the dtype is numeric and ``numeric_only=True`` is provided; previously this would raise a ``NotImplementedError`` (:issue:`47500`) - :meth:`RangeIndex.union` now can return a :class:`RangeIndex` instead of a :class:`Int64Index` if the resulting values are equally spaced (:issue:`47557`, :issue:`43885`) +- :meth:`DataFrame.compare` now accepts an argument ``result_names`` to allow the user to specify the result's names of both left and right DataFrame which are being compared. This is by default ``'self'`` and ``'other'`` (:issue:`44354`) .. --------------------------------------------------------------------------- .. _whatsnew_150.notable_bug_fixes: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e70312c562907..9c90dffbf4df6 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7776,6 +7776,14 @@ def __rdivmod__(self, other) -> tuple[DataFrame, DataFrame]: 0 a c NaN NaN 2 NaN NaN 3.0 4.0 +Assign result_names + +>>> df.compare(df2, result_names=("left", "right")) + col1 col3 + left right left right +0 a c NaN NaN +2 NaN NaN 3.0 4.0 + Stack the differences on rows >>> df.compare(df2, align_axis=0) @@ -7823,12 +7831,14 @@ def compare( align_axis: Axis = 1, keep_shape: bool = False, keep_equal: bool = False, + result_names: Suffixes = ("self", "other"), ) -> DataFrame: return super().compare( other=other, align_axis=align_axis, keep_shape=keep_shape, keep_equal=keep_equal, + result_names=result_names, ) def combine( diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6e00f33f486d9..6e1df8fa3e270 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -59,6 +59,7 @@ Renamer, SortKind, StorageOptions, + Suffixes, T, TimedeltaConvertibleTypes, TimestampConvertibleTypes, @@ -8970,6 +8971,7 @@ def compare( align_axis: Axis = 1, keep_shape: bool_t = False, keep_equal: bool_t = False, + result_names: Suffixes = ("self", "other"), ): from pandas.core.reshape.concat import concat @@ -8980,7 +8982,6 @@ def compare( ) mask = ~((self == other) | (self.isna() & other.isna())) - keys = ["self", "other"] if not keep_equal: self = self.where(mask) @@ -8995,13 +8996,18 @@ def compare( else: self = self[mask] other = other[mask] + if not isinstance(result_names, tuple): + raise TypeError( + f"Passing 'result_names' as a {type(result_names)} is not " + "supported. Provide 'result_names' as a tuple instead." + ) if align_axis in (1, "columns"): # This is needed for Series axis = 1 else: axis = self._get_axis_number(align_axis) - diff = concat([self, other], axis=axis, keys=keys) + diff = concat([self, other], axis=axis, keys=result_names) if axis >= self.ndim: # No need to reorganize data if stacking on new axis diff --git a/pandas/core/series.py b/pandas/core/series.py index 60898ee75f7c2..67cdb5d8d72ab 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -165,6 +165,7 @@ from pandas._typing import ( NumpySorter, NumpyValueArrayLike, + Suffixes, ) from pandas.core.frame import DataFrame @@ -3237,12 +3238,14 @@ def compare( align_axis: Axis = 1, keep_shape: bool = False, keep_equal: bool = False, + result_names: Suffixes = ("self", "other"), ) -> DataFrame | Series: return super().compare( other=other, align_axis=align_axis, keep_shape=keep_shape, keep_equal=keep_equal, + result_names=result_names, ) def combine(self, other, func, fill_value=None) -> Series: diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index 4b7a487e9472d..b7b75d6464da3 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -75,6 +75,11 @@ keep_equal : bool, default False If true, the result keeps values that are equal. Otherwise, equal values are shown as NaNs. + +result_names : tuple, default ('self', 'other') + Set the dataframes names in the comparison. + + .. versionadded:: 1.5.0 """ _shared_docs[ diff --git a/pandas/tests/frame/methods/test_compare.py b/pandas/tests/frame/methods/test_compare.py index 468811eba0d39..609242db453ba 100644 --- a/pandas/tests/frame/methods/test_compare.py +++ b/pandas/tests/frame/methods/test_compare.py @@ -180,3 +180,59 @@ def test_compare_unaligned_objects(): df1 = pd.DataFrame(np.ones((3, 3))) df2 = pd.DataFrame(np.zeros((2, 1))) df1.compare(df2) + + +def test_compare_result_names(): + # GH 44354 + df1 = pd.DataFrame( + {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]}, + ) + df2 = pd.DataFrame( + { + "col1": ["c", "b", "c"], + "col2": [1.0, 2.0, np.nan], + "col3": [1.0, 2.0, np.nan], + }, + ) + result = df1.compare(df2, result_names=("left", "right")) + expected = pd.DataFrame( + { + ("col1", "left"): {0: "a", 2: np.nan}, + ("col1", "right"): {0: "c", 2: np.nan}, + ("col3", "left"): {0: np.nan, 2: 3.0}, + ("col3", "right"): {0: np.nan, 2: np.nan}, + } + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "result_names", + [ + [1, 2], + "HK", + {"2": 2, "3": 3}, + 3, + 3.0, + ], +) +def test_invalid_input_result_names(result_names): + # GH 44354 + df1 = pd.DataFrame( + {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]}, + ) + df2 = pd.DataFrame( + { + "col1": ["c", "b", "c"], + "col2": [1.0, 2.0, np.nan], + "col3": [1.0, 2.0, np.nan], + }, + ) + with pytest.raises( + TypeError, + match=( + f"Passing 'result_names' as a {type(result_names)} is not " + "supported. Provide 'result_names' as a tuple instead." + ), + ): + df1.compare(df2, result_names=result_names)