Skip to content

Commit cb5a924

Browse files
authored
ENH: add result_names argument to DataFrame.compare #44354 (#47643)
* DOC #45443 edited the documentation of where/mask functions * DOC #45443 edited the documentation of where/mask functions * Update generic.py * ENH: add suffixes argument to DataFrame.compare #44354 * Edited the tests * space fixing * Update shared_docs.py * Update series.py * Update series.py * invalid argument tests * issue reference * syntax editing * grammar fixing * edit doc * editting doc * Update 02_read_write.rst * Update 02_read_write.rst * Update v1.5.0.rst * Update v1.5.0.rst * np * 1.5.0 rst * created tests for invalid input * space * space * space * editing test
1 parent 0b8d8bb commit cb5a924

File tree

6 files changed

+83
-2
lines changed

6 files changed

+83
-2
lines changed

doc/source/whatsnew/v1.5.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,7 @@ Other enhancements
278278
- :meth:`DatetimeIndex.astype` now supports casting timezone-naive indexes to ``datetime64[s]``, ``datetime64[ms]``, and ``datetime64[us]``, and timezone-aware indexes to the corresponding ``datetime64[unit, tzname]`` dtypes (:issue:`47579`)
279279
- :class:`Series` reducers (e.g. ``min``, ``max``, ``sum``, ``mean``) will now successfully operate when the dtype is numeric and ``numeric_only=True`` is provided; previously this would raise a ``NotImplementedError`` (:issue:`47500`)
280280
- :meth:`RangeIndex.union` now can return a :class:`RangeIndex` instead of a :class:`Int64Index` if the resulting values are equally spaced (:issue:`47557`, :issue:`43885`)
281+
- :meth:`DataFrame.compare` now accepts an argument ``result_names`` to allow the user to specify the result's names of both left and right DataFrame which are being compared. This is by default ``'self'`` and ``'other'`` (:issue:`44354`)
281282

282283
.. ---------------------------------------------------------------------------
283284
.. _whatsnew_150.notable_bug_fixes:

pandas/core/frame.py

+10
Original file line numberDiff line numberDiff line change
@@ -7776,6 +7776,14 @@ def __rdivmod__(self, other) -> tuple[DataFrame, DataFrame]:
77767776
0 a c NaN NaN
77777777
2 NaN NaN 3.0 4.0
77787778
7779+
Assign result_names
7780+
7781+
>>> df.compare(df2, result_names=("left", "right"))
7782+
col1 col3
7783+
left right left right
7784+
0 a c NaN NaN
7785+
2 NaN NaN 3.0 4.0
7786+
77797787
Stack the differences on rows
77807788
77817789
>>> df.compare(df2, align_axis=0)
@@ -7823,12 +7831,14 @@ def compare(
78237831
align_axis: Axis = 1,
78247832
keep_shape: bool = False,
78257833
keep_equal: bool = False,
7834+
result_names: Suffixes = ("self", "other"),
78267835
) -> DataFrame:
78277836
return super().compare(
78287837
other=other,
78297838
align_axis=align_axis,
78307839
keep_shape=keep_shape,
78317840
keep_equal=keep_equal,
7841+
result_names=result_names,
78327842
)
78337843

78347844
def combine(

pandas/core/generic.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@
5959
Renamer,
6060
SortKind,
6161
StorageOptions,
62+
Suffixes,
6263
T,
6364
TimedeltaConvertibleTypes,
6465
TimestampConvertibleTypes,
@@ -8970,6 +8971,7 @@ def compare(
89708971
align_axis: Axis = 1,
89718972
keep_shape: bool_t = False,
89728973
keep_equal: bool_t = False,
8974+
result_names: Suffixes = ("self", "other"),
89738975
):
89748976
from pandas.core.reshape.concat import concat
89758977

@@ -8980,7 +8982,6 @@ def compare(
89808982
)
89818983

89828984
mask = ~((self == other) | (self.isna() & other.isna()))
8983-
keys = ["self", "other"]
89848985

89858986
if not keep_equal:
89868987
self = self.where(mask)
@@ -8995,13 +8996,18 @@ def compare(
89958996
else:
89968997
self = self[mask]
89978998
other = other[mask]
8999+
if not isinstance(result_names, tuple):
9000+
raise TypeError(
9001+
f"Passing 'result_names' as a {type(result_names)} is not "
9002+
"supported. Provide 'result_names' as a tuple instead."
9003+
)
89989004

89999005
if align_axis in (1, "columns"): # This is needed for Series
90009006
axis = 1
90019007
else:
90029008
axis = self._get_axis_number(align_axis)
90039009

9004-
diff = concat([self, other], axis=axis, keys=keys)
9010+
diff = concat([self, other], axis=axis, keys=result_names)
90059011

90069012
if axis >= self.ndim:
90079013
# No need to reorganize data if stacking on new axis

pandas/core/series.py

+3
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,7 @@
165165
from pandas._typing import (
166166
NumpySorter,
167167
NumpyValueArrayLike,
168+
Suffixes,
168169
)
169170

170171
from pandas.core.frame import DataFrame
@@ -3237,12 +3238,14 @@ def compare(
32373238
align_axis: Axis = 1,
32383239
keep_shape: bool = False,
32393240
keep_equal: bool = False,
3241+
result_names: Suffixes = ("self", "other"),
32403242
) -> DataFrame | Series:
32413243
return super().compare(
32423244
other=other,
32433245
align_axis=align_axis,
32443246
keep_shape=keep_shape,
32453247
keep_equal=keep_equal,
3248+
result_names=result_names,
32463249
)
32473250

32483251
def combine(self, other, func, fill_value=None) -> Series:

pandas/core/shared_docs.py

+5
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,11 @@
7575
keep_equal : bool, default False
7676
If true, the result keeps values that are equal.
7777
Otherwise, equal values are shown as NaNs.
78+
79+
result_names : tuple, default ('self', 'other')
80+
Set the dataframes names in the comparison.
81+
82+
.. versionadded:: 1.5.0
7883
"""
7984

8085
_shared_docs[

pandas/tests/frame/methods/test_compare.py

+56
Original file line numberDiff line numberDiff line change
@@ -180,3 +180,59 @@ def test_compare_unaligned_objects():
180180
df1 = pd.DataFrame(np.ones((3, 3)))
181181
df2 = pd.DataFrame(np.zeros((2, 1)))
182182
df1.compare(df2)
183+
184+
185+
def test_compare_result_names():
186+
# GH 44354
187+
df1 = pd.DataFrame(
188+
{"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]},
189+
)
190+
df2 = pd.DataFrame(
191+
{
192+
"col1": ["c", "b", "c"],
193+
"col2": [1.0, 2.0, np.nan],
194+
"col3": [1.0, 2.0, np.nan],
195+
},
196+
)
197+
result = df1.compare(df2, result_names=("left", "right"))
198+
expected = pd.DataFrame(
199+
{
200+
("col1", "left"): {0: "a", 2: np.nan},
201+
("col1", "right"): {0: "c", 2: np.nan},
202+
("col3", "left"): {0: np.nan, 2: 3.0},
203+
("col3", "right"): {0: np.nan, 2: np.nan},
204+
}
205+
)
206+
tm.assert_frame_equal(result, expected)
207+
208+
209+
@pytest.mark.parametrize(
210+
"result_names",
211+
[
212+
[1, 2],
213+
"HK",
214+
{"2": 2, "3": 3},
215+
3,
216+
3.0,
217+
],
218+
)
219+
def test_invalid_input_result_names(result_names):
220+
# GH 44354
221+
df1 = pd.DataFrame(
222+
{"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]},
223+
)
224+
df2 = pd.DataFrame(
225+
{
226+
"col1": ["c", "b", "c"],
227+
"col2": [1.0, 2.0, np.nan],
228+
"col3": [1.0, 2.0, np.nan],
229+
},
230+
)
231+
with pytest.raises(
232+
TypeError,
233+
match=(
234+
f"Passing 'result_names' as a {type(result_names)} is not "
235+
"supported. Provide 'result_names' as a tuple instead."
236+
),
237+
):
238+
df1.compare(df2, result_names=result_names)

0 commit comments

Comments
 (0)