From 54c50687ddfcd79814aa1f854056b51eacd4e9e1 Mon Sep 17 00:00:00 2001 From: ahmedibrhm <81244897+ahmedibrhm@users.noreply.github.com> Date: Fri, 8 Jul 2022 00:29:21 -0700 Subject: [PATCH 01/12] DOC #45443 edited the documentation of where/mask functions --- pandas/core/generic.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ba3474a2513fb..b46eff137394c 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9614,7 +9614,8 @@ def where( The {name} method is an application of the if-then idiom. For each element in the calling DataFrame, if ``cond`` is ``{cond}`` the element is used; otherwise the corresponding element from the DataFrame - ``other`` is used. + ``other`` is used. If `cond` {klass} is less in size than `other`, the default bool + for the missing value is {cond_rev}. The signature for :func:`DataFrame.where` differs from :func:`numpy.where`. Roughly ``df1.where(m, df2)`` is equivalent to @@ -9641,6 +9642,23 @@ def where( 4 NaN dtype: float64 + >>> s = pd.Series(range(5)) + >>> t = pd.Series([True, False]) + >>> s.where(t,99) + 0 0 + 1 99 + 2 99 + 3 99 + 4 99 + dtype: int64 + >>> s.mask(t, 99) + 0 99 + 1 1 + 2 99 + 3 99 + 4 99 + dtype: int64 + >>> s.where(s > 1, 10) 0 10 1 10 From 2951fb14ef8c589f50b5a28e76878de410968b79 Mon Sep 17 00:00:00 2001 From: ahmedibrhm <81244897+ahmedibrhm@users.noreply.github.com> Date: Fri, 8 Jul 2022 00:39:29 -0700 Subject: [PATCH 02/12] DOC #45443 edited the documentation of where/mask functions --- pandas/core/generic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b46eff137394c..489ad1e3bf5c2 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9614,8 +9614,8 @@ def where( The {name} method is an application of the if-then idiom. For each element in the calling DataFrame, if ``cond`` is ``{cond}`` the element is used; otherwise the corresponding element from the DataFrame - ``other`` is used. If `cond` {klass} is less in size than `other`, the default bool - for the missing value is {cond_rev}. + ``other`` is used. If `cond` {klass} is less in size than `other`, the + default bool for the missing value is {cond_rev}. The signature for :func:`DataFrame.where` differs from :func:`numpy.where`. Roughly ``df1.where(m, df2)`` is equivalent to From 8afd6a1fad45a45326e0fdac46eb5cfd8ffac551 Mon Sep 17 00:00:00 2001 From: Ahmed Ibrahim <81244897+ahmedibrhm@users.noreply.github.com> Date: Fri, 8 Jul 2022 08:12:01 -0700 Subject: [PATCH 03/12] Update generic.py --- pandas/core/generic.py | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 489ad1e3bf5c2..ba3474a2513fb 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9614,8 +9614,7 @@ def where( The {name} method is an application of the if-then idiom. For each element in the calling DataFrame, if ``cond`` is ``{cond}`` the element is used; otherwise the corresponding element from the DataFrame - ``other`` is used. If `cond` {klass} is less in size than `other`, the - default bool for the missing value is {cond_rev}. + ``other`` is used. The signature for :func:`DataFrame.where` differs from :func:`numpy.where`. Roughly ``df1.where(m, df2)`` is equivalent to @@ -9642,23 +9641,6 @@ def where( 4 NaN dtype: float64 - >>> s = pd.Series(range(5)) - >>> t = pd.Series([True, False]) - >>> s.where(t,99) - 0 0 - 1 99 - 2 99 - 3 99 - 4 99 - dtype: int64 - >>> s.mask(t, 99) - 0 99 - 1 1 - 2 99 - 3 99 - 4 99 - dtype: int64 - >>> s.where(s > 1, 10) 0 10 1 10 From a326359ca3743ecdfd4a64b303c9d0e8fa63b6fb Mon Sep 17 00:00:00 2001 From: ahmedibrhm <81244897+ahmedibrhm@users.noreply.github.com> Date: Fri, 8 Jul 2022 10:43:16 -0700 Subject: [PATCH 04/12] ENH: add suffixes argument to DataFrame.compare #44354 --- doc/source/whatsnew/v1.5.0.rst | 1 + pandas/core/frame.py | 10 +++++++++ pandas/core/generic.py | 13 +++++++++-- pandas/core/shared_docs.py | 6 +++++- pandas/tests/frame/methods/test_compare.py | 25 ++++++++++++++++++++++ 5 files changed, 52 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index c70acc0a0b18c..e9d5bd1ffd1e2 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -277,6 +277,7 @@ Other enhancements - Allow reading compressed SAS files with :func:`read_sas` (e.g., ``.sas7bdat.gz`` files) - :meth:`DatetimeIndex.astype` now supports casting timezone-naive indexes to ``datetime64[s]``, ``datetime64[ms]``, and ``datetime64[us]``, and timezone-aware indexes to the corresponding ``datetime64[unit, tzname]`` dtypes (:issue:`47579`) - :class:`Series` reducers (e.g. ``min``, ``max``, ``sum``, ``mean``) will now successfully operate when the dtype is numeric and ``numeric_only=True`` is provided; previously this would raise a ``NotImplementedError`` (:issue:`47500`) +- :meth:`DataFrame.compare` now accepts a ``suffixes`` to allow the user to specify the suffixes of both left and right DataFrame which are being compared. This is by default ``self`` and ``other`` (:issue:`44354`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ead4ea744c647..ffa0b46896f98 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7776,6 +7776,14 @@ def __rdivmod__(self, other) -> tuple[DataFrame, DataFrame]: 0 a c NaN NaN 2 NaN NaN 3.0 4.0 +Assign suffixes + +>>> df.compare(df2, suffixes=("left", "right")) + col1 col3 + left right left right +0 a c NaN NaN +2 NaN NaN 3.0 4.0 + Stack the differences on rows >>> df.compare(df2, align_axis=0) @@ -7823,12 +7831,14 @@ def compare( align_axis: Axis = 1, keep_shape: bool = False, keep_equal: bool = False, + suffixes: Suffixes = ("self", "other"), ) -> DataFrame: return super().compare( other=other, align_axis=align_axis, keep_shape=keep_shape, keep_equal=keep_equal, + suffixes=suffixes, ) def combine( diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ba3474a2513fb..b62f5aa088500 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -58,6 +58,7 @@ Renamer, SortKind, StorageOptions, + Suffixes, T, TimedeltaConvertibleTypes, TimestampConvertibleTypes, @@ -8965,6 +8966,7 @@ def compare( align_axis: Axis = 1, keep_shape: bool_t = False, keep_equal: bool_t = False, + suffixes: Suffixes = ("self", "other"), ): from pandas.core.reshape.concat import concat @@ -8975,7 +8977,6 @@ def compare( ) mask = ~((self == other) | (self.isna() & other.isna())) - keys = ["self", "other"] if not keep_equal: self = self.where(mask) @@ -8990,13 +8991,21 @@ def compare( else: self = self[mask] other = other[mask] + if not isinstance(suffixes, tuple): + warnings.warn( + f"Passing 'suffixes' as a {type(suffixes)}, is not supported and may give " + "unexpected results. Provide 'suffixes' as a tuple instead. In the " + "future a 'TypeError' will be raised.", + FutureWarning, + stacklevel=find_stack_level(), + ) if align_axis in (1, "columns"): # This is needed for Series axis = 1 else: axis = self._get_axis_number(align_axis) - diff = concat([self, other], axis=axis, keys=keys) + diff = concat([self, other], axis=axis, keys=suffixes) if axis >= self.ndim: # No need to reorganize data if stacking on new axis diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index 4b7a487e9472d..039d37d70dd45 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -53,7 +53,7 @@ ] = """ Compare to another {klass} and show the differences. -.. versionadded:: 1.1.0 +.. versionadded:: 1.5.0 Parameters ---------- @@ -75,6 +75,10 @@ keep_equal : bool, default False If true, the result keeps values that are equal. Otherwise, equal values are shown as NaNs. + +suffixes : tuple, default ('self', 'other') + Set the dataframes names in the comparison. + """ _shared_docs[ diff --git a/pandas/tests/frame/methods/test_compare.py b/pandas/tests/frame/methods/test_compare.py index 468811eba0d39..10e9a4c847e30 100644 --- a/pandas/tests/frame/methods/test_compare.py +++ b/pandas/tests/frame/methods/test_compare.py @@ -180,3 +180,28 @@ def test_compare_unaligned_objects(): df1 = pd.DataFrame(np.ones((3, 3))) df2 = pd.DataFrame(np.zeros((2, 1))) df1.compare(df2) + + +def test_compare_suffixes(): + # GH + df1 = pd.DataFrame( + {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]}, + columns=["col1", "col2", "col3"], + ) + df2 = df1.copy() + df2.loc[0, "col1"] = "c" + df2.loc[2, "col3"] = 4.0 + + suffixes = ["left", "right"] + comp = df1.compare(df2, suffixes=suffixes) + expected = pd.DataFrame( + { + ("col1", "left"): {0: "a", 2: np.nan}, + ("col1", "right"): {0: "c", 2: np.nan}, + ("col3", "left"): {0: np.nan, 2: 3.0}, + ("col3", "right"): {0: np.nan, 2: np.nan}, + } + ) + tm.assert_frame_equal(comp, expected) + result_suffixes = comp.columns.get_level_values(1).unique() + assert result_suffixes.isin(suffixes).all(), "suffixes not equal" From d9c4ca98071cec8094e18f2fafddcc35b2c21a12 Mon Sep 17 00:00:00 2001 From: ahmedibrhm <81244897+ahmedibrhm@users.noreply.github.com> Date: Fri, 8 Jul 2022 11:12:08 -0700 Subject: [PATCH 05/12] Edited the tests --- pandas/tests/frame/methods/test_compare.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/methods/test_compare.py b/pandas/tests/frame/methods/test_compare.py index 10e9a4c847e30..f51fbf11c1b30 100644 --- a/pandas/tests/frame/methods/test_compare.py +++ b/pandas/tests/frame/methods/test_compare.py @@ -190,9 +190,9 @@ def test_compare_suffixes(): ) df2 = df1.copy() df2.loc[0, "col1"] = "c" - df2.loc[2, "col3"] = 4.0 + df2.loc[2, "col3"] = np.nan - suffixes = ["left", "right"] + suffixes = ("left", "right") comp = df1.compare(df2, suffixes=suffixes) expected = pd.DataFrame( { From 1c54472c4e0c27aef3dc9f306a5063250a03997f Mon Sep 17 00:00:00 2001 From: ahmedibrhm <81244897+ahmedibrhm@users.noreply.github.com> Date: Fri, 8 Jul 2022 11:17:13 -0700 Subject: [PATCH 06/12] space fixing --- pandas/core/generic.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b62f5aa088500..a8a0913d386ee 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8993,9 +8993,9 @@ def compare( other = other[mask] if not isinstance(suffixes, tuple): warnings.warn( - f"Passing 'suffixes' as a {type(suffixes)}, is not supported and may give " - "unexpected results. Provide 'suffixes' as a tuple instead. In the " - "future a 'TypeError' will be raised.", + f"Passing 'suffixes' as a {type(suffixes)}, is not supported " + "and may give unexpected results. Provide 'suffixes' as a tuple " + "instead. In the future a 'TypeError' will be raised.", FutureWarning, stacklevel=find_stack_level(), ) From 4d3482134ce57fa6e070d7f2860a39359d43aa77 Mon Sep 17 00:00:00 2001 From: Ahmed Ibrahim <81244897+ahmedibrhm@users.noreply.github.com> Date: Fri, 8 Jul 2022 12:54:39 -0700 Subject: [PATCH 07/12] Update shared_docs.py --- pandas/core/shared_docs.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index 039d37d70dd45..9b6c25ae80b83 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -78,7 +78,6 @@ suffixes : tuple, default ('self', 'other') Set the dataframes names in the comparison. - """ _shared_docs[ From 8fb6aa22a72fa82d9e35493c3bb699a74f9d7217 Mon Sep 17 00:00:00 2001 From: Ahmed Ibrahim <81244897+ahmedibrhm@users.noreply.github.com> Date: Fri, 8 Jul 2022 12:58:23 -0700 Subject: [PATCH 08/12] Update series.py --- pandas/core/series.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/series.py b/pandas/core/series.py index ef4ea0172c505..a899facc918f5 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -164,6 +164,7 @@ from pandas._typing import ( NumpySorter, NumpyValueArrayLike, + Suffixes, ) from pandas.core.frame import DataFrame @@ -3236,12 +3237,14 @@ def compare( align_axis: Axis = 1, keep_shape: bool = False, keep_equal: bool = False, + suffixes: Suffixes = ("self", "other"), ) -> DataFrame | Series: return super().compare( other=other, align_axis=align_axis, keep_shape=keep_shape, keep_equal=keep_equal, + suffixes = suffixes, ) def combine(self, other, func, fill_value=None) -> Series: From 1e33dea21d2967373e4940017c8e1661ec267c30 Mon Sep 17 00:00:00 2001 From: Ahmed Ibrahim <81244897+ahmedibrhm@users.noreply.github.com> Date: Fri, 8 Jul 2022 13:02:43 -0700 Subject: [PATCH 09/12] Update series.py --- pandas/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index a899facc918f5..8116706963bc1 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3244,7 +3244,7 @@ def compare( align_axis=align_axis, keep_shape=keep_shape, keep_equal=keep_equal, - suffixes = suffixes, + suffixes=suffixes, ) def combine(self, other, func, fill_value=None) -> Series: From ae6c75ad72c67d20308d4ab10461e66e6574e72a Mon Sep 17 00:00:00 2001 From: ahmedibrhm <81244897+ahmedibrhm@users.noreply.github.com> Date: Fri, 8 Jul 2022 13:46:34 -0700 Subject: [PATCH 10/12] invalid argument tests --- doc/source/whatsnew/v1.5.0.rst | 2 +- pandas/core/generic.py | 9 +++------ pandas/core/shared_docs.py | 4 +++- pandas/tests/frame/methods/test_compare.py | 19 +++++++++---------- 4 files changed, 16 insertions(+), 18 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index e9d5bd1ffd1e2..1cb1156da379d 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -277,7 +277,7 @@ Other enhancements - Allow reading compressed SAS files with :func:`read_sas` (e.g., ``.sas7bdat.gz`` files) - :meth:`DatetimeIndex.astype` now supports casting timezone-naive indexes to ``datetime64[s]``, ``datetime64[ms]``, and ``datetime64[us]``, and timezone-aware indexes to the corresponding ``datetime64[unit, tzname]`` dtypes (:issue:`47579`) - :class:`Series` reducers (e.g. ``min``, ``max``, ``sum``, ``mean``) will now successfully operate when the dtype is numeric and ``numeric_only=True`` is provided; previously this would raise a ``NotImplementedError`` (:issue:`47500`) -- :meth:`DataFrame.compare` now accepts a ``suffixes`` to allow the user to specify the suffixes of both left and right DataFrame which are being compared. This is by default ``self`` and ``other`` (:issue:`44354`) +- :meth:`DataFrame.compare` now accepts a ``suffixes`` to allow the user to specify the suffixes of both left and right DataFrame which are being compared. This is by default ``'self'`` and ``'other'`` (:issue:`44354`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a8a0913d386ee..327e0912ca291 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8992,12 +8992,9 @@ def compare( self = self[mask] other = other[mask] if not isinstance(suffixes, tuple): - warnings.warn( - f"Passing 'suffixes' as a {type(suffixes)}, is not supported " - "and may give unexpected results. Provide 'suffixes' as a tuple " - "instead. In the future a 'TypeError' will be raised.", - FutureWarning, - stacklevel=find_stack_level(), + raise TypeError( + f"Passing 'suffixes' as a {type(suffixes)}, is not " + "supported Provide 'suffixes' as a tuple instead." ) if align_axis in (1, "columns"): # This is needed for Series diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index 9b6c25ae80b83..f5b3bff521f2e 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -53,7 +53,7 @@ ] = """ Compare to another {klass} and show the differences. -.. versionadded:: 1.5.0 +.. versionadded:: 1.1.0 Parameters ---------- @@ -78,6 +78,8 @@ suffixes : tuple, default ('self', 'other') Set the dataframes names in the comparison. + + .. versionadded:: 1.5.0 """ _shared_docs[ diff --git a/pandas/tests/frame/methods/test_compare.py b/pandas/tests/frame/methods/test_compare.py index f51fbf11c1b30..4dbd5328a71b7 100644 --- a/pandas/tests/frame/methods/test_compare.py +++ b/pandas/tests/frame/methods/test_compare.py @@ -186,14 +186,15 @@ def test_compare_suffixes(): # GH df1 = pd.DataFrame( {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]}, - columns=["col1", "col2", "col3"], ) - df2 = df1.copy() - df2.loc[0, "col1"] = "c" - df2.loc[2, "col3"] = np.nan - - suffixes = ("left", "right") - comp = df1.compare(df2, suffixes=suffixes) + df2 = pd.DataFrame( + { + "col1": ["c", "b", "c"], + "col2": [1.0, 2.0, np.nan], + "col3": [1.0, 2.0, np.nan], + }, + ) + result = df1.compare(df2, suffixes=("left", "right")) expected = pd.DataFrame( { ("col1", "left"): {0: "a", 2: np.nan}, @@ -202,6 +203,4 @@ def test_compare_suffixes(): ("col3", "right"): {0: np.nan, 2: np.nan}, } ) - tm.assert_frame_equal(comp, expected) - result_suffixes = comp.columns.get_level_values(1).unique() - assert result_suffixes.isin(suffixes).all(), "suffixes not equal" + tm.assert_frame_equal(result, expected) From ee10dd32e69fedc453a5cad3d5b1c23dce042c86 Mon Sep 17 00:00:00 2001 From: ahmedibrhm <81244897+ahmedibrhm@users.noreply.github.com> Date: Fri, 8 Jul 2022 13:48:44 -0700 Subject: [PATCH 11/12] issue reference --- pandas/tests/frame/methods/test_compare.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/methods/test_compare.py b/pandas/tests/frame/methods/test_compare.py index 4dbd5328a71b7..9ad58972621d8 100644 --- a/pandas/tests/frame/methods/test_compare.py +++ b/pandas/tests/frame/methods/test_compare.py @@ -183,7 +183,7 @@ def test_compare_unaligned_objects(): def test_compare_suffixes(): - # GH + #44354 df1 = pd.DataFrame( {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]}, ) From 077d274eefbc1eba793ef23bbff3d349ad984434 Mon Sep 17 00:00:00 2001 From: ahmedibrhm <81244897+ahmedibrhm@users.noreply.github.com> Date: Fri, 8 Jul 2022 13:50:29 -0700 Subject: [PATCH 12/12] syntax editing --- pandas/tests/frame/methods/test_compare.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/methods/test_compare.py b/pandas/tests/frame/methods/test_compare.py index 9ad58972621d8..18106fa3c2496 100644 --- a/pandas/tests/frame/methods/test_compare.py +++ b/pandas/tests/frame/methods/test_compare.py @@ -183,7 +183,7 @@ def test_compare_unaligned_objects(): def test_compare_suffixes(): - #44354 + # 44354 df1 = pd.DataFrame( {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]}, )