From 54c50687ddfcd79814aa1f854056b51eacd4e9e1 Mon Sep 17 00:00:00 2001 From: ahmedibrhm <81244897+ahmedibrhm@users.noreply.github.com> Date: Fri, 8 Jul 2022 00:29:21 -0700 Subject: [PATCH 01/26] DOC #45443 edited the documentation of where/mask functions --- pandas/core/generic.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ba3474a2513fb..b46eff137394c 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9614,7 +9614,8 @@ def where( The {name} method is an application of the if-then idiom. For each element in the calling DataFrame, if ``cond`` is ``{cond}`` the element is used; otherwise the corresponding element from the DataFrame - ``other`` is used. + ``other`` is used. If `cond` {klass} is less in size than `other`, the default bool + for the missing value is {cond_rev}. The signature for :func:`DataFrame.where` differs from :func:`numpy.where`. Roughly ``df1.where(m, df2)`` is equivalent to @@ -9641,6 +9642,23 @@ def where( 4 NaN dtype: float64 + >>> s = pd.Series(range(5)) + >>> t = pd.Series([True, False]) + >>> s.where(t,99) + 0 0 + 1 99 + 2 99 + 3 99 + 4 99 + dtype: int64 + >>> s.mask(t, 99) + 0 99 + 1 1 + 2 99 + 3 99 + 4 99 + dtype: int64 + >>> s.where(s > 1, 10) 0 10 1 10 From 2951fb14ef8c589f50b5a28e76878de410968b79 Mon Sep 17 00:00:00 2001 From: ahmedibrhm <81244897+ahmedibrhm@users.noreply.github.com> Date: Fri, 8 Jul 2022 00:39:29 -0700 Subject: [PATCH 02/26] DOC #45443 edited the documentation of where/mask functions --- pandas/core/generic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b46eff137394c..489ad1e3bf5c2 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9614,8 +9614,8 @@ def where( The {name} method is an application of the if-then idiom. For each element in the calling DataFrame, if ``cond`` is ``{cond}`` the element is used; otherwise the corresponding element from the DataFrame - ``other`` is used. If `cond` {klass} is less in size than `other`, the default bool - for the missing value is {cond_rev}. + ``other`` is used. If `cond` {klass} is less in size than `other`, the + default bool for the missing value is {cond_rev}. The signature for :func:`DataFrame.where` differs from :func:`numpy.where`. Roughly ``df1.where(m, df2)`` is equivalent to From 8afd6a1fad45a45326e0fdac46eb5cfd8ffac551 Mon Sep 17 00:00:00 2001 From: Ahmed Ibrahim <81244897+ahmedibrhm@users.noreply.github.com> Date: Fri, 8 Jul 2022 08:12:01 -0700 Subject: [PATCH 03/26] Update generic.py --- pandas/core/generic.py | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 489ad1e3bf5c2..ba3474a2513fb 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9614,8 +9614,7 @@ def where( The {name} method is an application of the if-then idiom. For each element in the calling DataFrame, if ``cond`` is ``{cond}`` the element is used; otherwise the corresponding element from the DataFrame - ``other`` is used. If `cond` {klass} is less in size than `other`, the - default bool for the missing value is {cond_rev}. + ``other`` is used. The signature for :func:`DataFrame.where` differs from :func:`numpy.where`. Roughly ``df1.where(m, df2)`` is equivalent to @@ -9642,23 +9641,6 @@ def where( 4 NaN dtype: float64 - >>> s = pd.Series(range(5)) - >>> t = pd.Series([True, False]) - >>> s.where(t,99) - 0 0 - 1 99 - 2 99 - 3 99 - 4 99 - dtype: int64 - >>> s.mask(t, 99) - 0 99 - 1 1 - 2 99 - 3 99 - 4 99 - dtype: int64 - >>> s.where(s > 1, 10) 0 10 1 10 From a326359ca3743ecdfd4a64b303c9d0e8fa63b6fb Mon Sep 17 00:00:00 2001 From: ahmedibrhm <81244897+ahmedibrhm@users.noreply.github.com> Date: Fri, 8 Jul 2022 10:43:16 -0700 Subject: [PATCH 04/26] ENH: add suffixes argument to DataFrame.compare #44354 --- doc/source/whatsnew/v1.5.0.rst | 1 + pandas/core/frame.py | 10 +++++++++ pandas/core/generic.py | 13 +++++++++-- pandas/core/shared_docs.py | 6 +++++- pandas/tests/frame/methods/test_compare.py | 25 ++++++++++++++++++++++ 5 files changed, 52 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index c70acc0a0b18c..e9d5bd1ffd1e2 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -277,6 +277,7 @@ Other enhancements - Allow reading compressed SAS files with :func:`read_sas` (e.g., ``.sas7bdat.gz`` files) - :meth:`DatetimeIndex.astype` now supports casting timezone-naive indexes to ``datetime64[s]``, ``datetime64[ms]``, and ``datetime64[us]``, and timezone-aware indexes to the corresponding ``datetime64[unit, tzname]`` dtypes (:issue:`47579`) - :class:`Series` reducers (e.g. ``min``, ``max``, ``sum``, ``mean``) will now successfully operate when the dtype is numeric and ``numeric_only=True`` is provided; previously this would raise a ``NotImplementedError`` (:issue:`47500`) +- :meth:`DataFrame.compare` now accepts a ``suffixes`` to allow the user to specify the suffixes of both left and right DataFrame which are being compared. This is by default ``self`` and ``other`` (:issue:`44354`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ead4ea744c647..ffa0b46896f98 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7776,6 +7776,14 @@ def __rdivmod__(self, other) -> tuple[DataFrame, DataFrame]: 0 a c NaN NaN 2 NaN NaN 3.0 4.0 +Assign suffixes + +>>> df.compare(df2, suffixes=("left", "right")) + col1 col3 + left right left right +0 a c NaN NaN +2 NaN NaN 3.0 4.0 + Stack the differences on rows >>> df.compare(df2, align_axis=0) @@ -7823,12 +7831,14 @@ def compare( align_axis: Axis = 1, keep_shape: bool = False, keep_equal: bool = False, + suffixes: Suffixes = ("self", "other"), ) -> DataFrame: return super().compare( other=other, align_axis=align_axis, keep_shape=keep_shape, keep_equal=keep_equal, + suffixes=suffixes, ) def combine( diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ba3474a2513fb..b62f5aa088500 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -58,6 +58,7 @@ Renamer, SortKind, StorageOptions, + Suffixes, T, TimedeltaConvertibleTypes, TimestampConvertibleTypes, @@ -8965,6 +8966,7 @@ def compare( align_axis: Axis = 1, keep_shape: bool_t = False, keep_equal: bool_t = False, + suffixes: Suffixes = ("self", "other"), ): from pandas.core.reshape.concat import concat @@ -8975,7 +8977,6 @@ def compare( ) mask = ~((self == other) | (self.isna() & other.isna())) - keys = ["self", "other"] if not keep_equal: self = self.where(mask) @@ -8990,13 +8991,21 @@ def compare( else: self = self[mask] other = other[mask] + if not isinstance(suffixes, tuple): + warnings.warn( + f"Passing 'suffixes' as a {type(suffixes)}, is not supported and may give " + "unexpected results. Provide 'suffixes' as a tuple instead. In the " + "future a 'TypeError' will be raised.", + FutureWarning, + stacklevel=find_stack_level(), + ) if align_axis in (1, "columns"): # This is needed for Series axis = 1 else: axis = self._get_axis_number(align_axis) - diff = concat([self, other], axis=axis, keys=keys) + diff = concat([self, other], axis=axis, keys=suffixes) if axis >= self.ndim: # No need to reorganize data if stacking on new axis diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index 4b7a487e9472d..039d37d70dd45 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -53,7 +53,7 @@ ] = """ Compare to another {klass} and show the differences. -.. versionadded:: 1.1.0 +.. versionadded:: 1.5.0 Parameters ---------- @@ -75,6 +75,10 @@ keep_equal : bool, default False If true, the result keeps values that are equal. Otherwise, equal values are shown as NaNs. + +suffixes : tuple, default ('self', 'other') + Set the dataframes names in the comparison. + """ _shared_docs[ diff --git a/pandas/tests/frame/methods/test_compare.py b/pandas/tests/frame/methods/test_compare.py index 468811eba0d39..10e9a4c847e30 100644 --- a/pandas/tests/frame/methods/test_compare.py +++ b/pandas/tests/frame/methods/test_compare.py @@ -180,3 +180,28 @@ def test_compare_unaligned_objects(): df1 = pd.DataFrame(np.ones((3, 3))) df2 = pd.DataFrame(np.zeros((2, 1))) df1.compare(df2) + + +def test_compare_suffixes(): + # GH + df1 = pd.DataFrame( + {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]}, + columns=["col1", "col2", "col3"], + ) + df2 = df1.copy() + df2.loc[0, "col1"] = "c" + df2.loc[2, "col3"] = 4.0 + + suffixes = ["left", "right"] + comp = df1.compare(df2, suffixes=suffixes) + expected = pd.DataFrame( + { + ("col1", "left"): {0: "a", 2: np.nan}, + ("col1", "right"): {0: "c", 2: np.nan}, + ("col3", "left"): {0: np.nan, 2: 3.0}, + ("col3", "right"): {0: np.nan, 2: np.nan}, + } + ) + tm.assert_frame_equal(comp, expected) + result_suffixes = comp.columns.get_level_values(1).unique() + assert result_suffixes.isin(suffixes).all(), "suffixes not equal" From d9c4ca98071cec8094e18f2fafddcc35b2c21a12 Mon Sep 17 00:00:00 2001 From: ahmedibrhm <81244897+ahmedibrhm@users.noreply.github.com> Date: Fri, 8 Jul 2022 11:12:08 -0700 Subject: [PATCH 05/26] Edited the tests --- pandas/tests/frame/methods/test_compare.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/methods/test_compare.py b/pandas/tests/frame/methods/test_compare.py index 10e9a4c847e30..f51fbf11c1b30 100644 --- a/pandas/tests/frame/methods/test_compare.py +++ b/pandas/tests/frame/methods/test_compare.py @@ -190,9 +190,9 @@ def test_compare_suffixes(): ) df2 = df1.copy() df2.loc[0, "col1"] = "c" - df2.loc[2, "col3"] = 4.0 + df2.loc[2, "col3"] = np.nan - suffixes = ["left", "right"] + suffixes = ("left", "right") comp = df1.compare(df2, suffixes=suffixes) expected = pd.DataFrame( { From 1c54472c4e0c27aef3dc9f306a5063250a03997f Mon Sep 17 00:00:00 2001 From: ahmedibrhm <81244897+ahmedibrhm@users.noreply.github.com> Date: Fri, 8 Jul 2022 11:17:13 -0700 Subject: [PATCH 06/26] space fixing --- pandas/core/generic.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b62f5aa088500..a8a0913d386ee 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8993,9 +8993,9 @@ def compare( other = other[mask] if not isinstance(suffixes, tuple): warnings.warn( - f"Passing 'suffixes' as a {type(suffixes)}, is not supported and may give " - "unexpected results. Provide 'suffixes' as a tuple instead. In the " - "future a 'TypeError' will be raised.", + f"Passing 'suffixes' as a {type(suffixes)}, is not supported " + "and may give unexpected results. Provide 'suffixes' as a tuple " + "instead. In the future a 'TypeError' will be raised.", FutureWarning, stacklevel=find_stack_level(), ) From 4d3482134ce57fa6e070d7f2860a39359d43aa77 Mon Sep 17 00:00:00 2001 From: Ahmed Ibrahim <81244897+ahmedibrhm@users.noreply.github.com> Date: Fri, 8 Jul 2022 12:54:39 -0700 Subject: [PATCH 07/26] Update shared_docs.py --- pandas/core/shared_docs.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index 039d37d70dd45..9b6c25ae80b83 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -78,7 +78,6 @@ suffixes : tuple, default ('self', 'other') Set the dataframes names in the comparison. - """ _shared_docs[ From 8fb6aa22a72fa82d9e35493c3bb699a74f9d7217 Mon Sep 17 00:00:00 2001 From: Ahmed Ibrahim <81244897+ahmedibrhm@users.noreply.github.com> Date: Fri, 8 Jul 2022 12:58:23 -0700 Subject: [PATCH 08/26] Update series.py --- pandas/core/series.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/series.py b/pandas/core/series.py index ef4ea0172c505..a899facc918f5 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -164,6 +164,7 @@ from pandas._typing import ( NumpySorter, NumpyValueArrayLike, + Suffixes, ) from pandas.core.frame import DataFrame @@ -3236,12 +3237,14 @@ def compare( align_axis: Axis = 1, keep_shape: bool = False, keep_equal: bool = False, + suffixes: Suffixes = ("self", "other"), ) -> DataFrame | Series: return super().compare( other=other, align_axis=align_axis, keep_shape=keep_shape, keep_equal=keep_equal, + suffixes = suffixes, ) def combine(self, other, func, fill_value=None) -> Series: From 1e33dea21d2967373e4940017c8e1661ec267c30 Mon Sep 17 00:00:00 2001 From: Ahmed Ibrahim <81244897+ahmedibrhm@users.noreply.github.com> Date: Fri, 8 Jul 2022 13:02:43 -0700 Subject: [PATCH 09/26] Update series.py --- pandas/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index a899facc918f5..8116706963bc1 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3244,7 +3244,7 @@ def compare( align_axis=align_axis, keep_shape=keep_shape, keep_equal=keep_equal, - suffixes = suffixes, + suffixes=suffixes, ) def combine(self, other, func, fill_value=None) -> Series: From ae6c75ad72c67d20308d4ab10461e66e6574e72a Mon Sep 17 00:00:00 2001 From: ahmedibrhm <81244897+ahmedibrhm@users.noreply.github.com> Date: Fri, 8 Jul 2022 13:46:34 -0700 Subject: [PATCH 10/26] invalid argument tests --- doc/source/whatsnew/v1.5.0.rst | 2 +- pandas/core/generic.py | 9 +++------ pandas/core/shared_docs.py | 4 +++- pandas/tests/frame/methods/test_compare.py | 19 +++++++++---------- 4 files changed, 16 insertions(+), 18 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index e9d5bd1ffd1e2..1cb1156da379d 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -277,7 +277,7 @@ Other enhancements - Allow reading compressed SAS files with :func:`read_sas` (e.g., ``.sas7bdat.gz`` files) - :meth:`DatetimeIndex.astype` now supports casting timezone-naive indexes to ``datetime64[s]``, ``datetime64[ms]``, and ``datetime64[us]``, and timezone-aware indexes to the corresponding ``datetime64[unit, tzname]`` dtypes (:issue:`47579`) - :class:`Series` reducers (e.g. ``min``, ``max``, ``sum``, ``mean``) will now successfully operate when the dtype is numeric and ``numeric_only=True`` is provided; previously this would raise a ``NotImplementedError`` (:issue:`47500`) -- :meth:`DataFrame.compare` now accepts a ``suffixes`` to allow the user to specify the suffixes of both left and right DataFrame which are being compared. This is by default ``self`` and ``other`` (:issue:`44354`) +- :meth:`DataFrame.compare` now accepts a ``suffixes`` to allow the user to specify the suffixes of both left and right DataFrame which are being compared. This is by default ``'self'`` and ``'other'`` (:issue:`44354`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a8a0913d386ee..327e0912ca291 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8992,12 +8992,9 @@ def compare( self = self[mask] other = other[mask] if not isinstance(suffixes, tuple): - warnings.warn( - f"Passing 'suffixes' as a {type(suffixes)}, is not supported " - "and may give unexpected results. Provide 'suffixes' as a tuple " - "instead. In the future a 'TypeError' will be raised.", - FutureWarning, - stacklevel=find_stack_level(), + raise TypeError( + f"Passing 'suffixes' as a {type(suffixes)}, is not " + "supported Provide 'suffixes' as a tuple instead." ) if align_axis in (1, "columns"): # This is needed for Series diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index 9b6c25ae80b83..f5b3bff521f2e 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -53,7 +53,7 @@ ] = """ Compare to another {klass} and show the differences. -.. versionadded:: 1.5.0 +.. versionadded:: 1.1.0 Parameters ---------- @@ -78,6 +78,8 @@ suffixes : tuple, default ('self', 'other') Set the dataframes names in the comparison. + + .. versionadded:: 1.5.0 """ _shared_docs[ diff --git a/pandas/tests/frame/methods/test_compare.py b/pandas/tests/frame/methods/test_compare.py index f51fbf11c1b30..4dbd5328a71b7 100644 --- a/pandas/tests/frame/methods/test_compare.py +++ b/pandas/tests/frame/methods/test_compare.py @@ -186,14 +186,15 @@ def test_compare_suffixes(): # GH df1 = pd.DataFrame( {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]}, - columns=["col1", "col2", "col3"], ) - df2 = df1.copy() - df2.loc[0, "col1"] = "c" - df2.loc[2, "col3"] = np.nan - - suffixes = ("left", "right") - comp = df1.compare(df2, suffixes=suffixes) + df2 = pd.DataFrame( + { + "col1": ["c", "b", "c"], + "col2": [1.0, 2.0, np.nan], + "col3": [1.0, 2.0, np.nan], + }, + ) + result = df1.compare(df2, suffixes=("left", "right")) expected = pd.DataFrame( { ("col1", "left"): {0: "a", 2: np.nan}, @@ -202,6 +203,4 @@ def test_compare_suffixes(): ("col3", "right"): {0: np.nan, 2: np.nan}, } ) - tm.assert_frame_equal(comp, expected) - result_suffixes = comp.columns.get_level_values(1).unique() - assert result_suffixes.isin(suffixes).all(), "suffixes not equal" + tm.assert_frame_equal(result, expected) From ee10dd32e69fedc453a5cad3d5b1c23dce042c86 Mon Sep 17 00:00:00 2001 From: ahmedibrhm <81244897+ahmedibrhm@users.noreply.github.com> Date: Fri, 8 Jul 2022 13:48:44 -0700 Subject: [PATCH 11/26] issue reference --- pandas/tests/frame/methods/test_compare.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/methods/test_compare.py b/pandas/tests/frame/methods/test_compare.py index 4dbd5328a71b7..9ad58972621d8 100644 --- a/pandas/tests/frame/methods/test_compare.py +++ b/pandas/tests/frame/methods/test_compare.py @@ -183,7 +183,7 @@ def test_compare_unaligned_objects(): def test_compare_suffixes(): - # GH + #44354 df1 = pd.DataFrame( {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]}, ) From 077d274eefbc1eba793ef23bbff3d349ad984434 Mon Sep 17 00:00:00 2001 From: ahmedibrhm <81244897+ahmedibrhm@users.noreply.github.com> Date: Fri, 8 Jul 2022 13:50:29 -0700 Subject: [PATCH 12/26] syntax editing --- pandas/tests/frame/methods/test_compare.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/methods/test_compare.py b/pandas/tests/frame/methods/test_compare.py index 9ad58972621d8..18106fa3c2496 100644 --- a/pandas/tests/frame/methods/test_compare.py +++ b/pandas/tests/frame/methods/test_compare.py @@ -183,7 +183,7 @@ def test_compare_unaligned_objects(): def test_compare_suffixes(): - #44354 + # 44354 df1 = pd.DataFrame( {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]}, ) From d0289e58d240787ac83da2bdab7ef0f780127a59 Mon Sep 17 00:00:00 2001 From: ahmedibrhm <81244897+ahmedibrhm@users.noreply.github.com> Date: Sun, 10 Jul 2022 08:31:40 -0700 Subject: [PATCH 13/26] grammar fixing --- doc/source/whatsnew/v1.5.0.rst | 2 +- pandas/core/generic.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 4377a867b3fce..9910c2a5e2291 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -278,7 +278,7 @@ Other enhancements - :meth:`DatetimeIndex.astype` now supports casting timezone-naive indexes to ``datetime64[s]``, ``datetime64[ms]``, and ``datetime64[us]``, and timezone-aware indexes to the corresponding ``datetime64[unit, tzname]`` dtypes (:issue:`47579`) - :class:`Series` reducers (e.g. ``min``, ``max``, ``sum``, ``mean``) will now successfully operate when the dtype is numeric and ``numeric_only=True`` is provided; previously this would raise a ``NotImplementedError`` (:issue:`47500`) - :meth:`RangeIndex.union` now can return a :class:`RangeIndex` instead of a :class:`Int64Index` if the resulting values are equally spaced (:issue:`47557`, :issue:`43885`) -- :meth:`DataFrame.compare` now accepts a ``suffixes`` to allow the user to specify the suffixes of both left and right DataFrame which are being compared. This is by default ``'self'`` and ``'other'`` (:issue:`44354`) +- :meth:`DataFrame.compare` now accepts an argument ``suffixes`` to allow the user to specify the suffixes of both left and right DataFrame which are being compared. This is by default ``'self'`` and ``'other'`` (:issue:`44354`) - .. --------------------------------------------------------------------------- .. _whatsnew_150.notable_bug_fixes: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index dcd80c0a18f97..af9979c1d2531 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8993,8 +8993,8 @@ def compare( other = other[mask] if not isinstance(suffixes, tuple): raise TypeError( - f"Passing 'suffixes' as a {type(suffixes)}, is not " - "supported Provide 'suffixes' as a tuple instead." + f"Passing 'suffixes' as a {type(suffixes)} is not " + "supported. Provide 'suffixes' as a tuple instead." ) if align_axis in (1, "columns"): # This is needed for Series From bd45e06b3a8a458d8c502e8fbef8622a067d5a20 Mon Sep 17 00:00:00 2001 From: ahmedibrhm <81244897+ahmedibrhm@users.noreply.github.com> Date: Sun, 10 Jul 2022 09:20:39 -0700 Subject: [PATCH 14/26] edit doc --- doc/source/whatsnew/v1.5.0.rst | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 9910c2a5e2291..b9bd36aa6bcac 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -279,7 +279,7 @@ Other enhancements - :class:`Series` reducers (e.g. ``min``, ``max``, ``sum``, ``mean``) will now successfully operate when the dtype is numeric and ``numeric_only=True`` is provided; previously this would raise a ``NotImplementedError`` (:issue:`47500`) - :meth:`RangeIndex.union` now can return a :class:`RangeIndex` instead of a :class:`Int64Index` if the resulting values are equally spaced (:issue:`47557`, :issue:`43885`) - :meth:`DataFrame.compare` now accepts an argument ``suffixes`` to allow the user to specify the suffixes of both left and right DataFrame which are being compared. This is by default ``'self'`` and ``'other'`` (:issue:`44354`) -- + .. --------------------------------------------------------------------------- .. _whatsnew_150.notable_bug_fixes: @@ -800,6 +800,7 @@ Performance improvements - Performance improvement in :class:`BusinessHour` ``str`` and ``repr`` (:issue:`44764`) - Performance improvement in datetime arrays string formatting when one of the default strftime formats ``"%Y-%m-%d %H:%M:%S"`` or ``"%Y-%m-%d %H:%M:%S.%f"`` is used. (:issue:`44764`) - Performance improvement in :meth:`Series.to_sql` and :meth:`DataFrame.to_sql` (:class:`SQLiteTable`) when processing time arrays. (:issue:`44764`) +- Performance improvements to :func:`read_sas` (:issue:`47403`, :issue:`47404`, :issue:`47405`) - .. --------------------------------------------------------------------------- @@ -858,10 +859,12 @@ Conversion - Bug in :meth:`DataFrame.to_records` returning inconsistent numpy types if the index was a :class:`MultiIndex` (:issue:`47263`) - Bug in :meth:`DataFrame.to_dict` for ``orient="list"`` or ``orient="index"`` was not returning native types (:issue:`46751`) - Bug in :meth:`DataFrame.apply` that returns a :class:`DataFrame` instead of a :class:`Series` when applied to an empty :class:`DataFrame` and ``axis=1`` (:issue:`39111`) +- Bug when inferring the dtype from an iterable that is *not* a NumPy ``ndarray`` consisting of all NumPy unsigned integer scalars did not result in an unsigned integer dtype (:issue:`47294`) Strings ^^^^^^^ - Bug in :meth:`str.startswith` and :meth:`str.endswith` when using other series as parameter _pat_. Now raises ``TypeError`` (:issue:`3485`) +- Bug in :meth:`Series.str.zfill` when strings contain leading signs, padding '0' before the sign character rather than after as ``str.zfill`` from standard library (:issue:`20868`) - Interval @@ -1048,4 +1051,4 @@ Other .. _whatsnew_150.contributors: Contributors -~~~~~~~~~~~~ +~~~~~~~~~~~~ \ No newline at end of file From a13b319860b61b2a3d068bcbb14db91703d493b4 Mon Sep 17 00:00:00 2001 From: ahmedibrhm <81244897+ahmedibrhm@users.noreply.github.com> Date: Sun, 10 Jul 2022 09:30:23 -0700 Subject: [PATCH 15/26] editting doc --- doc/source/getting_started/intro_tutorials/02_read_write.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/getting_started/intro_tutorials/02_read_write.rst b/doc/source/getting_started/intro_tutorials/02_read_write.rst index d69a48def0287..864732ea0b7ec 100644 --- a/doc/source/getting_started/intro_tutorials/02_read_write.rst +++ b/doc/source/getting_started/intro_tutorials/02_read_write.rst @@ -205,4 +205,4 @@ For a complete overview of the input and output possibilities from and to pandas .. raw:: html - + \ No newline at end of file From f32d7cf7e53b6eca4720cb17efed64b166be8ce8 Mon Sep 17 00:00:00 2001 From: Ahmed Ibrahim <81244897+ahmedibrhm@users.noreply.github.com> Date: Sun, 10 Jul 2022 09:58:34 -0700 Subject: [PATCH 16/26] Update 02_read_write.rst --- doc/source/getting_started/intro_tutorials/02_read_write.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/getting_started/intro_tutorials/02_read_write.rst b/doc/source/getting_started/intro_tutorials/02_read_write.rst index 864732ea0b7ec..92b6e85c9c3a3 100644 --- a/doc/source/getting_started/intro_tutorials/02_read_write.rst +++ b/doc/source/getting_started/intro_tutorials/02_read_write.rst @@ -205,4 +205,5 @@ For a complete overview of the input and output possibilities from and to pandas .. raw:: html - \ No newline at end of file + + From 63965838454fd51eeb7a9ca871892d49791e4933 Mon Sep 17 00:00:00 2001 From: Ahmed Ibrahim <81244897+ahmedibrhm@users.noreply.github.com> Date: Sun, 10 Jul 2022 09:58:44 -0700 Subject: [PATCH 17/26] Update 02_read_write.rst --- doc/source/getting_started/intro_tutorials/02_read_write.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/getting_started/intro_tutorials/02_read_write.rst b/doc/source/getting_started/intro_tutorials/02_read_write.rst index 92b6e85c9c3a3..d69a48def0287 100644 --- a/doc/source/getting_started/intro_tutorials/02_read_write.rst +++ b/doc/source/getting_started/intro_tutorials/02_read_write.rst @@ -205,5 +205,4 @@ For a complete overview of the input and output possibilities from and to pandas .. raw:: html - From e754e1510d3cfc077184555efc1cc1c92291f883 Mon Sep 17 00:00:00 2001 From: Ahmed Ibrahim <81244897+ahmedibrhm@users.noreply.github.com> Date: Sun, 10 Jul 2022 09:59:04 -0700 Subject: [PATCH 18/26] Update v1.5.0.rst --- doc/source/whatsnew/v1.5.0.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index b9bd36aa6bcac..d1a7f3c5b2e0d 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -1,5 +1,6 @@ .. _whatsnew_150: + What's new in 1.5.0 (??) ------------------------ @@ -1051,4 +1052,4 @@ Other .. _whatsnew_150.contributors: Contributors -~~~~~~~~~~~~ \ No newline at end of file +~~~~~~~~~~~~ From 8f67c9f7cedd90a08a7cb990de9ea4fd71a0d288 Mon Sep 17 00:00:00 2001 From: Ahmed Ibrahim <81244897+ahmedibrhm@users.noreply.github.com> Date: Sun, 10 Jul 2022 09:59:15 -0700 Subject: [PATCH 19/26] Update v1.5.0.rst --- doc/source/whatsnew/v1.5.0.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index d1a7f3c5b2e0d..7eaed2a2e3566 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -1,6 +1,5 @@ .. _whatsnew_150: - What's new in 1.5.0 (??) ------------------------ From 580773d40fc343b9b47bf8e8d816cdab3c8ff115 Mon Sep 17 00:00:00 2001 From: ahmedibrhm <81244897+ahmedibrhm@users.noreply.github.com> Date: Tue, 12 Jul 2022 18:43:46 -0700 Subject: [PATCH 20/26] np --- doc/source/whatsnew/v1.5.0.rst | 2 +- pandas/core/frame.py | 8 ++++---- pandas/core/generic.py | 10 +++++----- pandas/core/series.py | 4 ++-- pandas/core/shared_docs.py | 2 +- pandas/tests/frame/methods/test_compare.py | 4 ++-- 6 files changed, 15 insertions(+), 15 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 7eaed2a2e3566..db1a6a5eead6d 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -278,7 +278,7 @@ Other enhancements - :meth:`DatetimeIndex.astype` now supports casting timezone-naive indexes to ``datetime64[s]``, ``datetime64[ms]``, and ``datetime64[us]``, and timezone-aware indexes to the corresponding ``datetime64[unit, tzname]`` dtypes (:issue:`47579`) - :class:`Series` reducers (e.g. ``min``, ``max``, ``sum``, ``mean``) will now successfully operate when the dtype is numeric and ``numeric_only=True`` is provided; previously this would raise a ``NotImplementedError`` (:issue:`47500`) - :meth:`RangeIndex.union` now can return a :class:`RangeIndex` instead of a :class:`Int64Index` if the resulting values are equally spaced (:issue:`47557`, :issue:`43885`) -- :meth:`DataFrame.compare` now accepts an argument ``suffixes`` to allow the user to specify the suffixes of both left and right DataFrame which are being compared. This is by default ``'self'`` and ``'other'`` (:issue:`44354`) +- :meth:`DataFrame.compare` now accepts an argument ``result_names`` to allow the user to specify the names of both left and right DataFrame which are being compared. This is by default ``'self'`` and ``'other'`` (:issue:`44354`) .. --------------------------------------------------------------------------- .. _whatsnew_150.notable_bug_fixes: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ffa0b46896f98..3e6fe9719c5c7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7776,9 +7776,9 @@ def __rdivmod__(self, other) -> tuple[DataFrame, DataFrame]: 0 a c NaN NaN 2 NaN NaN 3.0 4.0 -Assign suffixes +Assign result_names ->>> df.compare(df2, suffixes=("left", "right")) +>>> df.compare(df2, result_names=("left", "right")) col1 col3 left right left right 0 a c NaN NaN @@ -7831,14 +7831,14 @@ def compare( align_axis: Axis = 1, keep_shape: bool = False, keep_equal: bool = False, - suffixes: Suffixes = ("self", "other"), + result_names: Suffixes = ("self", "other"), ) -> DataFrame: return super().compare( other=other, align_axis=align_axis, keep_shape=keep_shape, keep_equal=keep_equal, - suffixes=suffixes, + result_names=result_names, ) def combine( diff --git a/pandas/core/generic.py b/pandas/core/generic.py index bc3f18414b793..9472aaf8e935c 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8971,7 +8971,7 @@ def compare( align_axis: Axis = 1, keep_shape: bool_t = False, keep_equal: bool_t = False, - suffixes: Suffixes = ("self", "other"), + result_names: Suffixes = ("self", "other"), ): from pandas.core.reshape.concat import concat @@ -8996,10 +8996,10 @@ def compare( else: self = self[mask] other = other[mask] - if not isinstance(suffixes, tuple): + if not isinstance(result_names, tuple): raise TypeError( - f"Passing 'suffixes' as a {type(suffixes)} is not " - "supported. Provide 'suffixes' as a tuple instead." + f"Passing 'result_names' as a {type(result_names)} is not " + "supported. Provide 'result_names' as a tuple instead." ) if align_axis in (1, "columns"): # This is needed for Series @@ -9007,7 +9007,7 @@ def compare( else: axis = self._get_axis_number(align_axis) - diff = concat([self, other], axis=axis, keys=suffixes) + diff = concat([self, other], axis=axis, keys=result_names) if axis >= self.ndim: # No need to reorganize data if stacking on new axis diff --git a/pandas/core/series.py b/pandas/core/series.py index 8116706963bc1..05fc90503dbbd 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3237,14 +3237,14 @@ def compare( align_axis: Axis = 1, keep_shape: bool = False, keep_equal: bool = False, - suffixes: Suffixes = ("self", "other"), + result_names: Suffixes = ("self", "other"), ) -> DataFrame | Series: return super().compare( other=other, align_axis=align_axis, keep_shape=keep_shape, keep_equal=keep_equal, - suffixes=suffixes, + result_names=result_names, ) def combine(self, other, func, fill_value=None) -> Series: diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index f5b3bff521f2e..b7b75d6464da3 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -76,7 +76,7 @@ If true, the result keeps values that are equal. Otherwise, equal values are shown as NaNs. -suffixes : tuple, default ('self', 'other') +result_names : tuple, default ('self', 'other') Set the dataframes names in the comparison. .. versionadded:: 1.5.0 diff --git a/pandas/tests/frame/methods/test_compare.py b/pandas/tests/frame/methods/test_compare.py index 18106fa3c2496..d9d24b0ebb7dd 100644 --- a/pandas/tests/frame/methods/test_compare.py +++ b/pandas/tests/frame/methods/test_compare.py @@ -182,7 +182,7 @@ def test_compare_unaligned_objects(): df1.compare(df2) -def test_compare_suffixes(): +def test_compare_result_names(): # 44354 df1 = pd.DataFrame( {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]}, @@ -194,7 +194,7 @@ def test_compare_suffixes(): "col3": [1.0, 2.0, np.nan], }, ) - result = df1.compare(df2, suffixes=("left", "right")) + result = df1.compare(df2, result_names=("left", "right")) expected = pd.DataFrame( { ("col1", "left"): {0: "a", 2: np.nan}, From a4fca5637f0ff16b26671a27c17ec6ab469296d0 Mon Sep 17 00:00:00 2001 From: ahmedibrhm <81244897+ahmedibrhm@users.noreply.github.com> Date: Wed, 13 Jul 2022 09:08:40 -0700 Subject: [PATCH 21/26] 1.5.0 rst --- doc/source/whatsnew/v1.5.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index bdebf27f59b04..7bd787ff1acfb 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -278,7 +278,7 @@ Other enhancements - :meth:`DatetimeIndex.astype` now supports casting timezone-naive indexes to ``datetime64[s]``, ``datetime64[ms]``, and ``datetime64[us]``, and timezone-aware indexes to the corresponding ``datetime64[unit, tzname]`` dtypes (:issue:`47579`) - :class:`Series` reducers (e.g. ``min``, ``max``, ``sum``, ``mean``) will now successfully operate when the dtype is numeric and ``numeric_only=True`` is provided; previously this would raise a ``NotImplementedError`` (:issue:`47500`) - :meth:`RangeIndex.union` now can return a :class:`RangeIndex` instead of a :class:`Int64Index` if the resulting values are equally spaced (:issue:`47557`, :issue:`43885`) -- :meth:`DataFrame.compare` now accepts an argument ``result_names`` to allow the user to specify the names of both left and right DataFrame which are being compared. This is by default ``'self'`` and ``'other'`` (:issue:`44354`) +- :meth:`DataFrame.compare` now accepts an argument ``result_names`` to allow the user to specify the result's names of both left and right DataFrame which are being compared. This is by default ``'self'`` and ``'other'`` (:issue:`44354`) .. --------------------------------------------------------------------------- .. _whatsnew_150.notable_bug_fixes: From bc209bb17be1e6f5a82a81016cd4244a95c2713e Mon Sep 17 00:00:00 2001 From: ahmedibrhm <81244897+ahmedibrhm@users.noreply.github.com> Date: Fri, 15 Jul 2022 20:22:34 -0700 Subject: [PATCH 22/26] created tests for invalid input --- pandas/tests/frame/methods/test_compare.py | 28 +++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/pandas/tests/frame/methods/test_compare.py b/pandas/tests/frame/methods/test_compare.py index d9d24b0ebb7dd..fdb1f8909041e 100644 --- a/pandas/tests/frame/methods/test_compare.py +++ b/pandas/tests/frame/methods/test_compare.py @@ -183,7 +183,7 @@ def test_compare_unaligned_objects(): def test_compare_result_names(): - # 44354 + #GH 44354 df1 = pd.DataFrame( {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]}, ) @@ -204,3 +204,29 @@ def test_compare_result_names(): } ) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "result_names", + [ + [1,2], + "HK", + {"2":2,"3":3}, + 3, + 3.0 + ] +) +def test_invalid_input_result_names(result_names): + #GH 44354 + df1 = pd.DataFrame( + {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]}, + ) + df2 = pd.DataFrame( + { + "col1": ["c", "b", "c"], + "col2": [1.0, 2.0, np.nan], + "col3": [1.0, 2.0, np.nan], + }, + ) + with pytest.raises(TypeError): + df1.compare(df2, result_names=result_names) From ff014e3706e5e1eaede890d30e81840437dbfc5b Mon Sep 17 00:00:00 2001 From: ahmedibrhm <81244897+ahmedibrhm@users.noreply.github.com> Date: Fri, 15 Jul 2022 20:23:49 -0700 Subject: [PATCH 23/26] space --- pandas/tests/frame/methods/test_compare.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/frame/methods/test_compare.py b/pandas/tests/frame/methods/test_compare.py index fdb1f8909041e..d1c06ec4f7e61 100644 --- a/pandas/tests/frame/methods/test_compare.py +++ b/pandas/tests/frame/methods/test_compare.py @@ -230,3 +230,4 @@ def test_invalid_input_result_names(result_names): ) with pytest.raises(TypeError): df1.compare(df2, result_names=result_names) + From 32d1c5e2bda00918f870e9ae85b0c56761cf7906 Mon Sep 17 00:00:00 2001 From: ahmedibrhm <81244897+ahmedibrhm@users.noreply.github.com> Date: Fri, 15 Jul 2022 20:24:43 -0700 Subject: [PATCH 24/26] space --- pandas/tests/frame/methods/test_compare.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/methods/test_compare.py b/pandas/tests/frame/methods/test_compare.py index d1c06ec4f7e61..52a99f23980e0 100644 --- a/pandas/tests/frame/methods/test_compare.py +++ b/pandas/tests/frame/methods/test_compare.py @@ -213,8 +213,8 @@ def test_compare_result_names(): "HK", {"2":2,"3":3}, 3, - 3.0 - ] + 3.0, + ], ) def test_invalid_input_result_names(result_names): #GH 44354 From 0daa3e831b140c7b66289efe4764b12b2a6c2c37 Mon Sep 17 00:00:00 2001 From: ahmedibrhm <81244897+ahmedibrhm@users.noreply.github.com> Date: Fri, 15 Jul 2022 20:39:26 -0700 Subject: [PATCH 25/26] space --- pandas/tests/frame/methods/test_compare.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/frame/methods/test_compare.py b/pandas/tests/frame/methods/test_compare.py index 52a99f23980e0..3982e7191ab9f 100644 --- a/pandas/tests/frame/methods/test_compare.py +++ b/pandas/tests/frame/methods/test_compare.py @@ -183,7 +183,7 @@ def test_compare_unaligned_objects(): def test_compare_result_names(): - #GH 44354 + # GH 44354 df1 = pd.DataFrame( {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]}, ) @@ -209,15 +209,15 @@ def test_compare_result_names(): @pytest.mark.parametrize( "result_names", [ - [1,2], + [1, 2], "HK", - {"2":2,"3":3}, + {"2": 2, "3": 3}, 3, 3.0, ], ) def test_invalid_input_result_names(result_names): - #GH 44354 + # GH 44354 df1 = pd.DataFrame( {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]}, ) From 9cb23b8365d65a3351e9081ac6512e7adf1ca788 Mon Sep 17 00:00:00 2001 From: ahmedibrhm <81244897+ahmedibrhm@users.noreply.github.com> Date: Fri, 15 Jul 2022 21:16:25 -0700 Subject: [PATCH 26/26] editing test --- pandas/tests/frame/methods/test_compare.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/methods/test_compare.py b/pandas/tests/frame/methods/test_compare.py index 3982e7191ab9f..609242db453ba 100644 --- a/pandas/tests/frame/methods/test_compare.py +++ b/pandas/tests/frame/methods/test_compare.py @@ -228,6 +228,11 @@ def test_invalid_input_result_names(result_names): "col3": [1.0, 2.0, np.nan], }, ) - with pytest.raises(TypeError): + with pytest.raises( + TypeError, + match=( + f"Passing 'result_names' as a {type(result_names)} is not " + "supported. Provide 'result_names' as a tuple instead." + ), + ): df1.compare(df2, result_names=result_names) -