From b414a2a1469e30c6d78af92141a4973609bd8f24 Mon Sep 17 00:00:00 2001 From: ikramersh <91307258+ikramersh@users.noreply.github.com> Date: Fri, 22 Oct 2021 18:12:18 +1100 Subject: [PATCH 01/22] Update groupby.py Resolution to Issue#44132. A column of strings was being excluded from the result of .sum(). The problem is caused by the test in determining if the group being summed has only numeric columns thereby excluding the column with strings which can also be summed. --- pandas/core/groupby/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 9ca05e05fc09a..7eeb511fb40ad 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1217,7 +1217,7 @@ def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: else: obj = self._obj_with_exclusions check = obj._get_numeric_data() - if len(obj.columns) and not len(check.columns) and not obj.empty: + if len(obj.columns) > len(check.columns) and not obj.empty: numeric_only = False # TODO: v1.4+ Add FutureWarning From 2a4d5c89940ee240be7332959a5a0d5f71cf62c2 Mon Sep 17 00:00:00 2001 From: ikramersh <91307258+ikramersh@users.noreply.github.com> Date: Sat, 23 Oct 2021 04:40:07 +1100 Subject: [PATCH 02/22] Fix tests failing due to bug fix Fix tests to consider only columns that are in the expected results and ignore new columns produced by bug fix. --- pandas/core/generic.py | 2 +- pandas/tests/groupby/test_groupby.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c3ad87082c8ed..ffbf74437289b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8072,7 +8072,7 @@ def resample( 5 18 100 2018-02-11 6 17 40 2018-02-18 7 19 50 2018-02-25 - >>> df.resample('M', on='week_starting').mean() + >>> df.resample('M', on='week_starting')['price', 'volume'].mean() price volume week_starting 2018-01-31 10.75 62.5 diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 203d8abb465d0..578979643ca7d 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -896,7 +896,7 @@ def test_keep_nuisance_agg(df, agg_function): ) def test_omit_nuisance_agg(df, agg_function): # GH 38774, GH 38815 - grouped = df.groupby("A") + grouped = df.groupby("A")["C", "D"] result = getattr(grouped, agg_function)() expected = getattr(df.loc[:, ["A", "C", "D"]].groupby("A"), agg_function)() tm.assert_frame_equal(result, expected) From 5548bd674552effd06aa67959dc6cfb095a2bedd Mon Sep 17 00:00:00 2001 From: ikramersh <91307258+ikramersh@users.noreply.github.com> Date: Sat, 23 Oct 2021 10:30:06 +1100 Subject: [PATCH 03/22] Fix tests failing due to bug fix Fix tests to consider only columns that are in the expected results and ignore new columns produced by bug fix. --- pandas/tests/extension/base/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index db0190d488d42..56dcf3f9ca3f8 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -96,7 +96,7 @@ def test_in_numeric_groupby(self, data_for_grouping): "C": [1, 1, 1, 1, 1, 1, 1, 1], } ) - result = df.groupby("A").sum().columns + result = df.groupby("A").sum(numeric_only=True).columns if data_for_grouping.dtype._is_numeric: expected = pd.Index(["B", "C"]) From 68daf0b2ae2f8de8fde66868d15af3b3d147eceb Mon Sep 17 00:00:00 2001 From: ikramersh <91307258+ikramersh@users.noreply.github.com> Date: Sat, 23 Oct 2021 12:25:52 +1100 Subject: [PATCH 04/22] Fix tests failing due to bug fix Fix tests to consider only columns that are in the expected results and ignore new columns produced by bug fix. --- pandas/tests/groupby/test_groupby.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 578979643ca7d..e81cb3f5efe9c 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1126,8 +1126,8 @@ def test_groupby_with_hier_columns(): def test_grouping_ndarray(df): grouped = df.groupby(df["A"].values) - result = grouped.sum() - expected = df.groupby("A").sum() + result = grouped.sum(numeric_only=True) + expected = df.groupby("A").sum(numeric_only=True) tm.assert_frame_equal( result, expected, check_names=False ) # Note: no names when grouping by value From 1076ec7d994281bd1dcb1043e9fd38ea410f29da Mon Sep 17 00:00:00 2001 From: ikramersh <91307258+ikramersh@users.noreply.github.com> Date: Sat, 23 Oct 2021 15:30:06 +1100 Subject: [PATCH 05/22] Fix tests failing due to bug fix Fix tests to consider only columns that are in the expected results and ignore new columns produced by bug fix. --- pandas/tests/groupby/test_categorical.py | 2 +- pandas/tests/groupby/test_function.py | 9 ++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 339bb2c30736d..7536e1ed10425 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -333,7 +333,7 @@ def test_observed(observed, using_array_manager): gb = df.groupby(["A", "B"], observed=observed) exp_index = MultiIndex.from_arrays([cat1, cat2], names=["A", "B"]) expected = DataFrame({"values": [1, 2, 3, 4]}, index=exp_index) - result = gb.sum() + result = gb.sum(numeric_only=True) if not observed: expected = cartesian_product_for_groupers( expected, [cat1, cat2], list("AB"), fill_value=0 diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 3ae11847cc06b..5725efa129bb4 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -125,7 +125,14 @@ def df(self): @pytest.mark.parametrize("method", ["mean", "median"]) def test_averages(self, df, method): # mean / median - expected_columns_numeric = Index(["int", "float", "category_int"]) + expected_columns_numeric = Index([ + "int", + "float", + "category_int", + "datetime", + "datetimetz", + "timedelta", + ]) gb = df.groupby("group") expected = DataFrame( From de09ba222b10b9411d34044f8d8af0bedecbbf3a Mon Sep 17 00:00:00 2001 From: ikramersh <91307258+ikramersh@users.noreply.github.com> Date: Sat, 23 Oct 2021 16:40:13 +1100 Subject: [PATCH 06/22] Fix tests failing due to bug fix Fix tests to consider only columns that are in the expected results and ignore new columns produced by bug fix. --- pandas/tests/groupby/test_function.py | 6 ++++-- pandas/tests/groupby/test_timegrouper.py | 6 +++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 5725efa129bb4..23d2277e65aab 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -125,14 +125,16 @@ def df(self): @pytest.mark.parametrize("method", ["mean", "median"]) def test_averages(self, df, method): # mean / median - expected_columns_numeric = Index([ + expected_columns_numeric = Index( + [ "int", "float", "category_int", "datetime", "datetimetz", "timedelta", - ]) + ] + ) gb = df.groupby("group") expected = DataFrame( diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index d4b21633309db..9596a387adba9 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -105,14 +105,14 @@ def test_groupby_with_timegrouper(self): ) expected.iloc[[0, 6, 18], 0] = np.array([24, 6, 9], dtype="int64") - result1 = df.resample("5D").sum() + result1 = df.resample("5D").sum()['Quantity'].to_frame() tm.assert_frame_equal(result1, expected) df_sorted = df.sort_index() - result2 = df_sorted.groupby(Grouper(freq="5D")).sum() + result2 = df_sorted.groupby(Grouper(freq="5D")).sum()['Quantity'].to_frame() tm.assert_frame_equal(result2, expected) - result3 = df.groupby(Grouper(freq="5D")).sum() + result3 = df.groupby(Grouper(freq="5D")).sum()['Quantity'].to_frame() tm.assert_frame_equal(result3, expected) @pytest.mark.parametrize("should_sort", [True, False]) From 3b66057db47da8c67796259de23ca486b6953b00 Mon Sep 17 00:00:00 2001 From: ikramersh <91307258+ikramersh@users.noreply.github.com> Date: Sun, 24 Oct 2021 07:50:37 +1100 Subject: [PATCH 07/22] Fix tests failing due to bug fix --- pandas/tests/groupby/test_groupby.py | 2 +- pandas/tests/groupby/test_timegrouper.py | 25 +++++++++---------- pandas/tests/resample/test_resample_api.py | 4 +-- .../tests/resample/test_resampler_grouper.py | 2 +- 4 files changed, 16 insertions(+), 17 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index e81cb3f5efe9c..450fa5e570fc0 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2549,7 +2549,7 @@ def test_groupby_aggregation_numeric_with_non_numeric_dtype(): ) gb = df.groupby(by=["x"]) - result = gb.sum() + result = gb.sum(numeric_only=True) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index 9596a387adba9..5abd51528fb5c 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -185,8 +185,7 @@ def test_timegrouper_with_reg_groups(self): ], } ).set_index(["Date", "Buyer"]) - - result = df.groupby([Grouper(freq="A"), "Buyer"]).sum() + result = df.groupby([Grouper(freq="A"), "Buyer"]).sum(numeric_only=True) tm.assert_frame_equal(result, expected) expected = DataFrame( @@ -201,7 +200,7 @@ def test_timegrouper_with_reg_groups(self): ], } ).set_index(["Date", "Buyer"]) - result = df.groupby([Grouper(freq="6MS"), "Buyer"]).sum() + result = df.groupby([Grouper(freq="6MS"), "Buyer"]).sum(numeric_only=True) tm.assert_frame_equal(result, expected) df_original = DataFrame( @@ -239,10 +238,10 @@ def test_timegrouper_with_reg_groups(self): } ).set_index(["Date", "Buyer"]) - result = df.groupby([Grouper(freq="1D"), "Buyer"]).sum() + result = df.groupby([Grouper(freq="1D"), "Buyer"]).sum(numeric_only=True) tm.assert_frame_equal(result, expected) - result = df.groupby([Grouper(freq="1M"), "Buyer"]).sum() + result = df.groupby([Grouper(freq="1M"), "Buyer"]).sum(numeric_only=True) expected = DataFrame( { "Buyer": "Carl Joe Mark".split(), @@ -258,7 +257,7 @@ def test_timegrouper_with_reg_groups(self): # passing the name df = df.reset_index() - result = df.groupby([Grouper(freq="1M", key="Date"), "Buyer"]).sum() + result = df.groupby([Grouper(freq="1M", key="Date"), "Buyer"]).sum(numeric_only=True) tm.assert_frame_equal(result, expected) with pytest.raises(KeyError, match="'The grouper name foo is not found'"): @@ -266,9 +265,9 @@ def test_timegrouper_with_reg_groups(self): # passing the level df = df.set_index("Date") - result = df.groupby([Grouper(freq="1M", level="Date"), "Buyer"]).sum() + result = df.groupby([Grouper(freq="1M", level="Date"), "Buyer"]).sum(numeric_only=True) tm.assert_frame_equal(result, expected) - result = df.groupby([Grouper(freq="1M", level=0), "Buyer"]).sum() + result = df.groupby([Grouper(freq="1M", level=0), "Buyer"]).sum(numeric_only=True) tm.assert_frame_equal(result, expected) with pytest.raises(ValueError, match="The level foo is not valid"): @@ -277,7 +276,7 @@ def test_timegrouper_with_reg_groups(self): # multi names df = df.copy() df["Date"] = df.index + offsets.MonthEnd(2) - result = df.groupby([Grouper(freq="1M", key="Date"), "Buyer"]).sum() + result = df.groupby([Grouper(freq="1M", key="Date"), "Buyer"]).sum(numeric_only=True) expected = DataFrame( { "Buyer": "Carl Joe Mark".split(), @@ -306,18 +305,18 @@ def test_timegrouper_with_reg_groups(self): [datetime(2013, 10, 31, 0, 0)], freq=offsets.MonthEnd(), name="Date" ), ) - result = df.groupby(Grouper(freq="1M")).sum() + result = df.groupby(Grouper(freq="1M")).sum(numeric_only=True) tm.assert_frame_equal(result, expected) - result = df.groupby([Grouper(freq="1M")]).sum() + result = df.groupby([Grouper(freq="1M")]).sum(numeric_only=True) tm.assert_frame_equal(result, expected) expected.index = expected.index.shift(1) assert expected.index.freq == offsets.MonthEnd() - result = df.groupby(Grouper(freq="1M", key="Date")).sum() + result = df.groupby(Grouper(freq="1M", key="Date")).sum(numeric_only=True) tm.assert_frame_equal(result, expected) - result = df.groupby([Grouper(freq="1M", key="Date")]).sum() + result = df.groupby([Grouper(freq="1M", key="Date")]).sum(numeric_only=True) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("freq", ["D", "M", "A", "Q-APR"]) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 10fabe234d218..77a9118097bf2 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -90,7 +90,7 @@ def test_groupby_resample_on_api(): expected = df.set_index("dates").groupby("key").resample("D").mean() - result = df.groupby("key").resample("D", on="dates").mean() + result = df.groupby("key").resample("D", on="dates").mean()["values"].to_frame() tm.assert_frame_equal(result, expected) @@ -169,7 +169,7 @@ def tests_skip_nuisance(test_frame): tm.assert_frame_equal(result, expected) expected = r[["A", "B", "C"]].sum() - result = r.sum() + result = r.sum()[["A", "B", "C"]] tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 594b6b44aafa1..ef9a6c566322a 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -401,7 +401,7 @@ def test_resample_groupby_agg(): df["date"] = pd.to_datetime(df["date"]) resampled = df.groupby("cat").resample("Y", on="date") - expected = resampled.sum() + expected = resampled.sum()["num"].to_frame() result = resampled.agg({"num": "sum"}) tm.assert_frame_equal(result, expected) From af94515d678659a7490f1690b7baefbc4f24f9ae Mon Sep 17 00:00:00 2001 From: ikramersh <91307258+ikramersh@users.noreply.github.com> Date: Sun, 24 Oct 2021 12:57:29 +1100 Subject: [PATCH 08/22] Fix tests failing due to bug fix --- pandas/tests/groupby/test_timegrouper.py | 16 ++++++++++++---- pandas/tests/reshape/test_pivot.py | 18 +++++++++++++++--- 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index 5abd51528fb5c..7648bcc73bb20 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -257,7 +257,9 @@ def test_timegrouper_with_reg_groups(self): # passing the name df = df.reset_index() - result = df.groupby([Grouper(freq="1M", key="Date"), "Buyer"]).sum(numeric_only=True) + result = df.groupby([ + Grouper(freq="1M", key="Date"), "Buyer" + ]).sum(numeric_only=True) tm.assert_frame_equal(result, expected) with pytest.raises(KeyError, match="'The grouper name foo is not found'"): @@ -265,9 +267,13 @@ def test_timegrouper_with_reg_groups(self): # passing the level df = df.set_index("Date") - result = df.groupby([Grouper(freq="1M", level="Date"), "Buyer"]).sum(numeric_only=True) + result = df.groupby([ + Grouper(freq="1M", level="Date"), "Buyer" + ]).sum(numeric_only=True) tm.assert_frame_equal(result, expected) - result = df.groupby([Grouper(freq="1M", level=0), "Buyer"]).sum(numeric_only=True) + result = df.groupby([ + Grouper(freq="1M", level=0), "Buyer" + ]).sum(numeric_only=True) tm.assert_frame_equal(result, expected) with pytest.raises(ValueError, match="The level foo is not valid"): @@ -276,7 +282,9 @@ def test_timegrouper_with_reg_groups(self): # multi names df = df.copy() df["Date"] = df.index + offsets.MonthEnd(2) - result = df.groupby([Grouper(freq="1M", key="Date"), "Buyer"]).sum(numeric_only=True) + result = df.groupby([ + Grouper(freq="1M", key="Date"), "Buyer" + ]).sum(numeric_only=True) expected = DataFrame( { "Buyer": "Carl Joe Mark".split(), diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 88607f4b036a0..6b695e6a531d8 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -959,7 +959,11 @@ def test_margin_with_only_columns_defined( } ) - result = df.pivot_table(columns=columns, margins=True, aggfunc=aggfunc) + result = df[["A","B","D", "E"]].pivot_table( + columns=columns, + margins=True, + aggfunc=aggfunc + ) expected = DataFrame(values, index=Index(["D", "E"]), columns=expected_columns) tm.assert_frame_equal(result, expected) @@ -1984,8 +1988,16 @@ def test_pivot_string_as_func(self): def test_pivot_string_func_vs_func(self, f, f_numpy): # GH #18713 # for consistency purposes - result = pivot_table(self.data, index="A", columns="B", aggfunc=f) - expected = pivot_table(self.data, index="A", columns="B", aggfunc=f_numpy) + result = pivot_table(self.data[["D","E","F"]], + index="D", + columns="E", + aggfunc=f + ) + expected = pivot_table(self.data[["D","E","F"]], + index="D", + columns="E", + aggfunc=f_numpy + ) tm.assert_frame_equal(result, expected) @pytest.mark.slow From d0e033440b95327f170333b050111a09abf3b4eb Mon Sep 17 00:00:00 2001 From: ikramersh <91307258+ikramersh@users.noreply.github.com> Date: Sun, 24 Oct 2021 13:12:33 +1100 Subject: [PATCH 09/22] Fix lint errors --- pandas/tests/groupby/test_timegrouper.py | 16 ++++++++-------- pandas/tests/reshape/test_pivot.py | 8 +++++--- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index 7648bcc73bb20..7ff8cee59dea6 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -258,8 +258,8 @@ def test_timegrouper_with_reg_groups(self): # passing the name df = df.reset_index() result = df.groupby([ - Grouper(freq="1M", key="Date"), "Buyer" - ]).sum(numeric_only=True) + Grouper(freq="1M", key="Date"), "Buyer" + ]).sum(numeric_only=True) tm.assert_frame_equal(result, expected) with pytest.raises(KeyError, match="'The grouper name foo is not found'"): @@ -268,12 +268,12 @@ def test_timegrouper_with_reg_groups(self): # passing the level df = df.set_index("Date") result = df.groupby([ - Grouper(freq="1M", level="Date"), "Buyer" - ]).sum(numeric_only=True) + Grouper(freq="1M", level="Date"), "Buyer" + ]).sum(numeric_only=True) tm.assert_frame_equal(result, expected) result = df.groupby([ - Grouper(freq="1M", level=0), "Buyer" - ]).sum(numeric_only=True) + Grouper(freq="1M", level=0), "Buyer" + ]).sum(numeric_only=True) tm.assert_frame_equal(result, expected) with pytest.raises(ValueError, match="The level foo is not valid"): @@ -283,8 +283,8 @@ def test_timegrouper_with_reg_groups(self): df = df.copy() df["Date"] = df.index + offsets.MonthEnd(2) result = df.groupby([ - Grouper(freq="1M", key="Date"), "Buyer" - ]).sum(numeric_only=True) + Grouper(freq="1M", key="Date"), "Buyer" + ]).sum(numeric_only=True) expected = DataFrame( { "Buyer": "Carl Joe Mark".split(), diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 6b695e6a531d8..b5d100809ea50 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -959,7 +959,7 @@ def test_margin_with_only_columns_defined( } ) - result = df[["A","B","D", "E"]].pivot_table( + result = df[["A", "B", "D", "E"]].pivot_table( columns=columns, margins=True, aggfunc=aggfunc @@ -1988,12 +1988,14 @@ def test_pivot_string_as_func(self): def test_pivot_string_func_vs_func(self, f, f_numpy): # GH #18713 # for consistency purposes - result = pivot_table(self.data[["D","E","F"]], + result = pivot_table( + self.data[["D","E","F"]], index="D", columns="E", aggfunc=f ) - expected = pivot_table(self.data[["D","E","F"]], + expected = pivot_table( + self.data[["D","E","F"]], index="D", columns="E", aggfunc=f_numpy From 16d5a9fe3ec7b71c23c13f48c5f0073d2d258c08 Mon Sep 17 00:00:00 2001 From: ikramersh <91307258+ikramersh@users.noreply.github.com> Date: Sun, 24 Oct 2021 13:23:28 +1100 Subject: [PATCH 10/22] Fix lint errors --- pandas/tests/reshape/test_pivot.py | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index b5d100809ea50..58b367d55a9dc 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -960,10 +960,9 @@ def test_margin_with_only_columns_defined( ) result = df[["A", "B", "D", "E"]].pivot_table( - columns=columns, - margins=True, - aggfunc=aggfunc - ) + columns=columns, + margins=True, + aggfunc=aggfunc) expected = DataFrame(values, index=Index(["D", "E"]), columns=expected_columns) tm.assert_frame_equal(result, expected) @@ -1989,17 +1988,17 @@ def test_pivot_string_func_vs_func(self, f, f_numpy): # GH #18713 # for consistency purposes result = pivot_table( - self.data[["D","E","F"]], - index="D", - columns="E", - aggfunc=f - ) - expected = pivot_table( - self.data[["D","E","F"]], - index="D", - columns="E", - aggfunc=f_numpy - ) + self.data[["D","E","F"]], + index="D", + columns="E", + aggfunc=f + ) + expected = pivot_table( + self.data[["D","E","F"]], + index="D", + columns="E", + aggfunc=f_numpy + ) tm.assert_frame_equal(result, expected) @pytest.mark.slow From 1c38592403159db41ac7d663668a5d3acd703561 Mon Sep 17 00:00:00 2001 From: ikramersh <91307258+ikramersh@users.noreply.github.com> Date: Sun, 24 Oct 2021 13:53:26 +1100 Subject: [PATCH 11/22] Fix lint errors --- pandas/tests/reshape/test_pivot.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 58b367d55a9dc..50363f40c1931 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1988,16 +1988,10 @@ def test_pivot_string_func_vs_func(self, f, f_numpy): # GH #18713 # for consistency purposes result = pivot_table( - self.data[["D","E","F"]], - index="D", - columns="E", - aggfunc=f + self.data[["D", "E", "F"]], index="D", columns="E", aggfunc=f ) expected = pivot_table( - self.data[["D","E","F"]], - index="D", - columns="E", - aggfunc=f_numpy + self.data[["D", "E", "F"]], index="D", columns="E", aggfunc=f_numpy ) tm.assert_frame_equal(result, expected) From 8e04a0cf487acf23e169d216e52ec17f54665a69 Mon Sep 17 00:00:00 2001 From: ikramersh <91307258+ikramersh@users.noreply.github.com> Date: Sun, 24 Oct 2021 14:08:25 +1100 Subject: [PATCH 12/22] Fix lint errors --- pandas/tests/groupby/test_timegrouper.py | 18 +++++++++--------- pandas/tests/reshape/test_pivot.py | 4 ++-- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index 7ff8cee59dea6..26f1267f04f99 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -105,14 +105,14 @@ def test_groupby_with_timegrouper(self): ) expected.iloc[[0, 6, 18], 0] = np.array([24, 6, 9], dtype="int64") - result1 = df.resample("5D").sum()['Quantity'].to_frame() + result1 = df.resample("5D").sum()["Quantity"].to_frame() tm.assert_frame_equal(result1, expected) df_sorted = df.sort_index() - result2 = df_sorted.groupby(Grouper(freq="5D")).sum()['Quantity'].to_frame() + result2 = df_sorted.groupby(Grouper(freq="5D")).sum()["Quantity"].to_frame() tm.assert_frame_equal(result2, expected) - result3 = df.groupby(Grouper(freq="5D")).sum()['Quantity'].to_frame() + result3 = df.groupby(Grouper(freq="5D")).sum()["Quantity"].to_frame() tm.assert_frame_equal(result3, expected) @pytest.mark.parametrize("should_sort", [True, False]) @@ -267,13 +267,13 @@ def test_timegrouper_with_reg_groups(self): # passing the level df = df.set_index("Date") - result = df.groupby([ - Grouper(freq="1M", level="Date"), "Buyer" - ]).sum(numeric_only=True) + result = df.groupby([Grouper(freq="1M", level="Date"), "Buyer"]).sum( + numeric_only=True + ) tm.assert_frame_equal(result, expected) - result = df.groupby([ - Grouper(freq="1M", level=0), "Buyer" - ]).sum(numeric_only=True) + result = df.groupby([Grouper(freq="1M", level=0), "Buyer"]).sum( + numeric_only=True + ) tm.assert_frame_equal(result, expected) with pytest.raises(ValueError, match="The level foo is not valid"): diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 50363f40c1931..228dce0bfa649 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -960,8 +960,8 @@ def test_margin_with_only_columns_defined( ) result = df[["A", "B", "D", "E"]].pivot_table( - columns=columns, - margins=True, + columns=columns, + margins=True, aggfunc=aggfunc) expected = DataFrame(values, index=Index(["D", "E"]), columns=expected_columns) From 49f8efe6a7fb952ef601c88a8bb2fec5946fb1b0 Mon Sep 17 00:00:00 2001 From: ikramersh <91307258+ikramersh@users.noreply.github.com> Date: Sun, 24 Oct 2021 14:46:11 +1100 Subject: [PATCH 13/22] Fix lint errors --- pandas/tests/reshape/test_pivot.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 228dce0bfa649..f981f2e6c3147 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1988,11 +1988,11 @@ def test_pivot_string_func_vs_func(self, f, f_numpy): # GH #18713 # for consistency purposes result = pivot_table( - self.data[["D", "E", "F"]], index="D", columns="E", aggfunc=f - ) - expected = pivot_table( - self.data[["D", "E", "F"]], index="D", columns="E", aggfunc=f_numpy - ) + self.data[["D", "E", "F"]], index="D", columns="E", aggfunc=f + ) + expected = pivot_table( + self.data[["D", "E", "F"]], index="D", columns="E", aggfunc=f_numpy + ) tm.assert_frame_equal(result, expected) @pytest.mark.slow From 8461a285cb01a932f544d06fbc3cd5550c31fb0c Mon Sep 17 00:00:00 2001 From: ikramersh <91307258+ikramersh@users.noreply.github.com> Date: Sun, 24 Oct 2021 14:59:55 +1100 Subject: [PATCH 14/22] Fix lint errors --- pandas/tests/reshape/test_pivot.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index f981f2e6c3147..7451208fe49c9 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1988,11 +1988,11 @@ def test_pivot_string_func_vs_func(self, f, f_numpy): # GH #18713 # for consistency purposes result = pivot_table( - self.data[["D", "E", "F"]], index="D", columns="E", aggfunc=f - ) + self.data[["D", "E", "F"]], index="D", columns="E", aggfunc=f + ) expected = pivot_table( - self.data[["D", "E", "F"]], index="D", columns="E", aggfunc=f_numpy - ) + self.data[["D", "E", "F"]], index="D", columns="E", aggfunc=f_numpy + ) tm.assert_frame_equal(result, expected) @pytest.mark.slow From 6d54981ee6adf06a7cfdcb3b42da1855e5cb1615 Mon Sep 17 00:00:00 2001 From: ikramersh <91307258+ikramersh@users.noreply.github.com> Date: Sun, 24 Oct 2021 15:44:00 +1100 Subject: [PATCH 15/22] Fix lint errors --- pandas/tests/groupby/test_timegrouper.py | 12 ++++++------ pandas/tests/reshape/test_pivot.py | 5 ++--- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index 26f1267f04f99..57219c1c32d5b 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -257,9 +257,9 @@ def test_timegrouper_with_reg_groups(self): # passing the name df = df.reset_index() - result = df.groupby([ - Grouper(freq="1M", key="Date"), "Buyer" - ]).sum(numeric_only=True) + result = df.groupby([Grouper(freq="1M", key="Date"), "Buyer"]).sum( + numeric_only=True + ) tm.assert_frame_equal(result, expected) with pytest.raises(KeyError, match="'The grouper name foo is not found'"): @@ -282,9 +282,9 @@ def test_timegrouper_with_reg_groups(self): # multi names df = df.copy() df["Date"] = df.index + offsets.MonthEnd(2) - result = df.groupby([ - Grouper(freq="1M", key="Date"), "Buyer" - ]).sum(numeric_only=True) + result = df.groupby([Grouper(freq="1M", key="Date"), "Buyer"]).sum( + numeric_only=True + ) expected = DataFrame( { "Buyer": "Carl Joe Mark".split(), diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 7451208fe49c9..2775414ba90b6 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -960,9 +960,8 @@ def test_margin_with_only_columns_defined( ) result = df[["A", "B", "D", "E"]].pivot_table( - columns=columns, - margins=True, - aggfunc=aggfunc) + columns=columns, margins=True, aggfunc=aggfunc + ) expected = DataFrame(values, index=Index(["D", "E"]), columns=expected_columns) tm.assert_frame_equal(result, expected) From 4ca35caf42fb0dd4f6a8f1eb58d42090066a2d09 Mon Sep 17 00:00:00 2001 From: ikramersh <91307258+ikramersh@users.noreply.github.com> Date: Sun, 24 Oct 2021 17:01:27 +1100 Subject: [PATCH 16/22] Remove "FutureWarning: Dropping invalid columns" --- pandas/tests/frame/test_stack_unstack.py | 2 +- pandas/tests/reshape/merge/test_join.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 905b33b285625..d25a3bc8d82c4 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1749,7 +1749,7 @@ def test_stack_multiple_bug(self): multi = df.set_index(["DATE", "ID"]) multi.columns.name = "Params" unst = multi.unstack("ID") - down = unst.resample("W-THU").mean() + down = unst.loc[:, ["VAR1"]].resample("W-THU").mean() rs = down.stack("ID") xp = unst.loc[:, ["VAR1"]].resample("W-THU").mean().stack("ID") diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 48a55022aa484..147ffadf69b09 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -559,7 +559,7 @@ def test_mixed_type_join_with_suffix(self): df.insert(5, "dt", "foo") grouped = df.groupby("id") - mn = grouped.mean() + mn = grouped.mean(numeric_only=True) cn = grouped.count() # it works! From 1d1b143f5ce00555281e10b7c07b5393a536c7f7 Mon Sep 17 00:00:00 2001 From: ikramersh <91307258+ikramersh@users.noreply.github.com> Date: Sun, 24 Oct 2021 18:48:43 +1100 Subject: [PATCH 17/22] Remove "FutureWarning: Dropping invalid columns" --- pandas/tests/groupby/test_function.py | 31 +++++++++++---------------- pandas/tests/reshape/test_pivot.py | 9 ++++---- 2 files changed, 18 insertions(+), 22 deletions(-) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 23d2277e65aab..d080c59336989 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -125,18 +125,17 @@ def df(self): @pytest.mark.parametrize("method", ["mean", "median"]) def test_averages(self, df, method): # mean / median - expected_columns_numeric = Index( - [ - "int", - "float", - "category_int", - "datetime", - "datetimetz", - "timedelta", - ] - ) - - gb = df.groupby("group") + expected_column_names = [ + "int", + "float", + "category_int", + "datetime", + "datetimetz", + "timedelta", + ] + expected_columns_numeric = Index(expected_column_names) + + gb = df[["group", *expected_column_names]].groupby("group") expected = DataFrame( { "category_int": [7.5, 9], @@ -163,10 +162,7 @@ def test_averages(self, df, method): ], ) - with tm.assert_produces_warning( - FutureWarning, match="Dropping invalid", check_stacklevel=False - ): - result = getattr(gb, method)(numeric_only=False) + result = getattr(gb, method)() tm.assert_frame_equal(result.reindex_like(expected), expected) expected_columns = expected.columns @@ -246,7 +242,7 @@ def test_cummin_cummax(self, df, method): self._check(df, method, expected_columns, expected_columns_numeric) def _check(self, df, method, expected_columns, expected_columns_numeric): - gb = df.groupby("group") + gb = df[["group", *list(expected_columns_numeric)]].groupby("group") # cummin, cummax dont have numeric_only kwarg, always use False warn = None @@ -259,7 +255,6 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): with tm.assert_produces_warning(warn, match="Dropping invalid columns"): result = getattr(gb, method)() - tm.assert_index_equal(result.columns, expected_columns_numeric) # GH#41475 deprecated silently ignoring nuisance columns diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 2775414ba90b6..dbe26016f45c5 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -898,14 +898,15 @@ def _check_output( # to help with a buglet self.data.columns = [k * 2 for k in self.data.columns] - table = self.data.pivot_table(index=["AA", "BB"], margins=True, aggfunc=np.mean) + table = self.data[["AA", "DD", "EE", "FF"]].pivot_table(index=["AA"], margins=True, aggfunc=np.mean) for value_col in table.columns: - totals = table.loc[("All", ""), value_col] + totals = table.loc[("All"), value_col] assert totals == self.data[value_col].mean() - table = self.data.pivot_table(index=["AA", "BB"], margins=True, aggfunc="mean") +# table = self.data.pivot_table(index=["AA", "BB"], margins=True, aggfunc="mean") + table = self.data[["AA", "DD", "EE", "FF"]].pivot_table(index=["AA"], margins=True, aggfunc="mean") for item in ["DD", "EE", "FF"]: - totals = table.loc[("All", ""), item] + totals = table.loc[("All"), item] assert totals == self.data[item].mean() @pytest.mark.parametrize( From 7ce804136fa3ee8a1427c3f401313f39eb56fc34 Mon Sep 17 00:00:00 2001 From: ikramersh <91307258+ikramersh@users.noreply.github.com> Date: Sun, 24 Oct 2021 21:14:26 +1100 Subject: [PATCH 18/22] Remove "FutureWarning: Dropping invalid columns" --- pandas/tests/groupby/test_function.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index d080c59336989..c287d5a9cde56 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -210,14 +210,11 @@ def test_first_last(self, df, method): @pytest.mark.parametrize("method", ["sum", "cumsum"]) def test_sum_cumsum(self, df, method): - - expected_columns_numeric = Index(["int", "float", "category_int"]) expected_columns = Index( - ["int", "float", "string", "category_int", "timedelta"] + ["int", "float", "category_int"] ) - if method == "cumsum": - # cumsum loses string - expected_columns = Index(["int", "float", "category_int", "timedelta"]) + + expected_columns_numeric = expected_columns self._check(df, method, expected_columns, expected_columns_numeric) @@ -236,7 +233,6 @@ def test_cummin_cummax(self, df, method): ["int", "float", "category_int", "datetime", "datetimetz", "timedelta"] ) - # GH#15561: numeric_only=False set by default like min/max expected_columns_numeric = expected_columns self._check(df, method, expected_columns, expected_columns_numeric) @@ -253,16 +249,14 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): # these have numeric_only kwarg, but default to False warn = FutureWarning - with tm.assert_produces_warning(warn, match="Dropping invalid columns"): - result = getattr(gb, method)() + result = getattr(gb, method)() tm.assert_index_equal(result.columns, expected_columns_numeric) # GH#41475 deprecated silently ignoring nuisance columns warn = None if len(expected_columns) < len(gb._obj_with_exclusions.columns): warn = FutureWarning - with tm.assert_produces_warning(warn, match="Dropping invalid columns"): - result = getattr(gb, method)(numeric_only=False) + result = getattr(gb, method)() tm.assert_index_equal(result.columns, expected_columns) From c30a59c456d64d4e1893ec557ad3c8cb9b8e2dd6 Mon Sep 17 00:00:00 2001 From: ikramersh <91307258+ikramersh@users.noreply.github.com> Date: Tue, 9 Nov 2021 15:21:39 +1100 Subject: [PATCH 19/22] Fix Lint errors --- pandas/tests/groupby/test_function.py | 4 +--- pandas/tests/reshape/test_pivot.py | 9 ++++++--- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index c287d5a9cde56..30f2d2fe2ab81 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -210,9 +210,7 @@ def test_first_last(self, df, method): @pytest.mark.parametrize("method", ["sum", "cumsum"]) def test_sum_cumsum(self, df, method): - expected_columns = Index( - ["int", "float", "category_int"] - ) + expected_columns = Index(["int", "float", "category_int"]) expected_columns_numeric = expected_columns diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index dbe26016f45c5..ab7d58a3fc72f 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -898,13 +898,16 @@ def _check_output( # to help with a buglet self.data.columns = [k * 2 for k in self.data.columns] - table = self.data[["AA", "DD", "EE", "FF"]].pivot_table(index=["AA"], margins=True, aggfunc=np.mean) + table = self.data[["AA", "DD", "EE", "FF"]].pivot_table( + index=["AA"], margins=True, aggfunc=np.mean + ) for value_col in table.columns: totals = table.loc[("All"), value_col] assert totals == self.data[value_col].mean() -# table = self.data.pivot_table(index=["AA", "BB"], margins=True, aggfunc="mean") - table = self.data[["AA", "DD", "EE", "FF"]].pivot_table(index=["AA"], margins=True, aggfunc="mean") + table = self.data[["AA", "DD", "EE", "FF"]].pivot_table( + index=["AA"], margins=True, aggfunc="mean" + ) for item in ["DD", "EE", "FF"]: totals = table.loc[("All"), item] assert totals == self.data[item].mean() From c012cd10184aedb91e583c3782cd47835db41ffa Mon Sep 17 00:00:00 2001 From: ikramersh <91307258+ikramersh@users.noreply.github.com> Date: Wed, 10 Nov 2021 15:24:23 +1100 Subject: [PATCH 20/22] Fix lint warning --- pandas/tests/groupby/test_function.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 30f2d2fe2ab81..4185fe4aa7252 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -236,25 +236,27 @@ def test_cummin_cummax(self, df, method): self._check(df, method, expected_columns, expected_columns_numeric) def _check(self, df, method, expected_columns, expected_columns_numeric): - gb = df[["group", *list(expected_columns_numeric)]].groupby("group") # cummin, cummax dont have numeric_only kwarg, always use False warn = None - if method in ["cummin", "cummax"]: + if method in ["cummin", "cummax", "min", "max"]: # these dont have numeric_only kwarg, always use False warn = FutureWarning - elif method in ["min", "max"]: - # these have numeric_only kwarg, but default to False - warn = FutureWarning + df["object"] = [None,'y','z'] # add a column that is non numeric and will be dropped + gb = df[["group", "object", *list(expected_columns_numeric)]].groupby("group") + else: + gb = df[["group", *list(expected_columns_numeric)]].groupby("group") - result = getattr(gb, method)() + with tm.assert_produces_warning(warn, match="Dropping invalid columns"): + result = getattr(gb, method)() tm.assert_index_equal(result.columns, expected_columns_numeric) # GH#41475 deprecated silently ignoring nuisance columns warn = None if len(expected_columns) < len(gb._obj_with_exclusions.columns): warn = FutureWarning - result = getattr(gb, method)() + with tm.assert_produces_warning(warn, match="Dropping invalid columns"): + result = getattr(gb, method)() tm.assert_index_equal(result.columns, expected_columns) From fa6617603f6f4c4fefa751319b6c98ac0785597e Mon Sep 17 00:00:00 2001 From: ikramersh <91307258+ikramersh@users.noreply.github.com> Date: Wed, 10 Nov 2021 15:39:36 +1100 Subject: [PATCH 21/22] Fix lint errors --- pandas/tests/groupby/test_function.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 4185fe4aa7252..a7a6747cd0b54 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -242,8 +242,14 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): if method in ["cummin", "cummax", "min", "max"]: # these dont have numeric_only kwarg, always use False warn = FutureWarning - df["object"] = [None,'y','z'] # add a column that is non numeric and will be dropped - gb = df[["group", "object", *list(expected_columns_numeric)]].groupby("group") + df["object"] = [ + None, + "y", + "z", + ] # add a column that is non numeric and will be dropped + gb = df[["group", "object", *list(expected_columns_numeric)]].groupby( + "group" + ) else: gb = df[["group", *list(expected_columns_numeric)]].groupby("group") From 91b6aa1b8f1ee1f91b90e99fe7543ed86d267c4f Mon Sep 17 00:00:00 2001 From: ikramersh <91307258+ikramersh@users.noreply.github.com> Date: Wed, 10 Nov 2021 15:52:42 +1100 Subject: [PATCH 22/22] Fix lint error --- pandas/tests/groupby/test_function.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index a7a6747cd0b54..383e71e4eebdd 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -246,7 +246,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): None, "y", "z", - ] # add a column that is non numeric and will be dropped + ] # add a column that is non numeric and will be dropped gb = df[["group", "object", *list(expected_columns_numeric)]].groupby( "group" )