diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c3ad87082c8ed..ffbf74437289b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8072,7 +8072,7 @@ def resample( 5 18 100 2018-02-11 6 17 40 2018-02-18 7 19 50 2018-02-25 - >>> df.resample('M', on='week_starting').mean() + >>> df.resample('M', on='week_starting')['price', 'volume'].mean() price volume week_starting 2018-01-31 10.75 62.5 diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 07cef290c8919..8ff531fe2b5b9 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1224,7 +1224,7 @@ def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: else: obj = self._obj_with_exclusions check = obj._get_numeric_data() - if len(obj.columns) and not len(check.columns) and not obj.empty: + if len(obj.columns) > len(check.columns) and not obj.empty: numeric_only = False # TODO: v1.4+ Add FutureWarning diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index db0190d488d42..56dcf3f9ca3f8 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -96,7 +96,7 @@ def test_in_numeric_groupby(self, data_for_grouping): "C": [1, 1, 1, 1, 1, 1, 1, 1], } ) - result = df.groupby("A").sum().columns + result = df.groupby("A").sum(numeric_only=True).columns if data_for_grouping.dtype._is_numeric: expected = pd.Index(["B", "C"]) diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 905b33b285625..d25a3bc8d82c4 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -1749,7 +1749,7 @@ def test_stack_multiple_bug(self): multi = df.set_index(["DATE", "ID"]) multi.columns.name = "Params" unst = multi.unstack("ID") - down = unst.resample("W-THU").mean() + down = unst.loc[:, ["VAR1"]].resample("W-THU").mean() rs = down.stack("ID") xp = unst.loc[:, ["VAR1"]].resample("W-THU").mean().stack("ID") diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 339bb2c30736d..7536e1ed10425 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -333,7 +333,7 @@ def test_observed(observed, using_array_manager): gb = df.groupby(["A", "B"], observed=observed) exp_index = MultiIndex.from_arrays([cat1, cat2], names=["A", "B"]) expected = DataFrame({"values": [1, 2, 3, 4]}, index=exp_index) - result = gb.sum() + result = gb.sum(numeric_only=True) if not observed: expected = cartesian_product_for_groupers( expected, [cat1, cat2], list("AB"), fill_value=0 diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 3ae11847cc06b..383e71e4eebdd 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -125,9 +125,17 @@ def df(self): @pytest.mark.parametrize("method", ["mean", "median"]) def test_averages(self, df, method): # mean / median - expected_columns_numeric = Index(["int", "float", "category_int"]) - - gb = df.groupby("group") + expected_column_names = [ + "int", + "float", + "category_int", + "datetime", + "datetimetz", + "timedelta", + ] + expected_columns_numeric = Index(expected_column_names) + + gb = df[["group", *expected_column_names]].groupby("group") expected = DataFrame( { "category_int": [7.5, 9], @@ -154,10 +162,7 @@ def test_averages(self, df, method): ], ) - with tm.assert_produces_warning( - FutureWarning, match="Dropping invalid", check_stacklevel=False - ): - result = getattr(gb, method)(numeric_only=False) + result = getattr(gb, method)() tm.assert_frame_equal(result.reindex_like(expected), expected) expected_columns = expected.columns @@ -205,14 +210,9 @@ def test_first_last(self, df, method): @pytest.mark.parametrize("method", ["sum", "cumsum"]) def test_sum_cumsum(self, df, method): + expected_columns = Index(["int", "float", "category_int"]) - expected_columns_numeric = Index(["int", "float", "category_int"]) - expected_columns = Index( - ["int", "float", "string", "category_int", "timedelta"] - ) - if method == "cumsum": - # cumsum loses string - expected_columns = Index(["int", "float", "category_int", "timedelta"]) + expected_columns_numeric = expected_columns self._check(df, method, expected_columns, expected_columns_numeric) @@ -231,26 +231,30 @@ def test_cummin_cummax(self, df, method): ["int", "float", "category_int", "datetime", "datetimetz", "timedelta"] ) - # GH#15561: numeric_only=False set by default like min/max expected_columns_numeric = expected_columns self._check(df, method, expected_columns, expected_columns_numeric) def _check(self, df, method, expected_columns, expected_columns_numeric): - gb = df.groupby("group") # cummin, cummax dont have numeric_only kwarg, always use False warn = None - if method in ["cummin", "cummax"]: + if method in ["cummin", "cummax", "min", "max"]: # these dont have numeric_only kwarg, always use False warn = FutureWarning - elif method in ["min", "max"]: - # these have numeric_only kwarg, but default to False - warn = FutureWarning + df["object"] = [ + None, + "y", + "z", + ] # add a column that is non numeric and will be dropped + gb = df[["group", "object", *list(expected_columns_numeric)]].groupby( + "group" + ) + else: + gb = df[["group", *list(expected_columns_numeric)]].groupby("group") with tm.assert_produces_warning(warn, match="Dropping invalid columns"): result = getattr(gb, method)() - tm.assert_index_equal(result.columns, expected_columns_numeric) # GH#41475 deprecated silently ignoring nuisance columns @@ -258,7 +262,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): if len(expected_columns) < len(gb._obj_with_exclusions.columns): warn = FutureWarning with tm.assert_produces_warning(warn, match="Dropping invalid columns"): - result = getattr(gb, method)(numeric_only=False) + result = getattr(gb, method)() tm.assert_index_equal(result.columns, expected_columns) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 203d8abb465d0..450fa5e570fc0 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -896,7 +896,7 @@ def test_keep_nuisance_agg(df, agg_function): ) def test_omit_nuisance_agg(df, agg_function): # GH 38774, GH 38815 - grouped = df.groupby("A") + grouped = df.groupby("A")["C", "D"] result = getattr(grouped, agg_function)() expected = getattr(df.loc[:, ["A", "C", "D"]].groupby("A"), agg_function)() tm.assert_frame_equal(result, expected) @@ -1126,8 +1126,8 @@ def test_groupby_with_hier_columns(): def test_grouping_ndarray(df): grouped = df.groupby(df["A"].values) - result = grouped.sum() - expected = df.groupby("A").sum() + result = grouped.sum(numeric_only=True) + expected = df.groupby("A").sum(numeric_only=True) tm.assert_frame_equal( result, expected, check_names=False ) # Note: no names when grouping by value @@ -2549,7 +2549,7 @@ def test_groupby_aggregation_numeric_with_non_numeric_dtype(): ) gb = df.groupby(by=["x"]) - result = gb.sum() + result = gb.sum(numeric_only=True) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index d4b21633309db..57219c1c32d5b 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -105,14 +105,14 @@ def test_groupby_with_timegrouper(self): ) expected.iloc[[0, 6, 18], 0] = np.array([24, 6, 9], dtype="int64") - result1 = df.resample("5D").sum() + result1 = df.resample("5D").sum()["Quantity"].to_frame() tm.assert_frame_equal(result1, expected) df_sorted = df.sort_index() - result2 = df_sorted.groupby(Grouper(freq="5D")).sum() + result2 = df_sorted.groupby(Grouper(freq="5D")).sum()["Quantity"].to_frame() tm.assert_frame_equal(result2, expected) - result3 = df.groupby(Grouper(freq="5D")).sum() + result3 = df.groupby(Grouper(freq="5D")).sum()["Quantity"].to_frame() tm.assert_frame_equal(result3, expected) @pytest.mark.parametrize("should_sort", [True, False]) @@ -185,8 +185,7 @@ def test_timegrouper_with_reg_groups(self): ], } ).set_index(["Date", "Buyer"]) - - result = df.groupby([Grouper(freq="A"), "Buyer"]).sum() + result = df.groupby([Grouper(freq="A"), "Buyer"]).sum(numeric_only=True) tm.assert_frame_equal(result, expected) expected = DataFrame( @@ -201,7 +200,7 @@ def test_timegrouper_with_reg_groups(self): ], } ).set_index(["Date", "Buyer"]) - result = df.groupby([Grouper(freq="6MS"), "Buyer"]).sum() + result = df.groupby([Grouper(freq="6MS"), "Buyer"]).sum(numeric_only=True) tm.assert_frame_equal(result, expected) df_original = DataFrame( @@ -239,10 +238,10 @@ def test_timegrouper_with_reg_groups(self): } ).set_index(["Date", "Buyer"]) - result = df.groupby([Grouper(freq="1D"), "Buyer"]).sum() + result = df.groupby([Grouper(freq="1D"), "Buyer"]).sum(numeric_only=True) tm.assert_frame_equal(result, expected) - result = df.groupby([Grouper(freq="1M"), "Buyer"]).sum() + result = df.groupby([Grouper(freq="1M"), "Buyer"]).sum(numeric_only=True) expected = DataFrame( { "Buyer": "Carl Joe Mark".split(), @@ -258,7 +257,9 @@ def test_timegrouper_with_reg_groups(self): # passing the name df = df.reset_index() - result = df.groupby([Grouper(freq="1M", key="Date"), "Buyer"]).sum() + result = df.groupby([Grouper(freq="1M", key="Date"), "Buyer"]).sum( + numeric_only=True + ) tm.assert_frame_equal(result, expected) with pytest.raises(KeyError, match="'The grouper name foo is not found'"): @@ -266,9 +267,13 @@ def test_timegrouper_with_reg_groups(self): # passing the level df = df.set_index("Date") - result = df.groupby([Grouper(freq="1M", level="Date"), "Buyer"]).sum() + result = df.groupby([Grouper(freq="1M", level="Date"), "Buyer"]).sum( + numeric_only=True + ) tm.assert_frame_equal(result, expected) - result = df.groupby([Grouper(freq="1M", level=0), "Buyer"]).sum() + result = df.groupby([Grouper(freq="1M", level=0), "Buyer"]).sum( + numeric_only=True + ) tm.assert_frame_equal(result, expected) with pytest.raises(ValueError, match="The level foo is not valid"): @@ -277,7 +282,9 @@ def test_timegrouper_with_reg_groups(self): # multi names df = df.copy() df["Date"] = df.index + offsets.MonthEnd(2) - result = df.groupby([Grouper(freq="1M", key="Date"), "Buyer"]).sum() + result = df.groupby([Grouper(freq="1M", key="Date"), "Buyer"]).sum( + numeric_only=True + ) expected = DataFrame( { "Buyer": "Carl Joe Mark".split(), @@ -306,18 +313,18 @@ def test_timegrouper_with_reg_groups(self): [datetime(2013, 10, 31, 0, 0)], freq=offsets.MonthEnd(), name="Date" ), ) - result = df.groupby(Grouper(freq="1M")).sum() + result = df.groupby(Grouper(freq="1M")).sum(numeric_only=True) tm.assert_frame_equal(result, expected) - result = df.groupby([Grouper(freq="1M")]).sum() + result = df.groupby([Grouper(freq="1M")]).sum(numeric_only=True) tm.assert_frame_equal(result, expected) expected.index = expected.index.shift(1) assert expected.index.freq == offsets.MonthEnd() - result = df.groupby(Grouper(freq="1M", key="Date")).sum() + result = df.groupby(Grouper(freq="1M", key="Date")).sum(numeric_only=True) tm.assert_frame_equal(result, expected) - result = df.groupby([Grouper(freq="1M", key="Date")]).sum() + result = df.groupby([Grouper(freq="1M", key="Date")]).sum(numeric_only=True) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("freq", ["D", "M", "A", "Q-APR"]) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 10fabe234d218..77a9118097bf2 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -90,7 +90,7 @@ def test_groupby_resample_on_api(): expected = df.set_index("dates").groupby("key").resample("D").mean() - result = df.groupby("key").resample("D", on="dates").mean() + result = df.groupby("key").resample("D", on="dates").mean()["values"].to_frame() tm.assert_frame_equal(result, expected) @@ -169,7 +169,7 @@ def tests_skip_nuisance(test_frame): tm.assert_frame_equal(result, expected) expected = r[["A", "B", "C"]].sum() - result = r.sum() + result = r.sum()[["A", "B", "C"]] tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 594b6b44aafa1..ef9a6c566322a 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -401,7 +401,7 @@ def test_resample_groupby_agg(): df["date"] = pd.to_datetime(df["date"]) resampled = df.groupby("cat").resample("Y", on="date") - expected = resampled.sum() + expected = resampled.sum()["num"].to_frame() result = resampled.agg({"num": "sum"}) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 48a55022aa484..147ffadf69b09 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -559,7 +559,7 @@ def test_mixed_type_join_with_suffix(self): df.insert(5, "dt", "foo") grouped = df.groupby("id") - mn = grouped.mean() + mn = grouped.mean(numeric_only=True) cn = grouped.count() # it works! diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 88607f4b036a0..ab7d58a3fc72f 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -898,14 +898,18 @@ def _check_output( # to help with a buglet self.data.columns = [k * 2 for k in self.data.columns] - table = self.data.pivot_table(index=["AA", "BB"], margins=True, aggfunc=np.mean) + table = self.data[["AA", "DD", "EE", "FF"]].pivot_table( + index=["AA"], margins=True, aggfunc=np.mean + ) for value_col in table.columns: - totals = table.loc[("All", ""), value_col] + totals = table.loc[("All"), value_col] assert totals == self.data[value_col].mean() - table = self.data.pivot_table(index=["AA", "BB"], margins=True, aggfunc="mean") + table = self.data[["AA", "DD", "EE", "FF"]].pivot_table( + index=["AA"], margins=True, aggfunc="mean" + ) for item in ["DD", "EE", "FF"]: - totals = table.loc[("All", ""), item] + totals = table.loc[("All"), item] assert totals == self.data[item].mean() @pytest.mark.parametrize( @@ -959,7 +963,9 @@ def test_margin_with_only_columns_defined( } ) - result = df.pivot_table(columns=columns, margins=True, aggfunc=aggfunc) + result = df[["A", "B", "D", "E"]].pivot_table( + columns=columns, margins=True, aggfunc=aggfunc + ) expected = DataFrame(values, index=Index(["D", "E"]), columns=expected_columns) tm.assert_frame_equal(result, expected) @@ -1984,8 +1990,12 @@ def test_pivot_string_as_func(self): def test_pivot_string_func_vs_func(self, f, f_numpy): # GH #18713 # for consistency purposes - result = pivot_table(self.data, index="A", columns="B", aggfunc=f) - expected = pivot_table(self.data, index="A", columns="B", aggfunc=f_numpy) + result = pivot_table( + self.data[["D", "E", "F"]], index="D", columns="E", aggfunc=f + ) + expected = pivot_table( + self.data[["D", "E", "F"]], index="D", columns="E", aggfunc=f_numpy + ) tm.assert_frame_equal(result, expected) @pytest.mark.slow