Skip to content

Fixes Issues#44132, #40148, #29033, #22275, #18869: groupby #44142

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 23 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -8072,7 +8072,7 @@ def resample(
5 18 100 2018-02-11
6 17 40 2018-02-18
7 19 50 2018-02-25
>>> df.resample('M', on='week_starting').mean()
>>> df.resample('M', on='week_starting')['price', 'volume'].mean()
price volume
week_starting
2018-01-31 10.75 62.5
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1224,7 +1224,7 @@ def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool:
else:
obj = self._obj_with_exclusions
check = obj._get_numeric_data()
if len(obj.columns) and not len(check.columns) and not obj.empty:
if len(obj.columns) > len(check.columns) and not obj.empty:
numeric_only = False
# TODO: v1.4+ Add FutureWarning

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/extension/base/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def test_in_numeric_groupby(self, data_for_grouping):
"C": [1, 1, 1, 1, 1, 1, 1, 1],
}
)
result = df.groupby("A").sum().columns
result = df.groupby("A").sum(numeric_only=True).columns

if data_for_grouping.dtype._is_numeric:
expected = pd.Index(["B", "C"])
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/test_stack_unstack.py
Original file line number Diff line number Diff line change
Expand Up @@ -1749,7 +1749,7 @@ def test_stack_multiple_bug(self):
multi = df.set_index(["DATE", "ID"])
multi.columns.name = "Params"
unst = multi.unstack("ID")
down = unst.resample("W-THU").mean()
down = unst.loc[:, ["VAR1"]].resample("W-THU").mean()

rs = down.stack("ID")
xp = unst.loc[:, ["VAR1"]].resample("W-THU").mean().stack("ID")
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/groupby/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,7 @@ def test_observed(observed, using_array_manager):
gb = df.groupby(["A", "B"], observed=observed)
exp_index = MultiIndex.from_arrays([cat1, cat2], names=["A", "B"])
expected = DataFrame({"values": [1, 2, 3, 4]}, index=exp_index)
result = gb.sum()
result = gb.sum(numeric_only=True)
if not observed:
expected = cartesian_product_for_groupers(
expected, [cat1, cat2], list("AB"), fill_value=0
Expand Down
48 changes: 26 additions & 22 deletions pandas/tests/groupby/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,9 +125,17 @@ def df(self):
@pytest.mark.parametrize("method", ["mean", "median"])
def test_averages(self, df, method):
# mean / median
expected_columns_numeric = Index(["int", "float", "category_int"])

gb = df.groupby("group")
expected_column_names = [
"int",
"float",
"category_int",
"datetime",
"datetimetz",
"timedelta",
]
expected_columns_numeric = Index(expected_column_names)

gb = df[["group", *expected_column_names]].groupby("group")
expected = DataFrame(
{
"category_int": [7.5, 9],
Expand All @@ -154,10 +162,7 @@ def test_averages(self, df, method):
],
)

with tm.assert_produces_warning(
FutureWarning, match="Dropping invalid", check_stacklevel=False
):
result = getattr(gb, method)(numeric_only=False)
result = getattr(gb, method)()
tm.assert_frame_equal(result.reindex_like(expected), expected)

expected_columns = expected.columns
Expand Down Expand Up @@ -205,14 +210,9 @@ def test_first_last(self, df, method):

@pytest.mark.parametrize("method", ["sum", "cumsum"])
def test_sum_cumsum(self, df, method):
expected_columns = Index(["int", "float", "category_int"])

expected_columns_numeric = Index(["int", "float", "category_int"])
expected_columns = Index(
["int", "float", "string", "category_int", "timedelta"]
)
if method == "cumsum":
# cumsum loses string
expected_columns = Index(["int", "float", "category_int", "timedelta"])
expected_columns_numeric = expected_columns

self._check(df, method, expected_columns, expected_columns_numeric)

Expand All @@ -231,34 +231,38 @@ def test_cummin_cummax(self, df, method):
["int", "float", "category_int", "datetime", "datetimetz", "timedelta"]
)

# GH#15561: numeric_only=False set by default like min/max
expected_columns_numeric = expected_columns

self._check(df, method, expected_columns, expected_columns_numeric)

def _check(self, df, method, expected_columns, expected_columns_numeric):
gb = df.groupby("group")

# cummin, cummax dont have numeric_only kwarg, always use False
warn = None
if method in ["cummin", "cummax"]:
if method in ["cummin", "cummax", "min", "max"]:
# these dont have numeric_only kwarg, always use False
warn = FutureWarning
elif method in ["min", "max"]:
# these have numeric_only kwarg, but default to False
warn = FutureWarning
df["object"] = [
None,
"y",
"z",
] # add a column that is non numeric and will be dropped
gb = df[["group", "object", *list(expected_columns_numeric)]].groupby(
"group"
)
else:
gb = df[["group", *list(expected_columns_numeric)]].groupby("group")

with tm.assert_produces_warning(warn, match="Dropping invalid columns"):
result = getattr(gb, method)()

tm.assert_index_equal(result.columns, expected_columns_numeric)

# GH#41475 deprecated silently ignoring nuisance columns
warn = None
if len(expected_columns) < len(gb._obj_with_exclusions.columns):
warn = FutureWarning
with tm.assert_produces_warning(warn, match="Dropping invalid columns"):
result = getattr(gb, method)(numeric_only=False)
result = getattr(gb, method)()

tm.assert_index_equal(result.columns, expected_columns)

Expand Down
8 changes: 4 additions & 4 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -896,7 +896,7 @@ def test_keep_nuisance_agg(df, agg_function):
)
def test_omit_nuisance_agg(df, agg_function):
# GH 38774, GH 38815
grouped = df.groupby("A")
grouped = df.groupby("A")["C", "D"]
result = getattr(grouped, agg_function)()
expected = getattr(df.loc[:, ["A", "C", "D"]].groupby("A"), agg_function)()
tm.assert_frame_equal(result, expected)
Expand Down Expand Up @@ -1126,8 +1126,8 @@ def test_groupby_with_hier_columns():
def test_grouping_ndarray(df):
grouped = df.groupby(df["A"].values)

result = grouped.sum()
expected = df.groupby("A").sum()
result = grouped.sum(numeric_only=True)
expected = df.groupby("A").sum(numeric_only=True)
tm.assert_frame_equal(
result, expected, check_names=False
) # Note: no names when grouping by value
Expand Down Expand Up @@ -2549,7 +2549,7 @@ def test_groupby_aggregation_numeric_with_non_numeric_dtype():
)

gb = df.groupby(by=["x"])
result = gb.sum()
result = gb.sum(numeric_only=True)
tm.assert_frame_equal(result, expected)


Expand Down
39 changes: 23 additions & 16 deletions pandas/tests/groupby/test_timegrouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,14 +105,14 @@ def test_groupby_with_timegrouper(self):
)
expected.iloc[[0, 6, 18], 0] = np.array([24, 6, 9], dtype="int64")

result1 = df.resample("5D").sum()
result1 = df.resample("5D").sum()["Quantity"].to_frame()
tm.assert_frame_equal(result1, expected)

df_sorted = df.sort_index()
result2 = df_sorted.groupby(Grouper(freq="5D")).sum()
result2 = df_sorted.groupby(Grouper(freq="5D")).sum()["Quantity"].to_frame()
tm.assert_frame_equal(result2, expected)

result3 = df.groupby(Grouper(freq="5D")).sum()
result3 = df.groupby(Grouper(freq="5D")).sum()["Quantity"].to_frame()
tm.assert_frame_equal(result3, expected)

@pytest.mark.parametrize("should_sort", [True, False])
Expand Down Expand Up @@ -185,8 +185,7 @@ def test_timegrouper_with_reg_groups(self):
],
}
).set_index(["Date", "Buyer"])

result = df.groupby([Grouper(freq="A"), "Buyer"]).sum()
result = df.groupby([Grouper(freq="A"), "Buyer"]).sum(numeric_only=True)
tm.assert_frame_equal(result, expected)

expected = DataFrame(
Expand All @@ -201,7 +200,7 @@ def test_timegrouper_with_reg_groups(self):
],
}
).set_index(["Date", "Buyer"])
result = df.groupby([Grouper(freq="6MS"), "Buyer"]).sum()
result = df.groupby([Grouper(freq="6MS"), "Buyer"]).sum(numeric_only=True)
tm.assert_frame_equal(result, expected)

df_original = DataFrame(
Expand Down Expand Up @@ -239,10 +238,10 @@ def test_timegrouper_with_reg_groups(self):
}
).set_index(["Date", "Buyer"])

result = df.groupby([Grouper(freq="1D"), "Buyer"]).sum()
result = df.groupby([Grouper(freq="1D"), "Buyer"]).sum(numeric_only=True)
tm.assert_frame_equal(result, expected)

result = df.groupby([Grouper(freq="1M"), "Buyer"]).sum()
result = df.groupby([Grouper(freq="1M"), "Buyer"]).sum(numeric_only=True)
expected = DataFrame(
{
"Buyer": "Carl Joe Mark".split(),
Expand All @@ -258,17 +257,23 @@ def test_timegrouper_with_reg_groups(self):

# passing the name
df = df.reset_index()
result = df.groupby([Grouper(freq="1M", key="Date"), "Buyer"]).sum()
result = df.groupby([Grouper(freq="1M", key="Date"), "Buyer"]).sum(
numeric_only=True
)
tm.assert_frame_equal(result, expected)

with pytest.raises(KeyError, match="'The grouper name foo is not found'"):
df.groupby([Grouper(freq="1M", key="foo"), "Buyer"]).sum()

# passing the level
df = df.set_index("Date")
result = df.groupby([Grouper(freq="1M", level="Date"), "Buyer"]).sum()
result = df.groupby([Grouper(freq="1M", level="Date"), "Buyer"]).sum(
numeric_only=True
)
tm.assert_frame_equal(result, expected)
result = df.groupby([Grouper(freq="1M", level=0), "Buyer"]).sum()
result = df.groupby([Grouper(freq="1M", level=0), "Buyer"]).sum(
numeric_only=True
)
tm.assert_frame_equal(result, expected)

with pytest.raises(ValueError, match="The level foo is not valid"):
Expand All @@ -277,7 +282,9 @@ def test_timegrouper_with_reg_groups(self):
# multi names
df = df.copy()
df["Date"] = df.index + offsets.MonthEnd(2)
result = df.groupby([Grouper(freq="1M", key="Date"), "Buyer"]).sum()
result = df.groupby([Grouper(freq="1M", key="Date"), "Buyer"]).sum(
numeric_only=True
)
expected = DataFrame(
{
"Buyer": "Carl Joe Mark".split(),
Expand Down Expand Up @@ -306,18 +313,18 @@ def test_timegrouper_with_reg_groups(self):
[datetime(2013, 10, 31, 0, 0)], freq=offsets.MonthEnd(), name="Date"
),
)
result = df.groupby(Grouper(freq="1M")).sum()
result = df.groupby(Grouper(freq="1M")).sum(numeric_only=True)
tm.assert_frame_equal(result, expected)

result = df.groupby([Grouper(freq="1M")]).sum()
result = df.groupby([Grouper(freq="1M")]).sum(numeric_only=True)
tm.assert_frame_equal(result, expected)

expected.index = expected.index.shift(1)
assert expected.index.freq == offsets.MonthEnd()
result = df.groupby(Grouper(freq="1M", key="Date")).sum()
result = df.groupby(Grouper(freq="1M", key="Date")).sum(numeric_only=True)
tm.assert_frame_equal(result, expected)

result = df.groupby([Grouper(freq="1M", key="Date")]).sum()
result = df.groupby([Grouper(freq="1M", key="Date")]).sum(numeric_only=True)
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize("freq", ["D", "M", "A", "Q-APR"])
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/resample/test_resample_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def test_groupby_resample_on_api():

expected = df.set_index("dates").groupby("key").resample("D").mean()

result = df.groupby("key").resample("D", on="dates").mean()
result = df.groupby("key").resample("D", on="dates").mean()["values"].to_frame()
tm.assert_frame_equal(result, expected)


Expand Down Expand Up @@ -169,7 +169,7 @@ def tests_skip_nuisance(test_frame):
tm.assert_frame_equal(result, expected)

expected = r[["A", "B", "C"]].sum()
result = r.sum()
result = r.sum()[["A", "B", "C"]]
tm.assert_frame_equal(result, expected)


Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/resample/test_resampler_grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,7 +401,7 @@ def test_resample_groupby_agg():
df["date"] = pd.to_datetime(df["date"])

resampled = df.groupby("cat").resample("Y", on="date")
expected = resampled.sum()
expected = resampled.sum()["num"].to_frame()
result = resampled.agg({"num": "sum"})

tm.assert_frame_equal(result, expected)
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/reshape/merge/test_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -559,7 +559,7 @@ def test_mixed_type_join_with_suffix(self):
df.insert(5, "dt", "foo")

grouped = df.groupby("id")
mn = grouped.mean()
mn = grouped.mean(numeric_only=True)
cn = grouped.count()

# it works!
Expand Down
24 changes: 17 additions & 7 deletions pandas/tests/reshape/test_pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -898,14 +898,18 @@ def _check_output(

# to help with a buglet
self.data.columns = [k * 2 for k in self.data.columns]
table = self.data.pivot_table(index=["AA", "BB"], margins=True, aggfunc=np.mean)
table = self.data[["AA", "DD", "EE", "FF"]].pivot_table(
index=["AA"], margins=True, aggfunc=np.mean
)
for value_col in table.columns:
totals = table.loc[("All", ""), value_col]
totals = table.loc[("All"), value_col]
assert totals == self.data[value_col].mean()

table = self.data.pivot_table(index=["AA", "BB"], margins=True, aggfunc="mean")
table = self.data[["AA", "DD", "EE", "FF"]].pivot_table(
index=["AA"], margins=True, aggfunc="mean"
)
for item in ["DD", "EE", "FF"]:
totals = table.loc[("All", ""), item]
totals = table.loc[("All"), item]
assert totals == self.data[item].mean()

@pytest.mark.parametrize(
Expand Down Expand Up @@ -959,7 +963,9 @@ def test_margin_with_only_columns_defined(
}
)

result = df.pivot_table(columns=columns, margins=True, aggfunc=aggfunc)
result = df[["A", "B", "D", "E"]].pivot_table(
columns=columns, margins=True, aggfunc=aggfunc
)
expected = DataFrame(values, index=Index(["D", "E"]), columns=expected_columns)

tm.assert_frame_equal(result, expected)
Expand Down Expand Up @@ -1984,8 +1990,12 @@ def test_pivot_string_as_func(self):
def test_pivot_string_func_vs_func(self, f, f_numpy):
# GH #18713
# for consistency purposes
result = pivot_table(self.data, index="A", columns="B", aggfunc=f)
expected = pivot_table(self.data, index="A", columns="B", aggfunc=f_numpy)
result = pivot_table(
self.data[["D", "E", "F"]], index="D", columns="E", aggfunc=f
)
expected = pivot_table(
self.data[["D", "E", "F"]], index="D", columns="E", aggfunc=f_numpy
)
tm.assert_frame_equal(result, expected)

@pytest.mark.slow
Expand Down