Skip to content

BUG: groupby then resample on column gives incorrect results if the index is out of order #59408

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@ These improvements also fixed certain bugs in groupby:
- :meth:`.DataFrameGroupBy.agg` would fail when there are multiple groupings, unobserved groups, and ``as_index=False`` (:issue:`36698`)
- :meth:`.DataFrameGroupBy.groups` with ``sort=False`` would sort groups; they now occur in the order they are observed (:issue:`56966`)
- :meth:`.DataFrameGroupBy.nunique` would fail when there are multiple groupings, unobserved groups, and ``as_index=False`` (:issue:`52848`)
- :meth:`.DataFrameGroupBy.resample` with an ``on`` value that is not ``None`` would have incorrect values when the index is out of order (:issue:`59350`)
- :meth:`.DataFrameGroupBy.sum` would have incorrect values when there are multiple groupings, unobserved groups, and non-numeric data (:issue:`43891`)
- :meth:`.DataFrameGroupBy.value_counts` would produce incorrect results when used with some categorical and some non-categorical groupings and ``observed=False`` (:issue:`56016`)

Expand Down
10 changes: 9 additions & 1 deletion pandas/core/groupby/grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
from pandas.core.indexes.api import (
Index,
MultiIndex,
RangeIndex,
default_index,
)
from pandas.core.series import Series
Expand Down Expand Up @@ -348,8 +349,15 @@ def _set_grouper(
reverse_indexer = self._indexer.argsort()
unsorted_ax = self._grouper.take(reverse_indexer)
ax = unsorted_ax.take(obj.index)
else:
elif isinstance(obj.index, RangeIndex):
# Standard case for RangeIndex
ax = self._grouper.take(obj.index)
else:
# GH 59350
# If index is not RangeIndex and not sorted here,
# avoid re-taking based on potentially mis-ordered obj.index.
# self._grouper should already align with obj's values via key.
ax = self._grouper
else:
if key not in obj._info_axis:
raise KeyError(f"The grouper name {key} is not found")
Expand Down
92 changes: 91 additions & 1 deletion pandas/tests/resample/test_resampler_grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,8 @@ def test_getitem_multiple():
def test_groupby_resample_on_api_with_getitem():
# GH 17813
df = DataFrame(
{"id": list("aabbb"), "date": date_range("1-1-2016", periods=5), "data": 1}
{"id": list("aabbb"), "date": date_range("1-1-2016", periods=5), "data": 1},
index=list("xyzab"),
)
exp = df.set_index("date").groupby("id").resample("2D")["data"].sum()
result = df.groupby("id").resample("2D", on="date")["data"].sum()
Expand Down Expand Up @@ -672,3 +673,92 @@ def test_groupby_resample_on_index_with_list_of_keys_missing_column():
rs = gb.resample("2D")
with pytest.raises(KeyError, match="Columns not found"):
rs[["val_not_in_dataframe"]]


def test_groupby_resample_after_set_index_and_not_on_column():
# GH 59350
df = DataFrame(
data={
"datetime": [
pd.to_datetime("2024-07-30T00:00Z"),
pd.to_datetime("2024-07-30T00:01Z"),
],
"group": ["A", "A"],
"numbers": [100, 200],
},
index=[1, 0],
).set_index("datetime")
gb = df.groupby("group")
rs = gb.resample("1min")
result = rs.aggregate({"numbers": "sum"})

index = pd.MultiIndex.from_arrays(
[
["A", "A"],
[pd.to_datetime("2024-07-30T00:00Z"), pd.to_datetime("2024-07-30T00:01Z")],
],
names=[
"group",
"datetime",
],
)
expected = DataFrame({"numbers": [100, 200]}, index=index)

tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
"df",
[
DataFrame(
data={
"datetime": [
pd.to_datetime("2024-07-30T00:00Z"),
pd.to_datetime("2024-07-30T00:01Z"),
],
"group": ["A", "A"],
"numbers": [100, 200],
},
index=[1, 0],
),
DataFrame(
data={
"datetime": [
pd.to_datetime("2024-07-30T00:00Z"),
pd.to_datetime("2024-07-30T00:01Z"),
],
"group": ["A", "A"],
"numbers": [100, 200],
},
).set_index("group"),
DataFrame(
data={
"datetime": [
pd.to_datetime("2024-07-30T00:00Z"),
pd.to_datetime("2024-07-30T00:01Z"),
],
"group": ["A", "A"],
"numbers": [100, 200],
},
).set_index("datetime", drop=False),
],
)
def test_groupby_resample_on_column_when_index_is_unusual(df):
# GH 59350
gb = df.groupby("group")
rs = gb.resample("1min", on="datetime")
result = rs.aggregate({"numbers": "sum"})

index = pd.MultiIndex.from_arrays(
[
["A", "A"],
[pd.to_datetime("2024-07-30T00:00Z"), pd.to_datetime("2024-07-30T00:01Z")],
],
names=[
"group",
"datetime",
],
)
expected = DataFrame({"numbers": [100, 200]}, index=index)

tm.assert_frame_equal(result, expected)
Loading