Skip to content

Rolling on DataFrameGroupBy duplicated index column when part of the grouping cols is from index #36816

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,7 @@ Groupby/resample/rolling
- Bug in :meth:`DataFrame.groupby` does not always maintain column index name for ``any``, ``all``, ``bfill``, ``ffill``, ``shift`` (:issue:`29764`)
- Bug in :meth:`DataFrameGroupBy.apply` raising error with ``np.nan`` group(s) when ``dropna=False`` (:issue:`35889`)
- Bug in :meth:`Rolling.sum()` returned wrong values when dtypes where mixed between float and integer and axis was equal to one (:issue:`20649`, :issue:`35596`)
- Bug in :meth:`DataFrameGroupBy.rolling` duplicates index columns when a part of the grouping columns was from the index (:issue:`36794`)

Reshaping
^^^^^^^^^
Expand Down
15 changes: 15 additions & 0 deletions pandas/core/window/rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -2209,6 +2209,20 @@ def _apply(
grouped_index_name = [*grouped_object_index.names]
groupby_keys = [grouping.name for grouping in self._groupby.grouper._groupings]
result_index_names = groupby_keys + grouped_index_name
drop_levels = []
obj = self._groupby._selected_obj
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is way to special cases; we need to figure out why this is happening and adjust the logic more generally.

Copy link
Member Author

@phofl phofl Oct 2, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is happening because of

        for key, values in self._groupby.grouper.indices.items():
            for value in values:
                data = [
                    *com.maybe_make_list(key),
                    *com.maybe_make_list(grouped_object_index[value]),
                ]
                result_index_data.append(tuple(data))

        result_index = MultiIndex.from_tuples(
            result_index_data, names=result_index_names
        )

which is a few lines below. self._groupby.grouper.indices.items()represents the grouping columns and grouped_object_index[value] the index used for the rolling operation. If parts of the grouping columns are from the index, we duplicate these here. As I wrote, I could not figure out a simple way to avoid this within the loop.

# We have to handle a unnamed Series different, because we can not say, if
# grouping column was index or Series column.
if (
isinstance(obj, ABCDataFrame)
or obj.name is not None
or obj.index.names != [None]
):
drop_levels = [
i for i, name in enumerate(groupby_keys) if name in grouped_index_name
]
elif self._groupby.level is not None:
drop_levels = com.maybe_make_list(self._groupby.level)

result_index_data = []
for key, values in self._groupby.grouper.indices.items():
Expand All @@ -2222,6 +2236,7 @@ def _apply(
result_index = MultiIndex.from_tuples(
result_index_data, names=result_index_names
)
result_index = result_index.droplevel(drop_levels)
result.index = result_index
return result

Expand Down
38 changes: 38 additions & 0 deletions pandas/tests/window/test_grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,3 +416,41 @@ def test_groupby_rolling_empty_frame(self):
result = expected.groupby(["s1", "s2"]).rolling(window=1).sum()
expected.index = pd.MultiIndex.from_tuples([], names=["s1", "s2", None])
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize("groupings", [{"level": 0}, {"by": "groupby_col"}])
def test_groupby_rolling_index_in_by_columns(self, groupings):
# GH: 36794
df = pd.DataFrame(
{"groupby_col": [1, 1, 1, 2, 2, 2], "agg_col": [1, 1, 0, 0, 1, 0]}
).set_index("groupby_col")
result = df.groupby(**groupings).rolling(2).sum()
expected = pd.DataFrame(
{
"groupby_col": [1, 1, 1, 2, 2, 2],
"agg_col": [np.nan, 2.0, 1.0, np.nan, 1.0, 1.0],
}
).set_index("groupby_col")
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize("index_name", ["index", None])
def test_groupby_rolling_unnamed_series(self, index_name):
# GH: 36794
ds = pd.Series([1, 2, 3, 4], index=pd.Index([0, 1, 2, 3], name=index_name))
result = ds.groupby(ds).rolling(2).sum()
expected = pd.Series(
[np.nan] * 4,
index=pd.MultiIndex.from_tuples(
[(1, 0), (2, 1), (3, 2), (4, 3)], names=[None, index_name]
),
)
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("groupings", [{"level": 0}, {"by": "index"}])
def test_groupby_rolling_index_of_unnamed_series(self, groupings):
# GH: 36794
ds = pd.Series([1, 2, 3, 4], index=pd.Index([0, 0, 1, 1], name="index"))
result = ds.groupby(**groupings).rolling(2).sum()
expected = pd.Series(
[np.nan, 3.0, np.nan, 7.0], index=pd.Index([0, 0, 1, 1], name="index")
)
tm.assert_series_equal(result, expected)
4 changes: 1 addition & 3 deletions pandas/tests/window/test_rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -675,9 +675,7 @@ def test_iter_rolling_datetime(expected, expected_index, window):
[
(
{"level": 0},
pd.MultiIndex.from_tuples(
[(0, 0), (0, 0), (1, 1), (1, 1), (1, 1)], names=[None, None]
),
pd.Index([0, 0, 1, 1, 1]),
),
(
{"by": "X"},
Expand Down