-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
fix Rolling for multi-index and reversed index. #28297
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 11 commits
ce0a5e9
1e1af78
de83db4
f78cf28
859bfea
908342d
bc77e09
33c3cca
c5a062f
1ec868b
753d01c
1b46dc5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -126,7 +126,7 @@ def _create_blocks(self): | |
obj = self._selected_obj | ||
|
||
# filter out the on from the object | ||
if self.on is not None: | ||
if self.on is not None and not isinstance(self.on, Index): | ||
if obj.ndim == 2: | ||
obj = obj.reindex(columns=obj.columns.difference([self.on]), copy=False) | ||
blocks = obj._to_dict_of_blocks(copy=False).values() | ||
|
@@ -1651,18 +1651,19 @@ def is_datetimelike(self): | |
|
||
@cache_readonly | ||
def _on(self): | ||
|
||
if self.on is None: | ||
if self.axis == 0: | ||
return self.obj.index | ||
elif self.axis == 1: | ||
return self.obj.columns | ||
elif isinstance(self.on, Index): | ||
return self.on | ||
elif isinstance(self.obj, ABCDataFrame) and self.on in self.obj.columns: | ||
return Index(self.obj[self.on]) | ||
else: | ||
raise ValueError( | ||
"invalid on specified as {0}, " | ||
"must be a column (if DataFrame) " | ||
"must be a column (of DataFrame), an Index " | ||
"or None".format(self.on) | ||
) | ||
|
||
|
@@ -1706,10 +1707,12 @@ def validate(self): | |
|
||
def _validate_monotonic(self): | ||
""" | ||
Validate on is_monotonic. | ||
Validate monotonic (increasing or decreasing). | ||
""" | ||
if not self._on.is_monotonic: | ||
formatted = self.on or "index" | ||
if not (self._on.is_monotonic_increasing or self._on.is_monotonic_decreasing): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Surprised this doesn't match the previous definition of just There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You mean updating |
||
formatted = self.on | ||
if self.on is None: | ||
formatted = "index" | ||
raise ValueError("{0} must be monotonic".format(formatted)) | ||
|
||
def _validate_freq(self): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,15 @@ | ||
import numpy as np | ||
import pytest | ||
|
||
from pandas import DataFrame, Index, Series, Timestamp, date_range, to_datetime | ||
from pandas import ( | ||
DataFrame, | ||
Index, | ||
MultiIndex, | ||
Series, | ||
Timestamp, | ||
date_range, | ||
to_datetime, | ||
) | ||
import pandas.util.testing as tm | ||
|
||
import pandas.tseries.offsets as offsets | ||
|
@@ -105,8 +113,15 @@ def test_monotonic_on(self): | |
assert df.index.is_monotonic | ||
df.rolling("2s").sum() | ||
|
||
# non-monotonic | ||
df.index = reversed(df.index.tolist()) | ||
def test_non_monotonic_on(self): | ||
df = DataFrame( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add the issue number as a comment |
||
{"A": date_range("20130101", periods=5, freq="s"), "B": range(5)} | ||
) | ||
df.set_index("A", inplace=True) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. don't use inplace in tests at all, rather |
||
non_monotonic_index = df.index.to_list() | ||
non_monotonic_index[0] = non_monotonic_index[3] | ||
df.index = non_monotonic_index | ||
|
||
assert not df.index.is_monotonic | ||
|
||
with pytest.raises(ValueError): | ||
|
@@ -690,3 +705,34 @@ def test_rolling_cov_offset(self): | |
|
||
expected2 = ss.rolling(3, min_periods=1).cov() | ||
tm.assert_series_equal(result, expected2) | ||
|
||
def test_rolling_on_decreasing_index(self): | ||
# GH-19248 | ||
index = [ | ||
Timestamp("20190101 09:00:00"), | ||
Timestamp("20190101 09:00:02"), | ||
Timestamp("20190101 09:00:03"), | ||
Timestamp("20190101 09:00:05"), | ||
Timestamp("20190101 09:00:06"), | ||
] | ||
|
||
df = DataFrame({"column": [3, 4, 4, 2, 1]}, index=reversed(index)) | ||
result = df.rolling("2s").min() | ||
expected = DataFrame( | ||
{"column": [3.0, 3.0, 3.0, 2.0, 1.0]}, index=reversed(index) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems to me that the 2 seconds time window is not respected, but the minimum is taken over the whole dataframe. To my understanding, the excpected results would be [3, 4, 2, 1, 1] (closed interval limits) or [3, 4, 2, 2, 1] (open interval end). Even if you consider the next 2 seconds in time order, i.e. the other way round, you would have [3, 3, 4, 2, 1]. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same issue found here. By using df.rolling().apply(func), we can print the window inside func and it is the while dataframe till current offset. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You're right, the expected result is wrong. I debugged it a bit and found out that EDIT: Here it comes: #32386 |
||
) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
def test_rolling_on_multi_index_level(self): | ||
# GH-15584 | ||
df = DataFrame( | ||
{"column": range(6)}, | ||
index=MultiIndex.from_product( | ||
[date_range("20190101", periods=3), range(2)], names=["date", "seq"] | ||
), | ||
) | ||
result = df.rolling("10d", on=df.index.get_level_values("date")).sum() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is a very odd way of calling this as it should be a scalar. why do you think we should allow this way? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not sure if I understand you. how about passing something like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. right what I mean, is that this doesn't make rolling any more friendly to a Multi-index. we would need to add a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok |
||
expected = DataFrame( | ||
{"column": [0.0, 1.0, 3.0, 6.0, 10.0, 15.0]}, index=df.index | ||
) | ||
tm.assert_frame_equal(result, expected) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you update the doc-string (and type annoation if its there) for on in Rolling