Skip to content

ENH: Implement __iter__ for Rolling and Expanding #34201

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 23 commits into from
May 17, 2020
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,7 @@ Other enhancements
:class:`~pandas.io.stata.StataWriter`, :class:`~pandas.io.stata.StataWriter117`,
and :class:`~pandas.io.stata.StataWriterUTF8` (:issue:`26599`).
- :meth:`HDFStore.put` now accepts `track_times` parameter. Parameter is passed to ``create_table`` method of ``PyTables`` (:issue:`32682`).
- Make :class:`pandas.core.window.Rolling` and :class:`pandas.core.window.Expanding` iterable(:issue:`11704`)

.. ---------------------------------------------------------------------------
Expand Down
27 changes: 25 additions & 2 deletions pandas/core/window/rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,8 +247,31 @@ def __repr__(self) -> str:
return f"{self._window_type} [{attrs}]"

def __iter__(self):
url = "https://github.com/pandas-dev/pandas/issues/11704"
raise NotImplementedError(f"See issue #11704 {url}")
window = self._get_window(win_type=None)

blocks, obj = self._create_blocks()
index = self._get_window_indexer(window=window)

# Choose the min between min_periods and window to determine the output size
if self.min_periods is None:
iter_threshold = window
else:
iter_threshold = min(window, self.min_periods)

start, end = index.get_window_bounds(
num_values=len(obj),
min_periods=self.min_periods,
center=self.center,
closed=self.closed,
)
# From get_window_bounds, those two should be equal in length of array
assert len(start) == len(end)

window_size = len(start)
for i in range(window_size):
result = obj.iloc[slice(start[i], end[i])]
if result.count().min() >= iter_threshold:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMO since min_periods is supposed to influence the the aggregation result, and we're just returning the window here, I think we should always return the result and not filter results based on iter_threshold

Copy link
Member Author

@charlesdong1991 charlesdong1991 May 16, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

emm, i am not sure about this actually, because df.rolling(window=2, min_periods=3).sum() for instance is not allowed, and will raise a ValueError because min_periods has to be equal or smaller than window in aggregation, that is why i think maybe it makes a bit more sense to have the minimum here between window and min_periods.

but I also do not know if this matters in iter? maybe raise an error here, does it sound more reasonable?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right I am thinking it doesn't matter in __iter__ because the aggregation hasn't happened yet (sum), and it's up to the user to decide what they want to do with each window.

From the user perspective, I can see a potential source of confusion if not all the windows are returned.

Copy link
Member Author

@charlesdong1991 charlesdong1991 May 16, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, leaving it to users sounds more convincing!

thanks, @mroeschke ! i have updated the PR to remove this check (also no error raising) here.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks good, thanks! Might want to document this behavior in computation.rst

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sure, added! thanks for the reviews! @mroeschke

yield result

def _prep_values(self, values: Optional[np.ndarray] = None) -> np.ndarray:
"""Convert input to numpy arrays for Cython routines"""
Expand Down
83 changes: 74 additions & 9 deletions pandas/tests/window/test_expanding.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,15 +88,6 @@ def test_missing_minp_zero():
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("klass", [pd.Series, pd.DataFrame])
def test_iter_raises(klass):
# https://github.com/pandas-dev/pandas/issues/11704
# Iteration over a Window
obj = klass([1, 2, 3, 4])
with pytest.raises(NotImplementedError):
iter(obj.expanding(2))


def test_expanding_axis(axis_frame):
# see gh-23372.
df = DataFrame(np.ones((10, 20)))
Expand Down Expand Up @@ -131,3 +122,77 @@ def test_expanding_count_default_min_periods_with_null_values(constructor):
result = constructor(values).expanding().count()
expected = constructor(expected_counts)
tm.assert_equal(result, expected)


@pytest.mark.parametrize(
"df,expected,min_periods",
[
(
DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}),
[({"A": [1, 2, 3], "B": [4, 5, 6]}, [0, 1, 2])],
3,
),
(
DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}),
[
({"A": [1, 2], "B": [4, 5]}, [0, 1]),
({"A": [1, 2, 3], "B": [4, 5, 6]}, [0, 1, 2]),
],
2,
),
(
DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}),
[
({"A": [1], "B": [4]}, [0]),
({"A": [1, 2], "B": [4, 5]}, [0, 1]),
({"A": [1, 2, 3], "B": [4, 5, 6]}, [0, 1, 2]),
],
1,
),
(DataFrame({"A": [1], "B": [4]}), [], 2),
(DataFrame(), [({}, [])], 1),
(
DataFrame({"A": [1, np.nan, 3], "B": [np.nan, 5, 6]}),
[({"A": [1, np.nan, 3], "B": [np.nan, 5, 6]}, [0, 1, 2])],
3,
),
(
DataFrame({"A": [1, np.nan, 3], "B": [np.nan, 5, 6]}),
[({"A": [1, np.nan, 3], "B": [np.nan, 5, 6]}, [0, 1, 2])],
2,
),
(
DataFrame({"A": [1, np.nan, 3], "B": [np.nan, 5, 6]}),
[
({"A": [1, np.nan], "B": [np.nan, 5]}, [0, 1]),
({"A": [1, np.nan, 3], "B": [np.nan, 5, 6]}, [0, 1, 2]),
],
1,
),
],
)
def test_iter_expanding_dataframe(df, expected, min_periods):
# GH 11704
expected = [DataFrame(values, index=index) for (values, index) in expected]

for (expected, actual) in zip(expected, df.expanding(min_periods)):
tm.assert_frame_equal(actual, expected)


@pytest.mark.parametrize(
"ser,expected,min_periods",
[
(Series([1, 2, 3]), [([1, 2, 3], [0, 1, 2])], 3),
(Series([1, 2, 3]), [([1, 2], [0, 1]), ([1, 2, 3], [0, 1, 2])], 2),
(Series([1, 2, 3]), [([1], [0]), ([1, 2], [0, 1]), ([1, 2, 3], [0, 1, 2])], 1),
(Series([1, 2]), [([1, 2], [0, 1])], 2),
(Series([np.nan, 2]), [([np.nan, 2], [0, np.nan])], 2),
(Series([], dtype="int64"), [], 2),
],
)
def test_iter_expanding_series(ser, expected, min_periods):
# GH 11704
expected = [Series(values, index=index) for (values, index) in expected]

for (expected, actual) in zip(expected, ser.expanding(min_periods)):
tm.assert_series_equal(actual, expected)
192 changes: 179 additions & 13 deletions pandas/tests/window/test_rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import pandas.util._test_decorators as td

import pandas as pd
from pandas import DataFrame, Series
from pandas import DataFrame, Series, date_range
import pandas._testing as tm
from pandas.core.window import Rolling

Expand Down Expand Up @@ -310,18 +310,6 @@ def test_multi_index_names():
assert result.index.names == [None, "1", "2"]


@pytest.mark.parametrize("klass", [pd.Series, pd.DataFrame])
def test_iter_raises(klass):
# https://github.com/pandas-dev/pandas/issues/11704
# Iteration over a Window
obj = klass([1, 2, 3, 4])

msg = "See issue #11704 https://github.com/pandas-dev/pandas/issues/11704"

with pytest.raises(NotImplementedError, match=msg):
iter(obj.rolling(2))


def test_rolling_axis_sum(axis_frame):
# see gh-23372.
df = DataFrame(np.ones((10, 20)))
Expand Down Expand Up @@ -470,3 +458,181 @@ def test_rolling_count_default_min_periods_with_null_values(constructor):
result = constructor(values).rolling(3).count()
expected = constructor(expected_counts)
tm.assert_equal(result, expected)


@pytest.mark.parametrize(
"df,expected,window,min_periods",
[
(
DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}),
[({"A": [1, 2, 3], "B": [4, 5, 6]}, [0, 1, 2])],
3,
None,
),
(
DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}),
[
({"A": [1], "B": [4]}, [0]),
({"A": [1, 2], "B": [4, 5]}, [0, 1]),
({"A": [2, 3], "B": [5, 6]}, [1, 2]),
],
2,
1,
),
(
DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}),
[
({"A": [1, 2], "B": [4, 5]}, [0, 1]),
({"A": [2, 3], "B": [5, 6]}, [1, 2]),
],
2,
3,
),
(
DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}),
[
({"A": [1], "B": [4]}, [0]),
({"A": [2], "B": [5]}, [1]),
({"A": [3], "B": [6]}, [2]),
],
1,
1,
),
(DataFrame({"A": [1], "B": [4]}), [], 2, None),
(DataFrame({"A": [1], "B": [4]}), [], 2, 1),
(DataFrame(), [({}, [])], 2, None),
(DataFrame(), [({}, [])], 1, 2),
(
DataFrame({"A": [1, np.nan, 3], "B": [np.nan, 5, 6]}),
[({"A": [1, np.nan, 3], "B": [np.nan, 5, 6]}, [0, 1, 2])],
3,
2,
),
],
)
def test_iter_rolling_dataframe(df, expected, window, min_periods):
# GH 11704
expected = [DataFrame(values, index=index) for (values, index) in expected]

for (expected, actual) in zip(
expected, df.rolling(window, min_periods=min_periods)
):
tm.assert_frame_equal(actual, expected)


@pytest.mark.parametrize(
"expected,window",
[
(
[
({"A": [1], "B": [4]}, [0]),
({"A": [1, 2], "B": [4, 5]}, [0, 1]),
({"A": [2, 3], "B": [5, 6]}, [1, 2]),
],
"2D",
),
(
[
({"A": [1], "B": [4]}, [0]),
({"A": [1, 2], "B": [4, 5]}, [0, 1]),
({"A": [1, 2, 3], "B": [4, 5, 6]}, [0, 1, 2]),
],
"3D",
),
(
[
({"A": [1], "B": [4]}, [0]),
({"A": [2], "B": [5]}, [1]),
({"A": [3], "B": [6]}, [2]),
],
"1D",
),
],
)
def test_iter_rolling_on_dataframe(expected, window):
# GH 11704
df = DataFrame(
{
"A": [1, 2, 3, 4, 5],
"B": [4, 5, 6, 7, 8],
"C": date_range(start="2016-01-01", periods=5, freq="D"),
}
)

expected = [DataFrame(values, index=index) for (values, index) in expected]
for (expected, actual) in zip(expected, df.rolling(window, on="C")):
tm.assert_frame_equal(actual, expected)


@pytest.mark.parametrize(
"ser,expected,window, min_periods",
[
(Series([1, 2, 3]), [([1, 2, 3], [0, 1, 2])], 3, None),
(Series([1, 2, 3]), [([1], [0]), ([1, 2], [0, 1]), ([2, 3], [1, 2])], 2, 1),
(Series([1, 2, 3]), [([1, 2], [0, 1]), ([2, 3], [1, 2])], 2, 3),
(Series([1, 2, 3]), [([1], [0]), ([2], [1]), ([3], [2])], 1, 0),
(Series([1, 2, 3]), [([1], [0]), ([2], [1]), ([3], [2])], 1, 2),
(Series([1, 2]), [([1], [0]), ([1, 2], [0, 1])], 2, 0),
(Series([1, 2]), [([1, 2], [0, 1])], 2, 3),
(Series([], dtype="int64"), [], 2, 1),
(Series([], dtype="int64"), [], 2, 3),
],
)
def test_iter_rolling_series(ser, expected, window, min_periods):
# GH 11704
expected = [Series(values, index=index) for (values, index) in expected]

for (expected, actual) in zip(
expected, ser.rolling(window, min_periods=min_periods)
):
tm.assert_series_equal(actual, expected)


@pytest.mark.parametrize(
"expected,expected_index,window",
[
(
[[0], [1], [2], [3], [4]],
[
date_range("2020-01-01", periods=1, freq="D"),
date_range("2020-01-02", periods=1, freq="D"),
date_range("2020-01-03", periods=1, freq="D"),
date_range("2020-01-04", periods=1, freq="D"),
date_range("2020-01-05", periods=1, freq="D"),
],
"1D",
),
(
[[0], [0, 1], [1, 2], [2, 3], [3, 4]],
[
date_range("2020-01-01", periods=1, freq="D"),
date_range("2020-01-01", periods=2, freq="D"),
date_range("2020-01-02", periods=2, freq="D"),
date_range("2020-01-03", periods=2, freq="D"),
date_range("2020-01-04", periods=2, freq="D"),
],
"2D",
),
(
[[0], [0, 1], [0, 1, 2], [1, 2, 3], [2, 3, 4]],
[
date_range("2020-01-01", periods=1, freq="D"),
date_range("2020-01-01", periods=2, freq="D"),
date_range("2020-01-01", periods=3, freq="D"),
date_range("2020-01-02", periods=3, freq="D"),
date_range("2020-01-03", periods=3, freq="D"),
],
"3D",
),
],
)
def test_iter_rolling_datetime(expected, expected_index, window):
# GH 11704
ser = Series(range(5), index=date_range(start="2020-01-01", periods=5, freq="D"))

expected = [
Series(values, index=idx) for (values, idx) in zip(expected, expected_index)
]

for (expected, actual) in zip(expected, ser.rolling(window)):
tm.assert_series_equal(actual, expected)