From 05955671dc3976a36e36266b5bd5feefc1075683 Mon Sep 17 00:00:00 2001 From: jamesholcombe Date: Sat, 16 Oct 2021 23:25:59 +0100 Subject: [PATCH 01/15] BUG: 43909 - check monoticity of rolling groupby --- pandas/core/window/rolling.py | 79 +++++--------------- pandas/tests/window/test_rolling.py | 107 ++++++++-------------------- 2 files changed, 48 insertions(+), 138 deletions(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 2b8ed3c97d026..8e3460657f38b 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -662,13 +662,7 @@ def _apply( numba_args: tuple[Any, ...] = (), **kwargs, ) -> DataFrame | Series: - result = super()._apply( - func, - name, - numba_cache_key, - numba_args, - **kwargs, - ) + result = super()._apply(func, name, numba_cache_key, numba_args, **kwargs,) # Reconstruct the resulting MultiIndex # 1st set of levels = group by labels # 2nd set of levels = original DataFrame/Series index @@ -1263,9 +1257,7 @@ def apply( raise ValueError("engine must be either 'numba' or 'cython'") return self._apply( - apply_func, - numba_cache_key=numba_cache_key, - numba_args=numba_args, + apply_func, numba_cache_key=numba_cache_key, numba_args=numba_args, ) def _generate_cython_apply_func( @@ -1307,10 +1299,7 @@ def sum( func = np.nansum return self.apply( - func, - raw=True, - engine=engine, - engine_kwargs=engine_kwargs, + func, raw=True, engine=engine, engine_kwargs=engine_kwargs, ) window_func = window_aggregations.roll_sum return self._apply(window_func, name="sum", **kwargs) @@ -1330,10 +1319,7 @@ def max( func = np.nanmax return self.apply( - func, - raw=True, - engine=engine, - engine_kwargs=engine_kwargs, + func, raw=True, engine=engine, engine_kwargs=engine_kwargs, ) window_func = window_aggregations.roll_max return self._apply(window_func, name="max", **kwargs) @@ -1353,10 +1339,7 @@ def min( func = np.nanmin return self.apply( - func, - raw=True, - engine=engine, - engine_kwargs=engine_kwargs, + func, raw=True, engine=engine, engine_kwargs=engine_kwargs, ) window_func = window_aggregations.roll_min return self._apply(window_func, name="min", **kwargs) @@ -1373,10 +1356,7 @@ def mean( if self.method == "table": func = generate_manual_numpy_nan_agg_with_axis(np.nanmean) return self.apply( - func, - raw=True, - engine=engine, - engine_kwargs=engine_kwargs, + func, raw=True, engine=engine, engine_kwargs=engine_kwargs, ) else: from pandas.core._numba.kernels import sliding_mean @@ -1398,10 +1378,7 @@ def median( func = np.nanmedian return self.apply( - func, - raw=True, - engine=engine, - engine_kwargs=engine_kwargs, + func, raw=True, engine=engine, engine_kwargs=engine_kwargs, ) window_func = window_aggregations.roll_median_c return self._apply(window_func, name="median", **kwargs) @@ -1413,39 +1390,23 @@ def std(self, ddof: int = 1, *args, **kwargs): def zsqrt_func(values, begin, end, min_periods): return zsqrt(window_func(values, begin, end, min_periods, ddof=ddof)) - return self._apply( - zsqrt_func, - name="std", - **kwargs, - ) + return self._apply(zsqrt_func, name="std", **kwargs,) def var(self, ddof: int = 1, *args, **kwargs): nv.validate_window_func("var", args, kwargs) window_func = partial(window_aggregations.roll_var, ddof=ddof) - return self._apply( - window_func, - name="var", - **kwargs, - ) + return self._apply(window_func, name="var", **kwargs,) def skew(self, **kwargs): window_func = window_aggregations.roll_skew - return self._apply( - window_func, - name="skew", - **kwargs, - ) + return self._apply(window_func, name="skew", **kwargs,) def sem(self, ddof: int = 1, *args, **kwargs): return self.std(*args, **kwargs) / (self.count() - ddof).pow(0.5) def kurt(self, **kwargs): window_func = window_aggregations.roll_kurt - return self._apply( - window_func, - name="kurt", - **kwargs, - ) + return self._apply(window_func, name="kurt", **kwargs,) def quantile(self, quantile: float, interpolation: str = "linear", **kwargs): if quantile == 1.0: @@ -2223,9 +2184,7 @@ def kurt(self, **kwargs): ) def quantile(self, quantile: float, interpolation: str = "linear", **kwargs): return super().quantile( - quantile=quantile, - interpolation=interpolation, - **kwargs, + quantile=quantile, interpolation=interpolation, **kwargs, ) @doc( @@ -2296,12 +2255,7 @@ def rank( pct: bool = False, **kwargs, ): - return super().rank( - method=method, - ascending=ascending, - pct=pct, - **kwargs, - ) + return super().rank(method=method, ascending=ascending, pct=pct, **kwargs,) @doc( template_header, @@ -2513,8 +2467,9 @@ def _get_window_indexer(self) -> GroupbyIndexer: def _validate_monotonic(self): """ Validate that on is monotonic; - in this case we have to check only for nans, because - monotonicity was already validated at a higher level. """ - if self._on.hasnans: + if ( + not (self._on.is_monotonic_increasing or self._on.is_monotonic_decreasing) + or self._on.hasnans + ): self._raise_monotonic_error() diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index d58eeaa7cbcb1..17c1f7d0b21d5 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -84,15 +84,13 @@ def test_constructor_with_timedelta_window(window): # GH 15440 n = 10 df = DataFrame( - {"value": np.arange(n)}, - index=date_range("2015-12-24", periods=n, freq="D"), + {"value": np.arange(n)}, index=date_range("2015-12-24", periods=n, freq="D"), ) expected_data = np.append([0.0, 1.0], np.arange(3.0, 27.0, 3)) result = df.rolling(window=window).sum() expected = DataFrame( - {"value": expected_data}, - index=date_range("2015-12-24", periods=n, freq="D"), + {"value": expected_data}, index=date_range("2015-12-24", periods=n, freq="D"), ) tm.assert_frame_equal(result, expected) expected = df.rolling("3D").sum() @@ -104,8 +102,7 @@ def test_constructor_timedelta_window_and_minperiods(window, raw): # GH 15305 n = 10 df = DataFrame( - {"value": np.arange(n)}, - index=date_range("2017-08-08", periods=n, freq="D"), + {"value": np.arange(n)}, index=date_range("2017-08-08", periods=n, freq="D"), ) expected = DataFrame( {"value": np.append([np.NaN, 1.0], np.arange(3.0, 27.0, 3))}, @@ -136,13 +133,9 @@ def test_closed_fixed(closed, arithmetic_win_operators): df_fixed = DataFrame({"A": [0, 1, 2, 3, 4]}) df_time = DataFrame({"A": [0, 1, 2, 3, 4]}, index=date_range("2020", periods=5)) - result = getattr( - df_fixed.rolling(2, closed=closed, min_periods=1), - func_name, - )() + result = getattr(df_fixed.rolling(2, closed=closed, min_periods=1), func_name,)() expected = getattr( - df_time.rolling("2D", closed=closed, min_periods=1), - func_name, + df_time.rolling("2D", closed=closed, min_periods=1), func_name, )().reset_index(drop=True) tm.assert_frame_equal(result, expected) @@ -213,8 +206,7 @@ def test_datetimelike_centered_selections( kwargs = {} result = getattr( - df_time.rolling("2D", closed=closed, min_periods=1, center=True), - func_name, + df_time.rolling("2D", closed=closed, min_periods=1, center=True), func_name, )(**kwargs) tm.assert_frame_equal(result, expected, check_dtype=False) @@ -345,8 +337,7 @@ def test_closed_one_entry(func): def test_closed_one_entry_groupby(func): # GH24718 ser = DataFrame( - data={"A": [1, 1, 2], "B": [3, 2, 1]}, - index=date_range("2000", periods=3), + data={"A": [1, 1, 2], "B": [3, 2, 1]}, index=date_range("2000", periods=3), ) result = getattr( ser.groupby("A", sort=False)["B"].rolling("10D", closed="left"), func @@ -373,8 +364,7 @@ def test_closed_one_entry_groupby(func): def test_closed_min_max_datetime(input_dtype, func, closed, expected): # see gh-21704 ser = Series( - data=np.arange(10).astype(input_dtype), - index=date_range("2000", periods=10), + data=np.arange(10).astype(input_dtype), index=date_range("2000", periods=10), ) result = getattr(ser.rolling("3D", closed=closed), func)() @@ -849,18 +839,8 @@ def test_iter_rolling_on_dataframe_unordered(): 3, 1, ), - ( - Series([1, 2, 3]), - [([1], [0]), ([1, 2], [0, 1]), ([2, 3], [1, 2])], - 2, - 1, - ), - ( - Series([1, 2, 3]), - [([1], [0]), ([1, 2], [0, 1]), ([2, 3], [1, 2])], - 2, - 2, - ), + (Series([1, 2, 3]), [([1], [0]), ([1, 2], [0, 1]), ([2, 3], [1, 2])], 2, 1,), + (Series([1, 2, 3]), [([1], [0]), ([1, 2], [0, 1]), ([2, 3], [1, 2])], 2, 2,), (Series([1, 2, 3]), [([1], [0]), ([2], [1]), ([3], [2])], 1, 0), (Series([1, 2, 3]), [([1], [0]), ([2], [1]), ([3], [2])], 1, 1), (Series([1, 2]), [([1], [0]), ([1, 2], [0, 1])], 2, 0), @@ -1032,18 +1012,7 @@ def test_rolling_numerical_too_large_numbers(): ds[2] = -9e33 result = ds.rolling(5).mean() expected = Series( - [ - np.nan, - np.nan, - np.nan, - np.nan, - -1.8e33, - -1.8e33, - -1.8e33, - 5.0, - 6.0, - 7.0, - ], + [np.nan, np.nan, np.nan, np.nan, -1.8e33, -1.8e33, -1.8e33, 5.0, 6.0, 7.0,], index=dates, ) tm.assert_series_equal(result, expected) @@ -1059,8 +1028,7 @@ def test_rolling_mixed_dtypes_axis_1(func, value): df["c"] = 1.0 result = getattr(df.rolling(window=2, min_periods=1, axis=1), func)() expected = DataFrame( - {"a": [1.0, 1.0], "b": [value, value], "c": [value, value]}, - index=[1, 2], + {"a": [1.0, 1.0], "b": [value, value], "c": [value, value]}, index=[1, 2], ) tm.assert_frame_equal(result, expected) @@ -1086,8 +1054,7 @@ def test_rolling_axis_one_with_nan(): @pytest.mark.parametrize( - "value", - ["test", to_datetime("2019-12-31"), to_timedelta("1 days 06:05:01.00003")], + "value", ["test", to_datetime("2019-12-31"), to_timedelta("1 days 06:05:01.00003")], ) def test_rolling_axis_1_non_numeric_dtypes(value): # GH: 20649 @@ -1277,11 +1244,7 @@ def test_rolling_decreasing_indices_centered(window, closed, expected, frame_or_ @pytest.mark.parametrize( - "window,expected", - [ - ("1ns", [1.0, 1.0, 1.0, 1.0]), - ("3ns", [2.0, 3.0, 3.0, 2.0]), - ], + "window,expected", [("1ns", [1.0, 1.0, 1.0, 1.0]), ("3ns", [2.0, 3.0, 3.0, 2.0]),], ) def test_rolling_center_nanosecond_resolution( window, closed, expected, frame_or_series @@ -1309,14 +1272,8 @@ def test_rolling_center_nanosecond_resolution( 318.0, ], ), - ( - "mean", - [float("nan"), 7.5, float("nan"), 21.5, 6.0, 9.166667, 13.0, 17.5], - ), - ( - "sum", - [float("nan"), 30.0, float("nan"), 86.0, 30.0, 55.0, 91.0, 140.0], - ), + ("mean", [float("nan"), 7.5, float("nan"), 21.5, 6.0, 9.166667, 13.0, 17.5],), + ("sum", [float("nan"), 30.0, float("nan"), 86.0, 30.0, 55.0, 91.0, 140.0],), ( "skew", [ @@ -1380,10 +1337,7 @@ def get_window_bounds(self, num_values, min_periods, center, closed): @pytest.mark.parametrize( ("index", "window"), - [ - ([0, 1, 2, 3, 4], 2), - (date_range("2001-01-01", freq="D", periods=5), "2D"), - ], + [([0, 1, 2, 3, 4], 2), (date_range("2001-01-01", freq="D", periods=5), "2D"),], ) def test_rolling_corr_timedelta_index(index, window): # GH: 31286 @@ -1417,6 +1371,17 @@ def test_groupby_rolling_nan_included(): tm.assert_frame_equal(result, expected) +def test_groupby_rolling_non_monotonic(): + # GH 43909 + shuffled = [3, 0, 1, 2] + sec = 1_000_000_000 + df = DataFrame( + [{"t": Timestamp(2 * x * sec), "x": x + 1, "c": 42} for x in shuffled] + ) + with pytest.raises(ValueError, match=r".* must be monotonic"): + df.groupby("c").rolling(on="t", window="3s") + + @pytest.mark.parametrize("method", ["skew", "kurt"]) def test_rolling_skew_kurt_numerical_stability(method): # GH#6929 @@ -1474,13 +1439,7 @@ def test_rolling_var_floating_artifact_precision(): def test_rolling_std_small_values(): # GH 37051 - s = Series( - [ - 0.00000054, - 0.00000053, - 0.00000054, - ] - ) + s = Series([0.00000054, 0.00000053, 0.00000054,]) result = s.rolling(2).std() expected = Series([np.nan, 7.071068e-9, 7.071068e-9]) tm.assert_series_equal(result, expected, atol=1.0e-15, rtol=1.0e-15) @@ -1524,10 +1483,7 @@ def test_rolling_mean_all_nan_window_floating_artifacts(start, exp_values): 0.005, 0.102500, ] - expected = DataFrame( - values, - index=list(range(start, len(values) + start)), - ) + expected = DataFrame(values, index=list(range(start, len(values) + start)),) result = df.iloc[start:].rolling(5, min_periods=0).mean() tm.assert_frame_equal(result, expected) @@ -1552,8 +1508,7 @@ def test_rolling_float_dtype(float_numpy_dtype): # GH#42452 df = DataFrame({"A": range(5), "B": range(10, 15)}, dtype=float_numpy_dtype) expected = DataFrame( - {"A": [np.nan] * 5, "B": range(10, 20, 2)}, - dtype=float_numpy_dtype, + {"A": [np.nan] * 5, "B": range(10, 20, 2)}, dtype=float_numpy_dtype, ) result = df.rolling(2, axis=1).sum() tm.assert_frame_equal(result, expected, check_dtype=False) From 247e8f3831c93dfeb83e3f410ea375d0537ea989 Mon Sep 17 00:00:00 2001 From: jamesholcombe Date: Sat, 16 Oct 2021 22:38:04 +0000 Subject: [PATCH 02/15] Fixes from pre-commit [automated commit] --- pandas/core/window/rolling.py | 72 ++++++++++++++++++---- pandas/tests/window/test_rolling.py | 96 +++++++++++++++++++++++------ 2 files changed, 135 insertions(+), 33 deletions(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 8e3460657f38b..24ac7c040a45d 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -662,7 +662,13 @@ def _apply( numba_args: tuple[Any, ...] = (), **kwargs, ) -> DataFrame | Series: - result = super()._apply(func, name, numba_cache_key, numba_args, **kwargs,) + result = super()._apply( + func, + name, + numba_cache_key, + numba_args, + **kwargs, + ) # Reconstruct the resulting MultiIndex # 1st set of levels = group by labels # 2nd set of levels = original DataFrame/Series index @@ -1257,7 +1263,9 @@ def apply( raise ValueError("engine must be either 'numba' or 'cython'") return self._apply( - apply_func, numba_cache_key=numba_cache_key, numba_args=numba_args, + apply_func, + numba_cache_key=numba_cache_key, + numba_args=numba_args, ) def _generate_cython_apply_func( @@ -1299,7 +1307,10 @@ def sum( func = np.nansum return self.apply( - func, raw=True, engine=engine, engine_kwargs=engine_kwargs, + func, + raw=True, + engine=engine, + engine_kwargs=engine_kwargs, ) window_func = window_aggregations.roll_sum return self._apply(window_func, name="sum", **kwargs) @@ -1319,7 +1330,10 @@ def max( func = np.nanmax return self.apply( - func, raw=True, engine=engine, engine_kwargs=engine_kwargs, + func, + raw=True, + engine=engine, + engine_kwargs=engine_kwargs, ) window_func = window_aggregations.roll_max return self._apply(window_func, name="max", **kwargs) @@ -1339,7 +1353,10 @@ def min( func = np.nanmin return self.apply( - func, raw=True, engine=engine, engine_kwargs=engine_kwargs, + func, + raw=True, + engine=engine, + engine_kwargs=engine_kwargs, ) window_func = window_aggregations.roll_min return self._apply(window_func, name="min", **kwargs) @@ -1356,7 +1373,10 @@ def mean( if self.method == "table": func = generate_manual_numpy_nan_agg_with_axis(np.nanmean) return self.apply( - func, raw=True, engine=engine, engine_kwargs=engine_kwargs, + func, + raw=True, + engine=engine, + engine_kwargs=engine_kwargs, ) else: from pandas.core._numba.kernels import sliding_mean @@ -1378,7 +1398,10 @@ def median( func = np.nanmedian return self.apply( - func, raw=True, engine=engine, engine_kwargs=engine_kwargs, + func, + raw=True, + engine=engine, + engine_kwargs=engine_kwargs, ) window_func = window_aggregations.roll_median_c return self._apply(window_func, name="median", **kwargs) @@ -1390,23 +1413,39 @@ def std(self, ddof: int = 1, *args, **kwargs): def zsqrt_func(values, begin, end, min_periods): return zsqrt(window_func(values, begin, end, min_periods, ddof=ddof)) - return self._apply(zsqrt_func, name="std", **kwargs,) + return self._apply( + zsqrt_func, + name="std", + **kwargs, + ) def var(self, ddof: int = 1, *args, **kwargs): nv.validate_window_func("var", args, kwargs) window_func = partial(window_aggregations.roll_var, ddof=ddof) - return self._apply(window_func, name="var", **kwargs,) + return self._apply( + window_func, + name="var", + **kwargs, + ) def skew(self, **kwargs): window_func = window_aggregations.roll_skew - return self._apply(window_func, name="skew", **kwargs,) + return self._apply( + window_func, + name="skew", + **kwargs, + ) def sem(self, ddof: int = 1, *args, **kwargs): return self.std(*args, **kwargs) / (self.count() - ddof).pow(0.5) def kurt(self, **kwargs): window_func = window_aggregations.roll_kurt - return self._apply(window_func, name="kurt", **kwargs,) + return self._apply( + window_func, + name="kurt", + **kwargs, + ) def quantile(self, quantile: float, interpolation: str = "linear", **kwargs): if quantile == 1.0: @@ -2184,7 +2223,9 @@ def kurt(self, **kwargs): ) def quantile(self, quantile: float, interpolation: str = "linear", **kwargs): return super().quantile( - quantile=quantile, interpolation=interpolation, **kwargs, + quantile=quantile, + interpolation=interpolation, + **kwargs, ) @doc( @@ -2255,7 +2296,12 @@ def rank( pct: bool = False, **kwargs, ): - return super().rank(method=method, ascending=ascending, pct=pct, **kwargs,) + return super().rank( + method=method, + ascending=ascending, + pct=pct, + **kwargs, + ) @doc( template_header, diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 17c1f7d0b21d5..0391dc56c5791 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -84,13 +84,15 @@ def test_constructor_with_timedelta_window(window): # GH 15440 n = 10 df = DataFrame( - {"value": np.arange(n)}, index=date_range("2015-12-24", periods=n, freq="D"), + {"value": np.arange(n)}, + index=date_range("2015-12-24", periods=n, freq="D"), ) expected_data = np.append([0.0, 1.0], np.arange(3.0, 27.0, 3)) result = df.rolling(window=window).sum() expected = DataFrame( - {"value": expected_data}, index=date_range("2015-12-24", periods=n, freq="D"), + {"value": expected_data}, + index=date_range("2015-12-24", periods=n, freq="D"), ) tm.assert_frame_equal(result, expected) expected = df.rolling("3D").sum() @@ -102,7 +104,8 @@ def test_constructor_timedelta_window_and_minperiods(window, raw): # GH 15305 n = 10 df = DataFrame( - {"value": np.arange(n)}, index=date_range("2017-08-08", periods=n, freq="D"), + {"value": np.arange(n)}, + index=date_range("2017-08-08", periods=n, freq="D"), ) expected = DataFrame( {"value": np.append([np.NaN, 1.0], np.arange(3.0, 27.0, 3))}, @@ -133,9 +136,13 @@ def test_closed_fixed(closed, arithmetic_win_operators): df_fixed = DataFrame({"A": [0, 1, 2, 3, 4]}) df_time = DataFrame({"A": [0, 1, 2, 3, 4]}, index=date_range("2020", periods=5)) - result = getattr(df_fixed.rolling(2, closed=closed, min_periods=1), func_name,)() + result = getattr( + df_fixed.rolling(2, closed=closed, min_periods=1), + func_name, + )() expected = getattr( - df_time.rolling("2D", closed=closed, min_periods=1), func_name, + df_time.rolling("2D", closed=closed, min_periods=1), + func_name, )().reset_index(drop=True) tm.assert_frame_equal(result, expected) @@ -206,7 +213,8 @@ def test_datetimelike_centered_selections( kwargs = {} result = getattr( - df_time.rolling("2D", closed=closed, min_periods=1, center=True), func_name, + df_time.rolling("2D", closed=closed, min_periods=1, center=True), + func_name, )(**kwargs) tm.assert_frame_equal(result, expected, check_dtype=False) @@ -337,7 +345,8 @@ def test_closed_one_entry(func): def test_closed_one_entry_groupby(func): # GH24718 ser = DataFrame( - data={"A": [1, 1, 2], "B": [3, 2, 1]}, index=date_range("2000", periods=3), + data={"A": [1, 1, 2], "B": [3, 2, 1]}, + index=date_range("2000", periods=3), ) result = getattr( ser.groupby("A", sort=False)["B"].rolling("10D", closed="left"), func @@ -364,7 +373,8 @@ def test_closed_one_entry_groupby(func): def test_closed_min_max_datetime(input_dtype, func, closed, expected): # see gh-21704 ser = Series( - data=np.arange(10).astype(input_dtype), index=date_range("2000", periods=10), + data=np.arange(10).astype(input_dtype), + index=date_range("2000", periods=10), ) result = getattr(ser.rolling("3D", closed=closed), func)() @@ -839,8 +849,18 @@ def test_iter_rolling_on_dataframe_unordered(): 3, 1, ), - (Series([1, 2, 3]), [([1], [0]), ([1, 2], [0, 1]), ([2, 3], [1, 2])], 2, 1,), - (Series([1, 2, 3]), [([1], [0]), ([1, 2], [0, 1]), ([2, 3], [1, 2])], 2, 2,), + ( + Series([1, 2, 3]), + [([1], [0]), ([1, 2], [0, 1]), ([2, 3], [1, 2])], + 2, + 1, + ), + ( + Series([1, 2, 3]), + [([1], [0]), ([1, 2], [0, 1]), ([2, 3], [1, 2])], + 2, + 2, + ), (Series([1, 2, 3]), [([1], [0]), ([2], [1]), ([3], [2])], 1, 0), (Series([1, 2, 3]), [([1], [0]), ([2], [1]), ([3], [2])], 1, 1), (Series([1, 2]), [([1], [0]), ([1, 2], [0, 1])], 2, 0), @@ -1012,7 +1032,18 @@ def test_rolling_numerical_too_large_numbers(): ds[2] = -9e33 result = ds.rolling(5).mean() expected = Series( - [np.nan, np.nan, np.nan, np.nan, -1.8e33, -1.8e33, -1.8e33, 5.0, 6.0, 7.0,], + [ + np.nan, + np.nan, + np.nan, + np.nan, + -1.8e33, + -1.8e33, + -1.8e33, + 5.0, + 6.0, + 7.0, + ], index=dates, ) tm.assert_series_equal(result, expected) @@ -1028,7 +1059,8 @@ def test_rolling_mixed_dtypes_axis_1(func, value): df["c"] = 1.0 result = getattr(df.rolling(window=2, min_periods=1, axis=1), func)() expected = DataFrame( - {"a": [1.0, 1.0], "b": [value, value], "c": [value, value]}, index=[1, 2], + {"a": [1.0, 1.0], "b": [value, value], "c": [value, value]}, + index=[1, 2], ) tm.assert_frame_equal(result, expected) @@ -1054,7 +1086,8 @@ def test_rolling_axis_one_with_nan(): @pytest.mark.parametrize( - "value", ["test", to_datetime("2019-12-31"), to_timedelta("1 days 06:05:01.00003")], + "value", + ["test", to_datetime("2019-12-31"), to_timedelta("1 days 06:05:01.00003")], ) def test_rolling_axis_1_non_numeric_dtypes(value): # GH: 20649 @@ -1244,7 +1277,11 @@ def test_rolling_decreasing_indices_centered(window, closed, expected, frame_or_ @pytest.mark.parametrize( - "window,expected", [("1ns", [1.0, 1.0, 1.0, 1.0]), ("3ns", [2.0, 3.0, 3.0, 2.0]),], + "window,expected", + [ + ("1ns", [1.0, 1.0, 1.0, 1.0]), + ("3ns", [2.0, 3.0, 3.0, 2.0]), + ], ) def test_rolling_center_nanosecond_resolution( window, closed, expected, frame_or_series @@ -1272,8 +1309,14 @@ def test_rolling_center_nanosecond_resolution( 318.0, ], ), - ("mean", [float("nan"), 7.5, float("nan"), 21.5, 6.0, 9.166667, 13.0, 17.5],), - ("sum", [float("nan"), 30.0, float("nan"), 86.0, 30.0, 55.0, 91.0, 140.0],), + ( + "mean", + [float("nan"), 7.5, float("nan"), 21.5, 6.0, 9.166667, 13.0, 17.5], + ), + ( + "sum", + [float("nan"), 30.0, float("nan"), 86.0, 30.0, 55.0, 91.0, 140.0], + ), ( "skew", [ @@ -1337,7 +1380,10 @@ def get_window_bounds(self, num_values, min_periods, center, closed): @pytest.mark.parametrize( ("index", "window"), - [([0, 1, 2, 3, 4], 2), (date_range("2001-01-01", freq="D", periods=5), "2D"),], + [ + ([0, 1, 2, 3, 4], 2), + (date_range("2001-01-01", freq="D", periods=5), "2D"), + ], ) def test_rolling_corr_timedelta_index(index, window): # GH: 31286 @@ -1439,7 +1485,13 @@ def test_rolling_var_floating_artifact_precision(): def test_rolling_std_small_values(): # GH 37051 - s = Series([0.00000054, 0.00000053, 0.00000054,]) + s = Series( + [ + 0.00000054, + 0.00000053, + 0.00000054, + ] + ) result = s.rolling(2).std() expected = Series([np.nan, 7.071068e-9, 7.071068e-9]) tm.assert_series_equal(result, expected, atol=1.0e-15, rtol=1.0e-15) @@ -1483,7 +1535,10 @@ def test_rolling_mean_all_nan_window_floating_artifacts(start, exp_values): 0.005, 0.102500, ] - expected = DataFrame(values, index=list(range(start, len(values) + start)),) + expected = DataFrame( + values, + index=list(range(start, len(values) + start)), + ) result = df.iloc[start:].rolling(5, min_periods=0).mean() tm.assert_frame_equal(result, expected) @@ -1508,7 +1563,8 @@ def test_rolling_float_dtype(float_numpy_dtype): # GH#42452 df = DataFrame({"A": range(5), "B": range(10, 15)}, dtype=float_numpy_dtype) expected = DataFrame( - {"A": [np.nan] * 5, "B": range(10, 20, 2)}, dtype=float_numpy_dtype, + {"A": [np.nan] * 5, "B": range(10, 20, 2)}, + dtype=float_numpy_dtype, ) result = df.rolling(2, axis=1).sum() tm.assert_frame_equal(result, expected, check_dtype=False) From d6dd13e072c8e6ed42c42e19268e2d2a544964e1 Mon Sep 17 00:00:00 2001 From: jamesholcombe Date: Sun, 17 Oct 2021 17:06:58 +0100 Subject: [PATCH 03/15] amended tests --- pandas/tests/window/test_timeseries_window.py | 30 +++++++++---------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/pandas/tests/window/test_timeseries_window.py b/pandas/tests/window/test_timeseries_window.py index 7cd319480083b..9a28b53e89add 100644 --- a/pandas/tests/window/test_timeseries_window.py +++ b/pandas/tests/window/test_timeseries_window.py @@ -648,6 +648,9 @@ def test_groupby_monotonic(self): # GH 15130 # we don't need to validate monotonicity when grouping + #GH 43909 we should raise an error here to match behaviour of non-groupby rolling. + + data = [ ["David", "1/1/2015", 100], ["David", "1/5/2015", 500], @@ -664,17 +667,17 @@ def test_groupby_monotonic(self): df = DataFrame(data=data, columns=["name", "date", "amount"]) df["date"] = to_datetime(df["date"]) - expected = ( - df.set_index("date") - .groupby("name") - .apply(lambda x: x.rolling("180D")["amount"].sum()) - ) - result = df.groupby("name").rolling("180D", on="date")["amount"].sum() - tm.assert_series_equal(result, expected) + + with pytest.raises(ValueError,match=r".* must be monotonic"): + df.groupby("name").rolling("180D", on="date") + - def test_non_monotonic(self): + def test_non_monotonic_raises(self): # GH 13966 (similar to #15130, closed by #15175) + #superceded by 43909 + + dates = date_range(start="2016-01-01 09:30:00", periods=20, freq="s") df = DataFrame( { @@ -684,15 +687,10 @@ def test_non_monotonic(self): } ) - result = df.groupby("A").rolling("4s", on="B").C.mean() - expected = ( - df.set_index("B").groupby("A").apply(lambda x: x.rolling("4s")["C"].mean()) - ) - tm.assert_series_equal(result, expected) + with pytest.raises(ValueError,match=r".* must be monotonic"): + df.groupby("A").rolling("4s", on="B").C.mean() + - df2 = df.sort_values("B") - result = df2.groupby("A").rolling("4s", on="B").C.mean() - tm.assert_series_equal(result, expected) def test_rolling_cov_offset(self): # GH16058 From a5467ed64ce4e3b92d050ba11e10dade80102c45 Mon Sep 17 00:00:00 2001 From: jamesholcombe Date: Sun, 17 Oct 2021 16:17:14 +0000 Subject: [PATCH 04/15] Fixes from pre-commit [automated commit] --- pandas/tests/window/test_timeseries_window.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/pandas/tests/window/test_timeseries_window.py b/pandas/tests/window/test_timeseries_window.py index 9a28b53e89add..6916fb2813222 100644 --- a/pandas/tests/window/test_timeseries_window.py +++ b/pandas/tests/window/test_timeseries_window.py @@ -648,8 +648,7 @@ def test_groupby_monotonic(self): # GH 15130 # we don't need to validate monotonicity when grouping - #GH 43909 we should raise an error here to match behaviour of non-groupby rolling. - + # GH 43909 we should raise an error here to match behaviour of non-groupby rolling. data = [ ["David", "1/1/2015", 100], @@ -667,17 +666,14 @@ def test_groupby_monotonic(self): df = DataFrame(data=data, columns=["name", "date", "amount"]) df["date"] = to_datetime(df["date"]) - - with pytest.raises(ValueError,match=r".* must be monotonic"): + with pytest.raises(ValueError, match=r".* must be monotonic"): df.groupby("name").rolling("180D", on="date") - def test_non_monotonic_raises(self): # GH 13966 (similar to #15130, closed by #15175) - #superceded by 43909 + # superceded by 43909 - dates = date_range(start="2016-01-01 09:30:00", periods=20, freq="s") df = DataFrame( { @@ -687,10 +683,8 @@ def test_non_monotonic_raises(self): } ) - with pytest.raises(ValueError,match=r".* must be monotonic"): + with pytest.raises(ValueError, match=r".* must be monotonic"): df.groupby("A").rolling("4s", on="B").C.mean() - - def test_rolling_cov_offset(self): # GH16058 From e234f9a4f0570ad1e4f0e4d5fa5c68de56bd2958 Mon Sep 17 00:00:00 2001 From: jamesholcombe Date: Sun, 17 Oct 2021 19:00:03 +0100 Subject: [PATCH 05/15] commit to go through checks again --- pandas/tests/window/test_timeseries_window.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/window/test_timeseries_window.py b/pandas/tests/window/test_timeseries_window.py index 9a28b53e89add..c75aa97aa5d55 100644 --- a/pandas/tests/window/test_timeseries_window.py +++ b/pandas/tests/window/test_timeseries_window.py @@ -667,7 +667,6 @@ def test_groupby_monotonic(self): df = DataFrame(data=data, columns=["name", "date", "amount"]) df["date"] = to_datetime(df["date"]) - with pytest.raises(ValueError,match=r".* must be monotonic"): df.groupby("name").rolling("180D", on="date") From a743ff2348a1b849d922546d9e4cb3835834095b Mon Sep 17 00:00:00 2001 From: jamesholcombe Date: Sun, 17 Oct 2021 18:20:27 +0000 Subject: [PATCH 06/15] Fixes from pre-commit [automated commit] --- pandas/tests/window/test_timeseries_window.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/window/test_timeseries_window.py b/pandas/tests/window/test_timeseries_window.py index a0837cf8a2678..6916fb2813222 100644 --- a/pandas/tests/window/test_timeseries_window.py +++ b/pandas/tests/window/test_timeseries_window.py @@ -666,7 +666,7 @@ def test_groupby_monotonic(self): df = DataFrame(data=data, columns=["name", "date", "amount"]) df["date"] = to_datetime(df["date"]) - with pytest.raises(ValueError,match=r".* must be monotonic"): + with pytest.raises(ValueError, match=r".* must be monotonic"): df.groupby("name").rolling("180D", on="date") def test_non_monotonic_raises(self): From 9f132a5820390f8194d2e9cf1587f73a43991f7e Mon Sep 17 00:00:00 2001 From: jamesholcombe Date: Mon, 18 Oct 2021 19:08:22 +0100 Subject: [PATCH 07/15] ammending as per comments --- pandas/tests/window/test_rolling.py | 2 +- pandas/tests/window/test_timeseries_window.py | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 0391dc56c5791..408231085228f 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1420,7 +1420,7 @@ def test_groupby_rolling_nan_included(): def test_groupby_rolling_non_monotonic(): # GH 43909 shuffled = [3, 0, 1, 2] - sec = 1_000_000_000 + sec = 1_000 df = DataFrame( [{"t": Timestamp(2 * x * sec), "x": x + 1, "c": 42} for x in shuffled] ) diff --git a/pandas/tests/window/test_timeseries_window.py b/pandas/tests/window/test_timeseries_window.py index a0837cf8a2678..63e9706c147ec 100644 --- a/pandas/tests/window/test_timeseries_window.py +++ b/pandas/tests/window/test_timeseries_window.py @@ -683,6 +683,20 @@ def test_non_monotonic_raises(self): } ) + + result = df.sort_values("B").groupby("A").rolling("4s", on="B").C.mean() + expected = ( + df.set_index("B").groupby("A").apply(lambda x: x.rolling("4s")["C"].mean()) + ) + tm.assert_series_equal(result, expected) + + df2 = df.sort_values("B") + result = df2.groupby("A").rolling("4s", on="B").C.mean() + tm.assert_series_equal(result, expected) + + + + with pytest.raises(ValueError, match=r".* must be monotonic"): df.groupby("A").rolling("4s", on="B").C.mean() From a4c22d13be4df6a2d9330278bbdcd55f76a00f0a Mon Sep 17 00:00:00 2001 From: jamesholcombe Date: Mon, 18 Oct 2021 18:18:14 +0000 Subject: [PATCH 08/15] Fixes from pre-commit [automated commit] --- pandas/tests/window/test_timeseries_window.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/tests/window/test_timeseries_window.py b/pandas/tests/window/test_timeseries_window.py index 9feb3c7ad5124..6c3e37494ca58 100644 --- a/pandas/tests/window/test_timeseries_window.py +++ b/pandas/tests/window/test_timeseries_window.py @@ -683,7 +683,6 @@ def test_non_monotonic_raises(self): } ) - result = df.sort_values("B").groupby("A").rolling("4s", on="B").C.mean() expected = ( df.set_index("B").groupby("A").apply(lambda x: x.rolling("4s")["C"].mean()) @@ -694,9 +693,6 @@ def test_non_monotonic_raises(self): result = df2.groupby("A").rolling("4s", on="B").C.mean() tm.assert_series_equal(result, expected) - - - with pytest.raises(ValueError, match=r".* must be monotonic"): df.groupby("A").rolling("4s", on="B").C.mean() From 195b254af609a1aea369ac5eb39910f5db0c9fb5 Mon Sep 17 00:00:00 2001 From: jamesholcombe Date: Sun, 24 Oct 2021 16:41:06 +0100 Subject: [PATCH 09/15] readded succesful groupby on rolling test --- pandas/tests/window/test_timeseries_window.py | 27 ++++++++----------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/pandas/tests/window/test_timeseries_window.py b/pandas/tests/window/test_timeseries_window.py index 9feb3c7ad5124..96c724a0f9764 100644 --- a/pandas/tests/window/test_timeseries_window.py +++ b/pandas/tests/window/test_timeseries_window.py @@ -665,9 +665,18 @@ def test_groupby_monotonic(self): df = DataFrame(data=data, columns=["name", "date", "amount"]) df["date"] = to_datetime(df["date"]) + df = df.sort_values("date") - with pytest.raises(ValueError, match=r".* must be monotonic"): - df.groupby("name").rolling("180D", on="date") + expected = ( + df.set_index("date") + .groupby("name") + .apply(lambda x: x.rolling("180D")["amount"].sum()) + ) + result = df.groupby("name").rolling("180D", on="date")["amount"].sum() + tm.assert_series_equal(result, expected) + + + def test_non_monotonic_raises(self): # GH 13966 (similar to #15130, closed by #15175) @@ -683,20 +692,6 @@ def test_non_monotonic_raises(self): } ) - - result = df.sort_values("B").groupby("A").rolling("4s", on="B").C.mean() - expected = ( - df.set_index("B").groupby("A").apply(lambda x: x.rolling("4s")["C"].mean()) - ) - tm.assert_series_equal(result, expected) - - df2 = df.sort_values("B") - result = df2.groupby("A").rolling("4s", on="B").C.mean() - tm.assert_series_equal(result, expected) - - - - with pytest.raises(ValueError, match=r".* must be monotonic"): df.groupby("A").rolling("4s", on="B").C.mean() From cf41892cd9dc03789d576b37f8bce0fcad2e688b Mon Sep 17 00:00:00 2001 From: jamesholcombe Date: Sun, 24 Oct 2021 16:34:06 +0000 Subject: [PATCH 10/15] Fixes from pre-commit [automated commit] --- pandas/tests/window/test_timeseries_window.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/tests/window/test_timeseries_window.py b/pandas/tests/window/test_timeseries_window.py index 96c724a0f9764..b265e9be9e8a6 100644 --- a/pandas/tests/window/test_timeseries_window.py +++ b/pandas/tests/window/test_timeseries_window.py @@ -675,9 +675,6 @@ def test_groupby_monotonic(self): result = df.groupby("name").rolling("180D", on="date")["amount"].sum() tm.assert_series_equal(result, expected) - - - def test_non_monotonic_raises(self): # GH 13966 (similar to #15130, closed by #15175) From ebae6552055b6f0269fc6b445a4e6116ac50a4ee Mon Sep 17 00:00:00 2001 From: jamesholcombe Date: Sun, 31 Oct 2021 11:10:58 +0000 Subject: [PATCH 11/15] added back successful gby test --- pandas/tests/window/test_timeseries_window.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/pandas/tests/window/test_timeseries_window.py b/pandas/tests/window/test_timeseries_window.py index 96c724a0f9764..8ef008f9d9cc2 100644 --- a/pandas/tests/window/test_timeseries_window.py +++ b/pandas/tests/window/test_timeseries_window.py @@ -691,9 +691,17 @@ def test_non_monotonic_raises(self): "C": np.arange(40), } ) - + + expected = ( + df.set_index("B").groupby("A").apply(lambda x: x.rolling("4s")["C"].mean()) + ) with pytest.raises(ValueError, match=r".* must be monotonic"): - df.groupby("A").rolling("4s", on="B").C.mean() + df.groupby("A").rolling("4s", on="B").C.mean() #should raise for non-monotonic t series + + df2 = df.sort_values("B") + result = df2.groupby("A").rolling("4s", on="B").C.mean() + tm.assert_series_equal(result, expected) + def test_rolling_cov_offset(self): # GH16058 From 08ee1ec68897043834b51f0a8a17da0565cbf160 Mon Sep 17 00:00:00 2001 From: jamesholcombe Date: Sun, 31 Oct 2021 11:24:12 +0000 Subject: [PATCH 12/15] Fixes from pre-commit [automated commit] --- pandas/tests/window/test_timeseries_window.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/tests/window/test_timeseries_window.py b/pandas/tests/window/test_timeseries_window.py index 8a5eac80b4961..84a780c6f000e 100644 --- a/pandas/tests/window/test_timeseries_window.py +++ b/pandas/tests/window/test_timeseries_window.py @@ -688,18 +688,19 @@ def test_non_monotonic_raises(self): "C": np.arange(40), } ) - + expected = ( df.set_index("B").groupby("A").apply(lambda x: x.rolling("4s")["C"].mean()) ) with pytest.raises(ValueError, match=r".* must be monotonic"): - df.groupby("A").rolling("4s", on="B").C.mean() #should raise for non-monotonic t series + df.groupby("A").rolling( + "4s", on="B" + ).C.mean() # should raise for non-monotonic t series df2 = df.sort_values("B") result = df2.groupby("A").rolling("4s", on="B").C.mean() tm.assert_series_equal(result, expected) - def test_rolling_cov_offset(self): # GH16058 From 95975c3e4cafe7bcd8d962c6ecd9733571aa3a50 Mon Sep 17 00:00:00 2001 From: jamesholcombe Date: Sat, 11 Dec 2021 20:40:15 +0000 Subject: [PATCH 13/15] code style --- pandas/tests/window/test_rolling.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 9790d7cb195e8..814bd6b998182 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1421,6 +1421,7 @@ def test_groupby_rolling_nan_included(): def test_groupby_rolling_non_monotonic(): # GH 43909 + shuffled = [3, 0, 1, 2] sec = 1_000 df = DataFrame( From 52ea34b7b32e2a1d1acef8223aaa4c6fb44bda31 Mon Sep 17 00:00:00 2001 From: jamesholcombe Date: Sun, 12 Dec 2021 16:44:28 +0000 Subject: [PATCH 14/15] DOC: add whatsnew note for issue 43909 --- doc/source/whatsnew/v1.4.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 7cf8c07683514..ef0be82fc0a71 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -788,6 +788,7 @@ Groupby/resample/rolling - Bug in :meth:`GroupBy.nth` failing on ``axis=1`` (:issue:`43926`) - Fixed bug in :meth:`Series.rolling` and :meth:`DataFrame.rolling` not respecting right bound on centered datetime-like windows, if the index contain duplicates (:issue:`3944`) - Bug in :meth:`Series.rolling` and :meth:`DataFrame.rolling` when using a :class:`pandas.api.indexers.BaseIndexer` subclass that returned unequal start and end arrays would segfault instead of raising a ``ValueError`` (:issue:`44470`) +- Bug in :meth:`Groupby.rolling` when non-monotonic data passed, fails to correctly raise ``ValueError`` (:issue:`43909`) Reshaping ^^^^^^^^^ From a3357a50f3a9ea1605037218265e94855eed4570 Mon Sep 17 00:00:00 2001 From: jamesholcombe Date: Wed, 22 Dec 2021 13:39:34 +0000 Subject: [PATCH 15/15] fix typo --- pandas/tests/window/test_timeseries_window.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/window/test_timeseries_window.py b/pandas/tests/window/test_timeseries_window.py index 84a780c6f000e..f2cf7bd47e15b 100644 --- a/pandas/tests/window/test_timeseries_window.py +++ b/pandas/tests/window/test_timeseries_window.py @@ -648,7 +648,8 @@ def test_groupby_monotonic(self): # GH 15130 # we don't need to validate monotonicity when grouping - # GH 43909 we should raise an error here to match behaviour of non-groupby rolling. + # GH 43909 we should raise an error here to match + # behaviour of non-groupby rolling. data = [ ["David", "1/1/2015", 100], @@ -678,7 +679,7 @@ def test_groupby_monotonic(self): def test_non_monotonic_raises(self): # GH 13966 (similar to #15130, closed by #15175) - # superceded by 43909 + # superseded by 43909 dates = date_range(start="2016-01-01 09:30:00", periods=20, freq="s") df = DataFrame(