From d421a7a6c347f26cb8439fdc59e85a2316fd550f Mon Sep 17 00:00:00 2001 From: Samuel Scherrer Date: Tue, 21 Apr 2020 09:30:20 +0200 Subject: [PATCH 1/8] BUG: Fix memory issues in rolling.min/max This fixes at least the reproducible part of #30726, however, I am not totally sure what is going on here. Tests have shown that there are two solutions that avoid growing memory usage: - pass memoryviews (float64_t[:]) instead of ndarray[float64_t] - remove starti and endi as arguments to _roll_min_max_fixed This commit implements both. --- pandas/_libs/window/aggregations.pyx | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 673820fd8464a..afa0539014041 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -971,8 +971,8 @@ cdef inline numeric calc_mm(int64_t minp, Py_ssize_t nobs, return result -def roll_max_fixed(ndarray[float64_t] values, ndarray[int64_t] start, - ndarray[int64_t] end, int64_t minp, int64_t win): +def roll_max_fixed(float64_t[:] values, int64_t[:] start, + int64_t[:] end, int64_t minp, int64_t win): """ Moving max of 1d array of any numeric type along axis=0 ignoring NaNs. @@ -988,7 +988,7 @@ def roll_max_fixed(ndarray[float64_t] values, ndarray[int64_t] start, make the interval closed on the right, left, both or neither endpoints """ - return _roll_min_max_fixed(values, start, end, minp, win, is_max=1) + return _roll_min_max_fixed(values, minp, win, is_max=1) def roll_max_variable(ndarray[float64_t] values, ndarray[int64_t] start, @@ -1011,8 +1011,8 @@ def roll_max_variable(ndarray[float64_t] values, ndarray[int64_t] start, return _roll_min_max_variable(values, start, end, minp, is_max=1) -def roll_min_fixed(ndarray[float64_t] values, ndarray[int64_t] start, - ndarray[int64_t] end, int64_t minp, int64_t win): +def roll_min_fixed(float64_t[:] values, int64_t[:] start, + int64_t[:] end, int64_t minp, int64_t win): """ Moving min of 1d array of any numeric type along axis=0 ignoring NaNs. @@ -1025,7 +1025,7 @@ def roll_min_fixed(ndarray[float64_t] values, ndarray[int64_t] start, index : ndarray, optional index for window computation """ - return _roll_min_max_fixed(values, start, end, minp, win, is_max=0) + return _roll_min_max_fixed(values, minp, win, is_max=0) def roll_min_variable(ndarray[float64_t] values, ndarray[int64_t] start, @@ -1112,9 +1112,7 @@ cdef _roll_min_max_variable(ndarray[numeric] values, return output -cdef _roll_min_max_fixed(ndarray[numeric] values, - ndarray[int64_t] starti, - ndarray[int64_t] endi, +cdef _roll_min_max_fixed(numeric[:] values, int64_t minp, int64_t win, bint is_max): From 4709208e062d8931cdbbcba373e277bdfd019491 Mon Sep 17 00:00:00 2001 From: Samuel Scherrer Date: Thu, 23 Apr 2020 16:47:38 +0200 Subject: [PATCH 2/8] TST: Improved fixed window rolling min/max benchmark --- asv_bench/benchmarks/rolling.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index f85dc83ab8605..dc9752fbf5888 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -150,18 +150,16 @@ def time_quantile(self, constructor, window, dtype, percentile, interpolation): self.roll.quantile(percentile, interpolation=interpolation) -class PeakMemFixed: +class PeakMemFixedWindowMinMax: def setup(self): - N = 10 + N = int(1e5) arr = 100 * np.random.random(N) - self.roll = pd.Series(arr).rolling(10) + self.roll = pd.Series(arr).rolling(1000) def peakmem_fixed(self): - # GH 25926 - # This is to detect memory leaks in rolling operations. - # To save time this is only ran on one method. - # 6000 iterations is enough for most types of leaks to be detected - for x in range(6000): + # GH 33693 + # increased size of array so we see a clearer picture + for x in range(1000): self.roll.max() From 959a71141742af90ff822ae04022eb4e1fd33fa6 Mon Sep 17 00:00:00 2001 From: Samuel Scherrer Date: Thu, 23 Apr 2020 16:48:35 +0200 Subject: [PATCH 3/8] DOC: Added whatsnew note --- doc/source/whatsnew/v1.1.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 07849702c646d..3eeec16c44cf3 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -608,6 +608,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.resample` where an ``AmbiguousTimeError`` would be raised when the resulting timezone aware :class:`DatetimeIndex` had a DST transition at midnight (:issue:`25758`) - Bug in :meth:`DataFrame.groupby` where a ``ValueError`` would be raised when grouping by a categorical column with read-only categories and ``sort=False`` (:issue:`33410`) - Bug in :meth:`GroupBy.first` and :meth:`GroupBy.last` where None is not preserved in object dtype (:issue:`32800`) +- Bug in :meth:`Rolling.min` and :meth:`Rolling.max` when using a fixed window (:issue:`30726`) Reshaping ^^^^^^^^^ From 814b7e146c2630416fac1cc6d9c855970595532f Mon Sep 17 00:00:00 2001 From: Samuel Scherrer Date: Sun, 26 Apr 2020 11:34:08 +0200 Subject: [PATCH 4/8] DOC/TST: Parametrized asv benchmark and improved whatsnew note --- asv_bench/benchmarks/rolling.py | 12 +++++++++--- doc/source/whatsnew/v1.1.0.rst | 2 +- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index dc9752fbf5888..de08f1bf46af4 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -151,16 +151,22 @@ def time_quantile(self, constructor, window, dtype, percentile, interpolation): class PeakMemFixedWindowMinMax: - def setup(self): + + params = [ + pd.core.window.rolling.Rolling.min, + pd.core.window.rolling.Rolling.max, + ] + + def setup(self, f): N = int(1e5) arr = 100 * np.random.random(N) self.roll = pd.Series(arr).rolling(1000) - def peakmem_fixed(self): + def peakmem_fixed(self, f): # GH 33693 # increased size of array so we see a clearer picture for x in range(1000): - self.roll.max() + f(self.roll) class ForwardWindowMethods: diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 3eeec16c44cf3..43bf0e82490e2 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -608,7 +608,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.resample` where an ``AmbiguousTimeError`` would be raised when the resulting timezone aware :class:`DatetimeIndex` had a DST transition at midnight (:issue:`25758`) - Bug in :meth:`DataFrame.groupby` where a ``ValueError`` would be raised when grouping by a categorical column with read-only categories and ``sort=False`` (:issue:`33410`) - Bug in :meth:`GroupBy.first` and :meth:`GroupBy.last` where None is not preserved in object dtype (:issue:`32800`) -- Bug in :meth:`Rolling.min` and :meth:`Rolling.max` when using a fixed window (:issue:`30726`) +- Bug in :meth:`Rolling.min` and :meth:`Rolling.max`: Growing memory usage after multiple calls when using a fixed window (:issue:`30726`) Reshaping ^^^^^^^^^ From e77e02364ad3ad1206b59f27581a0e7c6ef73f9d Mon Sep 17 00:00:00 2001 From: Samuel Scherrer Date: Sun, 26 Apr 2020 11:37:09 +0200 Subject: [PATCH 5/8] CLN: removed blank line --- asv_bench/benchmarks/rolling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index de08f1bf46af4..e16f0965d5d12 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -151,7 +151,7 @@ def time_quantile(self, constructor, window, dtype, percentile, interpolation): class PeakMemFixedWindowMinMax: - + params = [ pd.core.window.rolling.Rolling.min, pd.core.window.rolling.Rolling.max, From 3c5cda2cc52f1968799eb6c7953bea18ed51b2eb Mon Sep 17 00:00:00 2001 From: Samuel Scherrer Date: Sun, 26 Apr 2020 15:12:38 +0200 Subject: [PATCH 6/8] TST: Faster benchmark for rolling min/max peakmem --- asv_bench/benchmarks/rolling.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index e16f0965d5d12..f1d8c0f268c31 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -158,14 +158,14 @@ class PeakMemFixedWindowMinMax: ] def setup(self, f): - N = int(1e5) - arr = 100 * np.random.random(N) - self.roll = pd.Series(arr).rolling(1000) + N = int(1e6) + arr = np.random.random(N) + self.roll = pd.Series(arr).rolling(2) def peakmem_fixed(self, f): # GH 33693 # increased size of array so we see a clearer picture - for x in range(1000): + for x in range(5): f(self.roll) From 2a5348c2d90d64f3b2fba74297b7473745f34fa9 Mon Sep 17 00:00:00 2001 From: Samuel Scherrer Date: Mon, 27 Apr 2020 10:15:03 +0200 Subject: [PATCH 7/8] TST: Benchmarks parametrized with string + getattr --- asv_bench/benchmarks/rolling.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index f1d8c0f268c31..8be353461130e 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -152,21 +152,18 @@ def time_quantile(self, constructor, window, dtype, percentile, interpolation): class PeakMemFixedWindowMinMax: - params = [ - pd.core.window.rolling.Rolling.min, - pd.core.window.rolling.Rolling.max, - ] + params = ['min', 'max'] def setup(self, f): N = int(1e6) arr = np.random.random(N) self.roll = pd.Series(arr).rolling(2) - def peakmem_fixed(self, f): + def peakmem_fixed(self, operation): # GH 33693 # increased size of array so we see a clearer picture for x in range(5): - f(self.roll) + getattr(self.roll, operation)() class ForwardWindowMethods: From 9aacc98f4feb760ae90e704fa92eba7dda805089 Mon Sep 17 00:00:00 2001 From: Samuel Scherrer Date: Tue, 28 Apr 2020 11:35:30 +0200 Subject: [PATCH 8/8] CLN: blacked benchmark --- asv_bench/benchmarks/rolling.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index 8be353461130e..e3c9c7ccdc51c 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -152,16 +152,14 @@ def time_quantile(self, constructor, window, dtype, percentile, interpolation): class PeakMemFixedWindowMinMax: - params = ['min', 'max'] + params = ["min", "max"] - def setup(self, f): + def setup(self, operation): N = int(1e6) arr = np.random.random(N) self.roll = pd.Series(arr).rolling(2) def peakmem_fixed(self, operation): - # GH 33693 - # increased size of array so we see a clearer picture for x in range(5): getattr(self.roll, operation)()