Skip to content

Commit a78e0f6

Browse files
authored
Fix rolling window operations with dask when bottleneck is installed (pydata#3040)
xref GH2940, GH2942 Previously, these operations could silently return incorrect results (dask 2.0), or use unbounded amounts of memory (older versions of dask). This requires a fairly large refactoring, because deciding when to use bottleneck now needs to be done at runtime rather than at import-time. These methods are now constructed as methods rather being injected aftewards into the class, which should also be a much more standard and understable design.
1 parent 3b622b0 commit a78e0f6

File tree

4 files changed

+130
-176
lines changed

4 files changed

+130
-176
lines changed

doc/whats-new.rst

+3
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,9 @@ Enhancements
9797
Bug fixes
9898
~~~~~~~~~
9999

100+
- Rolling operations on xarray objects containing dask arrays could silently
101+
compute the incorrect result or use large amounts of memory (:issue:`2940`).
102+
By `Stephan Hoyer <https://github.com/shoyer>`_.
100103
- Don't set encoding attributes on bounds variables when writing to netCDF.
101104
(:issue:`2921`)
102105
By `Deepak Cherian <https://github.com/dcherian>`_.

xarray/core/ops.py

-75
Original file line numberDiff line numberDiff line change
@@ -38,11 +38,6 @@
3838
NAN_REDUCE_METHODS = ['argmax', 'argmin', 'max', 'min', 'mean', 'prod', 'sum',
3939
'std', 'var', 'median']
4040
NAN_CUM_METHODS = ['cumsum', 'cumprod']
41-
BOTTLENECK_ROLLING_METHODS = {'move_sum': 'sum', 'move_mean': 'mean',
42-
'move_std': 'std', 'move_min': 'min',
43-
'move_max': 'max', 'move_var': 'var',
44-
'move_argmin': 'argmin', 'move_argmax': 'argmax',
45-
'move_median': 'median'}
4641
# TODO: wrap take, dot, sort
4742

4843

@@ -103,20 +98,6 @@
10398
If fewer than min_count non-NA values are present the result will
10499
be NA. New in version 0.10.8: Added with the default being None."""
105100

106-
_ROLLING_REDUCE_DOCSTRING_TEMPLATE = """\
107-
Reduce this {da_or_ds}'s data windows by applying `{name}` along its dimension.
108-
109-
Parameters
110-
----------
111-
**kwargs : dict
112-
Additional keyword arguments passed on to `{name}`.
113-
114-
Returns
115-
-------
116-
reduced : {da_or_ds}
117-
New {da_or_ds} object with `{name}` applied along its rolling dimnension.
118-
"""
119-
120101
_COARSEN_REDUCE_DOCSTRING_TEMPLATE = """\
121102
Coarsen this object by applying `{name}` along its dimensions.
122103
@@ -236,13 +217,6 @@ def func(self, *args, **kwargs):
236217
return func
237218

238219

239-
def rolling_count(rolling):
240-
241-
rolling_count = rolling._counts()
242-
enough_periods = rolling_count >= rolling._min_periods
243-
return rolling_count.where(enough_periods)
244-
245-
246220
def inject_reduce_methods(cls):
247221
methods = ([(name, getattr(duck_array_ops, 'array_%s' % name), False)
248222
for name in REDUCE_METHODS] +
@@ -340,55 +314,6 @@ def inject_all_ops_and_reduce_methods(cls, priority=50, array_only=True):
340314
inject_cum_methods(cls)
341315

342316

343-
def inject_bottleneck_rolling_methods(cls):
344-
# standard numpy reduce methods
345-
methods = [(name, getattr(duck_array_ops, name))
346-
for name in NAN_REDUCE_METHODS]
347-
for name, f in methods:
348-
func = cls._reduce_method(f)
349-
func.__name__ = name
350-
func.__doc__ = _ROLLING_REDUCE_DOCSTRING_TEMPLATE.format(
351-
name=func.__name__, da_or_ds='DataArray')
352-
setattr(cls, name, func)
353-
354-
# bottleneck doesn't offer rolling_count, so we construct it ourselves
355-
func = rolling_count
356-
func.__name__ = 'count'
357-
func.__doc__ = _ROLLING_REDUCE_DOCSTRING_TEMPLATE.format(
358-
name=func.__name__, da_or_ds='DataArray')
359-
setattr(cls, 'count', func)
360-
361-
# bottleneck rolling methods
362-
if not has_bottleneck:
363-
return
364-
365-
for bn_name, method_name in BOTTLENECK_ROLLING_METHODS.items():
366-
f = getattr(bn, bn_name)
367-
func = cls._bottleneck_reduce(f)
368-
func.__name__ = method_name
369-
func.__doc__ = _ROLLING_REDUCE_DOCSTRING_TEMPLATE.format(
370-
name=func.__name__, da_or_ds='DataArray')
371-
setattr(cls, method_name, func)
372-
373-
374-
def inject_datasetrolling_methods(cls):
375-
# standard numpy reduce methods
376-
methods = [(name, getattr(duck_array_ops, name))
377-
for name in NAN_REDUCE_METHODS]
378-
for name, f in methods:
379-
func = cls._reduce_method(f)
380-
func.__name__ = name
381-
func.__doc__ = _ROLLING_REDUCE_DOCSTRING_TEMPLATE.format(
382-
name=func.__name__, da_or_ds='Dataset')
383-
setattr(cls, name, func)
384-
# bottleneck doesn't offer rolling_count, so we construct it ourselves
385-
func = rolling_count
386-
func.__name__ = 'count'
387-
func.__doc__ = _ROLLING_REDUCE_DOCSTRING_TEMPLATE.format(
388-
name=func.__name__, da_or_ds='Dataset')
389-
setattr(cls, 'count', func)
390-
391-
392317
def inject_coarsen_methods(cls):
393318
# standard numpy reduce methods
394319
methods = [(name, getattr(duck_array_ops, name))

0 commit comments

Comments
 (0)