|
17 | 17 | from .nputils import nanfirst, nanlast
|
18 | 18 | from .pycompat import dask_array_type
|
19 | 19 |
|
20 |
| -try: |
21 |
| - import bottleneck as bn |
22 |
| - has_bottleneck = True |
23 |
| -except ImportError: |
24 |
| - # use numpy methods instead |
25 |
| - bn = np |
26 |
| - has_bottleneck = False |
27 |
| - |
28 | 20 | try:
|
29 | 21 | import dask.array as dask_array
|
30 | 22 | from . import dask_array_compat
|
@@ -175,7 +167,7 @@ def array_notnull_equiv(arr1, arr2):
|
175 | 167 | def count(data, axis=None):
|
176 | 168 | """Count the number of non-NA in this array along the given axis or axes
|
177 | 169 | """
|
178 |
| - return sum(~isnull(data), axis=axis) |
| 170 | + return np.sum(~isnull(data), axis=axis) |
179 | 171 |
|
180 | 172 |
|
181 | 173 | def where(condition, x, y):
|
@@ -213,159 +205,69 @@ def _ignore_warnings_if(condition):
|
213 | 205 | yield
|
214 | 206 |
|
215 | 207 |
|
216 |
| -def _nansum_object(value, axis=None, **kwargs): |
217 |
| - """ In house nansum for object array """ |
218 |
| - value = fillna(value, 0) |
219 |
| - return _dask_or_eager_func('sum')(value, axis=axis, **kwargs) |
220 |
| - |
221 |
| - |
222 |
| -def _nan_minmax_object(func, get_fill_value, value, axis=None, **kwargs): |
223 |
| - """ In house nanmin and nanmax for object array """ |
224 |
| - fill_value = get_fill_value(value.dtype) |
225 |
| - valid_count = count(value, axis=axis) |
226 |
| - filled_value = fillna(value, fill_value) |
227 |
| - data = _dask_or_eager_func(func)(filled_value, axis=axis, **kwargs) |
228 |
| - if not hasattr(data, 'dtype'): # scalar case |
229 |
| - data = dtypes.fill_value(value.dtype) if valid_count == 0 else data |
230 |
| - return np.array(data, dtype=value.dtype) |
231 |
| - return where_method(data, valid_count != 0) |
232 |
| - |
233 |
| - |
234 |
| -def _nan_argminmax_object(func, get_fill_value, value, axis=None, **kwargs): |
235 |
| - """ In house nanargmin, nanargmax for object arrays. Always return integer |
236 |
| - type """ |
237 |
| - fill_value = get_fill_value(value.dtype) |
238 |
| - valid_count = count(value, axis=axis) |
239 |
| - value = fillna(value, fill_value) |
240 |
| - data = _dask_or_eager_func(func)(value, axis=axis, **kwargs) |
241 |
| - # dask seems return non-integer type |
242 |
| - if isinstance(value, dask_array_type): |
243 |
| - data = data.astype(int) |
244 |
| - |
245 |
| - if (valid_count == 0).any(): |
246 |
| - raise ValueError('All-NaN slice encountered') |
247 |
| - |
248 |
| - return np.array(data, dtype=int) |
249 |
| - |
250 |
| - |
251 |
| -def _nanmean_ddof_object(ddof, value, axis=None, **kwargs): |
252 |
| - """ In house nanmean. ddof argument will be used in _nanvar method """ |
253 |
| - valid_count = count(value, axis=axis) |
254 |
| - value = fillna(value, 0) |
255 |
| - # As dtype inference is impossible for object dtype, we assume float |
256 |
| - # https://github.com/dask/dask/issues/3162 |
257 |
| - dtype = kwargs.pop('dtype', None) |
258 |
| - if dtype is None and value.dtype.kind == 'O': |
259 |
| - dtype = value.dtype if value.dtype.kind in ['cf'] else float |
260 |
| - |
261 |
| - data = _dask_or_eager_func('sum')(value, axis=axis, dtype=dtype, **kwargs) |
262 |
| - data = data / (valid_count - ddof) |
263 |
| - return where_method(data, valid_count != 0) |
264 |
| - |
265 |
| - |
266 |
| -def _nanvar_object(value, axis=None, **kwargs): |
267 |
| - ddof = kwargs.pop('ddof', 0) |
268 |
| - kwargs_mean = kwargs.copy() |
269 |
| - kwargs_mean.pop('keepdims', None) |
270 |
| - value_mean = _nanmean_ddof_object(ddof=0, value=value, axis=axis, |
271 |
| - keepdims=True, **kwargs_mean) |
272 |
| - squared = (value.astype(value_mean.dtype) - value_mean)**2 |
273 |
| - return _nanmean_ddof_object(ddof, squared, axis=axis, **kwargs) |
274 |
| - |
275 |
| - |
276 |
| -_nan_object_funcs = { |
277 |
| - 'sum': _nansum_object, |
278 |
| - 'min': partial(_nan_minmax_object, 'min', dtypes.get_pos_infinity), |
279 |
| - 'max': partial(_nan_minmax_object, 'max', dtypes.get_neg_infinity), |
280 |
| - 'argmin': partial(_nan_argminmax_object, 'argmin', |
281 |
| - dtypes.get_pos_infinity), |
282 |
| - 'argmax': partial(_nan_argminmax_object, 'argmax', |
283 |
| - dtypes.get_neg_infinity), |
284 |
| - 'mean': partial(_nanmean_ddof_object, 0), |
285 |
| - 'var': _nanvar_object, |
286 |
| -} |
287 |
| - |
288 |
| - |
289 |
| -def _create_nan_agg_method(name, numeric_only=False, np_compat=False, |
290 |
| - no_bottleneck=False, coerce_strings=False): |
| 208 | +def _create_nan_agg_method(name, coerce_strings=False): |
| 209 | + from . import nanops |
| 210 | + |
291 | 211 | def f(values, axis=None, skipna=None, **kwargs):
|
292 | 212 | if kwargs.pop('out', None) is not None:
|
293 | 213 | raise TypeError('`out` is not valid for {}'.format(name))
|
294 | 214 |
|
295 |
| - # If dtype is supplied, we use numpy's method. |
296 |
| - dtype = kwargs.get('dtype', None) |
297 | 215 | values = asarray(values)
|
298 | 216 |
|
299 |
| - # dask requires dtype argument for object dtype |
300 |
| - if (values.dtype == 'object' and name in ['sum', ]): |
301 |
| - kwargs['dtype'] = values.dtype if dtype is None else dtype |
302 |
| - |
303 | 217 | if coerce_strings and values.dtype.kind in 'SU':
|
304 | 218 | values = values.astype(object)
|
305 | 219 |
|
| 220 | + func = None |
306 | 221 | if skipna or (skipna is None and values.dtype.kind in 'cfO'):
|
307 |
| - if values.dtype.kind not in ['u', 'i', 'f', 'c']: |
308 |
| - func = _nan_object_funcs.get(name, None) |
309 |
| - using_numpy_nan_func = True |
310 |
| - if func is None or values.dtype.kind not in 'Ob': |
311 |
| - raise NotImplementedError( |
312 |
| - 'skipna=True not yet implemented for %s with dtype %s' |
313 |
| - % (name, values.dtype)) |
314 |
| - else: |
315 |
| - nanname = 'nan' + name |
316 |
| - if (isinstance(axis, tuple) or not values.dtype.isnative or |
317 |
| - no_bottleneck or (dtype is not None and |
318 |
| - np.dtype(dtype) != values.dtype)): |
319 |
| - # bottleneck can't handle multiple axis arguments or |
320 |
| - # non-native endianness |
321 |
| - if np_compat: |
322 |
| - eager_module = npcompat |
323 |
| - else: |
324 |
| - eager_module = np |
325 |
| - else: |
326 |
| - kwargs.pop('dtype', None) |
327 |
| - eager_module = bn |
328 |
| - func = _dask_or_eager_func(nanname, eager_module) |
329 |
| - using_numpy_nan_func = (eager_module is np or |
330 |
| - eager_module is npcompat) |
| 222 | + nanname = 'nan' + name |
| 223 | + func = getattr(nanops, nanname) |
331 | 224 | else:
|
332 | 225 | func = _dask_or_eager_func(name)
|
333 |
| - using_numpy_nan_func = False |
334 |
| - with _ignore_warnings_if(using_numpy_nan_func): |
335 |
| - try: |
336 |
| - return func(values, axis=axis, **kwargs) |
337 |
| - except AttributeError: |
338 |
| - if isinstance(values, dask_array_type): |
339 |
| - try: # dask/dask#3133 dask sometimes needs dtype argument |
340 |
| - return func(values, axis=axis, dtype=values.dtype, |
341 |
| - **kwargs) |
342 |
| - except AttributeError: |
343 |
| - msg = '%s is not yet implemented on dask arrays' % name |
344 |
| - else: |
345 |
| - assert using_numpy_nan_func |
346 |
| - msg = ('%s is not available with skipna=False with the ' |
347 |
| - 'installed version of numpy; upgrade to numpy 1.12 ' |
348 |
| - 'or newer to use skipna=True or skipna=None' % name) |
349 |
| - raise NotImplementedError(msg) |
350 |
| - f.numeric_only = numeric_only |
| 226 | + |
| 227 | + try: |
| 228 | + return func(values, axis=axis, **kwargs) |
| 229 | + except AttributeError: |
| 230 | + if isinstance(values, dask_array_type): |
| 231 | + try: # dask/dask#3133 dask sometimes needs dtype argument |
| 232 | + # if func does not accept dtype, then raises TypeError |
| 233 | + return func(values, axis=axis, dtype=values.dtype, |
| 234 | + **kwargs) |
| 235 | + except (AttributeError, TypeError): |
| 236 | + msg = '%s is not yet implemented on dask arrays' % name |
| 237 | + else: |
| 238 | + msg = ('%s is not available with skipna=False with the ' |
| 239 | + 'installed version of numpy; upgrade to numpy 1.12 ' |
| 240 | + 'or newer to use skipna=True or skipna=None' % name) |
| 241 | + raise NotImplementedError(msg) |
| 242 | + |
351 | 243 | f.__name__ = name
|
352 | 244 | return f
|
353 | 245 |
|
354 | 246 |
|
| 247 | +# Attributes `numeric_only`, `available_min_count` is used for docs. |
| 248 | +# See ops.inject_reduce_methods |
355 | 249 | argmax = _create_nan_agg_method('argmax', coerce_strings=True)
|
356 | 250 | argmin = _create_nan_agg_method('argmin', coerce_strings=True)
|
357 | 251 | max = _create_nan_agg_method('max', coerce_strings=True)
|
358 | 252 | min = _create_nan_agg_method('min', coerce_strings=True)
|
359 |
| -sum = _create_nan_agg_method('sum', numeric_only=True) |
360 |
| -mean = _create_nan_agg_method('mean', numeric_only=True) |
361 |
| -std = _create_nan_agg_method('std', numeric_only=True) |
362 |
| -var = _create_nan_agg_method('var', numeric_only=True) |
363 |
| -median = _create_nan_agg_method('median', numeric_only=True) |
364 |
| -prod = _create_nan_agg_method('prod', numeric_only=True, no_bottleneck=True) |
365 |
| -cumprod_1d = _create_nan_agg_method( |
366 |
| - 'cumprod', numeric_only=True, no_bottleneck=True) |
367 |
| -cumsum_1d = _create_nan_agg_method( |
368 |
| - 'cumsum', numeric_only=True, no_bottleneck=True) |
| 253 | +sum = _create_nan_agg_method('sum') |
| 254 | +sum.numeric_only = True |
| 255 | +sum.available_min_count = True |
| 256 | +mean = _create_nan_agg_method('mean') |
| 257 | +mean.numeric_only = True |
| 258 | +std = _create_nan_agg_method('std') |
| 259 | +std.numeric_only = True |
| 260 | +var = _create_nan_agg_method('var') |
| 261 | +var.numeric_only = True |
| 262 | +median = _create_nan_agg_method('median') |
| 263 | +median.numeric_only = True |
| 264 | +prod = _create_nan_agg_method('prod') |
| 265 | +prod.numeric_only = True |
| 266 | +sum.available_min_count = True |
| 267 | +cumprod_1d = _create_nan_agg_method('cumprod') |
| 268 | +cumprod_1d.numeric_only = True |
| 269 | +cumsum_1d = _create_nan_agg_method('cumsum') |
| 270 | +cumsum_1d.numeric_only = True |
369 | 271 |
|
370 | 272 |
|
371 | 273 | def _nd_cum_func(cum_func, array, axis, **kwargs):
|
|
0 commit comments