From 1b94cfb3e038c3b6ef0a6a496edc07f9cc693908 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Thu, 30 May 2013 23:13:26 -0400 Subject: [PATCH] API: disallow dates in reduction operations CLN: refactor into a class decorator DOC: add starter rls and whatsnew notes TST: add tests CLN: refactor bottleneck into class decorator BUG: fix median compatibility issues DOC: comment up some possibly obtuse looking code CLN: fix up comments ENH: add functools.wraps to preserve operation name in error message DOC: move to 11.1 DOC: minor doc note --- RELEASE.rst | 7 ++ doc/source/v0.11.1.txt | 13 ++++ pandas/core/nanops.py | 149 ++++++++++++++++++++++++------------ pandas/core/series.py | 18 ++--- pandas/tests/test_frame.py | 9 +++ pandas/tests/test_series.py | 10 ++- 6 files changed, 142 insertions(+), 64 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 3a347246be8dd..8da3b4760c303 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -97,6 +97,12 @@ pandas 0.11.1 in your calls. - Do not allow astypes on ``datetime64[ns]`` except to ``object``, and ``timedelta64[ns]`` to ``object/int`` (GH3425_) + - The behavior of ``datetime64`` dtypes has changed with respect to certain + so-called reduction operations (GH3726_). The following operations now + raise a ``TypeError`` when perfomed on a ``Series`` and return an *empty* + ``Series`` when performed on a ``DataFrame`` similar to performing these + operations on, for example, a ``DataFrame`` of ``slice`` objects: + - sum, prod, mean, std, var, skew, kurt, corr, and cov - Do not allow datetimelike/timedeltalike creation except with valid types (e.g. cannot pass ``datetime64[ms]``) (GH3423_) - Add ``squeeze`` keyword to ``groupby`` to allow reduction from @@ -294,6 +300,7 @@ pandas 0.11.1 .. _GH3748: https://github.com/pydata/pandas/issues/3748 .. _GH3741: https://github.com/pydata/pandas/issues/3741 .. _GH3750: https://github.com/pydata/pandas/issues/3750 +.. _GH3726: https://github.com/pydata/pandas/issues/3726 pandas 0.11.0 ============= diff --git a/doc/source/v0.11.1.txt b/doc/source/v0.11.1.txt index badb364d214d1..982b2f9f2eb3b 100644 --- a/doc/source/v0.11.1.txt +++ b/doc/source/v0.11.1.txt @@ -128,6 +128,17 @@ API changes - ``DataFrame.to_html`` and ``DataFrame.to_latex`` now accept a path for their first argument (GH3702_) + - Do not allow astypes on ``datetime64[ns]`` except to ``object``, and + ``timedelta64[ns]`` to ``object/int`` (GH3425_) + + - The behavior of ``datetime64`` dtypes has changed with respect to certain + so-called reduction operations (GH3726_). The following operations now + raise a ``TypeError`` when perfomed on a ``Series`` and return an *empty* + ``Series`` when performed on a ``DataFrame`` similar to performing these + operations on, for example, a ``DataFrame`` of ``slice`` objects: + + - sum, prod, mean, std, var, skew, kurt, corr, and cov + Enhancements ~~~~~~~~~~~~ @@ -345,3 +356,5 @@ on GitHub for a complete list. .. _GH3696: https://github.com/pydata/pandas/issues/3696 .. _GH3667: https://github.com/pydata/pandas/issues/3667 .. _GH3741: https://github.com/pydata/pandas/issues/3741 +.. _GH3726: https://github.com/pydata/pandas/issues/3726 +.. _GH3425: https://github.com/pydata/pandas/issues/3425 diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index f841c0dbecd8e..0d940dc348dc1 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1,10 +1,11 @@ import sys +import itertools +import functools import numpy as np from pandas.core.common import isnull, notnull import pandas.core.common as com -import pandas.core.config as cf import pandas.lib as lib import pandas.algos as algos import pandas.hashtable as _hash @@ -17,41 +18,70 @@ _USE_BOTTLENECK = False -def _bottleneck_switch(bn_name, alt, zero_value=None, **kwargs): - try: - bn_func = getattr(bn, bn_name) - except (AttributeError, NameError): # pragma: no cover - bn_func = None +class disallow(object): + def __init__(self, *dtypes): + super(disallow, self).__init__() + self.dtypes = tuple(np.dtype(dtype).type for dtype in dtypes) + + def check(self, obj): + return hasattr(obj, 'dtype') and issubclass(obj.dtype.type, + self.dtypes) + + def __call__(self, f): + @functools.wraps(f) + def _f(*args, **kwargs): + obj_iter = itertools.chain(args, kwargs.itervalues()) + if any(self.check(obj) for obj in obj_iter): + raise TypeError('reduction operation {0!r} not allowed for ' + 'this dtype'.format(f.__name__.replace('nan', + ''))) + return f(*args, **kwargs) + return _f + + +class bottleneck_switch(object): + def __init__(self, zero_value=None, **kwargs): + self.zero_value = zero_value + self.kwargs = kwargs + + def __call__(self, alt): + bn_name = alt.__name__ - def f(values, axis=None, skipna=True, **kwds): - if len(kwargs) > 0: - for k, v in kwargs.iteritems(): - if k not in kwds: - kwds[k] = v try: - if zero_value is not None and values.size == 0: - if values.ndim == 1: - return 0 + bn_func = getattr(bn, bn_name) + except (AttributeError, NameError): # pragma: no cover + bn_func = None + + @functools.wraps(alt) + def f(values, axis=None, skipna=True, **kwds): + if len(self.kwargs) > 0: + for k, v in self.kwargs.iteritems(): + if k not in kwds: + kwds[k] = v + try: + if self.zero_value is not None and values.size == 0: + if values.ndim == 1: + return 0 + else: + result_shape = (values.shape[:axis] + + values.shape[axis + 1:]) + result = np.empty(result_shape) + result.fill(0) + return result + + if _USE_BOTTLENECK and skipna and _bn_ok_dtype(values.dtype): + result = bn_func(values, axis=axis, **kwds) + # prefer to treat inf/-inf as NA + if _has_infs(result): + result = alt(values, axis=axis, skipna=skipna, **kwds) else: - result_shape = values.shape[: - axis] + values.shape[axis + 1:] - result = np.empty(result_shape) - result.fill(0) - return result - - if _USE_BOTTLENECK and skipna and _bn_ok_dtype(values.dtype): - result = bn_func(values, axis=axis, **kwds) - # prefer to treat inf/-inf as NA - if _has_infs(result): result = alt(values, axis=axis, skipna=skipna, **kwds) - else: + except Exception: result = alt(values, axis=axis, skipna=skipna, **kwds) - except Exception: - result = alt(values, axis=axis, skipna=skipna, **kwds) - return result + return result - return f + return f def _bn_ok_dtype(dt): @@ -166,13 +196,17 @@ def nanall(values, axis=None, skipna=True): values, mask, dtype = _get_values(values, skipna, True, copy=skipna) return values.all(axis) -def _nansum(values, axis=None, skipna=True): +@disallow('M8') +@bottleneck_switch(zero_value=0) +def nansum(values, axis=None, skipna=True): values, mask, dtype = _get_values(values, skipna, 0) the_sum = values.sum(axis) the_sum = _maybe_null_out(the_sum, axis, mask) return the_sum -def _nanmean(values, axis=None, skipna=True): +@disallow('M8') +@bottleneck_switch() +def nanmean(values, axis=None, skipna=True): values, mask, dtype = _get_values(values, skipna, 0) the_sum = _ensure_numeric(values.sum(axis)) count = _get_counts(mask, axis) @@ -186,8 +220,9 @@ def _nanmean(values, axis=None, skipna=True): the_mean = the_sum / count if count > 0 else np.nan return the_mean - -def _nanmedian(values, axis=None, skipna=True): +@disallow('M8') +@bottleneck_switch() +def nanmedian(values, axis=None, skipna=True): def get_median(x): mask = notnull(x) if not skipna and not mask.all(): @@ -197,13 +232,31 @@ def get_median(x): if values.dtype != np.float64: values = values.astype('f8') - if values.ndim > 1: - return np.apply_along_axis(get_median, axis, values) - else: - return get_median(values) + notempty = values.size - -def _nanvar(values, axis=None, skipna=True, ddof=1): + # an array from a frame + if values.ndim > 1: + # there's a non-empty array to apply over otherwise numpy raises + if notempty: + return np.apply_along_axis(get_median, axis, values) + + # must return the correct shape, but median is not defined for the + # empty set so return nans of shape "everything but the passed axis" + # since "axis" is where the reduction would occur if we had a nonempty + # array + shp = np.array(values.shape) + dims = np.arange(values.ndim) + ret = np.empty(shp[dims != axis]) + ret.fill(np.nan) + return ret + + # otherwise return a scalar value + return get_median(values) if notempty else np.nan + + +@disallow('M8') +@bottleneck_switch(ddof=1) +def nanvar(values, axis=None, skipna=True, ddof=1): if not isinstance(values.dtype.type, np.floating): values = values.astype('f8') @@ -223,7 +276,8 @@ def _nanvar(values, axis=None, skipna=True, ddof=1): return np.fabs((XX - X ** 2 / count) / (count - ddof)) -def _nanmin(values, axis=None, skipna=True): +@bottleneck_switch() +def nanmin(values, axis=None, skipna=True): values, mask, dtype = _get_values(values, skipna, fill_value_typ = '+inf') # numpy 1.6.1 workaround in Python 3.x @@ -247,7 +301,8 @@ def _nanmin(values, axis=None, skipna=True): return _maybe_null_out(result, axis, mask) -def _nanmax(values, axis=None, skipna=True): +@bottleneck_switch() +def nanmax(values, axis=None, skipna=True): values, mask, dtype = _get_values(values, skipna, fill_value_typ ='-inf') # numpy 1.6.1 workaround in Python 3.x @@ -291,14 +346,8 @@ def nanargmin(values, axis=None, skipna=True): result = _maybe_arg_null_out(result, axis, mask, skipna) return result -nansum = _bottleneck_switch('nansum', _nansum, zero_value=0) -nanmean = _bottleneck_switch('nanmean', _nanmean) -nanmedian = _bottleneck_switch('nanmedian', _nanmedian) -nanvar = _bottleneck_switch('nanvar', _nanvar, ddof=1) -nanmin = _bottleneck_switch('nanmin', _nanmin) -nanmax = _bottleneck_switch('nanmax', _nanmax) - +@disallow('M8') def nanskew(values, axis=None, skipna=True): if not isinstance(values.dtype.type, np.floating): values = values.astype('f8') @@ -332,6 +381,7 @@ def nanskew(values, axis=None, skipna=True): return result +@disallow('M8') def nankurt(values, axis=None, skipna=True): if not isinstance(values.dtype.type, np.floating): values = values.astype('f8') @@ -365,6 +415,7 @@ def nankurt(values, axis=None, skipna=True): return result +@disallow('M8') def nanprod(values, axis=None, skipna=True): mask = isnull(values) if skipna and not issubclass(values.dtype.type, np.integer): @@ -423,6 +474,7 @@ def _zero_out_fperr(arg): return 0 if np.abs(arg) < 1e-14 else arg +@disallow('M8') def nancorr(a, b, method='pearson', min_periods=None): """ a, b: ndarrays @@ -469,6 +521,7 @@ def _spearman(a, b): return _cor_methods[method] +@disallow('M8') def nancov(a, b, min_periods=None): if len(a) != len(b): raise AssertionError('Operands to nancov must have same size') diff --git a/pandas/core/series.py b/pandas/core/series.py index 64a6e9d3bcaaf..3a7a7d0f49b66 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -97,21 +97,15 @@ def convert_to_array(values): values = np.array([values]) inferred_type = lib.infer_dtype(values) if inferred_type in set(['datetime64','datetime','date','time']): - if isinstance(values, pa.Array) and com.is_datetime64_dtype(values): - pass - else: + if not (isinstance(values, pa.Array) and com.is_datetime64_dtype(values)): values = tslib.array_to_datetime(values) elif inferred_type in set(['timedelta','timedelta64']): # need to convert timedelta to ns here # safest to convert it to an object arrany to process - if isinstance(values, pa.Array) and com.is_timedelta64_dtype(values): - pass - else: + if not (isinstance(values, pa.Array) and com.is_timedelta64_dtype(values)): values = com._possibly_cast_to_timedelta(values) elif inferred_type in set(['integer']): - if values.dtype == 'timedelta64[ns]': - pass - elif values.dtype.kind == 'm': + if values.dtype.kind == 'm': values = values.astype('timedelta64[ns]') else: values = pa.array(values) @@ -125,9 +119,9 @@ def convert_to_array(values): is_datetime_rhs = com.is_datetime64_dtype(rvalues) # 2 datetimes or 2 timedeltas - if (is_timedelta_lhs and is_timedelta_rhs) or (is_datetime_lhs and is_datetime_rhs): - - if is_datetime_lhs and name not in ['__sub__']: + if (is_timedelta_lhs and is_timedelta_rhs) or (is_datetime_lhs and + is_datetime_rhs): + if is_datetime_lhs and name != '__sub__': raise TypeError("can only operate on a datetimes for subtraction, " "but the operator [%s] was passed" % name) elif is_timedelta_lhs and name not in ['__add__','__sub__']: diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 0b34d4dc46494..d674a2f44ebe1 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -9167,6 +9167,15 @@ def _check_stat_op(self, name, alternative, frame=None, has_skipna=True, f = getattr(frame, name) + if not ('max' in name or 'min' in name or 'count' in name): + df = DataFrame({'b': date_range('1/1/2001', periods=2)}) + _f = getattr(df, name) + print df + self.assertFalse(len(_f())) + + df['a'] = range(len(df)) + self.assert_(len(getattr(df, name)())) + if has_skipna: def skipna_wrapper(x): nona = x.dropna().values diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index cba908f7136a9..e1589b9499757 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -1460,10 +1460,6 @@ def test_sum_inf(self): with cf.option_context("mode.use_inf_as_null", True): assert_almost_equal(s.sum(), s2.sum()) - res = nanops.nansum(arr, axis=1) - expected = nanops._nansum(arr, axis=1) - assert_almost_equal(res, expected) - res = nanops.nansum(arr, axis=1) self.assertTrue(np.isinf(res).all()) @@ -1594,6 +1590,12 @@ def testit(): # add some NaNs self.series[5:15] = np.NaN + + # idxmax, idxmin, min, and max are valid for dates + if not ('max' in name or 'min' in name): + ds = Series(date_range('1/1/2001', periods=10)) + self.assertRaises(TypeError, f, ds) + # skipna or no self.assert_(notnull(f(self.series))) self.assert_(isnull(f(self.series, skipna=False)))