From 790330d7a4a7c9cc37057c48301c2b53d872ae64 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 21 Dec 2017 11:27:11 -0600 Subject: [PATCH 01/14] API: Change the sum of all-NA / all-Empty sum / prod --- doc/source/whatsnew/v0.22.0.txt | 194 ++++++++++++++++++++++- pandas/_libs/groupby_helper.pxi.in | 4 +- pandas/_libs/window.pyx | 14 +- pandas/core/generic.py | 34 ++-- pandas/core/groupby.py | 4 +- pandas/core/nanops.py | 18 ++- pandas/core/resample.py | 2 +- pandas/tests/frame/test_analytics.py | 52 ++++-- pandas/tests/groupby/test_aggregate.py | 43 ++++- pandas/tests/groupby/test_categorical.py | 10 +- pandas/tests/groupby/test_groupby.py | 2 +- pandas/tests/groupby/test_timegrouper.py | 18 ++- pandas/tests/series/test_analytics.py | 20 +-- pandas/tests/series/test_quantile.py | 2 +- pandas/tests/test_nanops.py | 23 ++- pandas/tests/test_panel.py | 21 ++- pandas/tests/test_panel4d.py | 22 ++- pandas/tests/test_resample.py | 53 +++---- pandas/tests/test_window.py | 55 ++++++- 19 files changed, 456 insertions(+), 135 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 2d30e00142846..6305829b99fdd 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -3,12 +3,198 @@ v0.22.0 ------- -This is a major release from 0.21.1 and includes a number of API changes, -deprecations, new features, enhancements, and performance improvements along -with a large number of bug fixes. We recommend that all users upgrade to this -version. +This is a major release from 0.21.1 and includes a single, API-breaking change. +We recommend that all users upgrade to this version after carefully reading the +release note (singular!). .. _whatsnew_0220.api_breaking: Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Pandas 0.22.0 changes the handling of empty and all-*NA* sums and products. The +summary is that + +* The sum of an all-*NA* or empty ``Series`` is now 0 +* The product of an all-*NA* or empty series is now 1 +* We've added a ``min_count`` parameter to ``.sum`` and ``.prod`` to control + the minimum number of valid values for the result to be valid. If fewer than + ``min_count`` valid values are present, the result is NA. The default is + ``0``. To return ``NaN``, the 0.21 behavior, use ``min_count=1``. + +Some background: In pandas 0.21, we fixed a long-standing inconsistency +in the return value of all-*NA* series depending on whether or not bottleneck +was installed. See :ref:`whatsnew_0210.api_breaking.bottleneck`_. At the same +time, we changed the sum and prod of an empty Series to also be ``NaN``. + +Based on feedback, we've partially reverted those changes. The default sum for +all-*NA* and empty series is now 0 (1 for ``prod``). + +*pandas 0.21.x* + +.. code-block:: ipython + + In [3]: pd.Series([]).sum() + Out[3]: nan + + In [4]: pd.Series([np.nan]).sum() + Out[4]: nan + +*pandas 0.22.0* + +.. ipython:: python + + pd.Series([]).sum() + pd.Series([np.nan]).sum() + +The default behavior is the same as pandas 0.20.3 with bottleneck installed. It +also matches the behavior of ``np.nansum`` and ``np.nanprod`` on empty and +all-*NA* arrays. + +To have the sum of an empty series return ``NaN``, use the ``min_count`` +keyword. Thanks to the ``skipna`` parameter, the ``.sum`` on an all-*NA* +series is conceptually the same as on an empty. The ``min_count`` parameter +refers to the minimum number of *valid* values required for a non-NA sum +or product. + +.. ipython:: python + + pd.Series([]).sum(min_count=1) + pd.Series([np.nan]).sum(min_count=1) + +Returning ``NaN`` was the default behavior for pandas 0.20.3 without bottleneck +installed. + +Note that this affects some other places in the library: + +Grouping by a Categorical +^^^^^^^^^^^^^^^^^^^^^^^^^ + +Grouping by a ``Categorical`` with some unobserved categories and computing the +``sum`` / ``prod`` will behave differently. + +*pandas 0.21.x* + +.. code-block:: ipython + + In [5]: grouper = pd.Categorical(['a', 'a'], categories=['a', 'b']) + + In [6]: pd.Series([1, 2]).groupby(grouper).sum() + Out[6]: + a 3.0 + b NaN + dtype: float64 + +*pandas 0.22* + +.. ipython:: python + + grouper = pd.Categorical(['a', 'a'], categories=['a', 'b']) + pd.Series([1, 2]).groupby(grouper).sum() + +To restore the 0.21 behavior of returning ``NaN`` of unobserved groups, +use ``min_count>=1``. + +.. ipython:: python + + pd.Series([1, 2]).groupby(grouper).sum(min_count=1) + +Resample +^^^^^^^^ + +The sum and product of all-*NA* bins will change: + +*pandas 0.21.x* + +.. code-block:: ipython + + In [7]: s = pd.Series([1, 1, np.nan, np.nan], + ...: index=pd.date_range('2017', periods=4)) + ...: + + In [8]: s + Out[8]: + 2017-01-01 1.0 + 2017-01-02 1.0 + 2017-01-03 NaN + 2017-01-04 NaN + Freq: D, dtype: float64 + + In [9]: s.resample('2d').sum() + Out[9]: + 2017-01-01 2.0 + 2017-01-03 NaN + Freq: 2D, dtype: float64 + +*pandas 0.22.0* + +.. ipython:: python + + s = pd.Series([1, 1, np.nan, np.nan], + index=pd.date_range('2017', periods=4)) + s.resample('2d').sum() + +To restore the 0.21 behavior of returning ``NaN``, use ``min_count>=1``. + +.. ipython:: python + + s.resample('2d').sum(min_count=1) + +In particular, upsampling and taking the sum or product is affected, as +upsampling introduces all-*NA* original series was entirely valid. + +*pandas 0.21.x* + +.. code-block:: ipython + + In [10]: idx = pd.DatetimeIndex(['2017-01-01', '2017-01-02']) + + In [10]: pd.Series([1, 2], index=idx).resample('12H').sum() + Out[10]: + 2017-01-01 00:00:00 1.0 + 2017-01-01 12:00:00 NaN + 2017-01-02 00:00:00 2.0 + Freq: 12H, dtype: float64 + +*pandas 0.22.0* + +.. ipython:: python + + idx = pd.DatetimeIndex(['2017-01-01', '2017-01-02']) + pd.Series([1, 2], index=idx).resample("12H").sum() + +Once again, the ``min_count`` keyword is available to restore the 0.21 behavior. + +.. ipython:: python + + pd.Series([1, 2], index=idx).resample("12H").sum(min_count=1) + +Rolling and Expanding +^^^^^^^^^^^^^^^^^^^^^ + +Rolling and expanding already have a ``min_periods`` keyword that behaves +similarly to ``min_count``. The only case that changes is when doing a rolling +or expanding sum on an all-*NA* series with ``min_periods=0``. Previously this +returned ``NaN``, now it will return ``0``. + +*pandas 0.21.1* + +.. ipython:: python + + In [11]: s = pd.Series([np.nan, np.nan]) + + In [12]: s.rolling(2, min_periods=0).sum() + Out[12]: + 0 NaN + 1 NaN + dtype: float64 + +*pandas 0.22.0* + +.. ipython:: python + + s = pd.Series([np.nan, np.nan]) + s.rolling(2, min_periods=0).sum() + +The default behavior of ``min_periods=None``, implying that ``min_periods`` +equals the window size, is unchanged. diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 16b7cbff44e03..14d47398ac1df 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -37,7 +37,7 @@ def group_add_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, ndarray[int64_t] counts, ndarray[{{c_type}}, ndim=2] values, ndarray[int64_t] labels, - Py_ssize_t min_count=1): + Py_ssize_t min_count=0): """ Only aggregates on axis=0 """ @@ -101,7 +101,7 @@ def group_prod_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, ndarray[int64_t] counts, ndarray[{{c_type}}, ndim=2] values, ndarray[int64_t] labels, - Py_ssize_t min_count=1): + Py_ssize_t min_count=0): """ Only aggregates on axis=0 """ diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx index ecce45742afa7..cbc9592e9f9b7 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window.pyx @@ -223,11 +223,12 @@ cdef class VariableWindowIndexer(WindowIndexer): """ def __init__(self, ndarray input, int64_t win, int64_t minp, - bint left_closed, bint right_closed, ndarray index): + bint left_closed, bint right_closed, ndarray index, + object floor=None): self.is_variable = 1 self.N = len(index) - self.minp = _check_minp(win, minp, self.N) + self.minp = _check_minp(win, minp, self.N, floor=floor) self.start = np.empty(self.N, dtype='int64') self.start.fill(-1) @@ -342,7 +343,7 @@ def get_window_indexer(input, win, minp, index, closed, if index is not None: indexer = VariableWindowIndexer(input, win, minp, left_closed, - right_closed, index) + right_closed, index, floor=floor) elif use_mock: indexer = MockFixedWindowIndexer(input, win, minp, left_closed, right_closed, index, floor) @@ -449,7 +450,8 @@ def roll_sum(ndarray[double_t] input, int64_t win, int64_t minp, start, end, N, win, minp, is_variable = get_window_indexer(input, win, minp, index, - closed) + closed, + floor=0) output = np.empty(N, dtype=float) # for performance we are going to iterate @@ -491,11 +493,11 @@ def roll_sum(ndarray[double_t] input, int64_t win, int64_t minp, with nogil: - for i in range(0, minp - 1): + for i in range(0, min(1, minp) - 1): add_sum(input[i], &nobs, &sum_x) output[i] = NaN - for i in range(minp - 1, N): + for i in range(min(1, minp) - 1, N): val = input[i] add_sum(val, &nobs, &sum_x) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 2acf64f1d9f74..c5359ba2c5ea1 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7619,48 +7619,48 @@ def _doc_parms(cls): _sum_examples = """\ Examples -------- -By default, the sum of an empty series is ``NaN``. +By default, the sum of an empty or all-NA Series is ``0``. ->>> pd.Series([]).sum() # min_count=1 is the default -nan +>>> pd.Series([]).sum() # min_count=0 is the default +0.0 This can be controlled with the ``min_count`` parameter. For example, if -you'd like the sum of an empty series to be 0, pass ``min_count=0``. +you'd like the sum of an empty series to be NaN, pass ``min_count=1``. ->>> pd.Series([]).sum(min_count=0) -0.0 +>>> pd.Series([]).sum(min_count=1) +nan Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and empty series identically. >>> pd.Series([np.nan]).sum() -nan - ->>> pd.Series([np.nan]).sum(min_count=0) 0.0 + +>>> pd.Series([np.nan]).sum(min_count=1) +nan """ _prod_examples = """\ Examples -------- -By default, the product of an empty series is ``NaN`` +By default, the product of an empty or all-NA Series is ``1`` >>> pd.Series([]).prod() -nan +1.0 This can be controlled with the ``min_count`` parameter ->>> pd.Series([]).prod(min_count=0) -1.0 +>>> pd.Series([]).prod(min_count=1) +nan Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and empty series identically. >>> pd.Series([np.nan]).prod() -nan - ->>> pd.Series([np.nan]).sum(min_count=0) 1.0 + +>>> pd.Series([np.nan]).sum(min_count=1) +nan """ @@ -7683,7 +7683,7 @@ def _make_min_count_stat_function(cls, name, name1, name2, axis_descr, desc, examples=examples) @Appender(_num_doc) def stat_func(self, axis=None, skipna=None, level=None, numeric_only=None, - min_count=1, + min_count=0, **kwargs): nv.validate_stat_func(tuple(), kwargs, fname=name) if skipna is None: diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 041239ed06d88..06b7dbb4ecf7b 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1363,8 +1363,8 @@ def last(x): else: return last(x) - cls.sum = groupby_function('sum', 'add', np.sum, min_count=1) - cls.prod = groupby_function('prod', 'prod', np.prod, min_count=1) + cls.sum = groupby_function('sum', 'add', np.sum, min_count=0) + cls.prod = groupby_function('prod', 'prod', np.prod, min_count=0) cls.min = groupby_function('min', 'min', np.min, numeric_only=False) cls.max = groupby_function('max', 'max', np.max, numeric_only=False) cls.first = groupby_function('first', 'first', first_compat, diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 88f69f6ff2e14..f51ceefd6bdc0 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -109,6 +109,8 @@ def f(values, axis=None, skipna=True, **kwds): try: if values.size == 0 and kwds.get('min_count') is None: # We are empty, returning NA for our type + # Only applies for the default `min_count` of None + # since that affects how empty arrays are handled. return _na_for_min_count(values, axis) if (_USE_BOTTLENECK and skipna and @@ -281,6 +283,18 @@ def _wrap_results(result, dtype): def _na_for_min_count(values, axis): + """Return the missing value for `values` + + values : ndarray + axis : int, optional + axis for the reduction + + Returns + ------- + result : scalar or ndarray + For 1-D values, returns a scalar of the correct missing type. + For 2-D values, returns a 1-D array where each element is missing. + """ # we either return np.nan or pd.NaT if is_numeric_dtype(values): values = values.astype('float64') @@ -308,7 +322,7 @@ def nanall(values, axis=None, skipna=True): @disallow('M8') @bottleneck_switch() -def nansum(values, axis=None, skipna=True, min_count=1): +def nansum(values, axis=None, skipna=True, min_count=0): values, mask, dtype, dtype_max = _get_values(values, skipna, 0) dtype_sum = dtype_max if is_float_dtype(dtype): @@ -645,7 +659,7 @@ def nankurt(values, axis=None, skipna=True): @disallow('M8', 'm8') -def nanprod(values, axis=None, skipna=True, min_count=1): +def nanprod(values, axis=None, skipna=True, min_count=0): mask = isna(values) if skipna and not is_any_int_dtype(values): values = values.copy() diff --git a/pandas/core/resample.py b/pandas/core/resample.py index a30c727ecb87c..5447ce7470b9d 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -629,7 +629,7 @@ def size(self): # downsample methods for method in ['sum', 'prod']: - def f(self, _method=method, min_count=1, *args, **kwargs): + def f(self, _method=method, min_count=0, *args, **kwargs): nv.validate_resampler_func(_method, args, kwargs) return self._downsample(_method, min_count=min_count) f.__doc__ = getattr(GroupBy, method).__doc__ diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 80e9acd0d2281..5c1d1686b1a19 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -478,7 +478,8 @@ def test_nunique(self): Series({0: 1, 1: 3, 2: 2})) def test_sum(self): - self._check_stat_op('sum', np.sum, has_numeric_only=True) + self._check_stat_op('sum', np.sum, has_numeric_only=True, + skipna_alternative=np.nansum) # mixed types (with upcasting happening) self._check_stat_op('sum', np.sum, @@ -753,7 +754,8 @@ def alt(x): def _check_stat_op(self, name, alternative, frame=None, has_skipna=True, has_numeric_only=False, check_dtype=True, - check_dates=False, check_less_precise=False): + check_dates=False, check_less_precise=False, + skipna_alternative=None): if frame is None: frame = self.frame # set some NAs @@ -774,15 +776,19 @@ def _check_stat_op(self, name, alternative, frame=None, has_skipna=True, assert len(result) if has_skipna: - def skipna_wrapper(x): - nona = x.dropna() - if len(nona) == 0: - return np.nan - return alternative(nona) - def wrapper(x): return alternative(x.values) + if skipna_alternative: + def skipna_wrapper(x): + return skipna_alternative(x.values) + else: + def skipna_wrapper(x): + nona = x.dropna() + if len(nona) == 0: + return np.nan + return alternative(nona) + result0 = f(axis=0, skipna=False) result1 = f(axis=1, skipna=False) tm.assert_series_equal(result0, frame.apply(wrapper), @@ -834,8 +840,11 @@ def wrapper(x): r0 = getattr(all_na, name)(axis=0) r1 = getattr(all_na, name)(axis=1) if name in ['sum', 'prod']: - assert np.isnan(r0).all() - assert np.isnan(r1).all() + unit = int(name == 'prod') + expected = pd.Series(unit, index=r0.index, dtype=r0.dtype) + tm.assert_series_equal(r0, expected) + expected = pd.Series(unit, index=r1.index, dtype=r1.dtype) + tm.assert_series_equal(r1, expected) def test_mode(self): df = pd.DataFrame({"A": [12, 12, 11, 12, 19, 11], @@ -1004,6 +1013,29 @@ def test_sum_prod_nanops(self, method, unit): expected = pd.Series(result, index=['A', 'B']) tm.assert_series_equal(result, expected) + if method == 'sum': + # prod isn't defined on timedeltas + df = pd.DataFrame({"a": [unit, unit], + "b": [unit, np.nan], + "c": [np.nan, np.nan]}) + + df2 = df.apply(pd.to_timedelta) + + # 0 by default + result = getattr(df2, method)() + expected = pd.Series([0, 0, 0], dtype='m8[ns]', index=idx) + tm.assert_series_equal(result, expected) + + # min_count=0 + result = getattr(df2, method)(min_count=0) + tm.assert_series_equal(result, expected) + + # min_count=1 + result = getattr(df2, method)(min_count=1) + expected = pd.Series([0, 0, np.nan], dtype='m8[ns]', + index=idx) + tm.assert_series_equal(result, expected) + def test_sum_object(self): values = self.frame.values.astype(int) frame = DataFrame(values, index=self.frame.index, diff --git a/pandas/tests/groupby/test_aggregate.py b/pandas/tests/groupby/test_aggregate.py index 07ecc085098bf..cca21fddd116e 100644 --- a/pandas/tests/groupby/test_aggregate.py +++ b/pandas/tests/groupby/test_aggregate.py @@ -813,8 +813,6 @@ def test__cython_agg_general(self): ('mean', np.mean), ('median', lambda x: np.median(x) if len(x) > 0 else np.nan), ('var', lambda x: np.var(x, ddof=1)), - ('add', lambda x: np.sum(x) if len(x) > 0 else np.nan), - ('prod', np.prod), ('min', np.min), ('max', np.max), ] ) @@ -824,12 +822,7 @@ def test_cython_agg_empty_buckets(self, op, targop): # calling _cython_agg_general directly, instead of via the user API # which sets different values for min_count, so do that here. - if op in ('add', 'prod'): - min_count = 1 - else: - min_count = -1 - result = df.groupby(pd.cut(df[0], grps))._cython_agg_general( - op, min_count=min_count) + result = df.groupby(pd.cut(df[0], grps))._cython_agg_general(op) expected = df.groupby(pd.cut(df[0], grps)).agg(lambda x: targop(x)) try: tm.assert_frame_equal(result, expected) @@ -837,6 +830,40 @@ def test_cython_agg_empty_buckets(self, op, targop): exc.args += ('operation: %s' % op,) raise + def test_cython_agg_empty_buckets_nanops(self): + # GH-18869 can't call nanops on empty groups, so hardcode expected + # for these + df = pd.DataFrame([11, 12, 13], columns=['a']) + grps = range(0, 25, 5) + # add / sum + result = df.groupby(pd.cut(df['a'], grps))._cython_agg_general('add') + intervals = pd.interval_range(0, 20, freq=5) + expected = pd.DataFrame( + {"a": [0, 0, 36, 0]}, + index=pd.CategoricalIndex(intervals, name='a', ordered=True)) + tm.assert_frame_equal(result, expected) + + # prod + result = df.groupby(pd.cut(df['a'], grps))._cython_agg_general('prod') + expected = pd.DataFrame( + {"a": [1, 1, 1716, 1]}, + index=pd.CategoricalIndex(intervals, name='a', ordered=True)) + tm.assert_frame_equal(result, expected) + + @pytest.mark.xfail(reason="GH-18869: agg func not called on empty groups.") + def test_agg_category_nansum(self): + categories = ['a', 'b', 'c'] + df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'], + categories=categories), + 'B': [1, 2, 3]}) + result = df.groupby("A").B.agg(np.nansum) + expected = pd.Series([3, 3, 0], + index=pd.CategoricalIndex(['a', 'b', 'c'], + categories=categories, + name='A'), + name='B') + tm.assert_series_equal(result, expected) + def test_agg_over_numpy_arrays(self): # GH 3788 df = pd.DataFrame([[1, np.array([10, 20, 30])], diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 5e3d2bb9cf091..1713b2d3015ad 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -37,7 +37,7 @@ def test_groupby(self): # single grouper gb = df.groupby("A") exp_idx = CategoricalIndex(['a', 'b', 'z'], name='A', ordered=True) - expected = DataFrame({'values': Series([3, 7, np.nan], index=exp_idx)}) + expected = DataFrame({'values': Series([3, 7, 0], index=exp_idx)}) result = gb.sum() tm.assert_frame_equal(result, expected) @@ -670,9 +670,9 @@ def test_empty_sum(self): 'B': [1, 2, 1]}) expected_idx = pd.CategoricalIndex(['a', 'b', 'c'], name='A') - # NA by default + # 0 by default result = df.groupby("A").B.sum() - expected = pd.Series([3, 1, np.nan], expected_idx, name='B') + expected = pd.Series([3, 1, 0], expected_idx, name='B') tm.assert_series_equal(result, expected) # min_count=0 @@ -693,9 +693,9 @@ def test_empty_prod(self): expected_idx = pd.CategoricalIndex(['a', 'b', 'c'], name='A') - # NA by default + # 1 by default result = df.groupby("A").B.prod() - expected = pd.Series([2, 1, np.nan], expected_idx, name='B') + expected = pd.Series([2, 1, 1], expected_idx, name='B') tm.assert_series_equal(result, expected) # min_count=0 diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index cf4a6ec1c932a..a13d985ab6974 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2704,7 +2704,7 @@ def h(df, arg3): # Assert the results here index = pd.Index(['A', 'B', 'C'], name='group') - expected = pd.Series([-79.5160891089, -78.4839108911, None], + expected = pd.Series([-79.5160891089, -78.4839108911, -80], index=index) assert_series_equal(expected, result) diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index c8503b16a0e16..c0f4470222c81 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -41,12 +41,11 @@ def test_groupby_with_timegrouper(self): df = df.set_index(['Date']) expected = DataFrame( - {'Quantity': np.nan}, + {'Quantity': 0}, index=date_range('20130901 13:00:00', '20131205 13:00:00', freq='5D', name='Date', closed='left')) - expected.iloc[[0, 6, 18], 0] = np.array( - [24., 6., 9.], dtype='float64') + expected.iloc[[0, 6, 18], 0] = np.array([24, 6, 9], dtype='int64') result1 = df.resample('5D') .sum() assert_frame_equal(result1, expected) @@ -259,10 +258,15 @@ def test_timegrouper_with_reg_groups(self): }).set_index('date') for freq in ['D', 'M', 'A', 'Q-APR']: - expected = df.groupby('user_id')[ - 'whole_cost'].resample( - freq).sum().dropna().reorder_levels( - ['date', 'user_id']).sort_index().astype('int64') + expected = ( + df.groupby('user_id')['whole_cost'] + .resample(freq) + .sum(min_count=1) # XXX + .dropna() + .reorder_levels(['date', 'user_id']) + .sort_index() + .astype('int64') + ) expected.name = 'whole_cost' result1 = df.sort_index().groupby([pd.Grouper(freq=freq), diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index cd92edc927173..14bf194ba5ee4 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -36,12 +36,12 @@ class TestSeriesAnalytics(TestData): ]) def test_empty(self, method, unit, use_bottleneck): with pd.option_context("use_bottleneck", use_bottleneck): - # GH 9422 + # GH 9422 / 18921 # Entirely empty s = Series([]) # NA by default result = getattr(s, method)() - assert isna(result) + assert result == unit # Explict result = getattr(s, method)(min_count=0) @@ -52,7 +52,7 @@ def test_empty(self, method, unit, use_bottleneck): # Skipna, default result = getattr(s, method)(skipna=True) - assert isna(result) + result == unit # Skipna, explicit result = getattr(s, method)(skipna=True, min_count=0) @@ -65,7 +65,7 @@ def test_empty(self, method, unit, use_bottleneck): s = Series([np.nan]) # NA by default result = getattr(s, method)() - assert isna(result) + assert result == unit # Explicit result = getattr(s, method)(min_count=0) @@ -76,7 +76,7 @@ def test_empty(self, method, unit, use_bottleneck): # Skipna, default result = getattr(s, method)(skipna=True) - assert isna(result) + result == unit # skipna, explicit result = getattr(s, method)(skipna=True, min_count=0) @@ -110,7 +110,7 @@ def test_empty(self, method, unit, use_bottleneck): # GH #844 (changed in 9422) df = DataFrame(np.empty((10, 0))) - assert (df.sum(1).isnull()).all() + assert (getattr(df, method)(1) == unit).all() s = pd.Series([1]) result = getattr(s, method)(min_count=2) @@ -131,9 +131,9 @@ def test_empty(self, method, unit, use_bottleneck): def test_empty_multi(self, method, unit): s = pd.Series([1, np.nan, np.nan, np.nan], index=pd.MultiIndex.from_product([('a', 'b'), (0, 1)])) - # NaN by default + # 1 / 0 by default result = getattr(s, method)(level=0) - expected = pd.Series([1, np.nan], index=['a', 'b']) + expected = pd.Series([1, unit], index=['a', 'b']) tm.assert_series_equal(result, expected) # min_count=0 @@ -147,7 +147,7 @@ def test_empty_multi(self, method, unit): tm.assert_series_equal(result, expected) @pytest.mark.parametrize( - "method", ['sum', 'mean', 'median', 'std', 'var']) + "method", ['mean', 'median', 'std', 'var']) def test_ops_consistency_on_empty(self, method): # GH 7869 @@ -195,7 +195,7 @@ def test_sum_overflow(self, use_bottleneck): assert np.allclose(float(result), v[-1]) def test_sum(self): - self._check_stat_op('sum', np.sum, check_allna=True) + self._check_stat_op('sum', np.sum, check_allna=False) def test_sum_inf(self): s = Series(np.random.randn(10)) diff --git a/pandas/tests/series/test_quantile.py b/pandas/tests/series/test_quantile.py index 14a44c36c6a0c..3c93ff1d3f31e 100644 --- a/pandas/tests/series/test_quantile.py +++ b/pandas/tests/series/test_quantile.py @@ -38,7 +38,7 @@ def test_quantile(self): # GH7661 result = Series([np.timedelta64('NaT')]).sum() - assert result is pd.NaT + assert result == pd.Timedelta(0) msg = 'percentiles should all be in the interval \\[0, 1\\]' for invalid in [-1, 2, [0.5, -1], [0.5, 2]]: diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index d03ecb9f9b5b7..306d063b364fe 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -182,12 +182,17 @@ def _coerce_tds(targ, res): check_dtype=check_dtype) def check_fun_data(self, testfunc, targfunc, testarval, targarval, - targarnanval, check_dtype=True, **kwargs): + targarnanval, check_dtype=True, empty_targfunc=None, + **kwargs): for axis in list(range(targarval.ndim)) + [None]: for skipna in [False, True]: targartempval = targarval if skipna else targarnanval - try: + if skipna and empty_targfunc and pd.isna(targartempval).all(): + targ = empty_targfunc(targartempval, axis=axis, **kwargs) + else: targ = targfunc(targartempval, axis=axis, **kwargs) + + try: res = testfunc(testarval, axis=axis, skipna=skipna, **kwargs) self.check_results(targ, res, axis, @@ -219,10 +224,11 @@ def check_fun_data(self, testfunc, targfunc, testarval, targarval, except ValueError: return self.check_fun_data(testfunc, targfunc, testarval2, targarval2, - targarnanval2, check_dtype=check_dtype, **kwargs) + targarnanval2, check_dtype=check_dtype, + empty_targfunc=empty_targfunc, **kwargs) def check_fun(self, testfunc, targfunc, testar, targar=None, - targarnan=None, **kwargs): + targarnan=None, empty_targfunc=None, **kwargs): if targar is None: targar = testar if targarnan is None: @@ -232,7 +238,8 @@ def check_fun(self, testfunc, targfunc, testar, targar=None, targarnanval = getattr(self, targarnan) try: self.check_fun_data(testfunc, targfunc, testarval, targarval, - targarnanval, **kwargs) + targarnanval, empty_targfunc=empty_targfunc, + **kwargs) except BaseException as exc: exc.args += ('testar: %s' % testar, 'targar: %s' % targar, 'targarnan: %s' % targarnan) @@ -329,7 +336,8 @@ def test_nanall(self): def test_nansum(self): self.check_funs(nanops.nansum, np.sum, allow_str=False, - allow_date=False, allow_tdelta=True, check_dtype=False) + allow_date=False, allow_tdelta=True, check_dtype=False, + empty_targfunc=np.nansum) def test_nanmean(self): self.check_funs(nanops.nanmean, np.mean, allow_complex=False, @@ -463,7 +471,8 @@ def test_nankurt(self): def test_nanprod(self): self.check_funs(nanops.nanprod, np.prod, allow_str=False, - allow_date=False, allow_tdelta=False) + allow_date=False, allow_tdelta=False, + empty_targfunc=np.nanprod) def check_nancorr_nancov_2d(self, checkfun, targ0, targ1, **kwargs): res00 = checkfun(self.arr_float_2d, self.arr_float1_2d, **kwargs) diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 34c1ee5683183..6ea9fdff724aa 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -83,13 +83,13 @@ def test_count(self): self._check_stat_op('count', f, obj=self.panel, has_skipna=False) def test_sum(self): - self._check_stat_op('sum', np.sum) + self._check_stat_op('sum', np.sum, skipna_alternative=np.nansum) def test_mean(self): self._check_stat_op('mean', np.mean) def test_prod(self): - self._check_stat_op('prod', np.prod) + self._check_stat_op('prod', np.prod, skipna_alternative=np.nanprod) def test_median(self): def wrapper(x): @@ -140,7 +140,8 @@ def alt(x): self._check_stat_op('sem', alt) - def _check_stat_op(self, name, alternative, obj=None, has_skipna=True): + def _check_stat_op(self, name, alternative, obj=None, has_skipna=True, + skipna_alternative=None): if obj is None: obj = self.panel @@ -152,11 +153,15 @@ def _check_stat_op(self, name, alternative, obj=None, has_skipna=True): if has_skipna: - def skipna_wrapper(x): - nona = remove_na_arraylike(x) - if len(nona) == 0: - return np.nan - return alternative(nona) + if skipna_alternative: + def skipna_wrapper(x): + return skipna_alternative(np.asarray(x)) + else: + def skipna_wrapper(x): + nona = remove_na_arraylike(x) + if len(nona) == 0: + return np.nan + return alternative(nona) def wrapper(x): return alternative(np.asarray(x)) diff --git a/pandas/tests/test_panel4d.py b/pandas/tests/test_panel4d.py index e194136ec716d..89876c3215b0d 100644 --- a/pandas/tests/test_panel4d.py +++ b/pandas/tests/test_panel4d.py @@ -38,13 +38,13 @@ def test_count(self): self._check_stat_op('count', f, obj=self.panel4d, has_skipna=False) def test_sum(self): - self._check_stat_op('sum', np.sum) + self._check_stat_op('sum', np.sum, skipna_alternative=np.nansum) def test_mean(self): self._check_stat_op('mean', np.mean) def test_prod(self): - self._check_stat_op('prod', np.prod) + self._check_stat_op('prod', np.prod, skipna_alternative=np.nanprod) def test_median(self): def wrapper(x): @@ -105,7 +105,8 @@ def alt(x): # self._check_stat_op('skew', alt) - def _check_stat_op(self, name, alternative, obj=None, has_skipna=True): + def _check_stat_op(self, name, alternative, obj=None, has_skipna=True, + skipna_alternative=None): if obj is None: obj = self.panel4d @@ -116,11 +117,16 @@ def _check_stat_op(self, name, alternative, obj=None, has_skipna=True): f = getattr(obj, name) if has_skipna: - def skipna_wrapper(x): - nona = remove_na_arraylike(x) - if len(nona) == 0: - return np.nan - return alternative(nona) + + if skipna_alternative: + def skipna_wrapper(x): + return skipna_alternative(np.asarray(x)) + else: + def skipna_wrapper(x): + nona = remove_na_arraylike(x) + if len(nona) == 0: + return np.nan + return alternative(nona) def wrapper(x): return alternative(np.asarray(x)) diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index 4a3c4eff9f8c3..d0cfd839d19c4 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -3390,9 +3390,9 @@ def test_aggregate_normal(self): def test_resample_entirly_nat_window(self, method, unit): s = pd.Series([0] * 2 + [np.nan] * 2, index=pd.date_range('2017', periods=4)) - # nan by default + # 0 / 1 by default result = methodcaller(method)(s.resample("2d")) - expected = pd.Series([0.0, np.nan], + expected = pd.Series([0.0, unit], index=pd.to_datetime(['2017-01-01', '2017-01-03'])) tm.assert_series_equal(result, expected) @@ -3411,7 +3411,14 @@ def test_resample_entirly_nat_window(self, method, unit): '2017-01-03'])) tm.assert_series_equal(result, expected) - def test_aggregate_with_nat(self): + @pytest.mark.parametrize('func, fill_value', [ + ('min', np.nan), + ('max', np.nan), + ('sum', 0), + ('prod', 1), + ('count', 0), + ]) + def test_aggregate_with_nat(self, func, fill_value): # check TimeGrouper's aggregation is identical as normal groupby n = 20 @@ -3426,42 +3433,32 @@ def test_aggregate_with_nat(self): normal_grouped = normal_df.groupby('key') dt_grouped = dt_df.groupby(TimeGrouper(key='key', freq='D')) - for func in ['min', 'max', 'sum', 'prod']: - normal_result = getattr(normal_grouped, func)() - dt_result = getattr(dt_grouped, func)() - pad = DataFrame([[np.nan, np.nan, np.nan, np.nan]], index=[3], - columns=['A', 'B', 'C', 'D']) + normal_result = getattr(normal_grouped, func)() + dt_result = getattr(dt_grouped, func)() + + if func == 'size': + pad = Series([fill_value], index=[3]) expected = normal_result.append(pad) expected = expected.sort_index() expected.index = date_range(start='2013-01-01', freq='D', periods=5, name='key') - assert_frame_equal(expected, dt_result) - - for func in ['count']: - normal_result = getattr(normal_grouped, func)() - pad = DataFrame([[0, 0, 0, 0]], index=[3], + dt_result = getattr(dt_grouped, func)() + assert_series_equal(expected, dt_result) + # GH 9925 + assert dt_result.index.name == 'key' + else: + pad = DataFrame([[fill_value] * 4], index=[3], columns=['A', 'B', 'C', 'D']) expected = normal_result.append(pad) expected = expected.sort_index() expected.index = date_range(start='2013-01-01', freq='D', periods=5, name='key') - dt_result = getattr(dt_grouped, func)() assert_frame_equal(expected, dt_result) - for func in ['size']: - normal_result = getattr(normal_grouped, func)() - pad = Series([0], index=[3]) - expected = normal_result.append(pad) - expected = expected.sort_index() - expected.index = date_range(start='2013-01-01', freq='D', - periods=5, name='key') - dt_result = getattr(dt_grouped, func)() - assert_series_equal(expected, dt_result) - # GH 9925 assert dt_result.index.name == 'key' - # if NaT is included, 'var', 'std', 'mean', 'first','last' - # and 'nth' doesn't work yet + # if NaT is included, 'var', 'std', 'mean', 'first','last' + # and 'nth' doesn't work yet def test_repr(self): # GH18203 @@ -3482,9 +3479,9 @@ def test_upsample_sum(self, method, unit): '2017-01-01T00:30:00', '2017-01-01T01:00:00']) - # NaN by default + # 0 / 1 by default result = methodcaller(method)(resampled) - expected = pd.Series([1, np.nan, 1], index=index) + expected = pd.Series([1, unit, 1], index=index) tm.assert_series_equal(result, expected) # min_count=0 diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index bee925823eebe..1d6291748c62f 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -439,6 +439,26 @@ def tests_empty_df_rolling(self, roller): result = DataFrame(index=pd.DatetimeIndex([])).rolling(roller).sum() tm.assert_frame_equal(result, expected) + def test_missing_minp_zero(self): + # minp=0 + x = pd.Series([np.nan]) + result = x.rolling(1, min_periods=0).sum() + expected = pd.Series([0.0]) + tm.assert_series_equal(result, expected) + + # minp=1 + result = x.rolling(1, min_periods=1).sum() + expected = pd.Series([np.nan]) + tm.assert_series_equal(result, expected) + + def test_missing_minp_zero_variable(self): + x = pd.Series([np.nan] * 4, + index=pd.DatetimeIndex(['2017-01-01', '2017-01-04', + '2017-01-06', '2017-01-07'])) + result = x.rolling(pd.Timedelta("2d"), min_periods=0).sum() + expected = pd.Series(0.0, index=x.index) + tm.assert_series_equal(result, expected) + def test_multi_index_names(self): # GH 16789, 16825 @@ -512,6 +532,18 @@ def test_empty_df_expanding(self, expander): index=pd.DatetimeIndex([])).expanding(expander).sum() tm.assert_frame_equal(result, expected) + def test_missing_minp_zero(self): + # minp=0 + x = pd.Series([np.nan]) + result = x.expanding(min_periods=0).sum() + expected = pd.Series([0.0]) + tm.assert_series_equal(result, expected) + + # minp=1 + result = x.expanding(min_periods=1).sum() + expected = pd.Series([np.nan]) + tm.assert_series_equal(result, expected) + class TestEWM(Base): @@ -828,7 +860,8 @@ def test_centered_axis_validation(self): .rolling(window=3, center=True, axis=2).mean()) def test_rolling_sum(self): - self._check_moment_func(mom.rolling_sum, np.sum, name='sum') + self._check_moment_func(mom.rolling_sum, np.nansum, name='sum', + zero_min_periods_equal=False) def test_rolling_count(self): counter = lambda x: np.isfinite(x).astype(float).sum() @@ -1298,14 +1331,18 @@ def test_fperr_robustness(self): def _check_moment_func(self, f, static_comp, name=None, window=50, has_min_periods=True, has_center=True, has_time_rule=True, preserve_nan=True, - fill_value=None, test_stable=False, **kwargs): + fill_value=None, test_stable=False, + zero_min_periods_equal=True, + **kwargs): with warnings.catch_warnings(record=True): self._check_ndarray(f, static_comp, window=window, has_min_periods=has_min_periods, preserve_nan=preserve_nan, has_center=has_center, fill_value=fill_value, - test_stable=test_stable, **kwargs) + test_stable=test_stable, + zero_min_periods_equal=zero_min_periods_equal, + **kwargs) with warnings.catch_warnings(record=True): self._check_structures(f, static_comp, @@ -1324,7 +1361,8 @@ def _check_moment_func(self, f, static_comp, name=None, window=50, def _check_ndarray(self, f, static_comp, window=50, has_min_periods=True, preserve_nan=True, has_center=True, fill_value=None, - test_stable=False, test_window=True, **kwargs): + test_stable=False, test_window=True, + zero_min_periods_equal=True, **kwargs): def get_result(arr, window, min_periods=None, center=False): return f(arr, window, min_periods=min_periods, center=center, ** kwargs) @@ -1357,10 +1395,11 @@ def get_result(arr, window, min_periods=None, center=False): assert isna(result[3]) assert notna(result[4]) - # min_periods=0 - result0 = get_result(arr, 20, min_periods=0) - result1 = get_result(arr, 20, min_periods=1) - tm.assert_almost_equal(result0, result1) + if zero_min_periods_equal: + # min_periods=0 may be equivalent to min_periods=1 + result0 = get_result(arr, 20, min_periods=0) + result1 = get_result(arr, 20, min_periods=1) + tm.assert_almost_equal(result0, result1) else: result = get_result(arr, 50) tm.assert_almost_equal(result[-1], static_comp(arr[10:-10])) From 541a36204249955a60c251328f999052ab07a7ea Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 28 Dec 2017 10:59:04 -0600 Subject: [PATCH 02/14] Max, not min --- pandas/_libs/window.pyx | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx index cbc9592e9f9b7..e46bf24c36f18 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window.pyx @@ -220,7 +220,8 @@ cdef class VariableWindowIndexer(WindowIndexer): right_closed: bint right endpoint closedness True if the right endpoint is closed, False if open - + floor: optional + unit for flooring the unit """ def __init__(self, ndarray input, int64_t win, int64_t minp, bint left_closed, bint right_closed, ndarray index, @@ -343,7 +344,7 @@ def get_window_indexer(input, win, minp, index, closed, if index is not None: indexer = VariableWindowIndexer(input, win, minp, left_closed, - right_closed, index, floor=floor) + right_closed, index, floor) elif use_mock: indexer = MockFixedWindowIndexer(input, win, minp, left_closed, right_closed, index, floor) @@ -442,7 +443,7 @@ def roll_sum(ndarray[double_t] input, int64_t win, int64_t minp, object index, object closed): cdef: double val, prev_x, sum_x = 0 - int64_t s, e + int64_t s, e, range_endpoint int64_t nobs = 0, i, j, N bint is_variable ndarray[int64_t] start, end @@ -491,13 +492,15 @@ def roll_sum(ndarray[double_t] input, int64_t win, int64_t minp, # fixed window + range_endpoint = int_max(minp, 1) - 1 + with nogil: - for i in range(0, min(1, minp) - 1): + for i in range(0, range_endpoint): add_sum(input[i], &nobs, &sum_x) output[i] = NaN - for i in range(min(1, minp) - 1, N): + for i in range(range_endpoint, N): val = input[i] add_sum(val, &nobs, &sum_x) From a267c2f66447b1ec32290768ae70bcb8331f6d11 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 28 Dec 2017 13:55:22 -0600 Subject: [PATCH 03/14] Update whatsnew --- doc/source/whatsnew/v0.22.0.txt | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 6305829b99fdd..c3bde3073e035 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -15,9 +15,9 @@ Backwards incompatible API changes Pandas 0.22.0 changes the handling of empty and all-*NA* sums and products. The summary is that -* The sum of an all-*NA* or empty ``Series`` is now 0 -* The product of an all-*NA* or empty series is now 1 -* We've added a ``min_count`` parameter to ``.sum`` and ``.prod`` to control +* The sum of an all-*NA* or empty ``Series`` is now ``0`` +* The product of an all-*NA* or empty series is now ``1`` +* We've added a ``min_count`` parameter to ``.sum()`` and ``.prod()`` to control the minimum number of valid values for the result to be valid. If fewer than ``min_count`` valid values are present, the result is NA. The default is ``0``. To return ``NaN``, the 0.21 behavior, use ``min_count=1``. @@ -27,8 +27,12 @@ in the return value of all-*NA* series depending on whether or not bottleneck was installed. See :ref:`whatsnew_0210.api_breaking.bottleneck`_. At the same time, we changed the sum and prod of an empty Series to also be ``NaN``. -Based on feedback, we've partially reverted those changes. The default sum for -all-*NA* and empty series is now 0 (1 for ``prod``). +Based on feedback, we've partially reverted those changes. + +Arithmetic Operations +^^^^^^^^^^^^^^^^^^^^^ + +The default sum for all-*NA* and empty series is now ``0``. *pandas 0.21.x* @@ -48,8 +52,7 @@ all-*NA* and empty series is now 0 (1 for ``prod``). pd.Series([np.nan]).sum() The default behavior is the same as pandas 0.20.3 with bottleneck installed. It -also matches the behavior of ``np.nansum`` and ``np.nanprod`` on empty and -all-*NA* arrays. +also matches the behavior of ``np.nansum`` on empty and all-*NA* arrays. To have the sum of an empty series return ``NaN``, use the ``min_count`` keyword. Thanks to the ``skipna`` parameter, the ``.sum`` on an all-*NA* @@ -65,7 +68,17 @@ or product. Returning ``NaN`` was the default behavior for pandas 0.20.3 without bottleneck installed. -Note that this affects some other places in the library: +:meth:`Series.prod` has been updated to behave the same as :meth:`Series.sum`, +returning ``1`` instead. + +.. ipython:: python + + pd.Series([]).prod() + pd.Series([np.nan]).prod() + pd.Series([]).prod(min_count=1) + +These changes affect :meth:`DataFrame.sum` and :meth:`DataFrame.prod` as well. +Finally, a few less obvious places in pandas are affected by this change. Grouping by a Categorical ^^^^^^^^^^^^^^^^^^^^^^^^^ From 8c067394075dfde67bb9a98fa6b5660a3b2ea843 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 28 Dec 2017 13:55:32 -0600 Subject: [PATCH 04/14] Parametrize test --- pandas/tests/groupby/test_timegrouper.py | 39 ++++++++++++------------ 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index c0f4470222c81..a014ecb7e91c1 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -244,6 +244,8 @@ def test_timegrouper_with_reg_groups(self): result = df.groupby([pd.Grouper(freq='1M', key='Date')]).sum() assert_frame_equal(result, expected) + @pytest.mark.parametrize('freq', ['D', 'M', 'A', 'Q-APR']) + def test_timegrouper_with_reg_groups_freq(self, freq): # GH 6764 multiple grouping with/without sort df = DataFrame({ 'date': pd.to_datetime([ @@ -257,25 +259,24 @@ def test_timegrouper_with_reg_groups(self): 'cost1': [12, 15, 10, 24, 39, 1, 0, 90, 45, 34, 1, 12] }).set_index('date') - for freq in ['D', 'M', 'A', 'Q-APR']: - expected = ( - df.groupby('user_id')['whole_cost'] - .resample(freq) - .sum(min_count=1) # XXX - .dropna() - .reorder_levels(['date', 'user_id']) - .sort_index() - .astype('int64') - ) - expected.name = 'whole_cost' - - result1 = df.sort_index().groupby([pd.Grouper(freq=freq), - 'user_id'])['whole_cost'].sum() - assert_series_equal(result1, expected) - - result2 = df.groupby([pd.Grouper(freq=freq), 'user_id'])[ - 'whole_cost'].sum() - assert_series_equal(result2, expected) + expected = ( + df.groupby('user_id')['whole_cost'] + .resample(freq) + .sum(min_count=1) # XXX + .dropna() + .reorder_levels(['date', 'user_id']) + .sort_index() + .astype('int64') + ) + expected.name = 'whole_cost' + + result1 = df.sort_index().groupby([pd.Grouper(freq=freq), + 'user_id'])['whole_cost'].sum() + assert_series_equal(result1, expected) + + result2 = df.groupby([pd.Grouper(freq=freq), 'user_id'])[ + 'whole_cost'].sum() + assert_series_equal(result2, expected) def test_timegrouper_get_group(self): # GH 6914 From df7c69a874d9b01ba715d662df77d6c5e95ab74d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 28 Dec 2017 13:56:16 -0600 Subject: [PATCH 05/14] Minor cleanups --- pandas/core/nanops.py | 7 ++++++- pandas/tests/test_nanops.py | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index f51ceefd6bdc0..d34d5e60068e3 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -111,6 +111,9 @@ def f(values, axis=None, skipna=True, **kwds): # We are empty, returning NA for our type # Only applies for the default `min_count` of None # since that affects how empty arrays are handled. + # TODO(toma): update all the nanops methods to correctly + # handle empty inputs and remove this check. + # It *may* just be `var` return _na_for_min_count(values, axis) if (_USE_BOTTLENECK and skipna and @@ -285,8 +288,10 @@ def _wrap_results(result, dtype): def _na_for_min_count(values, axis): """Return the missing value for `values` + Parameters + ---------- values : ndarray - axis : int, optional + axis : int or None axis for the reduction Returns diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index 306d063b364fe..20b7d9acbc3f2 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -187,7 +187,7 @@ def check_fun_data(self, testfunc, targfunc, testarval, targarval, for axis in list(range(targarval.ndim)) + [None]: for skipna in [False, True]: targartempval = targarval if skipna else targarnanval - if skipna and empty_targfunc and pd.isna(targartempval).all(): + if skipna and empty_targfunc and isna(targartempval).all(): targ = empty_targfunc(targartempval, axis=axis, **kwargs) else: targ = targfunc(targartempval, axis=axis, **kwargs) From 66a3ab6fd21d88899a51eb6ce67a059532b0b127 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 28 Dec 2017 13:56:36 -0600 Subject: [PATCH 06/14] Refactor skipna_alternative --- pandas/tests/frame/test_analytics.py | 48 ++++++++++++---------------- pandas/tests/test_panel.py | 12 ++----- pandas/tests/test_panel4d.py | 12 ++----- pandas/util/testing.py | 27 ++++++++++++++++ 4 files changed, 51 insertions(+), 48 deletions(-) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 5c1d1686b1a19..42b91872ea497 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -779,16 +779,8 @@ def _check_stat_op(self, name, alternative, frame=None, has_skipna=True, def wrapper(x): return alternative(x.values) - if skipna_alternative: - def skipna_wrapper(x): - return skipna_alternative(x.values) - else: - def skipna_wrapper(x): - nona = x.dropna() - if len(nona) == 0: - return np.nan - return alternative(nona) - + skipna_wrapper = tm._make_skipna_wrapper(alternative, + skipna_alternative) result0 = f(axis=0, skipna=False) result1 = f(axis=1, skipna=False) tm.assert_series_equal(result0, frame.apply(wrapper), @@ -1013,28 +1005,28 @@ def test_sum_prod_nanops(self, method, unit): expected = pd.Series(result, index=['A', 'B']) tm.assert_series_equal(result, expected) - if method == 'sum': - # prod isn't defined on timedeltas - df = pd.DataFrame({"a": [unit, unit], - "b": [unit, np.nan], - "c": [np.nan, np.nan]}) + def test_sum_nanops_timedelta(self): + # prod isn't defined on timedeltas + idx = ['a', 'b', 'c'] + df = pd.DataFrame({"a": [0, 0], + "b": [0, np.nan], + "c": [np.nan, np.nan]}) - df2 = df.apply(pd.to_timedelta) + df2 = df.apply(pd.to_timedelta) - # 0 by default - result = getattr(df2, method)() - expected = pd.Series([0, 0, 0], dtype='m8[ns]', index=idx) - tm.assert_series_equal(result, expected) + # 0 by default + result = df2.sum() + expected = pd.Series([0, 0, 0], dtype='m8[ns]', index=idx) + tm.assert_series_equal(result, expected) - # min_count=0 - result = getattr(df2, method)(min_count=0) - tm.assert_series_equal(result, expected) + # min_count=0 + result = df2.sum(min_count=0) + tm.assert_series_equal(result, expected) - # min_count=1 - result = getattr(df2, method)(min_count=1) - expected = pd.Series([0, 0, np.nan], dtype='m8[ns]', - index=idx) - tm.assert_series_equal(result, expected) + # min_count=1 + result = df2.sum(min_count=1) + expected = pd.Series([0, 0, np.nan], dtype='m8[ns]', index=idx) + tm.assert_series_equal(result, expected) def test_sum_object(self): values = self.frame.values.astype(int) diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 6ea9fdff724aa..c8b45daac11b2 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -153,16 +153,8 @@ def _check_stat_op(self, name, alternative, obj=None, has_skipna=True, if has_skipna: - if skipna_alternative: - def skipna_wrapper(x): - return skipna_alternative(np.asarray(x)) - else: - def skipna_wrapper(x): - nona = remove_na_arraylike(x) - if len(nona) == 0: - return np.nan - return alternative(nona) - + skipna_wrapper = tm._make_skipna_wrapper(alternative, + skipna_alternative) def wrapper(x): return alternative(np.asarray(x)) diff --git a/pandas/tests/test_panel4d.py b/pandas/tests/test_panel4d.py index 89876c3215b0d..b70d1d1f44267 100644 --- a/pandas/tests/test_panel4d.py +++ b/pandas/tests/test_panel4d.py @@ -118,16 +118,8 @@ def _check_stat_op(self, name, alternative, obj=None, has_skipna=True, if has_skipna: - if skipna_alternative: - def skipna_wrapper(x): - return skipna_alternative(np.asarray(x)) - else: - def skipna_wrapper(x): - nona = remove_na_arraylike(x) - if len(nona) == 0: - return np.nan - return alternative(nona) - + skipna_wrapper = tm._make_skipna_wrapper(alternative, + skipna_alternative) def wrapper(x): return alternative(np.asarray(x)) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 4e9282c3bd031..a7df8e8a41034 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -2665,3 +2665,30 @@ def setTZ(tz): yield finally: setTZ(orig_tz) + +def _make_skipna_wrapper(alternative, skipna_alternative=None): + """Create a function for calling on an array. + + Parameters + ---------- + alternative : function + The function to be called on the array with no NaNs. + Only used when 'skipna_alternative' is None. + skipna_alternative : function + The function to be called on the original array + + Returns + ------- + skipna_wrapper : function + """ + if skipna_alternative: + def skipna_wrapper(x): + return skipna_alternative(x.values) + else: + def skipna_wrapper(x): + nona = x.dropna() + if len(nona) == 0: + return np.nan + return alternative(nona) + + return skipna_wrapper From fb5937c1aee55c50b5ed776b3cefd1e66c1afaca Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 28 Dec 2017 14:03:11 -0600 Subject: [PATCH 07/14] Split test --- pandas/tests/test_resample.py | 54 +++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 21 deletions(-) diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index d0cfd839d19c4..601412a87d80c 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -3420,6 +3420,8 @@ def test_resample_entirly_nat_window(self, method, unit): ]) def test_aggregate_with_nat(self, func, fill_value): # check TimeGrouper's aggregation is identical as normal groupby + # if NaT is included, 'var', 'std', 'mean', 'first','last' + # and 'nth' doesn't work yet n = 20 data = np.random.randn(n, 4).astype('int64') @@ -3436,29 +3438,39 @@ def test_aggregate_with_nat(self, func, fill_value): normal_result = getattr(normal_grouped, func)() dt_result = getattr(dt_grouped, func)() - if func == 'size': - pad = Series([fill_value], index=[3]) - expected = normal_result.append(pad) - expected = expected.sort_index() - expected.index = date_range(start='2013-01-01', freq='D', - periods=5, name='key') - dt_result = getattr(dt_grouped, func)() - assert_series_equal(expected, dt_result) - # GH 9925 - assert dt_result.index.name == 'key' - else: - pad = DataFrame([[fill_value] * 4], index=[3], - columns=['A', 'B', 'C', 'D']) - expected = normal_result.append(pad) - expected = expected.sort_index() - expected.index = date_range(start='2013-01-01', freq='D', - periods=5, name='key') - assert_frame_equal(expected, dt_result) + pad = DataFrame([[fill_value] * 4], index=[3], + columns=['A', 'B', 'C', 'D']) + expected = normal_result.append(pad) + expected = expected.sort_index() + expected.index = date_range(start='2013-01-01', freq='D', + periods=5, name='key') + assert_frame_equal(expected, dt_result) + assert dt_result.index.name == 'key' + + def test_aggregate_with_nat_size(self): + # GH 9925 + n = 20 + data = np.random.randn(n, 4).astype('int64') + normal_df = DataFrame(data, columns=['A', 'B', 'C', 'D']) + normal_df['key'] = [1, 2, np.nan, 4, 5] * 4 - assert dt_result.index.name == 'key' + dt_df = DataFrame(data, columns=['A', 'B', 'C', 'D']) + dt_df['key'] = [datetime(2013, 1, 1), datetime(2013, 1, 2), pd.NaT, + datetime(2013, 1, 4), datetime(2013, 1, 5)] * 4 - # if NaT is included, 'var', 'std', 'mean', 'first','last' - # and 'nth' doesn't work yet + normal_grouped = normal_df.groupby('key') + dt_grouped = dt_df.groupby(TimeGrouper(key='key', freq='D')) + + normal_result = normal_grouped.size() + dt_result = dt_grouped.size() + + pad = Series([0], index=[3]) + expected = normal_result.append(pad) + expected = expected.sort_index() + expected.index = date_range(start='2013-01-01', freq='D', + periods=5, name='key') + assert_series_equal(expected, dt_result) + assert dt_result.index.name == 'key' def test_repr(self): # GH18203 From 4c65c9c6d8c30120ec874f73fc82281fa6757ffe Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 28 Dec 2017 14:03:28 -0600 Subject: [PATCH 08/14] Added issue --- pandas/core/nanops.py | 4 ++-- pandas/tests/test_window.py | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index d34d5e60068e3..d1a355021f388 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -111,8 +111,8 @@ def f(values, axis=None, skipna=True, **kwds): # We are empty, returning NA for our type # Only applies for the default `min_count` of None # since that affects how empty arrays are handled. - # TODO(toma): update all the nanops methods to correctly - # handle empty inputs and remove this check. + # TODO(GH-18976) update all the nanops methods to + # correctly handle empty inputs and remove this check. # It *may* just be `var` return _na_for_min_count(values, axis) diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 1d6291748c62f..ccffc554e00c7 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -440,6 +440,7 @@ def tests_empty_df_rolling(self, roller): tm.assert_frame_equal(result, expected) def test_missing_minp_zero(self): + # https://github.com/pandas-dev/pandas/pull/18921 # minp=0 x = pd.Series([np.nan]) result = x.rolling(1, min_periods=0).sum() @@ -452,6 +453,7 @@ def test_missing_minp_zero(self): tm.assert_series_equal(result, expected) def test_missing_minp_zero_variable(self): + # https://github.com/pandas-dev/pandas/pull/18921 x = pd.Series([np.nan] * 4, index=pd.DatetimeIndex(['2017-01-01', '2017-01-04', '2017-01-06', '2017-01-07'])) @@ -533,6 +535,7 @@ def test_empty_df_expanding(self, expander): tm.assert_frame_equal(result, expected) def test_missing_minp_zero(self): + # https://github.com/pandas-dev/pandas/pull/18921 # minp=0 x = pd.Series([np.nan]) result = x.expanding(min_periods=0).sum() From 52e4e6f8c916a66ffb257a0e22f8470eed6c1a51 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 28 Dec 2017 15:21:52 -0600 Subject: [PATCH 09/14] More updates --- doc/source/whatsnew/v0.22.0.txt | 14 +++++++------- pandas/tests/frame/test_analytics.py | 6 ++++++ pandas/tests/groupby/test_categorical.py | 5 +++++ pandas/tests/test_resample.py | 5 +++++ 4 files changed, 23 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index c3bde3073e035..2d9d9d8846795 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -1,4 +1,4 @@ -.. _whatsnew_0220: + v0.22.0 ------- @@ -25,7 +25,7 @@ summary is that Some background: In pandas 0.21, we fixed a long-standing inconsistency in the return value of all-*NA* series depending on whether or not bottleneck was installed. See :ref:`whatsnew_0210.api_breaking.bottleneck`_. At the same -time, we changed the sum and prod of an empty Series to also be ``NaN``. +time, we changed the sum and prod of an empty ``Series`` to also be ``NaN``. Based on feedback, we've partially reverted those changes. @@ -52,13 +52,13 @@ The default sum for all-*NA* and empty series is now ``0``. pd.Series([np.nan]).sum() The default behavior is the same as pandas 0.20.3 with bottleneck installed. It -also matches the behavior of ``np.nansum`` on empty and all-*NA* arrays. +also matches the behavior of NumPy's ``np.nansum`` on empty and all-*NA* arrays. To have the sum of an empty series return ``NaN``, use the ``min_count`` keyword. Thanks to the ``skipna`` parameter, the ``.sum`` on an all-*NA* -series is conceptually the same as on an empty. The ``min_count`` parameter -refers to the minimum number of *valid* values required for a non-NA sum -or product. +series is conceptually the same as the ``.sum`` of an empty one with +``skipna=True`` (the default). The ``min_count`` parameter refers to the +minimum number of *non-null* values required for a non-NA sum or product. .. ipython:: python @@ -186,7 +186,7 @@ Rolling and Expanding ^^^^^^^^^^^^^^^^^^^^^ Rolling and expanding already have a ``min_periods`` keyword that behaves -similarly to ``min_count``. The only case that changes is when doing a rolling +similar to ``min_count``. The only case that changes is when doing a rolling or expanding sum on an all-*NA* series with ``min_periods=0``. Previously this returned ``NaN``, now it will return ``0``. diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 42b91872ea497..bb416b6918fad 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -983,11 +983,16 @@ def test_sum_prod_nanops(self, method, unit): df = pd.DataFrame({"a": [unit, unit], "b": [unit, np.nan], "c": [np.nan, np.nan]}) + # The default + result = getattr(df, method) + expected = pd.Series([unit, unit, unit], index=idx, dtype='float64') + # min_count=1 result = getattr(df, method)(min_count=1) expected = pd.Series([unit, unit, np.nan], index=idx) tm.assert_series_equal(result, expected) + # min_count=0 result = getattr(df, method)(min_count=0) expected = pd.Series([unit, unit, unit], index=idx, dtype='float64') tm.assert_series_equal(result, expected) @@ -996,6 +1001,7 @@ def test_sum_prod_nanops(self, method, unit): expected = pd.Series([unit, np.nan, np.nan], index=idx) tm.assert_series_equal(result, expected) + # min_count > 1 df = pd.DataFrame({"A": [unit] * 10, "B": [unit] * 5 + [np.nan] * 5}) result = getattr(df, method)(min_count=5) expected = pd.Series(result, index=['A', 'B']) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 1713b2d3015ad..d4f35aa8755d1 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -685,6 +685,11 @@ def test_empty_sum(self): expected = pd.Series([3, 1, np.nan], expected_idx, name='B') tm.assert_series_equal(result, expected) + # min_count>1 + result = df.groupby("A").B.sum(min_count=2) + expected = pd.Series([3, np.nan, np.nan], expected_idx, name='B') + tm.assert_series_equal(result, expected) + def test_empty_prod(self): # https://github.com/pandas-dev/pandas/issues/18678 df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'], diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index 601412a87d80c..e9a517605020a 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -3505,3 +3505,8 @@ def test_upsample_sum(self, method, unit): result = methodcaller(method, min_count=1)(resampled) expected = pd.Series([1, np.nan, 1], index=index) tm.assert_series_equal(result, expected) + + # min_count>1 + result = methodcaller(method, min_count=2)(resampled) + expected = pd.Series([np.nan, np.nan, np.nan], index=index) + tm.assert_series_equal(result, expected) From d6a6c228e8210cc9af369cf49a55d1777ec7279f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 28 Dec 2017 15:34:06 -0600 Subject: [PATCH 10/14] linting --- pandas/tests/frame/test_analytics.py | 4 ++-- pandas/tests/groupby/test_timegrouper.py | 14 +++++++------- pandas/util/testing.py | 1 + 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index bb416b6918fad..69f1aeddc43e9 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1015,8 +1015,8 @@ def test_sum_nanops_timedelta(self): # prod isn't defined on timedeltas idx = ['a', 'b', 'c'] df = pd.DataFrame({"a": [0, 0], - "b": [0, np.nan], - "c": [np.nan, np.nan]}) + "b": [0, np.nan], + "c": [np.nan, np.nan]}) df2 = df.apply(pd.to_timedelta) diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index a014ecb7e91c1..d359bfa5351a9 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -261,17 +261,17 @@ def test_timegrouper_with_reg_groups_freq(self, freq): expected = ( df.groupby('user_id')['whole_cost'] - .resample(freq) - .sum(min_count=1) # XXX - .dropna() - .reorder_levels(['date', 'user_id']) - .sort_index() - .astype('int64') + .resample(freq) + .sum(min_count=1) # XXX + .dropna() + .reorder_levels(['date', 'user_id']) + .sort_index() + .astype('int64') ) expected.name = 'whole_cost' result1 = df.sort_index().groupby([pd.Grouper(freq=freq), - 'user_id'])['whole_cost'].sum() + 'user_id'])['whole_cost'].sum() assert_series_equal(result1, expected) result2 = df.groupby([pd.Grouper(freq=freq), 'user_id'])[ diff --git a/pandas/util/testing.py b/pandas/util/testing.py index a7df8e8a41034..8acf16536f1de 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -2666,6 +2666,7 @@ def setTZ(tz): finally: setTZ(orig_tz) + def _make_skipna_wrapper(alternative, skipna_alternative=None): """Create a function for calling on an array. From fcd57f40757012cfae5683ef4df3b1a276bbf653 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 28 Dec 2017 21:48:27 -0600 Subject: [PATCH 11/14] linting --- pandas/tests/test_panel.py | 2 +- pandas/tests/test_panel4d.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index c8b45daac11b2..181826448ee78 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -9,7 +9,6 @@ import numpy as np from pandas.core.dtypes.common import is_float_dtype -from pandas.core.dtypes.missing import remove_na_arraylike from pandas import (Series, DataFrame, Index, date_range, isna, notna, pivot, MultiIndex) from pandas.core.nanops import nanall, nanany @@ -155,6 +154,7 @@ def _check_stat_op(self, name, alternative, obj=None, has_skipna=True, skipna_wrapper = tm._make_skipna_wrapper(alternative, skipna_alternative) + def wrapper(x): return alternative(np.asarray(x)) diff --git a/pandas/tests/test_panel4d.py b/pandas/tests/test_panel4d.py index b70d1d1f44267..1af6e5c5f69d2 100644 --- a/pandas/tests/test_panel4d.py +++ b/pandas/tests/test_panel4d.py @@ -8,7 +8,6 @@ from pandas import Series, Index, isna, notna from pandas.core.dtypes.common import is_float_dtype -from pandas.core.dtypes.missing import remove_na_arraylike from pandas.core.panel import Panel from pandas.core.panel4d import Panel4D from pandas.tseries.offsets import BDay @@ -120,6 +119,7 @@ def _check_stat_op(self, name, alternative, obj=None, has_skipna=True, skipna_wrapper = tm._make_skipna_wrapper(alternative, skipna_alternative) + def wrapper(x): return alternative(np.asarray(x)) From 05b44f9b840d313bd154d882f2ae4571ea7d6fcd Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 28 Dec 2017 21:54:03 -0600 Subject: [PATCH 12/14] Added skips --- doc/source/whatsnew/v0.22.0.txt | 2 +- pandas/tests/test_nanops.py | 1 + pandas/tests/test_panel.py | 1 + pandas/tests/test_panel4d.py | 1 + 4 files changed, 4 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 2d9d9d8846795..e80a0460d2da7 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -1,4 +1,4 @@ - +.. _whatsnew_0220: v0.22.0 ------- diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index 20b7d9acbc3f2..df3c49a73d227 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -469,6 +469,7 @@ def test_nankurt(self): allow_str=False, allow_date=False, allow_tdelta=False) + @td.skip_if_no("numpy", min_version="1.10.0") def test_nanprod(self): self.check_funs(nanops.nanprod, np.prod, allow_str=False, allow_date=False, allow_tdelta=False, diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 181826448ee78..d772dba25868e 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -87,6 +87,7 @@ def test_sum(self): def test_mean(self): self._check_stat_op('mean', np.mean) + @td.skip_if_no("numpy", min_version="1.10.0") def test_prod(self): self._check_stat_op('prod', np.prod, skipna_alternative=np.nanprod) diff --git a/pandas/tests/test_panel4d.py b/pandas/tests/test_panel4d.py index 1af6e5c5f69d2..e429403bbc919 100644 --- a/pandas/tests/test_panel4d.py +++ b/pandas/tests/test_panel4d.py @@ -42,6 +42,7 @@ def test_sum(self): def test_mean(self): self._check_stat_op('mean', np.mean) + @td.skip_if_no("numpy", min_version="1.10.0") def test_prod(self): self._check_stat_op('prod', np.prod, skipna_alternative=np.nanprod) From cdf56924a442a0045f9135304bb6f53f62a416dc Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 29 Dec 2017 06:05:43 -0600 Subject: [PATCH 13/14] Doc fixup --- doc/source/whatsnew/v0.22.0.txt | 35 ++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index e80a0460d2da7..018500ef0e0ff 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -17,7 +17,7 @@ summary is that * The sum of an all-*NA* or empty ``Series`` is now ``0`` * The product of an all-*NA* or empty series is now ``1`` -* We've added a ``min_count`` parameter to ``.sum()`` and ``.prod()`` to control +* We've added a ``min_count`` parameter to ``.sum()`` and ``.prod()`` controlling the minimum number of valid values for the result to be valid. If fewer than ``min_count`` valid values are present, the result is NA. The default is ``0``. To return ``NaN``, the 0.21 behavior, use ``min_count=1``. @@ -54,19 +54,24 @@ The default sum for all-*NA* and empty series is now ``0``. The default behavior is the same as pandas 0.20.3 with bottleneck installed. It also matches the behavior of NumPy's ``np.nansum`` on empty and all-*NA* arrays. -To have the sum of an empty series return ``NaN``, use the ``min_count`` -keyword. Thanks to the ``skipna`` parameter, the ``.sum`` on an all-*NA* -series is conceptually the same as the ``.sum`` of an empty one with -``skipna=True`` (the default). The ``min_count`` parameter refers to the -minimum number of *non-null* values required for a non-NA sum or product. +To have the sum of an empty series return ``NaN`` (the default behavior of +pandas 0.20.3 without bottleneck, or pandas 0.21.x), use the ``min_count`` +keyword. .. ipython:: python pd.Series([]).sum(min_count=1) - pd.Series([np.nan]).sum(min_count=1) -Returning ``NaN`` was the default behavior for pandas 0.20.3 without bottleneck -installed. +Thanks to the ``skipna`` parameter, the ``.sum`` on an all-*NA* +series is conceptually the same as the ``.sum`` of an empty one with +``skipna=True`` (the default). + +.. ipython:: python + + pd.Series([np.nan]).sum(min_count=1) # skipna=True by default + +The ``min_count`` parameter refers to the minimum number of *non-null* values +required for a non-NA sum or product. :meth:`Series.prod` has been updated to behave the same as :meth:`Series.sum`, returning ``1`` instead. @@ -83,8 +88,9 @@ Finally, a few less obvious places in pandas are affected by this change. Grouping by a Categorical ^^^^^^^^^^^^^^^^^^^^^^^^^ -Grouping by a ``Categorical`` with some unobserved categories and computing the -``sum`` / ``prod`` will behave differently. +Grouping by a ``Categorical`` and summing now returns ``0`` instead of +``NaN`` for categories with no observations. The product now returns ``1`` +instead of ``NaN``. *pandas 0.21.x* @@ -105,7 +111,7 @@ Grouping by a ``Categorical`` with some unobserved categories and computing the grouper = pd.Categorical(['a', 'a'], categories=['a', 'b']) pd.Series([1, 2]).groupby(grouper).sum() -To restore the 0.21 behavior of returning ``NaN`` of unobserved groups, +To restore the 0.21 behavior of returning ``NaN`` for unobserved groups, use ``min_count>=1``. .. ipython:: python @@ -115,7 +121,8 @@ use ``min_count>=1``. Resample ^^^^^^^^ -The sum and product of all-*NA* bins will change: +The sum and product of all-*NA* bins has changed from ``NaN`` to ``0`` for +sum and ``1`` for product. *pandas 0.21.x* @@ -188,7 +195,7 @@ Rolling and Expanding Rolling and expanding already have a ``min_periods`` keyword that behaves similar to ``min_count``. The only case that changes is when doing a rolling or expanding sum on an all-*NA* series with ``min_periods=0``. Previously this -returned ``NaN``, now it will return ``0``. +returned ``NaN``, now it returns ``0``. *pandas 0.21.1* From a97e133bbe3e8bbc74fa8ecbc6331c4c7bfc4b77 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 29 Dec 2017 06:24:22 -0600 Subject: [PATCH 14/14] DOC: More whatsnew --- doc/source/whatsnew/v0.22.0.txt | 60 ++++++++++++++++----------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 018500ef0e0ff..8617aa6c03e1f 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -15,16 +15,16 @@ Backwards incompatible API changes Pandas 0.22.0 changes the handling of empty and all-*NA* sums and products. The summary is that -* The sum of an all-*NA* or empty ``Series`` is now ``0`` -* The product of an all-*NA* or empty series is now ``1`` +* The sum of an empty or all-*NA* ``Series`` is now ``0`` +* The product of an empty or all-*NA* ``Series`` is now ``1`` * We've added a ``min_count`` parameter to ``.sum()`` and ``.prod()`` controlling the minimum number of valid values for the result to be valid. If fewer than - ``min_count`` valid values are present, the result is NA. The default is + ``min_count`` non-*NA* values are present, the result is *NA*. The default is ``0``. To return ``NaN``, the 0.21 behavior, use ``min_count=1``. Some background: In pandas 0.21, we fixed a long-standing inconsistency in the return value of all-*NA* series depending on whether or not bottleneck -was installed. See :ref:`whatsnew_0210.api_breaking.bottleneck`_. At the same +was installed. See :ref:`whatsnew_0210.api_breaking.bottleneck`. At the same time, we changed the sum and prod of an empty ``Series`` to also be ``NaN``. Based on feedback, we've partially reverted those changes. @@ -32,17 +32,17 @@ Based on feedback, we've partially reverted those changes. Arithmetic Operations ^^^^^^^^^^^^^^^^^^^^^ -The default sum for all-*NA* and empty series is now ``0``. +The default sum for empty or all-*NA* ``Series`` is now ``0``. *pandas 0.21.x* .. code-block:: ipython - In [3]: pd.Series([]).sum() - Out[3]: nan + In [1]: pd.Series([]).sum() + Out[1]: nan - In [4]: pd.Series([np.nan]).sum() - Out[4]: nan + In [2]: pd.Series([np.nan]).sum() + Out[2]: nan *pandas 0.22.0* @@ -96,10 +96,10 @@ instead of ``NaN``. .. code-block:: ipython - In [5]: grouper = pd.Categorical(['a', 'a'], categories=['a', 'b']) + In [8]: grouper = pd.Categorical(['a', 'a'], categories=['a', 'b']) - In [6]: pd.Series([1, 2]).groupby(grouper).sum() - Out[6]: + In [9]: pd.Series([1, 2]).groupby(grouper).sum() + Out[9]: a 3.0 b NaN dtype: float64 @@ -128,20 +128,18 @@ sum and ``1`` for product. .. code-block:: ipython - In [7]: s = pd.Series([1, 1, np.nan, np.nan], - ...: index=pd.date_range('2017', periods=4)) - ...: - - In [8]: s - Out[8]: + In [11]: s = pd.Series([1, 1, np.nan, np.nan], + ...: index=pd.date_range('2017', periods=4)) + ...: s + Out[11]: 2017-01-01 1.0 2017-01-02 1.0 2017-01-03 NaN 2017-01-04 NaN Freq: D, dtype: float64 - In [9]: s.resample('2d').sum() - Out[9]: + In [12]: s.resample('2d').sum() + Out[12]: 2017-01-01 2.0 2017-01-03 NaN Freq: 2D, dtype: float64 @@ -161,16 +159,17 @@ To restore the 0.21 behavior of returning ``NaN``, use ``min_count>=1``. s.resample('2d').sum(min_count=1) In particular, upsampling and taking the sum or product is affected, as -upsampling introduces all-*NA* original series was entirely valid. +upsampling introduces missing values even if the original series was +entirely valid. *pandas 0.21.x* .. code-block:: ipython - In [10]: idx = pd.DatetimeIndex(['2017-01-01', '2017-01-02']) + In [14]: idx = pd.DatetimeIndex(['2017-01-01', '2017-01-02']) - In [10]: pd.Series([1, 2], index=idx).resample('12H').sum() - Out[10]: + In [15]: pd.Series([1, 2], index=idx).resample('12H').sum() + Out[15]: 2017-01-01 00:00:00 1.0 2017-01-01 12:00:00 NaN 2017-01-02 00:00:00 2.0 @@ -194,17 +193,18 @@ Rolling and Expanding Rolling and expanding already have a ``min_periods`` keyword that behaves similar to ``min_count``. The only case that changes is when doing a rolling -or expanding sum on an all-*NA* series with ``min_periods=0``. Previously this -returned ``NaN``, now it returns ``0``. +or expanding sum with ``min_periods=0``. Previously this returned ``NaN``, +when fewer than ``min_periods`` non-*NA* values were in the window. Now it +returns ``0``. *pandas 0.21.1* -.. ipython:: python +.. code-block:: ipython - In [11]: s = pd.Series([np.nan, np.nan]) + In [17]: s = pd.Series([np.nan, np.nan]) - In [12]: s.rolling(2, min_periods=0).sum() - Out[12]: + In [18]: s.rolling(2, min_periods=0).sum() + Out[18]: 0 NaN 1 NaN dtype: float64