diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 0579a80aad28e..b8bcdd8ec71d2 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -8,6 +8,103 @@ deprecations, new features, enhancements, and performance improvements along with a large number of bug fixes. We recommend that all users upgrade to this version. +.. _whatsnew_0220.na_sum: + +Pandas 0.22.0 changes the handling of empty and all-NA sums and products. The +summary is that + +* The sum of an all-NA or empty series is now 0 +* The product of an all-NA or empty series is now 1 +* We've added an ``empty_is_na`` keyword to the ``sum`` and ``prod`` methods + to control whether the sum or product of an empty series should be NA. The + default is ``False``. To restore the 0.21 behavior, use + ``empty_is_na=True``. + +Some background: In pandas 0.21.1, we fixed a long-standing inconsistency +in the return value of all-NA series depending on whether or not bottleneck +was installed. See :ref:`whatsnew_0210.api_breaking.bottleneck`_. At the same +time, we changed the sum and prod of an empty Series to also be ``NaN``. + +Based on feedback, we've partially reverted those changes. The defualt sum +for all-NA and empty series is now 0 (1 for ``prod``). You can achieve the +pandas 0.21.0 behavior, returning ``NaN``, with the ``empty_is_na`` keyword. + +*pandas 0.21* + +.. code-block:: ipython + + In [1]: import pandas as pd + + In [2]: import numpy as np + + In [3]: pd.Series([]).sum() + Out[3]: nan + + In [4]: pd.Series([np.nan]).sum() + Out[4]: nan + +*pandas 0.22.0* + +.. ipython:: python + + pd.Series([]).sum() + pd.Series([np.nan]).sum() + +To have the sum of an empty series return ``NaN``, use the ``empty_is_na`` +keyword. Thanks to the ``skipna`` parameter, the ``.sum`` on an all-NA +series is conceptually the same as on an empty. The ``empty_is_na`` parameter +controls the return value after removing NAs. + +.. ipython:: python + + pd.Series([]).sum(empty_is_na=True) + pd.Series([np.nan]).sum(empty_is_na=True) + +Note that this affects some other places in the library: + +1. Grouping by a Categorical with some unobserved categories + +*pandas 0.21* + +.. code-block:: ipython + + In [3]: grouper = pd.Categorical(['a', 'a'], categories=['a', 'b']) + + In [4]: pd.Series([1, 2]).groupby(grouper).sum() + Out[4]: + a 3.0 + b NaN + dtype: float64 + +*pandas 0.22* + +.. ipython:: python + + grouper = pd.Categorical(['a', 'a'], categories=['a', 'b']) + pd.Series([1, 2]).groupby(grouepr).sum() + +2. Upsampling + +*pandas 0.21.0* + +.. code-block:: ipython + + In [5]: idx = pd.DatetimeIndex(['2017-01-01', '2017-01-02']) + + In [6]: pd.Series([1, 2], index=idx).resample('12H').sum() + Out[6]: + 2017-01-01 00:00:00 1.0 + 2017-01-01 12:00:00 NaN + 2017-01-02 00:00:00 2.0 + Freq: 12H, dtype: float64 + +*pandas 0.22.0* + +.. ipython:: python + + idx = pd.DatetimeIndex(['2017-01-01', '2017-01-02']) + pd.Series([1, 2], index=idx).resample("12H").sum() + .. _whatsnew_0220.enhancements: New features diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index d38b677df321c..19dec86fd23fa 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -89,7 +89,7 @@ def group_add_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, for i in range(ncounts): for j in range(K): if nobs[i, j] == 0: - out[i, j] = NAN + out[i, j] = 0 else: out[i, j] = sumx[i, j] @@ -148,7 +148,7 @@ def group_prod_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, for i in range(ncounts): for j in range(K): if nobs[i, j] == 0: - out[i, j] = NAN + out[i, j] = 1 else: out[i, j] = prodx[i, j] diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 4eb7865523cc3..aaf35e502c315 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7310,7 +7310,8 @@ def _add_numeric_operations(cls): @Substitution(outname='mad', desc="Return the mean absolute deviation of the values " "for the requested axis", - name1=name, name2=name2, axis_descr=axis_descr) + name1=name, name2=name2, axis_descr=axis_descr, + empty_is_na='') @Appender(_num_doc) def mad(self, axis=None, skipna=None, level=None): if skipna is None: @@ -7351,7 +7352,7 @@ def mad(self, axis=None, skipna=None, level=None): @Substitution(outname='compounded', desc="Return the compound percentage of the values for " "the requested axis", name1=name, name2=name2, - axis_descr=axis_descr) + axis_descr=axis_descr, empty_is_na='') @Appender(_num_doc) def compound(self, axis=None, skipna=None, level=None): if skipna is None: @@ -7375,10 +7376,11 @@ def compound(self, axis=None, skipna=None, level=None): lambda y, axis: np.maximum.accumulate(y, axis), "max", -np.inf, np.nan) - cls.sum = _make_stat_function( + cls.sum = _make_empty_stat_function( cls, 'sum', name, name2, axis_descr, 'Return the sum of the values for the requested axis', - nanops.nansum) + nanops.nansum, + empty_is_na=False) cls.mean = _make_stat_function( cls, 'mean', name, name2, axis_descr, 'Return the mean of the values for the requested axis', @@ -7394,10 +7396,11 @@ def compound(self, axis=None, skipna=None, level=None): "by N-1\n", nanops.nankurt) cls.kurtosis = cls.kurt - cls.prod = _make_stat_function( + cls.prod = _make_empty_stat_function( cls, 'prod', name, name2, axis_descr, 'Return the product of the values for the requested axis', - nanops.nanprod) + nanops.nanprod, + empty_is_na=False) cls.product = cls.prod cls.median = _make_stat_function( cls, 'median', name, name2, axis_descr, @@ -7520,14 +7523,14 @@ def _doc_parms(cls): ---------- axis : %(axis_descr)s skipna : boolean, default True - Exclude NA/null values. If an entire row/column is NA or empty, the result - will be NA + Exclude NA/null values before computing the result. level : int or level name, default None If the axis is a MultiIndex (hierarchical), count along a particular level, collapsing into a %(name1)s numeric_only : boolean, default None Include only float, int, boolean columns. If None, will attempt to use - everything, then use only numeric data. Not implemented for Series. + everything, then use only numeric data. Not implemented for + Series.%(empty_is_na)s Returns ------- @@ -7584,7 +7587,7 @@ def _doc_parms(cls): axis : %(axis_descr)s skipna : boolean, default True Exclude NA/null values. If an entire row/column is NA, the result - will be NA + will be NA. Returns ------- @@ -7598,16 +7601,45 @@ def _doc_parms(cls): """ +_empty_is_na_doc = """ +empty_is_na : bool, default False + The result of operating on an empty array should be NA. The default + behavior is for the sum of an empty array to be 0, and the product + of an empty array to be 1. + + When ``skipna=True``, "empty" refers to whether or not the array + is empty after removing NAs. So operating on an all-NA array with + ``skipna=True`` will be NA when ``empty_is_na`` is True. + """ + + +def _make_empty_stat_function(cls, name, name1, name2, axis_descr, desc, f, + empty_is_na=False): + @Substitution(outname=name, desc=desc, name1=name1, name2=name2, + axis_descr=axis_descr, empty_is_na=_empty_is_na_doc) + @Appender(_num_doc) + def stat_func(self, axis=None, skipna=True, level=None, numeric_only=None, + empty_is_na=empty_is_na, **kwargs): + nv.validate_stat_func(tuple(), kwargs, fname=name) + if axis is None: + axis = self._stat_axis_number + if level is not None: + return self._agg_by_level(name, axis=axis, level=level, + skipna=skipna, empty_is_na=empty_is_na) + return self._reduce(f, name, axis=axis, skipna=skipna, + numeric_only=numeric_only, + empty_is_na=empty_is_na) + + return set_function_name(stat_func, name, cls) + def _make_stat_function(cls, name, name1, name2, axis_descr, desc, f): @Substitution(outname=name, desc=desc, name1=name1, name2=name2, - axis_descr=axis_descr) + axis_descr=axis_descr, empty_is_na='') @Appender(_num_doc) - def stat_func(self, axis=None, skipna=None, level=None, numeric_only=None, + def stat_func(self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs): nv.validate_stat_func(tuple(), kwargs, fname=name) - if skipna is None: - skipna = True if axis is None: axis = self._stat_axis_number if level is not None: diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index e1c09947ac0b4..2302561d303e8 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -107,7 +107,8 @@ def f(values, axis=None, skipna=True, **kwds): if k not in kwds: kwds[k] = v try: - if values.size == 0: + # TODO: NaT + if values.size == 0 and kwds.get('empty_is_na'): # we either return np.nan or pd.NaT if is_numeric_dtype(values): @@ -155,6 +156,7 @@ def _bn_ok_dtype(dt, name): # Bottleneck chokes on datetime64 if (not is_object_dtype(dt) and not is_datetime_or_timedelta_dtype(dt)): + # TODO: handle this overflow # GH 15507 # bottleneck does not properly upcast during the sum # so can overflow @@ -163,6 +165,9 @@ def _bn_ok_dtype(dt, name): # further we also want to preserve NaN when all elements # are NaN, unlinke bottleneck/numpy which consider this # to be 0 + + # https://github.com/kwgoodman/bottleneck/issues/180 + # No upcast for boolean -> int if name in ['nansum', 'nanprod']: return False @@ -303,8 +308,8 @@ def nanall(values, axis=None, skipna=True): @disallow('M8') -@bottleneck_switch() -def nansum(values, axis=None, skipna=True): +@bottleneck_switch(empty_is_na=False) +def nansum(values, axis=None, skipna=True, empty_is_na=False): values, mask, dtype, dtype_max = _get_values(values, skipna, 0) dtype_sum = dtype_max if is_float_dtype(dtype): @@ -312,13 +317,12 @@ def nansum(values, axis=None, skipna=True): elif is_timedelta64_dtype(dtype): dtype_sum = np.float64 the_sum = values.sum(axis, dtype=dtype_sum) - the_sum = _maybe_null_out(the_sum, axis, mask) + the_sum = _maybe_null_out(the_sum, axis, mask, empty_is_na) return _wrap_results(the_sum, dtype) @disallow('M8') -@bottleneck_switch() def nanmean(values, axis=None, skipna=True): values, mask, dtype, dtype_max = _get_values(values, skipna, 0) @@ -641,13 +645,15 @@ def nankurt(values, axis=None, skipna=True): @disallow('M8', 'm8') -def nanprod(values, axis=None, skipna=True): +@bottleneck_switch(empty_is_na=False) +def nanprod(values, axis=None, skipna=True, empty_is_na=False): mask = isna(values) if skipna and not is_any_int_dtype(values): values = values.copy() values[mask] = 1 result = values.prod(axis) - return _maybe_null_out(result, axis, mask) + + return _maybe_null_out(result, axis, mask, empty_is_na, unit=1.0) def _maybe_arg_null_out(result, axis, mask, skipna): @@ -683,9 +689,13 @@ def _get_counts(mask, axis, dtype=float): return np.array(count, dtype=dtype) -def _maybe_null_out(result, axis, mask): +def _maybe_null_out(result, axis, mask, empty_is_na=True, unit=0.0): if axis is not None and getattr(result, 'ndim', False): null_mask = (mask.shape[axis] - mask.sum(axis)) == 0 + + if not empty_is_na: + null_mask[result == unit] = False + if np.any(null_mask): if is_numeric_dtype(result): if np.iscomplexobj(result): @@ -698,7 +708,7 @@ def _maybe_null_out(result, axis, mask): result[null_mask] = None elif result is not tslib.NaT: null_mask = mask.size - mask.sum() - if null_mask == 0: + if null_mask == 0.0 and empty_is_na: result = np.nan return result diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 17d711f937bf7..a6b5137d49eea 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -478,10 +478,11 @@ def test_nunique(self): Series({0: 1, 1: 3, 2: 2})) def test_sum(self): - self._check_stat_op('sum', np.sum, has_numeric_only=True) + self._check_stat_op('sum', np.nansum, has_numeric_only=True, + no_skipna_alternative=np.sum) # mixed types (with upcasting happening) - self._check_stat_op('sum', np.sum, + self._check_stat_op('sum', np.nansum, frame=self.mixed_float.astype('float32'), has_numeric_only=True, check_dtype=False, check_less_precise=True) @@ -753,7 +754,8 @@ def alt(x): def _check_stat_op(self, name, alternative, frame=None, has_skipna=True, has_numeric_only=False, check_dtype=True, - check_dates=False, check_less_precise=False): + check_dates=False, check_less_precise=False, + no_skipna_alternative=None): if frame is None: frame = self.frame # set some NAs @@ -774,14 +776,20 @@ def _check_stat_op(self, name, alternative, frame=None, has_skipna=True, assert len(result) if has_skipna: - def skipna_wrapper(x): - nona = x.dropna() - if len(nona) == 0: - return np.nan - return alternative(nona) + alt = no_skipna_alternative or alternative # e.g. sum / nansum + + if no_skipna_alternative: + def skipna_wrapper(x): + return alternative(x.values) + else: + def skipna_wrapper(x): + nona = x.dropna() + if len(nona) == 0: + return np.nan + return alt(nona) def wrapper(x): - return alternative(x.values) + return alt(x.values) result0 = f(axis=0, skipna=False) result1 = f(axis=1, skipna=False) @@ -793,7 +801,7 @@ def wrapper(x): check_dtype=False, check_less_precise=check_less_precise) else: - skipna_wrapper = alternative + skipna_wrapper =alternative wrapper = alternative result0 = f(axis=0) @@ -834,6 +842,12 @@ def wrapper(x): r0 = getattr(all_na, name)(axis=0) r1 = getattr(all_na, name)(axis=1) if name in ['sum', 'prod']: + tm.assert_numpy_array_equal(r0.values, np.zeros_like(r0)) + tm.assert_numpy_array_equal(r1.values, np.zeros_like(r1)) + + if name in ['sum', 'prod']: + r0 = getattr(all_na, name)(axis=0, skipna=False) + r1 = getattr(all_na, name)(axis=1, skipna=False) assert np.isnan(r0).all() assert np.isnan(r1).all() diff --git a/pandas/tests/groupby/test_aggregate.py b/pandas/tests/groupby/test_aggregate.py index 3d27df31cee6e..7d354702b6eca 100644 --- a/pandas/tests/groupby/test_aggregate.py +++ b/pandas/tests/groupby/test_aggregate.py @@ -813,8 +813,6 @@ def test_cython_agg_empty_buckets(self): ops = [('mean', np.mean), ('median', lambda x: np.median(x) if len(x) > 0 else np.nan), ('var', lambda x: np.var(x, ddof=1)), - ('add', lambda x: np.sum(x) if len(x) > 0 else np.nan), - ('prod', np.prod), ('min', np.min), ('max', np.max), ] @@ -830,6 +828,23 @@ def test_cython_agg_empty_buckets(self): exc.args += ('operation: %s' % op,) raise + def test_cython_agg_empty_buckets_nanops(self): + # Bug in python agg func not being evaluated on empty buckets + df = pd.DataFrame([11, 12, 13], columns=['a']) + grps = range(0, 25, 5) + result = df.groupby(pd.cut(df['a'], grps))._cython_agg_general('add') + intervals = pd.interval_range(0, 20, freq=5) + expected = pd.DataFrame( + {"a": [0, 0, 36, 0]}, + index=pd.CategoricalIndex(intervals, name='a', ordered=True)) + tm.assert_frame_equal(result, expected) + + result = df.groupby(pd.cut(df['a'], grps))._cython_agg_general('prod') + expected = pd.DataFrame( + {"a": [1, 1, 1716, 1]}, + index=pd.CategoricalIndex(intervals, name='a', ordered=True)) + tm.assert_frame_equal(result, expected) + def test_agg_over_numpy_arrays(self): # GH 3788 df = pd.DataFrame([[1, np.array([10, 20, 30])], @@ -925,3 +940,17 @@ def test_agg_structs_series(self, structure, expected): result = df.groupby('A')['C'].aggregate(structure) expected.index.name = 'A' assert_series_equal(result, expected) + + @pytest.mark.xfail(reason="agg functions not called on empty groups") + def test_agg_category_nansum(self): + categories = ['a', 'b', 'c'] + df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'], + categories=categories), + 'B': [1, 2, 3]}) + result = df.groupby("A").B.agg(np.nansum) + expected = pd.Series([3, 3, 0], + index=pd.CategoricalIndex(['a', 'b', 'c'], + categories=categories, + name='A'), + name='B') + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index c73423921898d..1c05860b7f5fd 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -37,7 +37,7 @@ def test_groupby(self): # single grouper gb = df.groupby("A") exp_idx = CategoricalIndex(['a', 'b', 'z'], name='A', ordered=True) - expected = DataFrame({'values': Series([3, 7, np.nan], index=exp_idx)}) + expected = DataFrame({'values': Series([3, 7, 0], index=exp_idx)}) result = gb.sum() tm.assert_frame_equal(result, expected) @@ -662,3 +662,25 @@ def test_groupby_categorical_two_columns(self): "C3": [nan, nan, nan, nan, 10, 100, nan, nan, nan, nan, 200, 34]}, index=idx) tm.assert_frame_equal(res, exp) + + def test_sum_zero(self): + df = pd.DataFrame({"A": pd.Categorical(['a', 'b', 'a'], + categories=['a', 'b', 'c']), + 'B': [1, 2, 1]}) + result = df.groupby("A").B.sum() + expected = pd.Series([2, 2, 0], + index=pd.CategoricalIndex(['a', 'b', 'c'], + name='A'), + name='B') + tm.assert_series_equal(result, expected) + + def test_prod_one(self): + df = pd.DataFrame({"A": pd.Categorical(['a', 'b', 'a'], + categories=['a', 'b', 'c']), + 'B': [1, 2, 1]}) + result = df.groupby("A").B.prod() + expected = pd.Series([1, 2, 1], + index=pd.CategoricalIndex(['a', 'b', 'c'], + name='A'), + name='B') + tm.assert_series_equal(result, expected) \ No newline at end of file diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index cf4a6ec1c932a..a13d985ab6974 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2704,7 +2704,7 @@ def h(df, arg3): # Assert the results here index = pd.Index(['A', 'B', 'C'], name='group') - expected = pd.Series([-79.5160891089, -78.4839108911, None], + expected = pd.Series([-79.5160891089, -78.4839108911, -80], index=index) assert_series_equal(expected, result) diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index c8503b16a0e16..dedde090f347c 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -41,12 +41,12 @@ def test_groupby_with_timegrouper(self): df = df.set_index(['Date']) expected = DataFrame( - {'Quantity': np.nan}, + {'Quantity': 0}, index=date_range('20130901 13:00:00', '20131205 13:00:00', freq='5D', name='Date', closed='left')) expected.iloc[[0, 6, 18], 0] = np.array( - [24., 6., 9.], dtype='float64') + [24, 6, 9], dtype='int64') result1 = df.resample('5D') .sum() assert_frame_equal(result1, expected) @@ -261,9 +261,10 @@ def test_timegrouper_with_reg_groups(self): for freq in ['D', 'M', 'A', 'Q-APR']: expected = df.groupby('user_id')[ 'whole_cost'].resample( - freq).sum().dropna().reorder_levels( + freq).sum().reorder_levels( ['date', 'user_id']).sort_index().astype('int64') expected.name = 'whole_cost' + expected = expected[expected > 0] result1 = df.sort_index().groupby([pd.Grouper(freq=freq), 'user_id'])['whole_cost'].sum() diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 203a0b4a54858..6777bebcc35a1 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -33,26 +33,30 @@ class TestSeriesAnalytics(TestData): @pytest.mark.parametrize("method", ["sum", "prod"]) def test_empty(self, method, use_bottleneck): + if method == "sum": + unit = 0 + else: + unit = 1 with pd.option_context("use_bottleneck", use_bottleneck): - # GH 9422 - # treat all missing as NaN + # GH 9422 / 18678 + # treat all missing as 0 s = Series([]) result = getattr(s, method)() - assert isna(result) + assert result == unit result = getattr(s, method)(skipna=True) - assert isna(result) + assert result == unit s = Series([np.nan]) result = getattr(s, method)() - assert isna(result) + assert result == unit result = getattr(s, method)(skipna=True) - assert isna(result) + assert result == unit s = Series([np.nan, 1]) result = getattr(s, method)() - assert result == 1.0 + assert result == 1 s = Series([np.nan, 1]) result = getattr(s, method)(skipna=True) @@ -60,13 +64,15 @@ def test_empty(self, method, use_bottleneck): # GH #844 (changed in 9422) df = DataFrame(np.empty((10, 0))) - assert (df.sum(1).isnull()).all() + result = df.sum(1) + expected = pd.Series(0, index=df.index, dtype='float64') + tm.assert_series_equal(result, expected) @pytest.mark.parametrize( - "method", ['sum', 'mean', 'median', 'std', 'var']) + "method", ['mean', 'median', 'std', 'var']) def test_ops_consistency_on_empty(self, method): - # GH 7869 + # GH 7869 / 18678 # consistency on empty # float @@ -77,6 +83,19 @@ def test_ops_consistency_on_empty(self, method): result = getattr(Series(dtype='m8[ns]'), method)() assert result is pd.NaT + @pytest.mark.parametrize('method, unit', [ + ('sum', 0), + ('prod', 1), + ]) + def test_ops_consistency_on_empty_sum_prod(self, method, unit): + # GH 18678 + result = getattr(Series(dtype=float), method)() + assert result == unit + + if method == 'sum': + result = getattr(Series(dtype='m8[ns]'), method)() + assert result == pd.Timedelta(0) + def test_nansum_buglet(self): s = Series([1.0, np.nan], index=[0, 1]) result = np.nansum(s) @@ -111,7 +130,7 @@ def test_sum_overflow(self, use_bottleneck): assert np.allclose(float(result), v[-1]) def test_sum(self): - self._check_stat_op('sum', np.sum, check_allna=True) + self._check_stat_op('sum', np.nansum, check_allna=False) def test_sum_inf(self): s = Series(np.random.randn(10)) diff --git a/pandas/tests/series/test_quantile.py b/pandas/tests/series/test_quantile.py index 14a44c36c6a0c..3c93ff1d3f31e 100644 --- a/pandas/tests/series/test_quantile.py +++ b/pandas/tests/series/test_quantile.py @@ -38,7 +38,7 @@ def test_quantile(self): # GH7661 result = Series([np.timedelta64('NaT')]).sum() - assert result is pd.NaT + assert result == pd.Timedelta(0) msg = 'percentiles should all be in the interval \\[0, 1\\]' for invalid in [-1, 2, [0.5, -1], [0.5, 2]]: diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index 9305504f8d5e3..2f084cabd261d 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -181,12 +181,16 @@ def _coerce_tds(targ, res): check_dtype=check_dtype) def check_fun_data(self, testfunc, targfunc, testarval, targarval, - targarnanval, check_dtype=True, **kwargs): + targarnanval, check_dtype=True, empty_targfunc=None, + **kwargs): for axis in list(range(targarval.ndim)) + [None]: for skipna in [False, True]: targartempval = targarval if skipna else targarnanval - try: + if skipna and empty_targfunc and pd.isna(targartempval).all(): + targ = empty_targfunc(targartempval, axis=axis, **kwargs) + else: targ = targfunc(targartempval, axis=axis, **kwargs) + try: res = testfunc(testarval, axis=axis, skipna=skipna, **kwargs) self.check_results(targ, res, axis, @@ -218,7 +222,9 @@ def check_fun_data(self, testfunc, targfunc, testarval, targarval, except ValueError: return self.check_fun_data(testfunc, targfunc, testarval2, targarval2, - targarnanval2, check_dtype=check_dtype, **kwargs) + targarnanval2, check_dtype=check_dtype, + empty_targfunc=empty_targfunc, + **kwargs) def check_fun(self, testfunc, targfunc, testar, targar=None, targarnan=None, **kwargs): @@ -328,7 +334,8 @@ def test_nanall(self): def test_nansum(self): self.check_funs(nanops.nansum, np.sum, allow_str=False, - allow_date=False, allow_tdelta=True, check_dtype=False) + allow_date=False, allow_tdelta=True, check_dtype=False, + empty_targfunc=np.nansum) def test_nanmean(self): self.check_funs(nanops.nanmean, np.mean, allow_complex=False, @@ -462,7 +469,8 @@ def test_nankurt(self): def test_nanprod(self): self.check_funs(nanops.nanprod, np.prod, allow_str=False, - allow_date=False, allow_tdelta=False) + allow_date=False, allow_tdelta=False, + empty_targfunc=np.nanprod) def check_nancorr_nancov_2d(self, checkfun, targ0, targ1, **kwargs): res00 = checkfun(self.arr_float_2d, self.arr_float1_2d, **kwargs) @@ -992,6 +1000,50 @@ def test_nans_skipna(self): def prng(self): return np.random.RandomState(1234) + def test_empty_sum(self): + ser = Series(dtype=np.float64) + result = ser.sum() + assert result == 0.0 + + result = ser.sum(empty_is_na=True) + assert pd.isna(result) + + def test_empty_prod(self): + ser = Series(dtype=np.float64) + result = ser.prod() + assert result == 1.0 + + result = ser.prod(empty_is_na=True) + assert pd.isna(result) + + def test_bool_sum(self): + ser = Series([True, True, False]) + result = ser.sum() + assert result == 2 + + @pytest.mark.parametrize('skipna, series, empty_is_na, expected', [ + (True, pd.Series([]), False, 0), + (True, pd.Series([]), True, np.nan), + (True, pd.Series([np.nan]), False, 0), + (True, pd.Series([np.nan]), True, np.nan), + (False, pd.Series([]), False, 0), + (False, pd.Series([]), True, np.nan), + (False, pd.Series([np.nan]), False, np.nan), + (False, pd.Series([np.nan]), True, np.nan), + + ]) + def test_sum_table(self, skipna, series, empty_is_na, expected): + # https://github.com/pandas-dev/pandas/issues/18678 + # #issuecomment-351437890 + the_sum = series.sum(skipna=skipna, empty_is_na=empty_is_na) + the_prod = series.prod(skipna=skipna, empty_is_na=empty_is_na) + if np.isnan(expected): + assert np.isnan(the_sum) + assert np.isnan(the_prod) + else: + assert the_sum == 0.0 + assert the_prod == 1.0 + def test_use_bottleneck(): @@ -1003,4 +1055,4 @@ def test_use_bottleneck(): pd.set_option('use_bottleneck', False) assert not pd.get_option('use_bottleneck') - pd.set_option('use_bottleneck', use_bn) + pd.set_option('use_bottleneck', use_bn) \ No newline at end of file diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index f00fa07d868a1..e9b030fe7eb31 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -3385,7 +3385,13 @@ def test_aggregate_with_nat(self): for func in ['min', 'max', 'sum', 'prod']: normal_result = getattr(normal_grouped, func)() dt_result = getattr(dt_grouped, func)() - pad = DataFrame([[np.nan, np.nan, np.nan, np.nan]], index=[3], + if func == 'sum': + fill_value = 0 + elif func == 'prod': + fill_value = 1 + else: + fill_value = np.nan + pad = DataFrame([[fill_value] * 4], index=[3], columns=['A', 'B', 'C', 'D']) expected = normal_result.append(pad) expected = expected.sort_index()