Skip to content

Commit e390d1c

Browse files
committed
API: Change the sum of all-NA / all-Empty sum / prod
1 parent 7371c80 commit e390d1c

16 files changed

+306
-99
lines changed

doc/source/whatsnew/v0.22.0.txt

+102-4
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,110 @@
33
v0.22.0
44
-------
55

6-
This is a major release from 0.21.1 and includes a number of API changes,
7-
deprecations, new features, enhancements, and performance improvements along
8-
with a large number of bug fixes. We recommend that all users upgrade to this
9-
version.
6+
This is a major release from 0.21.1 and includes a single, API breaking change.
7+
We recommend that all users upgrade to this version after carefully reading the
8+
release note (singular!).
109

1110
.. _whatsnew_0220.api_breaking:
1211

1312
Backwards incompatible API changes
1413
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
14+
15+
Pandas 0.22.0 changes the handling of empty and all-NA sums and products. The
16+
summary is that
17+
18+
* The sum of an all-NA or empty series is now 0
19+
* The product of an all-NA or empty series is now 1
20+
* We've added a ``min_count`` parameter to ``.sum`` and ``.prod`` to control
21+
the minimum number of valid values for the result to be valid. If fewer than
22+
``min_count`` valid values are present, the result is NA. The default is
23+
``0``. To restore the 0.21 behavior, use ``min_count=1``.
24+
25+
Some background: In pandas 0.21.1, we fixed a long-standing inconsistency
26+
in the return value of all-NA series depending on whether or not bottleneck
27+
was installed. See :ref:`whatsnew_0210.api_breaking.bottleneck`_. At the same
28+
time, we changed the sum and prod of an empty Series to also be ``NaN``.
29+
30+
Based on feedback, we've partially reverted those changes. The default sum
31+
for all-NA and empty series is now 0 (1 for ``prod``).
32+
33+
*pandas 0.21*
34+
35+
.. code-block:: ipython
36+
37+
In [1]: import pandas as pd
38+
39+
In [2]: import numpy as np
40+
41+
In [3]: pd.Series([]).sum()
42+
Out[3]: nan
43+
44+
In [4]: pd.Series([np.nan]).sum()
45+
Out[4]: nan
46+
47+
*pandas 0.22.0*
48+
49+
.. ipython:: python
50+
51+
pd.Series([]).sum()
52+
pd.Series([np.nan]).sum()
53+
54+
To have the sum of an empty series return ``NaN``, use the ``min_count``
55+
keyword. Thanks to the ``skipna`` parameter, the ``.sum`` on an all-NA
56+
series is conceptually the same as on an empty. The ``min_count`` parameter
57+
refers to the minimum number of *valid* values required for a non-NA sum
58+
or product.
59+
60+
.. ipython:: python
61+
62+
pd.Series([]).sum(min_count=1)
63+
pd.Series([np.nan]).sum(min_count=1)
64+
65+
Note that this affects some other places in the library:
66+
67+
1. Grouping by a Categorical with some unobserved categories
68+
69+
*pandas 0.21*
70+
71+
.. code-block:: ipython
72+
73+
In [3]: grouper = pd.Categorical(['a', 'a'], categories=['a', 'b'])
74+
75+
In [4]: pd.Series([1, 2]).groupby(grouper).sum()
76+
Out[4]:
77+
a 3.0
78+
b NaN
79+
dtype: float64
80+
81+
*pandas 0.22*
82+
83+
.. ipython:: python
84+
85+
grouper = pd.Categorical(['a', 'a'], categories=['a', 'b'])
86+
pd.Series([1, 2]).groupby(grouper).sum()
87+
88+
pd.Series([1, 2]).groupby(groupuer).sum(min_count=1)
89+
90+
2. Upsampling
91+
92+
*pandas 0.21.0*
93+
94+
.. code-block:: ipython
95+
96+
In [5]: idx = pd.DatetimeIndex(['2017-01-01', '2017-01-02'])
97+
98+
In [6]: pd.Series([1, 2], index=idx).resample('12H').sum()
99+
Out[6]:
100+
2017-01-01 00:00:00 1.0
101+
2017-01-01 12:00:00 NaN
102+
2017-01-02 00:00:00 2.0
103+
Freq: 12H, dtype: float64
104+
105+
*pandas 0.22.0*
106+
107+
.. ipython:: python
108+
109+
idx = pd.DatetimeIndex(['2017-01-01', '2017-01-02'])
110+
pd.Series([1, 2], index=idx).resample("12H").sum()
111+
112+
pd.Series([1, 2], index=idx).resample("12H").sum(min_count=1)

pandas/_libs/groupby_helper.pxi.in

+2-2
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def group_add_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
3737
ndarray[int64_t] counts,
3838
ndarray[{{c_type}}, ndim=2] values,
3939
ndarray[int64_t] labels,
40-
Py_ssize_t min_count=1):
40+
Py_ssize_t min_count=0):
4141
"""
4242
Only aggregates on axis=0
4343
"""
@@ -101,7 +101,7 @@ def group_prod_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
101101
ndarray[int64_t] counts,
102102
ndarray[{{c_type}}, ndim=2] values,
103103
ndarray[int64_t] labels,
104-
Py_ssize_t min_count=1):
104+
Py_ssize_t min_count=0):
105105
"""
106106
Only aggregates on axis=0
107107
"""

pandas/_libs/window.pyx

+11
Original file line numberDiff line numberDiff line change
@@ -443,10 +443,17 @@ def roll_sum(ndarray[double_t] input, int64_t win, int64_t minp,
443443
double val, prev_x, sum_x = 0
444444
int64_t s, e
445445
int64_t nobs = 0, i, j, N
446+
int64_t minp2 = -1
446447
bint is_variable
447448
ndarray[int64_t] start, end
448449
ndarray[double_t] output
449450

451+
if minp == 0:
452+
# in get_window_indexer, we ensure that minp >= 1. That's fine for
453+
# all cases except nobs = 0 (all missing values) and minp=0. For
454+
# any other minp, the sum will be NA. For minp=0, the sum will be 0.
455+
# So we track that here and pass it later if needed.
456+
minp2 = 0
450457
start, end, N, win, minp, is_variable = get_window_indexer(input, win,
451458
minp, index,
452459
closed)
@@ -483,6 +490,8 @@ def roll_sum(ndarray[double_t] input, int64_t win, int64_t minp,
483490
for j in range(end[i - 1], e):
484491
add_sum(input[j], &nobs, &sum_x)
485492

493+
if minp2 == 0:
494+
minp = 0
486495
output[i] = calc_sum(minp, nobs, sum_x)
487496

488497
else:
@@ -503,6 +512,8 @@ def roll_sum(ndarray[double_t] input, int64_t win, int64_t minp,
503512
prev_x = input[i - win]
504513
remove_sum(prev_x, &nobs, &sum_x)
505514

515+
if minp2 == 0:
516+
minp = 0
506517
output[i] = calc_sum(minp, nobs, sum_x)
507518

508519
return output

pandas/core/generic.py

+17-17
Original file line numberDiff line numberDiff line change
@@ -7619,48 +7619,48 @@ def _doc_parms(cls):
76197619
_sum_examples = """\
76207620
Examples
76217621
--------
7622-
By default, the sum of an empty series is ``NaN``.
7622+
By default, the sum of an empty series is ``0``.
76237623
7624-
>>> pd.Series([]).sum() # min_count=1 is the default
7625-
nan
7624+
>>> pd.Series([]).sum() # min_count=0 is the default
7625+
0.0
76267626
76277627
This can be controlled with the ``min_count`` parameter. For example, if
7628-
you'd like the sum of an empty series to be 0, pass ``min_count=0``.
7628+
you'd like the sum of an empty series to be NaN, pass ``min_count=1``.
76297629
7630-
>>> pd.Series([]).sum(min_count=0)
7631-
0.0
7630+
>>> pd.Series([]).sum(min_count=1)
7631+
nan
76327632
76337633
Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and
76347634
empty series identically.
76357635
76367636
>>> pd.Series([np.nan]).sum()
7637-
nan
7638-
7639-
>>> pd.Series([np.nan]).sum(min_count=0)
76407637
0.0
7638+
7639+
>>> pd.Series([np.nan]).sum(min_count=1)
7640+
nan
76417641
"""
76427642

76437643
_prod_examples = """\
76447644
Examples
76457645
--------
7646-
By default, the product of an empty series is ``NaN``
7646+
By default, the product of an empty series is ``1``
76477647
76487648
>>> pd.Series([]).prod()
7649-
nan
7649+
1.0
76507650
76517651
This can be controlled with the ``min_count`` parameter
76527652
7653-
>>> pd.Series([]).prod(min_count=0)
7654-
1.0
7653+
>>> pd.Series([]).prod(min_count=1)
7654+
nan
76557655
76567656
Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and
76577657
empty series identically.
76587658
76597659
>>> pd.Series([np.nan]).prod()
7660-
nan
7661-
7662-
>>> pd.Series([np.nan]).sum(min_count=0)
76637660
1.0
7661+
7662+
>>> pd.Series([np.nan]).sum(min_count=1)
7663+
nan
76647664
"""
76657665

76667666

@@ -7683,7 +7683,7 @@ def _make_min_count_stat_function(cls, name, name1, name2, axis_descr, desc,
76837683
examples=examples)
76847684
@Appender(_num_doc)
76857685
def stat_func(self, axis=None, skipna=None, level=None, numeric_only=None,
7686-
min_count=1,
7686+
min_count=0,
76877687
**kwargs):
76887688
nv.validate_stat_func(tuple(), kwargs, fname=name)
76897689
if skipna is None:

pandas/core/nanops.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -308,7 +308,7 @@ def nanall(values, axis=None, skipna=True):
308308

309309
@disallow('M8')
310310
@bottleneck_switch()
311-
def nansum(values, axis=None, skipna=True, min_count=1):
311+
def nansum(values, axis=None, skipna=True, min_count=0):
312312
values, mask, dtype, dtype_max = _get_values(values, skipna, 0)
313313
dtype_sum = dtype_max
314314
if is_float_dtype(dtype):
@@ -645,7 +645,7 @@ def nankurt(values, axis=None, skipna=True):
645645

646646

647647
@disallow('M8', 'm8')
648-
def nanprod(values, axis=None, skipna=True, min_count=1):
648+
def nanprod(values, axis=None, skipna=True, min_count=0):
649649
mask = isna(values)
650650
if skipna and not is_any_int_dtype(values):
651651
values = values.copy()

pandas/tests/frame/test_analytics.py

+19-10
Original file line numberDiff line numberDiff line change
@@ -478,7 +478,8 @@ def test_nunique(self):
478478
Series({0: 1, 1: 3, 2: 2}))
479479

480480
def test_sum(self):
481-
self._check_stat_op('sum', np.sum, has_numeric_only=True)
481+
self._check_stat_op('sum', np.sum, has_numeric_only=True,
482+
skipna_alternative=np.nansum)
482483

483484
# mixed types (with upcasting happening)
484485
self._check_stat_op('sum', np.sum,
@@ -753,7 +754,8 @@ def alt(x):
753754

754755
def _check_stat_op(self, name, alternative, frame=None, has_skipna=True,
755756
has_numeric_only=False, check_dtype=True,
756-
check_dates=False, check_less_precise=False):
757+
check_dates=False, check_less_precise=False,
758+
skipna_alternative=None):
757759
if frame is None:
758760
frame = self.frame
759761
# set some NAs
@@ -774,15 +776,19 @@ def _check_stat_op(self, name, alternative, frame=None, has_skipna=True,
774776
assert len(result)
775777

776778
if has_skipna:
777-
def skipna_wrapper(x):
778-
nona = x.dropna()
779-
if len(nona) == 0:
780-
return np.nan
781-
return alternative(nona)
782-
783779
def wrapper(x):
784780
return alternative(x.values)
785781

782+
if skipna_alternative:
783+
def skipna_wrapper(x):
784+
return skipna_alternative(x.values)
785+
else:
786+
def skipna_wrapper(x):
787+
nona = x.dropna()
788+
if len(nona) == 0:
789+
return np.nan
790+
return alternative(nona)
791+
786792
result0 = f(axis=0, skipna=False)
787793
result1 = f(axis=1, skipna=False)
788794
tm.assert_series_equal(result0, frame.apply(wrapper),
@@ -834,8 +840,11 @@ def wrapper(x):
834840
r0 = getattr(all_na, name)(axis=0)
835841
r1 = getattr(all_na, name)(axis=1)
836842
if name in ['sum', 'prod']:
837-
assert np.isnan(r0).all()
838-
assert np.isnan(r1).all()
843+
unit = int(name == 'prod')
844+
expected = pd.Series(unit, index=r0.index, dtype=r0.dtype)
845+
tm.assert_series_equal(r0, expected)
846+
expected = pd.Series(unit, index=r1.index, dtype=r1.dtype)
847+
tm.assert_series_equal(r1, expected)
839848

840849
def test_mode(self):
841850
df = pd.DataFrame({"A": [12, 12, 11, 12, 19, 11],

pandas/tests/groupby/test_aggregate.py

+35-8
Original file line numberDiff line numberDiff line change
@@ -813,8 +813,6 @@ def test__cython_agg_general(self):
813813
('mean', np.mean),
814814
('median', lambda x: np.median(x) if len(x) > 0 else np.nan),
815815
('var', lambda x: np.var(x, ddof=1)),
816-
('add', lambda x: np.sum(x) if len(x) > 0 else np.nan),
817-
('prod', np.prod),
818816
('min', np.min),
819817
('max', np.max), ]
820818
)
@@ -824,19 +822,48 @@ def test_cython_agg_empty_buckets(self, op, targop):
824822

825823
# calling _cython_agg_general directly, instead of via the user API
826824
# which sets different values for min_count, so do that here.
827-
if op in ('add', 'prod'):
828-
min_count = 1
829-
else:
830-
min_count = -1
831-
result = df.groupby(pd.cut(df[0], grps))._cython_agg_general(
832-
op, min_count=min_count)
825+
result = df.groupby(pd.cut(df[0], grps))._cython_agg_general(op)
833826
expected = df.groupby(pd.cut(df[0], grps)).agg(lambda x: targop(x))
834827
try:
835828
tm.assert_frame_equal(result, expected)
836829
except BaseException as exc:
837830
exc.args += ('operation: %s' % op,)
838831
raise
839832

833+
def test_cython_agg_empty_buckets_nanops(self):
834+
# GH-18869 can't call nanops on empty groups, so hardcode expected
835+
# for these
836+
df = pd.DataFrame([11, 12, 13], columns=['a'])
837+
grps = range(0, 25, 5)
838+
# add / sum
839+
result = df.groupby(pd.cut(df['a'], grps))._cython_agg_general('add')
840+
intervals = pd.interval_range(0, 20, freq=5)
841+
expected = pd.DataFrame(
842+
{"a": [0, 0, 36, 0]},
843+
index=pd.CategoricalIndex(intervals, name='a', ordered=True))
844+
tm.assert_frame_equal(result, expected)
845+
846+
# prod
847+
result = df.groupby(pd.cut(df['a'], grps))._cython_agg_general('prod')
848+
expected = pd.DataFrame(
849+
{"a": [1, 1, 1716, 1]},
850+
index=pd.CategoricalIndex(intervals, name='a', ordered=True))
851+
tm.assert_frame_equal(result, expected)
852+
853+
@pytest.mark.xfail(reason="GH-18869: agg func not called on empty groups.")
854+
def test_agg_category_nansum(self):
855+
categories = ['a', 'b', 'c']
856+
df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'],
857+
categories=categories),
858+
'B': [1, 2, 3]})
859+
result = df.groupby("A").B.agg(np.nansum)
860+
expected = pd.Series([3, 3, 0],
861+
index=pd.CategoricalIndex(['a', 'b', 'c'],
862+
categories=categories,
863+
name='A'),
864+
name='B')
865+
tm.assert_series_equal(result, expected)
866+
840867
def test_agg_over_numpy_arrays(self):
841868
# GH 3788
842869
df = pd.DataFrame([[1, np.array([10, 20, 30])],

0 commit comments

Comments
 (0)