Skip to content

Commit d12a7a0

Browse files
authored
COMPAT: sum/prod on all nan will remain nan regardless of bottleneck install (#17630)
xref #15507 closes #9422
1 parent 6ff8434 commit d12a7a0

File tree

11 files changed

+223
-200
lines changed

11 files changed

+223
-200
lines changed

doc/source/missing_data.rst

+36
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,42 @@ account for missing data. For example:
181181
df.mean(1)
182182
df.cumsum()
183183
184+
185+
.. _missing_data.numeric_sum:
186+
187+
Sum/Prod of Empties/Nans
188+
~~~~~~~~~~~~~~~~~~~~~~~~
189+
190+
.. warning::
191+
192+
This behavior is now standard as of v0.21.0; previously sum/prod would give different
193+
results if the ``bottleneck`` package was installed. See the :ref:`here <whatsnew_0210.api_breaking.bottleneck>`.
194+
195+
With ``sum`` or ``prod`` on an empty or all-``NaN`` ``Series``, or columns of a ``DataFrame``, the result will be all-``NaN``.
196+
197+
.. ipython:: python
198+
199+
s = Series([np.nan])
200+
201+
s.sum()
202+
203+
Summing of an empty ``Series``
204+
205+
.. ipython:: python
206+
207+
pd.Series([]).sum()
208+
209+
.. warning::
210+
211+
These behaviors differ from the default in ``numpy`` where an empty sum returns zero.
212+
213+
.. ipython:: python
214+
215+
np.nansum(np.array([np.nan]))
216+
np.nansum(np.array([]))
217+
218+
219+
184220
NA values in GroupBy
185221
~~~~~~~~~~~~~~~~~~~~
186222

doc/source/whatsnew/v0.21.0.txt

+47
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ Highlights include:
1212
- Integration with `Apache Parquet <https://parquet.apache.org/>`__, including a new top-level :func:`read_parquet` and :func:`DataFrame.to_parquet` method, see :ref:`here <io.parquet>`.
1313
- New user-facing :class:`pandas.api.types.CategoricalDtype` for specifying
1414
categoricals independent of the data, see :ref:`here <whatsnew_0210.enhancements.categorical_dtype>`.
15+
- The behavior of ``sum`` and ``prod`` on all-NaN Series/DataFrames is now consistent and no longer depends on whether `bottleneck <http://berkeleyanalytics.com/bottleneck>`__ is installed, see :ref:`here <whatsnew_0210.api_breaking.bottleneck>`
1516

1617
Check the :ref:`API Changes <whatsnew_0210.api_breaking>` and :ref:`deprecations <whatsnew_0210.deprecations>` before updating.
1718

@@ -412,6 +413,52 @@ Current Behavior
412413

413414
s.loc[pd.Index([True, False, True])]
414415

416+
.. _whatsnew_0210.api_breaking.bottleneck:
417+
418+
Sum/Prod of all-NaN Series/DataFrames is now consistently NaN
419+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
420+
421+
The behavior of ``sum`` and ``prod`` on all-NaN Series/DataFrames is now consistent and no longer depends on
422+
whether `bottleneck <http://berkeleyanalytics.com/bottleneck>`__ is installed. (:issue:`9422`, :issue:`15507`).
423+
424+
With ``sum`` or ``prod`` on an empty or all-``NaN`` ``Series``, or columns of a ``DataFrame``, the result will be all-``NaN``. See the :ref:`docs <missing_data.numeric_sum>`.
425+
426+
.. ipython:: python
427+
428+
s = Series([np.nan])
429+
430+
Previously NO ``bottleneck``
431+
432+
.. code_block:: ipython
433+
434+
In [2]: s.sum()
435+
Out[2]: np.nan
436+
437+
Previously WITH ``bottleneck``
438+
439+
.. code_block:: ipython
440+
441+
In [2]: s.sum()
442+
Out[2]: 0.0
443+
444+
New Behavior, without regards to the bottleneck installation.
445+
446+
.. ipython:: python
447+
448+
s.sum()
449+
450+
Note that this also changes the sum of an empty ``Series``
451+
452+
Previously regardless of ``bottlenck``
453+
454+
.. code_block:: ipython
455+
456+
In [1]: pd.Series([]).sum()
457+
Out[1]: 0
458+
459+
.. ipython:: python
460+
461+
pd.Series([]).sum()
415462

416463
.. _whatsnew_0210.api_breaking.pandas_eval:
417464

pandas/core/generic.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -6990,7 +6990,7 @@ def _doc_parms(cls):
69906990
----------
69916991
axis : %(axis_descr)s
69926992
skipna : boolean, default True
6993-
Exclude NA/null values. If an entire row/column is NA, the result
6993+
Exclude NA/null values. If an entire row/column is NA or empty, the result
69946994
will be NA
69956995
level : int or level name, default None
69966996
If the axis is a MultiIndex (hierarchical), count along a

pandas/core/nanops.py

+21-15
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
is_datetime_or_timedelta_dtype,
1919
is_int_or_datetime_dtype, is_any_int_dtype)
2020
from pandas.core.dtypes.cast import _int64_max, maybe_upcast_putmask
21-
from pandas.core.dtypes.missing import isna, notna
21+
from pandas.core.dtypes.missing import isna, notna, na_value_for_dtype
2222
from pandas.core.config import get_option
2323
from pandas.core.common import _values_from_object
2424

@@ -89,8 +89,7 @@ def _f(*args, **kwargs):
8989

9090
class bottleneck_switch(object):
9191

92-
def __init__(self, zero_value=None, **kwargs):
93-
self.zero_value = zero_value
92+
def __init__(self, **kwargs):
9493
self.kwargs = kwargs
9594

9695
def __call__(self, alt):
@@ -108,18 +107,20 @@ def f(values, axis=None, skipna=True, **kwds):
108107
if k not in kwds:
109108
kwds[k] = v
110109
try:
111-
if self.zero_value is not None and values.size == 0:
112-
if values.ndim == 1:
110+
if values.size == 0:
111+
112+
# we either return np.nan or pd.NaT
113+
if is_numeric_dtype(values):
114+
values = values.astype('float64')
115+
fill_value = na_value_for_dtype(values.dtype)
113116

114-
# wrap the 0's if needed
115-
if is_timedelta64_dtype(values):
116-
return lib.Timedelta(0)
117-
return 0
117+
if values.ndim == 1:
118+
return fill_value
118119
else:
119120
result_shape = (values.shape[:axis] +
120121
values.shape[axis + 1:])
121-
result = np.empty(result_shape)
122-
result.fill(0)
122+
result = np.empty(result_shape, dtype=values.dtype)
123+
result.fill(fill_value)
123124
return result
124125

125126
if (_USE_BOTTLENECK and skipna and
@@ -154,11 +155,16 @@ def _bn_ok_dtype(dt, name):
154155
# Bottleneck chokes on datetime64
155156
if (not is_object_dtype(dt) and not is_datetime_or_timedelta_dtype(dt)):
156157

158+
# GH 15507
157159
# bottleneck does not properly upcast during the sum
158160
# so can overflow
159-
if name == 'nansum':
160-
if dt.itemsize < 8:
161-
return False
161+
162+
# GH 9422
163+
# further we also want to preserve NaN when all elements
164+
# are NaN, unlinke bottleneck/numpy which consider this
165+
# to be 0
166+
if name in ['nansum', 'nanprod']:
167+
return False
162168

163169
return True
164170
return False
@@ -297,7 +303,7 @@ def nanall(values, axis=None, skipna=True):
297303

298304

299305
@disallow('M8')
300-
@bottleneck_switch(zero_value=0)
306+
@bottleneck_switch()
301307
def nansum(values, axis=None, skipna=True):
302308
values, mask, dtype, dtype_max = _get_values(values, skipna, 0)
303309
dtype_sum = dtype_max

pandas/tests/frame/test_analytics.py

+35-38
Original file line numberDiff line numberDiff line change
@@ -448,7 +448,11 @@ def test_sum(self):
448448
has_numeric_only=True, check_dtype=False,
449449
check_less_precise=True)
450450

451-
def test_stat_operators_attempt_obj_array(self):
451+
@pytest.mark.parametrize(
452+
"method", ['sum', 'mean', 'prod', 'var',
453+
'std', 'skew', 'min', 'max'])
454+
def test_stat_operators_attempt_obj_array(self, method):
455+
# GH #676
452456
data = {
453457
'a': [-0.00049987540199591344, -0.0016467257772919831,
454458
0.00067695870775883013],
@@ -458,20 +462,17 @@ def test_stat_operators_attempt_obj_array(self):
458462
}
459463
df1 = DataFrame(data, index=['foo', 'bar', 'baz'],
460464
dtype='O')
461-
methods = ['sum', 'mean', 'prod', 'var', 'std', 'skew', 'min', 'max']
462465

463-
# GH #676
464466
df2 = DataFrame({0: [np.nan, 2], 1: [np.nan, 3],
465467
2: [np.nan, 4]}, dtype=object)
466468

467469
for df in [df1, df2]:
468-
for meth in methods:
469-
assert df.values.dtype == np.object_
470-
result = getattr(df, meth)(1)
471-
expected = getattr(df.astype('f8'), meth)(1)
470+
assert df.values.dtype == np.object_
471+
result = getattr(df, method)(1)
472+
expected = getattr(df.astype('f8'), method)(1)
472473

473-
if not tm._incompat_bottleneck_version(meth):
474-
tm.assert_series_equal(result, expected)
474+
if method in ['sum', 'prod']:
475+
tm.assert_series_equal(result, expected)
475476

476477
def test_mean(self):
477478
self._check_stat_op('mean', np.mean, check_dates=True)
@@ -563,15 +564,15 @@ def test_var_std(self):
563564
arr = np.repeat(np.random.random((1, 1000)), 1000, 0)
564565
result = nanops.nanvar(arr, axis=0)
565566
assert not (result < 0).any()
566-
if nanops._USE_BOTTLENECK:
567-
nanops._USE_BOTTLENECK = False
567+
568+
with pd.option_context('use_bottleneck', False):
568569
result = nanops.nanvar(arr, axis=0)
569570
assert not (result < 0).any()
570-
nanops._USE_BOTTLENECK = True
571571

572-
def test_numeric_only_flag(self):
572+
@pytest.mark.parametrize(
573+
"meth", ['sem', 'var', 'std'])
574+
def test_numeric_only_flag(self, meth):
573575
# GH #9201
574-
methods = ['sem', 'var', 'std']
575576
df1 = DataFrame(np.random.randn(5, 3), columns=['foo', 'bar', 'baz'])
576577
# set one entry to a number in str format
577578
df1.loc[0, 'foo'] = '100'
@@ -580,20 +581,19 @@ def test_numeric_only_flag(self):
580581
# set one entry to a non-number str
581582
df2.loc[0, 'foo'] = 'a'
582583

583-
for meth in methods:
584-
result = getattr(df1, meth)(axis=1, numeric_only=True)
585-
expected = getattr(df1[['bar', 'baz']], meth)(axis=1)
586-
tm.assert_series_equal(expected, result)
584+
result = getattr(df1, meth)(axis=1, numeric_only=True)
585+
expected = getattr(df1[['bar', 'baz']], meth)(axis=1)
586+
tm.assert_series_equal(expected, result)
587587

588-
result = getattr(df2, meth)(axis=1, numeric_only=True)
589-
expected = getattr(df2[['bar', 'baz']], meth)(axis=1)
590-
tm.assert_series_equal(expected, result)
588+
result = getattr(df2, meth)(axis=1, numeric_only=True)
589+
expected = getattr(df2[['bar', 'baz']], meth)(axis=1)
590+
tm.assert_series_equal(expected, result)
591591

592-
# df1 has all numbers, df2 has a letter inside
593-
pytest.raises(TypeError, lambda: getattr(df1, meth)(
594-
axis=1, numeric_only=False))
595-
pytest.raises(TypeError, lambda: getattr(df2, meth)(
596-
axis=1, numeric_only=False))
592+
# df1 has all numbers, df2 has a letter inside
593+
pytest.raises(TypeError, lambda: getattr(df1, meth)(
594+
axis=1, numeric_only=False))
595+
pytest.raises(TypeError, lambda: getattr(df2, meth)(
596+
axis=1, numeric_only=False))
597597

598598
def test_mixed_ops(self):
599599
# GH 16116
@@ -606,11 +606,9 @@ def test_mixed_ops(self):
606606
result = getattr(df, op)()
607607
assert len(result) == 2
608608

609-
if nanops._USE_BOTTLENECK:
610-
nanops._USE_BOTTLENECK = False
609+
with pd.option_context('use_bottleneck', False):
611610
result = getattr(df, op)()
612611
assert len(result) == 2
613-
nanops._USE_BOTTLENECK = True
614612

615613
def test_cumsum(self):
616614
self.tsframe.loc[5:10, 0] = nan
@@ -676,11 +674,10 @@ def test_sem(self):
676674
arr = np.repeat(np.random.random((1, 1000)), 1000, 0)
677675
result = nanops.nansem(arr, axis=0)
678676
assert not (result < 0).any()
679-
if nanops._USE_BOTTLENECK:
680-
nanops._USE_BOTTLENECK = False
677+
678+
with pd.option_context('use_bottleneck', False):
681679
result = nanops.nansem(arr, axis=0)
682680
assert not (result < 0).any()
683-
nanops._USE_BOTTLENECK = True
684681

685682
def test_skew(self):
686683
tm._skip_if_no_scipy()
@@ -767,7 +764,7 @@ def wrapper(x):
767764
tm.assert_series_equal(result0, frame.apply(skipna_wrapper),
768765
check_dtype=check_dtype,
769766
check_less_precise=check_less_precise)
770-
if not tm._incompat_bottleneck_version(name):
767+
if name in ['sum', 'prod']:
771768
exp = frame.apply(skipna_wrapper, axis=1)
772769
tm.assert_series_equal(result1, exp, check_dtype=False,
773770
check_less_precise=check_less_precise)
@@ -799,7 +796,7 @@ def wrapper(x):
799796
all_na = self.frame * np.NaN
800797
r0 = getattr(all_na, name)(axis=0)
801798
r1 = getattr(all_na, name)(axis=1)
802-
if not tm._incompat_bottleneck_version(name):
799+
if name in ['sum', 'prod']:
803800
assert np.isnan(r0).all()
804801
assert np.isnan(r1).all()
805802

@@ -1859,14 +1856,14 @@ def test_dataframe_clip(self):
18591856
assert (clipped_df.values[ub_mask] == ub).all()
18601857
assert (clipped_df.values[mask] == df.values[mask]).all()
18611858

1862-
@pytest.mark.xfail(reason=("clip on mixed integer or floats "
1863-
"with integer clippers coerces to float"))
18641859
def test_clip_mixed_numeric(self):
1865-
1860+
# TODO(jreback)
1861+
# clip on mixed integer or floats
1862+
# with integer clippers coerces to float
18661863
df = DataFrame({'A': [1, 2, 3],
18671864
'B': [1., np.nan, 3.]})
18681865
result = df.clip(1, 2)
1869-
expected = DataFrame({'A': [1, 2, 2],
1866+
expected = DataFrame({'A': [1, 2, 2.],
18701867
'B': [1., np.nan, 2.]})
18711868
tm.assert_frame_equal(result, expected, check_like=True)
18721869

pandas/tests/groupby/test_aggregate.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -562,7 +562,7 @@ def _testit(name):
562562
exp.name = 'C'
563563

564564
result = op(grouped)['C']
565-
if not tm._incompat_bottleneck_version(name):
565+
if name in ['sum', 'prod']:
566566
assert_series_equal(result, exp)
567567

568568
_testit('count')

0 commit comments

Comments
 (0)