Skip to content

Commit d5c4b97

Browse files
committed
COMPAT: sum/prod on all nan will remain nan regardless of bottleneck install
xref #15507 closes #9422
1 parent d1fe892 commit d5c4b97

File tree

9 files changed

+172
-199
lines changed

9 files changed

+172
-199
lines changed

doc/source/whatsnew/v0.21.0.txt

+36
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ users upgrade to this version.
1010
Highlights include:
1111

1212
- Integration with `Apache Parquet <https://parquet.apache.org/>`__, including a new top-level :func:`read_parquet` and :func:`DataFrame.to_parquet` method, see :ref:`here <io.parquet>`.
13+
- The behavior of ``sum`` and ``prod`` on all-NaN Series/DataFrames is now consistent without regards to `bottleneck <http://berkeleyanalytics.com/bottleneck>`__ is installed, see :ref:`here <whatsnew_0210.api_breaking.bottleneck>`
1314

1415
Check the :ref:`API Changes <whatsnew_0210.api_breaking>` and :ref:`deprecations <whatsnew_0210.deprecations>` before updating.
1516

@@ -145,6 +146,41 @@ We have updated our minimum supported versions of dependencies (:issue:`15206`,
145146
| Bottleneck | 1.0.0 | |
146147
+--------------+-----------------+----------+
147148

149+
.. _whatsnew_0210.api_breaking.bottleneck:
150+
151+
Sum/Prod of all-NaN Series/DataFrames is now consistently NaN
152+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
153+
154+
The behavior of summing all-NaN Series/DataFrames is now consistent without regards to
155+
whether `bottleneck <http://berkeleyanalytics.com/bottleneck>`__ is installed. (:issue:`9422`, :issue:`15507`).
156+
157+
This now will *always* preserve information. You will get back a ``NaN``, indicating missing values in that Series,
158+
or if summing a ``DataFrame``, a ``Series`` of all-``NaN``.
159+
160+
.. ipython:: python
161+
162+
s = Series([np.nan])
163+
164+
Previously NO ``bottleneck``
165+
166+
.. code_block:: ipython
167+
168+
In [2]: s.sum()
169+
Out[2]: np.nan
170+
171+
Previously WITH ``bottleneck``
172+
173+
.. code_block:: ipython
174+
175+
In [2]: s.sum()
176+
Out[2]: 0.0
177+
178+
New Behavior, without regards to the bottleneck installation.
179+
180+
.. ipython:: python
181+
182+
s.sum()
183+
148184
.. _whatsnew_0210.api_breaking.pandas_eval:
149185

150186
Improved error handling during item assignment in pd.eval

pandas/core/nanops.py

+18-15
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
is_datetime_or_timedelta_dtype,
1919
is_int_or_datetime_dtype, is_any_int_dtype)
2020
from pandas.core.dtypes.cast import _int64_max, maybe_upcast_putmask
21-
from pandas.core.dtypes.missing import isna, notna
21+
from pandas.core.dtypes.missing import isna, notna, na_value_for_dtype
2222
from pandas.core.config import get_option
2323
from pandas.core.common import _values_from_object
2424

@@ -89,8 +89,7 @@ def _f(*args, **kwargs):
8989

9090
class bottleneck_switch(object):
9191

92-
def __init__(self, zero_value=None, **kwargs):
93-
self.zero_value = zero_value
92+
def __init__(self, **kwargs):
9493
self.kwargs = kwargs
9594

9695
def __call__(self, alt):
@@ -108,18 +107,17 @@ def f(values, axis=None, skipna=True, **kwds):
108107
if k not in kwds:
109108
kwds[k] = v
110109
try:
111-
if self.zero_value is not None and values.size == 0:
112-
if values.ndim == 1:
110+
if values.size == 0:
111+
112+
fill_value = na_value_for_dtype(values.dtype)
113113

114-
# wrap the 0's if needed
115-
if is_timedelta64_dtype(values):
116-
return lib.Timedelta(0)
117-
return 0
114+
if values.ndim == 1:
115+
return fill_value
118116
else:
119117
result_shape = (values.shape[:axis] +
120118
values.shape[axis + 1:])
121-
result = np.empty(result_shape)
122-
result.fill(0)
119+
result = np.empty(result_shape, dtype=values.dtype)
120+
result.fill(fill_value)
123121
return result
124122

125123
if (_USE_BOTTLENECK and skipna and
@@ -154,11 +152,16 @@ def _bn_ok_dtype(dt, name):
154152
# Bottleneck chokes on datetime64
155153
if (not is_object_dtype(dt) and not is_datetime_or_timedelta_dtype(dt)):
156154

155+
# GH 15507
157156
# bottleneck does not properly upcast during the sum
158157
# so can overflow
159-
if name == 'nansum':
160-
if dt.itemsize < 8:
161-
return False
158+
159+
# GH 9422
160+
# further we also want to preserve NaN when all elements
161+
# are NaN, unlinke bottleneck/numpy which consider this
162+
# to be 0
163+
if name in ['nansum', 'nanprod']:
164+
return False
162165

163166
return True
164167
return False
@@ -297,7 +300,7 @@ def nanall(values, axis=None, skipna=True):
297300

298301

299302
@disallow('M8')
300-
@bottleneck_switch(zero_value=0)
303+
@bottleneck_switch()
301304
def nansum(values, axis=None, skipna=True):
302305
values, mask, dtype, dtype_max = _get_values(values, skipna, 0)
303306
dtype_sum = dtype_max

pandas/tests/frame/test_analytics.py

+35-38
Original file line numberDiff line numberDiff line change
@@ -444,7 +444,11 @@ def test_sum(self):
444444
has_numeric_only=True, check_dtype=False,
445445
check_less_precise=True)
446446

447-
def test_stat_operators_attempt_obj_array(self):
447+
@pytest.mark.parametrize(
448+
"method", ['sum', 'mean', 'prod', 'var',
449+
'std', 'skew', 'min', 'max'])
450+
def test_stat_operators_attempt_obj_array(self, method):
451+
# GH #676
448452
data = {
449453
'a': [-0.00049987540199591344, -0.0016467257772919831,
450454
0.00067695870775883013],
@@ -454,20 +458,17 @@ def test_stat_operators_attempt_obj_array(self):
454458
}
455459
df1 = DataFrame(data, index=['foo', 'bar', 'baz'],
456460
dtype='O')
457-
methods = ['sum', 'mean', 'prod', 'var', 'std', 'skew', 'min', 'max']
458461

459-
# GH #676
460462
df2 = DataFrame({0: [np.nan, 2], 1: [np.nan, 3],
461463
2: [np.nan, 4]}, dtype=object)
462464

463465
for df in [df1, df2]:
464-
for meth in methods:
465-
assert df.values.dtype == np.object_
466-
result = getattr(df, meth)(1)
467-
expected = getattr(df.astype('f8'), meth)(1)
466+
assert df.values.dtype == np.object_
467+
result = getattr(df, method)(1)
468+
expected = getattr(df.astype('f8'), method)(1)
468469

469-
if not tm._incompat_bottleneck_version(meth):
470-
tm.assert_series_equal(result, expected)
470+
if method in ['sum', 'prod']:
471+
tm.assert_series_equal(result, expected)
471472

472473
def test_mean(self):
473474
self._check_stat_op('mean', np.mean, check_dates=True)
@@ -559,15 +560,15 @@ def test_var_std(self):
559560
arr = np.repeat(np.random.random((1, 1000)), 1000, 0)
560561
result = nanops.nanvar(arr, axis=0)
561562
assert not (result < 0).any()
562-
if nanops._USE_BOTTLENECK:
563-
nanops._USE_BOTTLENECK = False
563+
564+
with pd.option_context('use_bottleneck', False):
564565
result = nanops.nanvar(arr, axis=0)
565566
assert not (result < 0).any()
566-
nanops._USE_BOTTLENECK = True
567567

568-
def test_numeric_only_flag(self):
568+
@pytest.mark.parametrize(
569+
"meth", ['sem', 'var', 'std'])
570+
def test_numeric_only_flag(self, meth):
569571
# GH #9201
570-
methods = ['sem', 'var', 'std']
571572
df1 = DataFrame(np.random.randn(5, 3), columns=['foo', 'bar', 'baz'])
572573
# set one entry to a number in str format
573574
df1.loc[0, 'foo'] = '100'
@@ -576,20 +577,19 @@ def test_numeric_only_flag(self):
576577
# set one entry to a non-number str
577578
df2.loc[0, 'foo'] = 'a'
578579

579-
for meth in methods:
580-
result = getattr(df1, meth)(axis=1, numeric_only=True)
581-
expected = getattr(df1[['bar', 'baz']], meth)(axis=1)
582-
tm.assert_series_equal(expected, result)
580+
result = getattr(df1, meth)(axis=1, numeric_only=True)
581+
expected = getattr(df1[['bar', 'baz']], meth)(axis=1)
582+
tm.assert_series_equal(expected, result)
583583

584-
result = getattr(df2, meth)(axis=1, numeric_only=True)
585-
expected = getattr(df2[['bar', 'baz']], meth)(axis=1)
586-
tm.assert_series_equal(expected, result)
584+
result = getattr(df2, meth)(axis=1, numeric_only=True)
585+
expected = getattr(df2[['bar', 'baz']], meth)(axis=1)
586+
tm.assert_series_equal(expected, result)
587587

588-
# df1 has all numbers, df2 has a letter inside
589-
pytest.raises(TypeError, lambda: getattr(df1, meth)(
590-
axis=1, numeric_only=False))
591-
pytest.raises(TypeError, lambda: getattr(df2, meth)(
592-
axis=1, numeric_only=False))
588+
# df1 has all numbers, df2 has a letter inside
589+
pytest.raises(TypeError, lambda: getattr(df1, meth)(
590+
axis=1, numeric_only=False))
591+
pytest.raises(TypeError, lambda: getattr(df2, meth)(
592+
axis=1, numeric_only=False))
593593

594594
def test_mixed_ops(self):
595595
# GH 16116
@@ -602,11 +602,9 @@ def test_mixed_ops(self):
602602
result = getattr(df, op)()
603603
assert len(result) == 2
604604

605-
if nanops._USE_BOTTLENECK:
606-
nanops._USE_BOTTLENECK = False
605+
with pd.option_context('use_bottleneck', False):
607606
result = getattr(df, op)()
608607
assert len(result) == 2
609-
nanops._USE_BOTTLENECK = True
610608

611609
def test_cumsum(self):
612610
self.tsframe.loc[5:10, 0] = nan
@@ -672,11 +670,10 @@ def test_sem(self):
672670
arr = np.repeat(np.random.random((1, 1000)), 1000, 0)
673671
result = nanops.nansem(arr, axis=0)
674672
assert not (result < 0).any()
675-
if nanops._USE_BOTTLENECK:
676-
nanops._USE_BOTTLENECK = False
673+
674+
with pd.option_context('use_bottleneck', False):
677675
result = nanops.nansem(arr, axis=0)
678676
assert not (result < 0).any()
679-
nanops._USE_BOTTLENECK = True
680677

681678
def test_skew(self):
682679
tm._skip_if_no_scipy()
@@ -763,7 +760,7 @@ def wrapper(x):
763760
tm.assert_series_equal(result0, frame.apply(skipna_wrapper),
764761
check_dtype=check_dtype,
765762
check_less_precise=check_less_precise)
766-
if not tm._incompat_bottleneck_version(name):
763+
if name in ['sum', 'prod']:
767764
exp = frame.apply(skipna_wrapper, axis=1)
768765
tm.assert_series_equal(result1, exp, check_dtype=False,
769766
check_less_precise=check_less_precise)
@@ -795,7 +792,7 @@ def wrapper(x):
795792
all_na = self.frame * np.NaN
796793
r0 = getattr(all_na, name)(axis=0)
797794
r1 = getattr(all_na, name)(axis=1)
798-
if not tm._incompat_bottleneck_version(name):
795+
if name in ['sum', 'prod']:
799796
assert np.isnan(r0).all()
800797
assert np.isnan(r1).all()
801798

@@ -1855,14 +1852,14 @@ def test_dataframe_clip(self):
18551852
assert (clipped_df.values[ub_mask] == ub).all()
18561853
assert (clipped_df.values[mask] == df.values[mask]).all()
18571854

1858-
@pytest.mark.xfail(reason=("clip on mixed integer or floats "
1859-
"with integer clippers coerces to float"))
18601855
def test_clip_mixed_numeric(self):
1861-
1856+
# TODO(jreback)
1857+
# clip on mixed integer or floats
1858+
# with integer clippers coerces to float
18621859
df = DataFrame({'A': [1, 2, 3],
18631860
'B': [1., np.nan, 3.]})
18641861
result = df.clip(1, 2)
1865-
expected = DataFrame({'A': [1, 2, 2],
1862+
expected = DataFrame({'A': [1, 2, 2.],
18661863
'B': [1., np.nan, 2.]})
18671864
tm.assert_frame_equal(result, expected, check_like=True)
18681865

pandas/tests/groupby/test_aggregate.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -562,7 +562,7 @@ def _testit(name):
562562
exp.name = 'C'
563563

564564
result = op(grouped)['C']
565-
if not tm._incompat_bottleneck_version(name):
565+
if name in ['sum', 'prod']:
566566
assert_series_equal(result, exp)
567567

568568
_testit('count')

0 commit comments

Comments
 (0)