From 1bf2dd5f5bc35678c6f156de849883a4e7f43647 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 18 Jan 2020 16:35:39 +0100 Subject: [PATCH] Backport PR #30971: BUG: reductions for nullable dtypes should return pd.NA for skipna=False --- doc/source/whatsnew/v1.0.0.rst | 19 +++++++++++++++++++ pandas/core/arrays/boolean.py | 8 +++++--- pandas/core/arrays/integer.py | 14 ++++++++------ pandas/tests/extension/test_boolean.py | 4 +++- pandas/tests/extension/test_integer.py | 10 +++++++++- 5 files changed, 44 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index c423933d4c438..4d55ee1c1cfc2 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -483,6 +483,25 @@ Use :meth:`arrays.IntegerArray.to_numpy` with an explicit ``na_value`` instead. a.to_numpy(dtype="float", na_value=np.nan) +**Reductions can return ``pd.NA``** + +When performing a reduction such as a sum with ``skipna=False``, the result +will now be ``pd.NA`` instead of ``np.nan`` in presence of missing values +(:issue:`30958`). + +*pandas 0.25.x* + +.. code-block:: python + + >>> pd.Series(a).sum(skipna=False) + nan + +*pandas 1.0.0* + +.. ipython:: python + + pd.Series(a).sum(skipna=False) + **value_counts returns a nullable integer dtype** :meth:`Series.value_counts` with a nullable integer dtype now returns a nullable diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index fa1cbc87cc5c1..eaa17df1235d3 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -670,13 +670,15 @@ def _reduce(self, name, skipna=True, **kwargs): mask = self._mask # coerce to a nan-aware float if needed - if mask.any(): - data = self._data.astype("float64") - data[mask] = np.nan + if self._hasna: + data = self.to_numpy("float64", na_value=np.nan) op = getattr(nanops, "nan" + name) result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs) + if np.isnan(result): + return libmissing.NA + # if we have numeric op that would result in an int, coerce to int if possible if name in ["sum", "prod"] and notna(result): int_result = np.int64(result) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index cb1e7115cd3c2..67036761bc62a 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -21,7 +21,7 @@ is_scalar, ) from pandas.core.dtypes.dtypes import register_extension_dtype -from pandas.core.dtypes.missing import isna, notna +from pandas.core.dtypes.missing import isna from pandas.core import nanops, ops from pandas.core.ops import invalid_comparison @@ -549,21 +549,23 @@ def _reduce(self, name, skipna=True, **kwargs): mask = self._mask # coerce to a nan-aware float if needed - if mask.any(): - data = self._data.astype("float64") - # We explicitly use NaN within reductions. - data[mask] = np.nan + # (we explicitly use NaN within reductions) + if self._hasna: + data = self.to_numpy("float64", na_value=np.nan) op = getattr(nanops, "nan" + name) result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs) + if np.isnan(result): + return libmissing.NA + # if we have a boolean op, don't coerce if name in ["any", "all"]: pass # if we have a preservable numeric op, # provide coercion back to an integer type if possible - elif name in ["sum", "min", "max", "prod"] and notna(result): + elif name in ["sum", "min", "max", "prod"]: int_result = int(result) if int_result == result: result = int_result diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index a7ce0fb097599..c489445d8512a 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -327,7 +327,9 @@ def check_reduce(self, s, op_name, skipna): result = getattr(s, op_name)(skipna=skipna) expected = getattr(s.astype("float64"), op_name)(skipna=skipna) # override parent function to cast to bool for min/max - if op_name in ("min", "max") and not pd.isna(expected): + if np.isnan(expected): + expected = pd.NA + elif op_name in ("min", "max"): expected = bool(expected) tm.assert_almost_equal(result, expected) diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index afb8412f12ea9..f55ec75b47dfa 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -19,6 +19,7 @@ from pandas.core.dtypes.common import is_extension_array_dtype import pandas as pd +import pandas._testing as tm from pandas.core.arrays import integer_array from pandas.core.arrays.integer import ( Int8Dtype, @@ -233,7 +234,14 @@ class TestGroupby(base.BaseGroupbyTests): class TestNumericReduce(base.BaseNumericReduceTests): - pass + def check_reduce(self, s, op_name, skipna): + # overwrite to ensure pd.NA is tested instead of np.nan + # https://github.com/pandas-dev/pandas/issues/30958 + result = getattr(s, op_name)(skipna=skipna) + expected = getattr(s.astype("float64"), op_name)(skipna=skipna) + if np.isnan(expected): + expected = pd.NA + tm.assert_almost_equal(result, expected) class TestBooleanReduce(base.BaseBooleanReduceTests):