From f6610f44b2979a590a9b1f72e8dd5dc4ac0fddeb Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 17 May 2020 09:56:22 +0100 Subject: [PATCH 1/3] BUG: Summing a sparse boolean series raises TypeError --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/core/arrays/sparse/array.py | 8 +++++++- pandas/core/dtypes/missing.py | 4 +++- pandas/tests/arrays/sparse/test_array.py | 19 +++++++++++++++++++ 4 files changed, 30 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 73892da2cbf71..227f2c16b37e2 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -852,7 +852,7 @@ Sparse ^^^^^^ - Creating a :class:`SparseArray` from timezone-aware dtype will issue a warning before dropping timezone information, instead of doing so silently (:issue:`32501`) - Bug in :meth:`arrays.SparseArray.from_spmatrix` wrongly read scipy sparse matrix (:issue:`31991`) -- +- Bug in :meth:`Series.sum` with ``SparseArray`` raises ``TypeError`` (:issue:`25777`) ExtensionArray ^^^^^^^^^^^^^^ diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 2720c831bcff6..9e4e43d152079 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -1220,7 +1220,7 @@ def any(self, axis=0, *args, **kwargs): return values.any().item() - def sum(self, axis=0, *args, **kwargs): + def sum(self, axis=0, min_count=0, *args, **kwargs): """ Sum of non-NA/null values @@ -1230,11 +1230,17 @@ def sum(self, axis=0, *args, **kwargs): """ nv.validate_sum(args, kwargs) valid_vals = self._valid_sp_values + count = len(valid_vals) sp_sum = valid_vals.sum() if self._null_fill_value: + if count < min_count: + return na_value_for_dtype(self.dtype.subtype, compat=False) return sp_sum else: nsparse = self.sp_index.ngaps + count += nsparse + if count < min_count: + return na_value_for_dtype(self.dtype.subtype, compat=False) return sp_sum + self.fill_value * nsparse def cumsum(self, axis=0, *args, **kwargs): diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 443206754ba69..75188ad5b00eb 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -520,7 +520,9 @@ def na_value_for_dtype(dtype, compat: bool = True): return 0 return np.nan elif is_bool_dtype(dtype): - return False + if compat: + return False + return np.nan return np.nan diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index f1e5050fa8a2e..8450253f853c3 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -983,6 +983,25 @@ def test_sum(self): out = SparseArray(data, fill_value=np.nan).sum() assert out == 40.0 + @pytest.mark.parametrize( + "arr", + [ + np.array([0, 1, np.nan, 1]), + np.array([0, 1, 1]), + np.array([True, True, False]), + ], + ) + @pytest.mark.parametrize("fill_value", [0, 1, np.nan, True, False]) + @pytest.mark.parametrize("min_count, expected", [(3, 2), (4, np.nan)]) + def test_sum_min_count(self, arr, fill_value, min_count, expected): + # https://github.com/pandas-dev/pandas/issues/25777 + sparray = SparseArray(arr, fill_value=fill_value) + result = sparray.sum(min_count=min_count) + if np.isnan(expected): + assert np.isnan(result) + else: + assert result == expected + def test_numpy_sum(self): data = np.arange(10).astype(float) out = np.sum(SparseArray(data)) From ab3d7c94a05c94f13bfba3044f21fffb55aa7d20 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 18 May 2020 12:35:20 +0100 Subject: [PATCH 2/3] docstring and types --- pandas/core/arrays/sparse/array.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 9e4e43d152079..363e4385f7821 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -13,6 +13,7 @@ import pandas._libs.sparse as splib from pandas._libs.sparse import BlockIndex, IntIndex, SparseIndex from pandas._libs.tslibs import NaT +from pandas._typing import Scalar import pandas.compat as compat from pandas.compat.numpy import function as nv from pandas.errors import PerformanceWarning @@ -1220,13 +1221,24 @@ def any(self, axis=0, *args, **kwargs): return values.any().item() - def sum(self, axis=0, min_count=0, *args, **kwargs): + def sum(self, axis: int = 0, min_count: int = 0, *args, **kwargs) -> Scalar: """ Sum of non-NA/null values + Parameters + ---------- + axis : int, default 0 + Not Used. NumPy compatibility. + min_count : int, default 0 + The required number of valid values to perform the summation. If fewer + than ``min_count`` valid values are present, the result will be the missing + value indicator for subarray type. + *args, **kwargs + Not Used. NumPy compatibility. + Returns ------- - sum : float + scalar """ nv.validate_sum(args, kwargs) valid_vals = self._valid_sp_values From 2dd71c8fc6c442beb464afcab95c6b762f2d8793 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 18 May 2020 13:00:55 +0100 Subject: [PATCH 3/3] use check_below_min_count --- pandas/core/arrays/sparse/array.py | 7 +++---- pandas/core/nanops.py | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 363e4385f7821..3cfd92d778823 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -47,6 +47,7 @@ from pandas.core.construction import extract_array, sanitize_array from pandas.core.indexers import check_array_indexer from pandas.core.missing import interpolate_2d +from pandas.core.nanops import check_below_min_count import pandas.core.ops as ops from pandas.core.ops.common import unpack_zerodim_and_defer @@ -1242,16 +1243,14 @@ def sum(self, axis: int = 0, min_count: int = 0, *args, **kwargs) -> Scalar: """ nv.validate_sum(args, kwargs) valid_vals = self._valid_sp_values - count = len(valid_vals) sp_sum = valid_vals.sum() if self._null_fill_value: - if count < min_count: + if check_below_min_count(valid_vals.shape, None, min_count): return na_value_for_dtype(self.dtype.subtype, compat=False) return sp_sum else: nsparse = self.sp_index.ngaps - count += nsparse - if count < min_count: + if check_below_min_count(valid_vals.shape, None, min_count - nsparse): return na_value_for_dtype(self.dtype.subtype, compat=False) return sp_sum + self.fill_value * nsparse diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 0a9d6f2172ff8..6b8518d8a47a0 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1283,7 +1283,7 @@ def _maybe_null_out( def check_below_min_count( shape: Tuple[int, ...], mask: Optional[np.ndarray], min_count: int -): +) -> bool: """ Check for the `min_count` keyword. Returns True if below `min_count` (when missing value should be returned from the reduction).