diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 4d9a45abe17cd..f97d0048287b5 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -750,6 +750,7 @@ Groupby/resample/rolling - Bug in :meth:`.DataFrameGroupBy.quantile` when ``interpolation="nearest"`` is inconsistent with :meth:`DataFrame.quantile` (:issue:`47942`) - Bug in :meth:`.Resampler.interpolate` on a :class:`DataFrame` with non-uniform sampling and/or indices not aligning with the resulting resampled index would result in wrong interpolation (:issue:`21351`) - Bug in :meth:`DataFrame.ewm` and :meth:`Series.ewm` when passed ``times`` and aggregation functions other than mean (:issue:`51695`) +- Bug in :meth:`DataFrame.sum`, :meth:`Series.sum`, :meth:`DataFrameGroupBy.sum` and :math:`SeriesGroupBy.sum` where in case of all-nan values for object dtype the result is incorrectly set to 0 instead of ``nan``. (:issue:`60229`) - Bug in :meth:`DataFrameGroupBy.agg` that raises ``AttributeError`` when there is dictionary input and duplicated columns, instead of returning a DataFrame with the aggregation of all duplicate columns. (:issue:`55041`) - Bug in :meth:`DataFrameGroupBy.apply` and :meth:`SeriesGroupBy.apply` for empty data frame with ``group_keys=False`` still creating output index using group keys. (:issue:`60471`) - Bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index f65fa2368967a..e9935256a2430 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -725,8 +725,12 @@ def group_sum( raise ValueError("len(index) != len(labels)") nobs = np.zeros((out).shape, dtype=np.int64) - # the below is equivalent to `np.zeros_like(out)` but faster - sumx = np.zeros((out).shape, dtype=(out).base.dtype) + if sum_t is object: + # For object dtype, fill value should not be 0 (#60229) + sumx = np.full((out).shape, NAN, dtype=object) + else: + # the below is equivalent to `np.zeros_like(out)` but faster + sumx = np.zeros((out).shape, dtype=(out).base.dtype) compensation = np.zeros((out).shape, dtype=(out).base.dtype) N, K = (values).shape @@ -760,7 +764,10 @@ def group_sum( if uses_mask: isna_result = result_mask[lab, j] else: - isna_result = _treat_as_na(sumx[lab, j], is_datetimelike) + isna_result = ( + _treat_as_na(sumx[lab, j], is_datetimelike) and + nobs[lab, j] > 0 + ) if isna_result: # If sum is already NA, don't add to it. This is important for @@ -795,6 +802,7 @@ def group_sum( compensation[lab, j] = 0 sumx[lab, j] = t elif not skipna: + nobs[lab, j] += 1 if uses_mask: result_mask[lab, j] = True else: diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index d1dc0ff809497..e671106a82db9 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -638,6 +638,13 @@ def nansum( the_sum = values.sum(axis, dtype=dtype_sum) the_sum = _maybe_null_out(the_sum, axis, mask, values.shape, min_count=min_count) + if dtype.kind == "O" and skipna and min_count == 0: + # GH#60229 For object dtype, sum of all-NA array should be nan + if isinstance(the_sum, np.ndarray): + the_sum[mask.sum(axis=axis) == mask.shape[axis]] = np.nan + elif mask.all(): + the_sum = np.nan + return the_sum diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 8239de3f39c20..c792bb69f18d4 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -1186,7 +1186,7 @@ def test_frame_single_columns_object_sum_axis_1(): } df = DataFrame(data) result = df.sum(axis=1) - expected = Series(["A", 1.2, 0]) + expected = Series(["A", 1.2, np.nan]) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 64e686d25faa7..1eeea60d184fa 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -420,10 +420,25 @@ def test_stat_operators_attempt_obj_array(self, method, df, axis): assert df.values.dtype == np.object_ result = getattr(df, method)(axis=axis) expected = getattr(df.astype("f8"), method)(axis=axis).astype(object) + if method == "sum": + # GH#60229 in case of all-NA object array, sum should be nan + expected[df.isna().all(axis=axis)] = np.nan if axis in [1, "columns"] and method in ["min", "max"]: expected[expected.isna()] = None tm.assert_series_equal(result, expected) + def test_object_sum_allna(self): + # GH#60229 + df = DataFrame({"a": [np.nan] * 5, "b": [pd.NA] * 5}, dtype=object) + + result = df.sum(axis=0, skipna=True) + expected = Series([np.nan, np.nan], index=["a", "b"], dtype=object) + tm.assert_series_equal(result, expected) + + result = df.sum(axis=0, skipna=False) + expected = Series([np.nan, pd.NA], index=["a", "b"], dtype=object) + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("op", ["mean", "std", "var", "skew", "kurt", "sem"]) def test_mixed_ops(self, op): # GH#16116 diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index e49be8c00b426..db63dff553c7c 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -358,6 +358,7 @@ def test_observed(request, using_infer_string, observed): expected = cartesian_product_for_groupers( expected, [cat1, cat2], list("AB"), fill_value=0 ) + expected.loc[expected.C == 0, "C"] = np.nan tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index ea876cfdf4933..33306cc756fb1 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -514,6 +514,21 @@ def test_sum_skipna_object(skipna): tm.assert_series_equal(result, expected) +def test_sum_allnan_object(skipna): + # GH#60229 + df = DataFrame( + { + "val": [np.nan] * 10, + "cat": ["A", "B"] * 5, + } + ).astype({"val": object}) + expected = Series( + [np.nan, np.nan], index=pd.Index(["A", "B"], name="cat"), name="val" + ).astype(object) + result = df.groupby("cat")["val"].sum(skipna=skipna) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( "func, values, dtype, result_dtype", [ diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index 550efe9187fe8..63d0ca0de9bb1 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -113,7 +113,7 @@ def test_groupby_with_timegrouper(self): unit=df.index.unit, ) expected = DataFrame( - {"Buyer": 0, "Quantity": 0}, + {"Buyer": np.nan, "Quantity": 0}, index=exp_dti, ) # Cast to object to avoid implicit cast when setting entry to "CarlCarlCarl" diff --git a/pandas/tests/series/test_reductions.py b/pandas/tests/series/test_reductions.py index 86ce60b1fc12b..b6f2d03405749 100644 --- a/pandas/tests/series/test_reductions.py +++ b/pandas/tests/series/test_reductions.py @@ -111,6 +111,20 @@ def test_prod_numpy16_bug(): assert not isinstance(result, Series) +@pytest.mark.parametrize("nan_val", [np.nan, pd.NA]) +def test_object_sum_allna(nan_val): + # GH#60229 + ser = Series([nan_val] * 5, dtype=object) + + result = ser.sum(axis=0, skipna=True) + expected = np.nan + tm.assert_equal(result, expected) + + result = ser.sum(axis=0, skipna=False) + expected = nan_val + tm.assert_equal(result, expected) + + @pytest.mark.parametrize("func", [np.any, np.all]) @pytest.mark.parametrize("kwargs", [{"keepdims": True}, {"out": object()}]) def test_validate_any_all_out_keepdims_raises(kwargs, func):