From 61f95ff344d9be751958bf0400d03e202a10e6dd Mon Sep 17 00:00:00 2001 From: Nitish Satyavolu Date: Sat, 30 Nov 2024 22:17:01 -0800 Subject: [PATCH 1/4] BUG: Groupby sum for object type should be None instead of 0 for all nan values --- pandas/_libs/groupby.pyx | 8 ++++++-- pandas/tests/groupby/test_categorical.py | 1 + pandas/tests/groupby/test_timegrouper.py | 2 +- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index d7e485f74e58b..7cca4bb793296 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -717,8 +717,12 @@ def group_sum( raise ValueError("len(index) != len(labels)") nobs = np.zeros((out).shape, dtype=np.int64) - # the below is equivalent to `np.zeros_like(out)` but faster - sumx = np.zeros((out).shape, dtype=(out).base.dtype) + if sum_t is object: + # For object dtype, fill value should not be 0 (#60229) + sumx = np.empty((out).shape, dtype=object) + else: + # the below is equivalent to `np.zeros_like(out)` but faster + sumx = np.zeros((out).shape, dtype=(out).base.dtype) compensation = np.zeros((out).shape, dtype=(out).base.dtype) N, K = (values).shape diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 6d84dae1d25d8..809c960be20b6 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -361,6 +361,7 @@ def test_observed(request, using_infer_string, observed): expected = cartesian_product_for_groupers( expected, [cat1, cat2], list("AB"), fill_value=0 ) + expected.loc[expected.C == 0, "C"] = None tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index a7712d9dc6586..dd1680738a7c6 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -113,7 +113,7 @@ def test_groupby_with_timegrouper(self): unit=df.index.unit, ) expected = DataFrame( - {"Buyer": 0, "Quantity": 0}, + {"Buyer": None, "Quantity": 0}, index=exp_dti, ) # Cast to object to avoid implicit cast when setting entry to "CarlCarlCarl" From 15a762847e4dcbfed6ef53086a36e8b48b1006d3 Mon Sep 17 00:00:00 2001 From: Nitish Satyavolu Date: Mon, 2 Dec 2024 21:50:10 -0800 Subject: [PATCH 2/4] BUG: Add not to whatsnew/v3.0.0.rst --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index e74bd2f745b94..0e8b9e2da058d 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -735,6 +735,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupBy.apply` with ``as_index=False`` that was returning :class:`MultiIndex` instead of returning :class:`Index`. (:issue:`58291`) - Bug in :meth:`DataFrameGroupBy.cumsum` and :meth:`DataFrameGroupBy.cumprod` where ``numeric_only`` parameter was passed indirectly through kwargs instead of passing directly. (:issue:`58811`) - Bug in :meth:`DataFrameGroupBy.cumsum` where it did not return the correct dtype when the label contained ``None``. (:issue:`58811`) +- Bug in :meth:`DataFrameGroupBy.sum` and :math:`SeriesGroupBy.sum` where in case of all-nan values for object dtype the result is incorrectly set to 0 instead of ``None``. (:issue:`58811`) - Bug in :meth:`DataFrameGroupby.transform` and :meth:`SeriesGroupby.transform` with a reducer and ``observed=False`` that coerces dtype to float when there are unobserved categories. (:issue:`55326`) - Bug in :meth:`Rolling.apply` where the applied function could be called on fewer than ``min_period`` periods if ``method="table"``. (:issue:`58868`) - Bug in :meth:`Series.resample` could raise when the the date range ended shortly before a non-existent time. (:issue:`58380`) From fba9f78df084bf857cb040478128fadac29e902c Mon Sep 17 00:00:00 2001 From: Nitish Satyavolu Date: Sat, 15 Feb 2025 23:53:14 -0800 Subject: [PATCH 3/4] Groupby sum for all-nan object array should be nan instead of None --- pandas/_libs/groupby.pyx | 8 ++++++-- pandas/tests/groupby/test_categorical.py | 2 +- pandas/tests/groupby/test_reductions.py | 15 +++++++++++++++ pandas/tests/groupby/test_timegrouper.py | 2 +- 4 files changed, 23 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 03ac344b2fcca..e9935256a2430 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -727,7 +727,7 @@ def group_sum( nobs = np.zeros((out).shape, dtype=np.int64) if sum_t is object: # For object dtype, fill value should not be 0 (#60229) - sumx = np.empty((out).shape, dtype=object) + sumx = np.full((out).shape, NAN, dtype=object) else: # the below is equivalent to `np.zeros_like(out)` but faster sumx = np.zeros((out).shape, dtype=(out).base.dtype) @@ -764,7 +764,10 @@ def group_sum( if uses_mask: isna_result = result_mask[lab, j] else: - isna_result = _treat_as_na(sumx[lab, j], is_datetimelike) + isna_result = ( + _treat_as_na(sumx[lab, j], is_datetimelike) and + nobs[lab, j] > 0 + ) if isna_result: # If sum is already NA, don't add to it. This is important for @@ -799,6 +802,7 @@ def group_sum( compensation[lab, j] = 0 sumx[lab, j] = t elif not skipna: + nobs[lab, j] += 1 if uses_mask: result_mask[lab, j] = True else: diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index e26c1574d0b4a..db63dff553c7c 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -358,7 +358,7 @@ def test_observed(request, using_infer_string, observed): expected = cartesian_product_for_groupers( expected, [cat1, cat2], list("AB"), fill_value=0 ) - expected.loc[expected.C == 0, "C"] = None + expected.loc[expected.C == 0, "C"] = np.nan tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index ea876cfdf4933..33306cc756fb1 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -514,6 +514,21 @@ def test_sum_skipna_object(skipna): tm.assert_series_equal(result, expected) +def test_sum_allnan_object(skipna): + # GH#60229 + df = DataFrame( + { + "val": [np.nan] * 10, + "cat": ["A", "B"] * 5, + } + ).astype({"val": object}) + expected = Series( + [np.nan, np.nan], index=pd.Index(["A", "B"], name="cat"), name="val" + ).astype(object) + result = df.groupby("cat")["val"].sum(skipna=skipna) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( "func, values, dtype, result_dtype", [ diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index e1d7db3c0c04a..63d0ca0de9bb1 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -113,7 +113,7 @@ def test_groupby_with_timegrouper(self): unit=df.index.unit, ) expected = DataFrame( - {"Buyer": None, "Quantity": 0}, + {"Buyer": np.nan, "Quantity": 0}, index=exp_dti, ) # Cast to object to avoid implicit cast when setting entry to "CarlCarlCarl" From 45eb9af36383070f042cd8bfbefc8ffaae63ff73 Mon Sep 17 00:00:00 2001 From: Nitish Satyavolu Date: Tue, 18 Feb 2025 14:36:44 -0800 Subject: [PATCH 4/4] Fix behavior for DataFrame.sum and Series.sum --- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/core/nanops.py | 7 +++++++ pandas/tests/frame/test_arithmetic.py | 2 +- pandas/tests/frame/test_reductions.py | 15 +++++++++++++++ pandas/tests/series/test_reductions.py | 14 ++++++++++++++ 5 files changed, 38 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index a43ce386cb877..f97d0048287b5 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -750,13 +750,13 @@ Groupby/resample/rolling - Bug in :meth:`.DataFrameGroupBy.quantile` when ``interpolation="nearest"`` is inconsistent with :meth:`DataFrame.quantile` (:issue:`47942`) - Bug in :meth:`.Resampler.interpolate` on a :class:`DataFrame` with non-uniform sampling and/or indices not aligning with the resulting resampled index would result in wrong interpolation (:issue:`21351`) - Bug in :meth:`DataFrame.ewm` and :meth:`Series.ewm` when passed ``times`` and aggregation functions other than mean (:issue:`51695`) +- Bug in :meth:`DataFrame.sum`, :meth:`Series.sum`, :meth:`DataFrameGroupBy.sum` and :math:`SeriesGroupBy.sum` where in case of all-nan values for object dtype the result is incorrectly set to 0 instead of ``nan``. (:issue:`60229`) - Bug in :meth:`DataFrameGroupBy.agg` that raises ``AttributeError`` when there is dictionary input and duplicated columns, instead of returning a DataFrame with the aggregation of all duplicate columns. (:issue:`55041`) - Bug in :meth:`DataFrameGroupBy.apply` and :meth:`SeriesGroupBy.apply` for empty data frame with ``group_keys=False`` still creating output index using group keys. (:issue:`60471`) - Bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`) - Bug in :meth:`DataFrameGroupBy.apply` with ``as_index=False`` that was returning :class:`MultiIndex` instead of returning :class:`Index`. (:issue:`58291`) - Bug in :meth:`DataFrameGroupBy.cumsum` and :meth:`DataFrameGroupBy.cumprod` where ``numeric_only`` parameter was passed indirectly through kwargs instead of passing directly. (:issue:`58811`) - Bug in :meth:`DataFrameGroupBy.cumsum` where it did not return the correct dtype when the label contained ``None``. (:issue:`58811`) -- Bug in :meth:`DataFrameGroupBy.sum` and :math:`SeriesGroupBy.sum` where in case of all-nan values for object dtype the result is incorrectly set to 0 instead of ``None``. (:issue:`58811`) - Bug in :meth:`DataFrameGroupby.transform` and :meth:`SeriesGroupby.transform` with a reducer and ``observed=False`` that coerces dtype to float when there are unobserved categories. (:issue:`55326`) - Bug in :meth:`Rolling.apply` for ``method="table"`` where column order was not being respected due to the columns getting sorted by default. (:issue:`59666`) - Bug in :meth:`Rolling.apply` where the applied function could be called on fewer than ``min_period`` periods if ``method="table"``. (:issue:`58868`) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index d1dc0ff809497..e671106a82db9 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -638,6 +638,13 @@ def nansum( the_sum = values.sum(axis, dtype=dtype_sum) the_sum = _maybe_null_out(the_sum, axis, mask, values.shape, min_count=min_count) + if dtype.kind == "O" and skipna and min_count == 0: + # GH#60229 For object dtype, sum of all-NA array should be nan + if isinstance(the_sum, np.ndarray): + the_sum[mask.sum(axis=axis) == mask.shape[axis]] = np.nan + elif mask.all(): + the_sum = np.nan + return the_sum diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 8239de3f39c20..c792bb69f18d4 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -1186,7 +1186,7 @@ def test_frame_single_columns_object_sum_axis_1(): } df = DataFrame(data) result = df.sum(axis=1) - expected = Series(["A", 1.2, 0]) + expected = Series(["A", 1.2, np.nan]) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 64e686d25faa7..1eeea60d184fa 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -420,10 +420,25 @@ def test_stat_operators_attempt_obj_array(self, method, df, axis): assert df.values.dtype == np.object_ result = getattr(df, method)(axis=axis) expected = getattr(df.astype("f8"), method)(axis=axis).astype(object) + if method == "sum": + # GH#60229 in case of all-NA object array, sum should be nan + expected[df.isna().all(axis=axis)] = np.nan if axis in [1, "columns"] and method in ["min", "max"]: expected[expected.isna()] = None tm.assert_series_equal(result, expected) + def test_object_sum_allna(self): + # GH#60229 + df = DataFrame({"a": [np.nan] * 5, "b": [pd.NA] * 5}, dtype=object) + + result = df.sum(axis=0, skipna=True) + expected = Series([np.nan, np.nan], index=["a", "b"], dtype=object) + tm.assert_series_equal(result, expected) + + result = df.sum(axis=0, skipna=False) + expected = Series([np.nan, pd.NA], index=["a", "b"], dtype=object) + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("op", ["mean", "std", "var", "skew", "kurt", "sem"]) def test_mixed_ops(self, op): # GH#16116 diff --git a/pandas/tests/series/test_reductions.py b/pandas/tests/series/test_reductions.py index 86ce60b1fc12b..b6f2d03405749 100644 --- a/pandas/tests/series/test_reductions.py +++ b/pandas/tests/series/test_reductions.py @@ -111,6 +111,20 @@ def test_prod_numpy16_bug(): assert not isinstance(result, Series) +@pytest.mark.parametrize("nan_val", [np.nan, pd.NA]) +def test_object_sum_allna(nan_val): + # GH#60229 + ser = Series([nan_val] * 5, dtype=object) + + result = ser.sum(axis=0, skipna=True) + expected = np.nan + tm.assert_equal(result, expected) + + result = ser.sum(axis=0, skipna=False) + expected = nan_val + tm.assert_equal(result, expected) + + @pytest.mark.parametrize("func", [np.any, np.all]) @pytest.mark.parametrize("kwargs", [{"keepdims": True}, {"out": object()}]) def test_validate_any_all_out_keepdims_raises(kwargs, func):