From 968a95906dff21661a914b8ee67f2be417862794 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 9 Aug 2022 21:32:51 +0200 Subject: [PATCH 1/4] ENH: Support mask in groupby sum --- pandas/_libs/groupby.pyx | 31 +++++++++++++++++++++++++++---- pandas/core/groupby/ops.py | 3 +++ 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 06830a1d84c6e..9776a8e2c57af 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -523,6 +523,8 @@ def group_sum( int64_t[::1] counts, ndarray[sum_t, ndim=2] values, const intp_t[::1] labels, + const uint8_t[:, :] mask, + uint8_t[:, ::1] result_mask=None, Py_ssize_t min_count=0, bint is_datetimelike=False, ) -> None: @@ -535,6 +537,9 @@ def group_sum( sum_t[:, ::1] sumx, compensation int64_t[:, ::1] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) + bint uses_mask = mask is not None + bint isna_entry + bint isna_entry_nogil if len_values != len_labels: raise ValueError("len(index) != len(labels)") @@ -558,7 +563,12 @@ def group_sum( val = values[i, j] # not nan - if not checknull(val): + if uses_mask: + isna_entry = mask[i, j] + else: + isna_entry = checknull(val) + + if not isna_entry: nobs[lab, j] += 1 if nobs[lab, j] == 1: @@ -572,7 +582,11 @@ def group_sum( for i in range(ncounts): for j in range(K): if nobs[i, j] < min_count: - out[i, j] = NAN + if uses_mask: + result_mask[i, j] = True + else: + out[i, j] = NAN + else: out[i, j] = sumx[i, j] else: @@ -590,7 +604,12 @@ def group_sum( # With dt64/td64 values, values have been cast to float64 # instead if int64 for group_sum, but the logic # is otherwise the same as in _treat_as_na - if val == val and not ( + if uses_mask: + isna_entry_nogil = mask[i, j] + else: + isna_entry_nogil = False + + if not isna_entry_nogil and not ( sum_t is float64_t and is_datetimelike and val == NPY_NAT @@ -604,7 +623,11 @@ def group_sum( for i in range(ncounts): for j in range(K): if nobs[i, j] < min_count: - out[i, j] = NAN + if uses_mask: + result_mask[i, j] = True + else: + out[i, j] = NAN + else: out[i, j] = sumx[i, j] diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 5feed98cbc75b..b8655aef86bcd 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -155,6 +155,7 @@ def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None: "last", "first", "rank", + "sum", } _cython_arity = {"ohlc": 4} # OHLC @@ -578,6 +579,8 @@ def _call_cython_op( counts=counts, values=values, labels=comp_ids, + mask=mask, + result_mask=result_mask, min_count=min_count, is_datetimelike=is_datetimelike, ) From 9184025c0d0cca24ff2e8ee04687bb8460dda2b7 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 9 Aug 2022 23:05:33 +0200 Subject: [PATCH 2/4] ENH: Support mask in groupby sum --- doc/source/whatsnew/v1.5.0.rst | 1 + pandas/_libs/groupby.pyi | 6 ++-- pandas/_libs/groupby.pyx | 50 ++++++++++++++++++---------- pandas/core/groupby/ops.py | 5 +-- pandas/tests/groupby/test_groupby.py | 21 ++++++++++++ 5 files changed, 62 insertions(+), 21 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 502e37705abfb..e2cd7cb472ea5 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -1047,6 +1047,7 @@ Groupby/resample/rolling - Bug when using ``engine="numba"`` would return the same jitted function when modifying ``engine_kwargs`` (:issue:`46086`) - Bug in :meth:`.DataFrameGroupBy.transform` fails when ``axis=1`` and ``func`` is ``"first"`` or ``"last"`` (:issue:`45986`) - Bug in :meth:`DataFrameGroupBy.cumsum` with ``skipna=False`` giving incorrect results (:issue:`46216`) +- Bug in :meth:`GroupBy.sum` with integer dtypes losing precision (:issue:`37493`) - Bug in :meth:`.GroupBy.cumsum` with ``timedelta64[ns]`` dtype failing to recognize ``NaT`` as a null value (:issue:`46216`) - Bug in :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` with nullable dtypes incorrectly altering the original data in place (:issue:`46220`) - Bug in :meth:`DataFrame.groupby` raising error when ``None`` is in first level of :class:`MultiIndex` (:issue:`47348`) diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index dfae1bff91ac8..d5ceb651e9893 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -51,10 +51,12 @@ def group_any_all( skipna: bool, ) -> None: ... def group_sum( - out: np.ndarray, # complexfloating_t[:, ::1] + out: np.ndarray, # complexfloatingintuint_t[:, ::1] counts: np.ndarray, # int64_t[::1] - values: np.ndarray, # ndarray[complexfloating_t, ndim=2] + values: np.ndarray, # ndarray[complexfloatingintuint_t, ndim=2] labels: np.ndarray, # const intp_t[:] + mask: np.ndarray | None = ..., + result_mask: np.ndarray | None = ..., min_count: int = ..., is_datetimelike: bool = ..., ) -> None: ... diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 9776a8e2c57af..e1080e4161fb5 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -513,6 +513,15 @@ ctypedef fused mean_t: ctypedef fused sum_t: mean_t + int8_t + int16_t + int32_t + int64_t + + uint8_t + uint16_t + uint32_t + uint64_t object @@ -539,7 +548,6 @@ def group_sum( Py_ssize_t len_values = len(values), len_labels = len(labels) bint uses_mask = mask is not None bint isna_entry - bint isna_entry_nogil if len_values != len_labels: raise ValueError("len(index) != len(labels)") @@ -563,12 +571,7 @@ def group_sum( val = values[i, j] # not nan - if uses_mask: - isna_entry = mask[i, j] - else: - isna_entry = checknull(val) - - if not isna_entry: + if not checknull(val): nobs[lab, j] += 1 if nobs[lab, j] == 1: @@ -582,10 +585,7 @@ def group_sum( for i in range(ncounts): for j in range(K): if nobs[i, j] < min_count: - if uses_mask: - result_mask[i, j] = True - else: - out[i, j] = NAN + out[i, j] = None else: out[i, j] = sumx[i, j] @@ -605,14 +605,18 @@ def group_sum( # instead if int64 for group_sum, but the logic # is otherwise the same as in _treat_as_na if uses_mask: - isna_entry_nogil = mask[i, j] + isna_entry = mask[i, j] + elif (sum_t is float32_t or sum_t is float64_t + or sum_t is complex64_t or sum_t is complex64_t): + # avoid warnings because of equality comparison + isna_entry = not val == val else: - isna_entry_nogil = False + isna_entry = False - if not isna_entry_nogil and not ( - sum_t is float64_t + if not isna_entry and not ( + sum_t is int64_t and is_datetimelike - and val == NPY_NAT + and val == NPY_NAT ): nobs[lab, j] += 1 y = val - compensation[lab, j] @@ -623,10 +627,22 @@ def group_sum( for i in range(ncounts): for j in range(K): if nobs[i, j] < min_count: + # if we are integer dtype, not is_datetimelike, and + # not uses_mask, then getting here implies that + # counts[i] < min_count, which means we will + # be cast to float64 and masked at the end + # of WrappedCythonOp._call_cython_op. So we can safely + # set a placeholder value in out[i, j]. if uses_mask: result_mask[i, j] = True - else: + elif (sum_t is float32_t or sum_t is float64_t + or sum_t is complex64_t or sum_t is complex64_t): out[i, j] = NAN + elif sum_t is int64_t: + out[i, j] = NPY_NAT + else: + # placeholder, see above + out[i, j] = 0 else: out[i, j] = sumx[i, j] diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index b8655aef86bcd..7617ca5074c9c 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -218,7 +218,7 @@ def _get_cython_vals(self, values: np.ndarray) -> np.ndarray: values = ensure_float64(values) elif values.dtype.kind in ["i", "u"]: - if how in ["sum", "var", "prod", "mean", "ohlc"] or ( + if how in ["var", "prod", "mean", "ohlc"] or ( self.kind == "transform" and self.has_dropped_na ): # result may still include NaN, so we have to cast @@ -616,7 +616,8 @@ def _call_cython_op( # need to have the result set to np.nan, which may require casting, # see GH#40767 if is_integer_dtype(result.dtype) and not is_datetimelike: - cutoff = max(1, min_count) + # Neutral value for sum is 0, so don't fill empty groups with nan + cutoff = max(0 if self.how == "sum" else 1, min_count) empty_groups = counts < cutoff if empty_groups.any(): if result_mask is not None and self.uses_mask(): diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index a6ab13270c4dc..a7c5b85e365ae 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2808,3 +2808,24 @@ def test_single_element_list_grouping(): ) with tm.assert_produces_warning(FutureWarning, match=msg): values, _ = next(iter(df.groupby(["a"]))) + + +def test_groupby_sum_avoid_casting_to_float(): + # GH#37493 + val = 922337203685477580 + df = DataFrame({"a": 1, "b": [val]}) + result = df.groupby("a").sum() - val + expected = DataFrame({"b": [0]}, index=Index([1], name="a")) + tm.assert_frame_equal(result, expected) + + +def test_groupby_sum_support_mask(any_numeric_ea_dtype): + # GH#37493 + df = DataFrame({"a": 1, "b": [1, 2, pd.NA]}, dtype=any_numeric_ea_dtype) + result = df.groupby("a").sum() + expected = DataFrame( + {"b": [3]}, + index=Index([1], name="a", dtype=any_numeric_ea_dtype), + dtype=any_numeric_ea_dtype, + ) + tm.assert_frame_equal(result, expected) From 5df22bcb3000d866633fcd82e96a6baefdd90bf9 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 9 Aug 2022 23:41:49 +0200 Subject: [PATCH 3/4] Fix mypy --- pandas/_libs/groupby.pyi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index d5ceb651e9893..3ec37718eb652 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -55,7 +55,7 @@ def group_sum( counts: np.ndarray, # int64_t[::1] values: np.ndarray, # ndarray[complexfloatingintuint_t, ndim=2] labels: np.ndarray, # const intp_t[:] - mask: np.ndarray | None = ..., + mask: np.ndarray | None, result_mask: np.ndarray | None = ..., min_count: int = ..., is_datetimelike: bool = ..., From 95fde3317cf6c490e5b080f67e481dd305a7f005 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 10 Aug 2022 08:28:40 +0200 Subject: [PATCH 4/4] Refactor if condition --- pandas/_libs/groupby.pyx | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index e1080e4161fb5..e4314edecfa7e 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -610,14 +610,12 @@ def group_sum( or sum_t is complex64_t or sum_t is complex64_t): # avoid warnings because of equality comparison isna_entry = not val == val + elif sum_t is int64_t and is_datetimelike and val == NPY_NAT: + isna_entry = True else: isna_entry = False - if not isna_entry and not ( - sum_t is int64_t - and is_datetimelike - and val == NPY_NAT - ): + if not isna_entry: nobs[lab, j] += 1 y = val - compensation[lab, j] t = sumx[lab, j] + y