BUG: Groupby min/max with nullable dtypes (#42567)

jbrockmendel · web-flow · commit 6f4c382e17e1 · 2021-09-04T21:29:53.000-04:00
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -394,6 +394,7 @@ Groupby/resample/rolling
 ^^^^^^^^^^^^^^^^^^^^^^^^
 - Fixed bug in :meth:`SeriesGroupBy.apply` where passing an unrecognized string argument failed to raise ``TypeError`` when the underlying ``Series`` is empty (:issue:`42021`)
 - Bug in :meth:`Series.rolling.apply`, :meth:`DataFrame.rolling.apply`, :meth:`Series.expanding.apply` and :meth:`DataFrame.expanding.apply` with ``engine="numba"`` where ``*args`` were being cached with the user passed function (:issue:`42287`)
+- Bug in :meth:`GroupBy.max` and :meth:`GroupBy.min` with nullable integer dtypes losing precision (:issue:`41743`)
 - Bug in :meth:`DataFrame.groupby.rolling.var` would calculate the rolling variance only on the first group (:issue:`42442`)
 - Bug in :meth:`GroupBy.shift` that would return the grouping columns if ``fill_value`` was not None (:issue:`41556`)
 - Bug in :meth:`SeriesGroupBy.nlargest` and :meth:`SeriesGroupBy.nsmallest` would have an inconsistent index when the input Series was sorted and ``n`` was greater than or equal to all group sizes (:issue:`15272`, :issue:`16345`, :issue:`29129`)
diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi
@@ -123,13 +123,17 @@ def group_max(
     values: np.ndarray,  # ndarray[groupby_t, ndim=2]
     labels: np.ndarray,  # const int64_t[:]
     min_count: int = ...,
+    mask: np.ndarray | None = ...,
+    result_mask: np.ndarray | None = ...,
 ) -> None: ...
 def group_min(
     out: np.ndarray,  # groupby_t[:, ::1]
     counts: np.ndarray,  # int64_t[::1]
     values: np.ndarray,  # ndarray[groupby_t, ndim=2]
     labels: np.ndarray,  # const int64_t[:]
     min_count: int = ...,
+    mask: np.ndarray | None = ...,
+    result_mask: np.ndarray | None = ...,
 ) -> None: ...
 def group_cummin(
     out: np.ndarray,  # groupby_t[:, ::1]
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -1182,7 +1182,9 @@ cdef group_min_max(groupby_t[:, ::1] out,
                    const intp_t[::1] labels,
                    Py_ssize_t min_count=-1,
                    bint is_datetimelike=False,
-                   bint compute_max=True):
+                   bint compute_max=True,
+                   const uint8_t[:, ::1] mask=None,
+                   uint8_t[:, ::1] result_mask=None):
     """
     Compute minimum/maximum  of columns of `values`, in row groups `labels`.
 
@@ -1203,6 +1205,12 @@ cdef group_min_max(groupby_t[:, ::1] out,
         True if `values` contains datetime-like entries.
     compute_max : bint, default True
         True to compute group-wise max, False to compute min
+    mask : ndarray[bool, ndim=2], optional
+        If not None, indices represent missing values,
+        otherwise the mask will not be used
+    result_mask : ndarray[bool, ndim=2], optional
+        If not None, these specify locations in the output that are NA.
+        Modified in-place.
 
     Notes
     -----
@@ -1215,6 +1223,8 @@ cdef group_min_max(groupby_t[:, ::1] out,
         ndarray[groupby_t, ndim=2] group_min_or_max
         bint runtime_error = False
         int64_t[:, ::1] nobs
+        bint uses_mask = mask is not None
+        bint isna_entry
 
     # TODO(cython 3.0):
     # Instead of `labels.shape[0]` use `len(labels)`
@@ -1249,7 +1259,12 @@ cdef group_min_max(groupby_t[:, ::1] out,
             for j in range(K):
                 val = values[i, j]
 
-                if not _treat_as_na(val, is_datetimelike):
+                if uses_mask:
+                    isna_entry = mask[i, j]
+                else:
+                    isna_entry = _treat_as_na(val, is_datetimelike)
+
+                if not isna_entry:
                     nobs[lab, j] += 1
                     if compute_max:
                         if val > group_min_or_max[lab, j]:
@@ -1265,7 +1280,10 @@ cdef group_min_max(groupby_t[:, ::1] out,
                         runtime_error = True
                         break
                     else:
-                        out[i, j] = nan_val
+                        if uses_mask:
+                            result_mask[i, j] = True
+                        else:
+                            out[i, j] = nan_val
                 else:
                     out[i, j] = group_min_or_max[i, j]
 
@@ -1282,7 +1300,9 @@ def group_max(groupby_t[:, ::1] out,
               ndarray[groupby_t, ndim=2] values,
               const intp_t[::1] labels,
               Py_ssize_t min_count=-1,
-              bint is_datetimelike=False) -> None:
+              bint is_datetimelike=False,
+              const uint8_t[:, ::1] mask=None,
+              uint8_t[:, ::1] result_mask=None) -> None:
     """See group_min_max.__doc__"""
     group_min_max(
         out,
@@ -1292,6 +1312,8 @@ def group_max(groupby_t[:, ::1] out,
         min_count=min_count,
         is_datetimelike=is_datetimelike,
         compute_max=True,
+        mask=mask,
+        result_mask=result_mask,
     )
 
 
@@ -1302,7 +1324,9 @@ def group_min(groupby_t[:, ::1] out,
               ndarray[groupby_t, ndim=2] values,
               const intp_t[::1] labels,
               Py_ssize_t min_count=-1,
-              bint is_datetimelike=False) -> None:
+              bint is_datetimelike=False,
+              const uint8_t[:, ::1] mask=None,
+              uint8_t[:, ::1] result_mask=None) -> None:
     """See group_min_max.__doc__"""
     group_min_max(
         out,
@@ -1312,6 +1336,8 @@ def group_min(groupby_t[:, ::1] out,
         min_count=min_count,
         is_datetimelike=is_datetimelike,
         compute_max=False,
+        mask=mask,
+        result_mask=result_mask,
     )
 
 
diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
@@ -123,6 +123,8 @@ def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False):
             raise ValueError("values must be a 1D array")
         if mask.ndim != 1:
             raise ValueError("mask must be a 1D array")
+        if values.shape != mask.shape:
+            raise ValueError("values and mask must have same shape")
 
         if copy:
             values = values.copy()
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -138,7 +138,7 @@ def __init__(self, kind: str, how: str):
         },
     }
 
-    _MASKED_CYTHON_FUNCTIONS = {"cummin", "cummax"}
+    _MASKED_CYTHON_FUNCTIONS = {"cummin", "cummax", "min", "max"}
 
     _cython_arity = {"ohlc": 4}  # OHLC
 
@@ -404,6 +404,7 @@ def _masked_ea_wrap_cython_operation(
 
         # Copy to ensure input and result masks don't end up shared
         mask = values._mask.copy()
+        result_mask = np.zeros(ngroups, dtype=bool)
         arr = values._data
 
         res_values = self._cython_op_ndim_compat(
@@ -412,13 +413,18 @@ def _masked_ea_wrap_cython_operation(
             ngroups=ngroups,
             comp_ids=comp_ids,
             mask=mask,
+            result_mask=result_mask,
             **kwargs,
         )
+
         dtype = self._get_result_dtype(orig_values.dtype)
         assert isinstance(dtype, BaseMaskedDtype)
         cls = dtype.construct_array_type()
 
-        return cls(res_values.astype(dtype.type, copy=False), mask)
+        if self.kind != "aggregate":
+            return cls(res_values.astype(dtype.type, copy=False), mask)
+        else:
+            return cls(res_values.astype(dtype.type, copy=False), result_mask)
 
     @final
     def _cython_op_ndim_compat(
@@ -428,20 +434,24 @@ def _cython_op_ndim_compat(
         min_count: int,
         ngroups: int,
         comp_ids: np.ndarray,
-        mask: np.ndarray | None,
+        mask: np.ndarray | None = None,
+        result_mask: np.ndarray | None = None,
         **kwargs,
     ) -> np.ndarray:
         if values.ndim == 1:
             # expand to 2d, dispatch, then squeeze if appropriate
             values2d = values[None, :]
             if mask is not None:
                 mask = mask[None, :]
+            if result_mask is not None:
+                result_mask = result_mask[None, :]
             res = self._call_cython_op(
                 values2d,
                 min_count=min_count,
                 ngroups=ngroups,
                 comp_ids=comp_ids,
                 mask=mask,
+                result_mask=result_mask,
                 **kwargs,
             )
             if res.shape[0] == 1:
@@ -456,6 +466,7 @@ def _cython_op_ndim_compat(
             ngroups=ngroups,
             comp_ids=comp_ids,
             mask=mask,
+            result_mask=result_mask,
             **kwargs,
         )
 
@@ -468,6 +479,7 @@ def _call_cython_op(
         ngroups: int,
         comp_ids: np.ndarray,
         mask: np.ndarray | None,
+        result_mask: np.ndarray | None,
         **kwargs,
     ) -> np.ndarray:  # np.ndarray[ndim=2]
         orig_values = values
@@ -493,6 +505,8 @@ def _call_cython_op(
         values = values.T
         if mask is not None:
             mask = mask.T
+            if result_mask is not None:
+                result_mask = result_mask.T
 
         out_shape = self._get_output_shape(ngroups, values)
         func, values = self.get_cython_func_and_vals(values, is_numeric)
@@ -508,6 +522,8 @@ def _call_cython_op(
                     values,
                     comp_ids,
                     min_count,
+                    mask=mask,
+                    result_mask=result_mask,
                     is_datetimelike=is_datetimelike,
                 )
             else:
diff --git a/pandas/tests/groupby/test_min_max.py b/pandas/tests/groupby/test_min_max.py
@@ -177,3 +177,51 @@ def test_aggregate_categorical_lost_index(func: str):
     expected["B"] = expected["B"].astype(ds.dtype)
 
     tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("dtype", ["Int64", "Int32", "Float64", "Float32", "boolean"])
+def test_groupby_min_max_nullable(dtype):
+    if dtype == "Int64":
+        # GH#41743 avoid precision loss
+        ts = 1618556707013635762
+    elif dtype == "boolean":
+        ts = 0
+    else:
+        ts = 4.0
+
+    df = DataFrame({"id": [2, 2], "ts": [ts, ts + 1]})
+    df["ts"] = df["ts"].astype(dtype)
+
+    gb = df.groupby("id")
+
+    result = gb.min()
+    expected = df.iloc[:1].set_index("id")
+    tm.assert_frame_equal(result, expected)
+
+    res_max = gb.max()
+    expected_max = df.iloc[1:].set_index("id")
+    tm.assert_frame_equal(res_max, expected_max)
+
+    result2 = gb.min(min_count=3)
+    expected2 = DataFrame({"ts": [pd.NA]}, index=expected.index, dtype=dtype)
+    tm.assert_frame_equal(result2, expected2)
+
+    res_max2 = gb.max(min_count=3)
+    tm.assert_frame_equal(res_max2, expected2)
+
+    # Case with NA values
+    df2 = DataFrame({"id": [2, 2, 2], "ts": [ts, pd.NA, ts + 1]})
+    df2["ts"] = df2["ts"].astype(dtype)
+    gb2 = df2.groupby("id")
+
+    result3 = gb2.min()
+    tm.assert_frame_equal(result3, expected)
+
+    res_max3 = gb2.max()
+    tm.assert_frame_equal(res_max3, expected_max)
+
+    result4 = gb2.min(min_count=100)
+    tm.assert_frame_equal(result4, expected2)
+
+    res_max4 = gb2.max(min_count=100)
+    tm.assert_frame_equal(res_max4, expected2)