pandas-dev · WillAyd · Feb 3, 2025 · Jan 22, 2025 · Jan 22, 2025 · Jan 22, 2025
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -58,7 +58,7 @@ Other enhancements
 - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
 - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
 - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
-- :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` methods ``sum``, ``mean``, ``prod``, ``std``, ``var`` and ``sem`` now accept ``skipna`` parameter (:issue:`15675`)
+- :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` methods ``sum``, ``mean``, ``prod``, ``min``, ``max``, ``std``, ``var`` and ``sem`` now accept ``skipna`` parameter (:issue:`15675`)
 - :class:`Rolling` and :class:`Expanding` now support aggregations ``first`` and ``last`` (:issue:`33155`)
 - :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`)
 - :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`)

diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi
@@ -185,6 +185,7 @@ def group_max(
     is_datetimelike: bool = ...,
     mask: np.ndarray | None = ...,
     result_mask: np.ndarray | None = ...,
+    skipna: bool = ...,
 ) -> None: ...
 def group_min(
     out: np.ndarray,  # groupby_t[:, ::1]
@@ -195,6 +196,7 @@ def group_min(
     is_datetimelike: bool = ...,
     mask: np.ndarray | None = ...,
     result_mask: np.ndarray | None = ...,
+    skipna: bool = ...,
 ) -> None: ...
 def group_idxmin_idxmax(
     out: npt.NDArray[np.intp],

diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -815,7 +815,7 @@ def group_prod(
         int64float_t[:, ::1] prodx
         int64_t[:, ::1] nobs
         Py_ssize_t len_values = len(values), len_labels = len(labels)
-        bint isna_entry, uses_mask = mask is not None
+        bint isna_entry, isna_result, uses_mask = mask is not None
 
     if len_values != len_labels:
         raise ValueError("len(index) != len(labels)")
@@ -842,17 +842,16 @@ def group_prod(
             for j in range(K):
                 val = values[i, j]
 
-                if not skipna and (
-                    (uses_mask and result_mask[lab, j]) or
-                    _treat_as_na(prodx[lab, j], False)
-                ):
-                    # If prod is already NA, no need to update it
-                    continue
-
                 if uses_mask:
                     isna_entry = mask[i, j]
+                    isna_result = result_mask[lab, j]
                 else:
                     isna_entry = _treat_as_na(val, False)
+                    isna_result = _treat_as_na(prodx[lab, j], False)
+
+                if not skipna and isna_result:
+                    # If prod is already NA, no need to update it
+                    continue
 
                 if not isna_entry:
                     nobs[lab, j] += 1
@@ -890,7 +889,7 @@ def group_var(
         floating[:, ::1] mean
         int64_t[:, ::1] nobs
         Py_ssize_t len_values = len(values), len_labels = len(labels)
-        bint isna_entry, uses_mask = mask is not None
+        bint isna_entry, isna_result, uses_mask = mask is not None
         bint is_std = name == "std"
         bint is_sem = name == "sem"
 
@@ -917,25 +916,24 @@ def group_var(
             for j in range(K):
                 val = values[i, j]
 
-                if not skipna and (
-                    (uses_mask and result_mask[lab, j]) or
-                    (is_datetimelike and out[lab, j] == NPY_NAT) or
-                    _treat_as_na(out[lab, j], False)
-                ):
-                    # If aggregate is already NA, don't add to it. This is important for
-                    # datetimelike because adding a value to NPY_NAT may not result
-                    # in a NPY_NAT
-                    continue
-
                 if uses_mask:
                     isna_entry = mask[i, j]
+                    isna_result = result_mask[lab, j]
                 elif is_datetimelike:
                     # With group_var, we cannot just use _treat_as_na bc
                     #  datetimelike dtypes get cast to float64 instead of
                     #  to int64.
                     isna_entry = val == NPY_NAT
+                    isna_result = out[lab, j] == NPY_NAT
                 else:
                     isna_entry = _treat_as_na(val, is_datetimelike)
+                    isna_result = _treat_as_na(out[lab, j], is_datetimelike)
+
+                if not skipna and isna_result:
+                    # If aggregate is already NA, don't add to it. This is important for
+                    # datetimelike because adding a value to NPY_NAT may not result
+                    # in a NPY_NAT
+                    continue
 
                 if not isna_entry:
                     nobs[lab, j] += 1
@@ -1201,7 +1199,7 @@ def group_mean(
         mean_t[:, ::1] sumx, compensation
         int64_t[:, ::1] nobs
         Py_ssize_t len_values = len(values), len_labels = len(labels)
-        bint isna_entry, uses_mask = mask is not None
+        bint isna_entry, isna_result, uses_mask = mask is not None
 
     assert min_count == -1, "'min_count' only used in sum and prod"
 
@@ -1231,25 +1229,24 @@ def group_mean(
             for j in range(K):
                 val = values[i, j]
 
-                if not skipna and (
-                    (uses_mask and result_mask[lab, j]) or
-                    (is_datetimelike and sumx[lab, j] == NPY_NAT) or
-                    _treat_as_na(sumx[lab, j], False)
-                ):
-                    # If sum is already NA, don't add to it. This is important for
-                    # datetimelike because adding a value to NPY_NAT may not result
-                    # in NPY_NAT
-                    continue
-
                 if uses_mask:
                     isna_entry = mask[i, j]
+                    isna_result = result_mask[lab, j]
                 elif is_datetimelike:
                     # With group_mean, we cannot just use _treat_as_na bc
                     #  datetimelike dtypes get cast to float64 instead of
                     #  to int64.
                     isna_entry = val == NPY_NAT
+                    isna_result = sumx[lab, j] == NPY_NAT
                 else:
                     isna_entry = _treat_as_na(val, is_datetimelike)
+                    isna_result = _treat_as_na(sumx[lab, j], is_datetimelike)
+
+                if not skipna and isna_result:
+                    # If sum is already NA, don't add to it. This is important for
+                    # datetimelike because adding a value to NPY_NAT may not result
+                    # in NPY_NAT
+                    continue
 
                 if not isna_entry:
                     nobs[lab, j] += 1
@@ -1843,6 +1840,7 @@ cdef group_min_max(
     bint compute_max=True,
     const uint8_t[:, ::1] mask=None,
     uint8_t[:, ::1] result_mask=None,
+    bint skipna=True,
 ):
     """
     Compute minimum/maximum  of columns of `values`, in row groups `labels`.
@@ -1870,6 +1868,8 @@ cdef group_min_max(
     result_mask : ndarray[bool, ndim=2], optional
         If not None, these specify locations in the output that are NA.
         Modified in-place.
+    skipna : bool, default True
+        If True, ignore nans in `values`.
 
     Notes
     -----
@@ -1878,17 +1878,18 @@ cdef group_min_max(
     """
     cdef:
         Py_ssize_t i, j, N, K, lab, ngroups = len(counts)
-        numeric_t val
+        numeric_t val, nan_val
         numeric_t[:, ::1] group_min_or_max
         int64_t[:, ::1] nobs
         bint uses_mask = mask is not None
-        bint isna_entry
+        bint isna_entry, isna_result
 
     if not len(values) == len(labels):
         raise AssertionError("len(index) != len(labels)")
 
     min_count = max(min_count, 1)
     nobs = np.zeros((<object>out).shape, dtype=np.int64)
+    nan_val = _get_na_val(<numeric_t>0, is_datetimelike)
 
     group_min_or_max = np.empty_like(out)
     group_min_or_max[:] = _get_min_or_max(<numeric_t>0, compute_max, is_datetimelike)
@@ -1907,8 +1908,15 @@ cdef group_min_max(
 
                 if uses_mask:
                     isna_entry = mask[i, j]
+                    isna_result = result_mask[lab, j]
                 else:
                     isna_entry = _treat_as_na(val, is_datetimelike)
+                    isna_result = _treat_as_na(group_min_or_max[lab, j],
+                                               is_datetimelike)
+
+                if not skipna and isna_result:
+                    # If current min/max is already NA, it will always be NA
+                    continue
 
                 if not isna_entry:
                     nobs[lab, j] += 1
@@ -1918,6 +1926,11 @@ cdef group_min_max(
                     else:
                         if val < group_min_or_max[lab, j]:
                             group_min_or_max[lab, j] = val
+                elif not skipna:
+                    if uses_mask:
+                        result_mask[lab, j] = True
+                    else:
+                        group_min_or_max[lab, j] = nan_val
 
     _check_below_mincount(
         out, uses_mask, result_mask, ngroups, K, nobs, min_count, group_min_or_max
@@ -2049,6 +2062,7 @@ def group_max(
     bint is_datetimelike=False,
     const uint8_t[:, ::1] mask=None,
     uint8_t[:, ::1] result_mask=None,
+    bint skipna=True,
 ) -> None:
     """See group_min_max.__doc__"""
     group_min_max(
@@ -2061,6 +2075,7 @@ def group_max(
         compute_max=True,
         mask=mask,
         result_mask=result_mask,
+        skipna=skipna,
     )
 
 
@@ -2075,6 +2090,7 @@ def group_min(
     bint is_datetimelike=False,
     const uint8_t[:, ::1] mask=None,
     uint8_t[:, ::1] result_mask=None,
+    bint skipna=True,
 ) -> None:
     """See group_min_max.__doc__"""
     group_min_max(
@@ -2087,6 +2103,7 @@ def group_min(
         compute_max=False,
         mask=mask,
         result_mask=result_mask,
+        skipna=skipna,
     )
 
 

diff --git a/pandas/core/_numba/kernels/min_max_.py b/pandas/core/_numba/kernels/min_max_.py
@@ -80,14 +80,15 @@ def sliding_min_max(
     return output, na_pos
 
 
-@numba.jit(nopython=True, nogil=True, parallel=False)
+@numba.jit(nopython=True, nogil=False, parallel=False)
 def grouped_min_max(
     values: np.ndarray,
     result_dtype: np.dtype,
     labels: npt.NDArray[np.intp],
     ngroups: int,
     min_periods: int,
     is_max: bool,
+    skipna: bool = True,
 ) -> tuple[np.ndarray, list[int]]:
     N = len(labels)
     nobs = np.zeros(ngroups, dtype=np.int64)
@@ -97,13 +98,16 @@ def grouped_min_max(
     for i in range(N):
         lab = labels[i]
         val = values[i]
-        if lab < 0:
+        if lab < 0 or (nobs[lab] >= 1 and np.isnan(output[lab])):
             continue
 
         if values.dtype.kind == "i" or not np.isnan(val):
             nobs[lab] += 1
         else:
-            # NaN value cannot be a min/max value
+            if not skipna:
+                # If skipna is False and we encounter a NaN,
+                # both min and max of the group will be NaN
+                output[lab] = np.nan
             continue
 
         if nobs[lab] == 1:

@@ -3068,12 +3068,13 @@ def prod(
 
     @final
     @doc(
-        _groupby_agg_method_engine_template,
+        _groupby_agg_method_skipna_engine_template,
         fname="min",
         no=False,
         mc=-1,
         e=None,
         ek=None,
+        s=True,
         example=dedent(
             """\
         For SeriesGroupBy:
@@ -3113,6 +3114,7 @@ def min(
         self,
         numeric_only: bool = False,
         min_count: int = -1,
+        skipna: bool = True,
         engine: Literal["cython", "numba"] | None = None,
         engine_kwargs: dict[str, bool] | None = None,
     ):
@@ -3125,23 +3127,26 @@ def min(
                 engine_kwargs,
                 min_periods=min_count,
                 is_max=False,
+                skipna=skipna,
             )
         else:
             return self._agg_general(
                 numeric_only=numeric_only,
                 min_count=min_count,
+                skipna=skipna,
                 alias="min",
                 npfunc=np.min,
             )
 
     @final
     @doc(
-        _groupby_agg_method_engine_template,
+        _groupby_agg_method_skipna_engine_template,
         fname="max",
         no=False,
         mc=-1,
         e=None,
         ek=None,
+        s=True,
         example=dedent(
             """\
         For SeriesGroupBy:
@@ -3181,6 +3186,7 @@ def max(
         self,
         numeric_only: bool = False,
         min_count: int = -1,
+        skipna: bool = True,
         engine: Literal["cython", "numba"] | None = None,
         engine_kwargs: dict[str, bool] | None = None,
     ):
@@ -3193,11 +3199,13 @@ def max(
                 engine_kwargs,
                 min_periods=min_count,
                 is_max=True,
+                skipna=skipna,
             )
         else:
             return self._agg_general(
                 numeric_only=numeric_only,
                 min_count=min_count,
+                skipna=skipna,
                 alias="max",
                 npfunc=np.max,
             )

diff --git a/pandas/tests/groupby/test_api.py b/pandas/tests/groupby/test_api.py
@@ -174,7 +174,7 @@ def test_frame_consistency(groupby_func):
     elif groupby_func in ("nunique",):
         exclude_expected = {"axis"}
     elif groupby_func in ("max", "min"):
-        exclude_expected = {"axis", "kwargs", "skipna"}
+        exclude_expected = {"axis", "kwargs"}
         exclude_result = {"min_count", "engine", "engine_kwargs"}
     elif groupby_func in ("sum", "mean", "std", "var"):
         exclude_expected = {"axis", "kwargs"}
@@ -234,7 +234,7 @@ def test_series_consistency(request, groupby_func):
     if groupby_func in ("any", "all"):
         exclude_expected = {"kwargs", "bool_only", "axis"}
     elif groupby_func in ("max", "min"):
-        exclude_expected = {"axis", "kwargs", "skipna"}
+        exclude_expected = {"axis", "kwargs"}
         exclude_result = {"min_count", "engine", "engine_kwargs"}
     elif groupby_func in ("sum", "mean", "std", "var"):
         exclude_expected = {"axis", "kwargs"}