diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 0c7cd31a10acb..54582674e5895 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -252,6 +252,7 @@ Other enhancements - :class:`Window` now supports all Scipy window types in ``win_type`` with flexible keyword argument support (:issue:`34556`) - :meth:`testing.assert_index_equal` now has a ``check_order`` parameter that allows indexes to be checked in an order-insensitive manner (:issue:`37478`) - :func:`read_csv` supports memory-mapping for compressed files (:issue:`37621`) +- Add support for ``min_count`` keyword for :meth:`DataFrame.groupby` and :meth:`DataFrame.resample` for functions ``min``, ``max``, ``first`` and ``last`` (:issue:`37821`, :issue:`37768`) - Improve error reporting for :meth:`DataFrame.merge` when invalid merge column definitions were given (:issue:`16228`) - Improve numerical stability for :meth:`.Rolling.skew`, :meth:`.Rolling.kurt`, :meth:`Expanding.skew` and :meth:`Expanding.kurt` through implementation of Kahan summation (:issue:`6929`) - Improved error reporting for subsetting columns of a :class:`.DataFrameGroupBy` with ``axis=1`` (:issue:`37725`) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 438d9fa625737..24156c88f0d76 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -903,13 +903,12 @@ def group_last(rank_t[:, :] out, ndarray[int64_t, ndim=2] nobs bint runtime_error = False - assert min_count == -1, "'min_count' only used in add and prod" - # TODO(cython 3.0): # Instead of `labels.shape[0]` use `len(labels)` if not len(values) == labels.shape[0]: raise AssertionError("len(index) != len(labels)") + min_count = max(min_count, 1) nobs = np.zeros((out).shape, dtype=np.int64) if rank_t is object: resx = np.empty((out).shape, dtype=object) @@ -939,7 +938,7 @@ def group_last(rank_t[:, :] out, for i in range(ncounts): for j in range(K): - if nobs[i, j] == 0: + if nobs[i, j] < min_count: out[i, j] = NAN else: out[i, j] = resx[i, j] @@ -961,7 +960,7 @@ def group_last(rank_t[:, :] out, for i in range(ncounts): for j in range(K): - if nobs[i, j] == 0: + if nobs[i, j] < min_count: if rank_t is int64_t: out[i, j] = NPY_NAT elif rank_t is uint64_t: @@ -986,7 +985,8 @@ def group_last(rank_t[:, :] out, def group_nth(rank_t[:, :] out, int64_t[:] counts, ndarray[rank_t, ndim=2] values, - const int64_t[:] labels, int64_t rank=1 + const int64_t[:] labels, + int64_t min_count=-1, int64_t rank=1 ): """ Only aggregates on axis=0 @@ -1003,6 +1003,7 @@ def group_nth(rank_t[:, :] out, if not len(values) == labels.shape[0]: raise AssertionError("len(index) != len(labels)") + min_count = max(min_count, 1) nobs = np.zeros((out).shape, dtype=np.int64) if rank_t is object: resx = np.empty((out).shape, dtype=object) @@ -1033,7 +1034,7 @@ def group_nth(rank_t[:, :] out, for i in range(ncounts): for j in range(K): - if nobs[i, j] == 0: + if nobs[i, j] < min_count: out[i, j] = NAN else: out[i, j] = resx[i, j] @@ -1057,7 +1058,7 @@ def group_nth(rank_t[:, :] out, for i in range(ncounts): for j in range(K): - if nobs[i, j] == 0: + if nobs[i, j] < min_count: if rank_t is int64_t: out[i, j] = NPY_NAT elif rank_t is uint64_t: @@ -1294,13 +1295,12 @@ def group_max(groupby_t[:, :] out, bint runtime_error = False int64_t[:, :] nobs - assert min_count == -1, "'min_count' only used in add and prod" - # TODO(cython 3.0): # Instead of `labels.shape[0]` use `len(labels)` if not len(values) == labels.shape[0]: raise AssertionError("len(index) != len(labels)") + min_count = max(min_count, 1) nobs = np.zeros((out).shape, dtype=np.int64) maxx = np.empty_like(out) @@ -1337,11 +1337,12 @@ def group_max(groupby_t[:, :] out, for i in range(ncounts): for j in range(K): - if nobs[i, j] == 0: + if nobs[i, j] < min_count: if groupby_t is uint64_t: runtime_error = True break else: + out[i, j] = nan_val else: out[i, j] = maxx[i, j] @@ -1369,13 +1370,12 @@ def group_min(groupby_t[:, :] out, bint runtime_error = False int64_t[:, :] nobs - assert min_count == -1, "'min_count' only used in add and prod" - # TODO(cython 3.0): # Instead of `labels.shape[0]` use `len(labels)` if not len(values) == labels.shape[0]: raise AssertionError("len(index) != len(labels)") + min_count = max(min_count, 1) nobs = np.zeros((out).shape, dtype=np.int64) minx = np.empty_like(out) @@ -1411,7 +1411,7 @@ def group_min(groupby_t[:, :] out, for i in range(ncounts): for j in range(K): - if nobs[i, j] == 0: + if nobs[i, j] < min_count: if groupby_t is uint64_t: runtime_error = True break diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index fc80852f00c95..3b33b7e5ecd00 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -603,7 +603,7 @@ def _aggregate( ): if agg_func is libgroupby.group_nth: # different signature from the others - agg_func(result, counts, values, comp_ids, rank=1) + agg_func(result, counts, values, comp_ids, min_count, rank=1) else: agg_func(result, counts, values, comp_ids, min_count) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index fccedd75c4531..e5589b0dae837 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -950,7 +950,7 @@ def quantile(self, q=0.5, **kwargs): # downsample methods -for method in ["sum", "prod"]: +for method in ["sum", "prod", "min", "max", "first", "last"]: def f(self, _method=method, min_count=0, *args, **kwargs): nv.validate_resampler_func(_method, args, kwargs) @@ -961,7 +961,7 @@ def f(self, _method=method, min_count=0, *args, **kwargs): # downsample methods -for method in ["min", "max", "first", "last", "mean", "sem", "median", "ohlc"]: +for method in ["mean", "sem", "median", "ohlc"]: def g(self, _method=method, *args, **kwargs): nv.validate_resampler_func(_method, args, kwargs) diff --git a/pandas/tests/groupby/test_missing.py b/pandas/tests/groupby/test_missing.py index 580148cb2a3a3..56cf400258f0f 100644 --- a/pandas/tests/groupby/test_missing.py +++ b/pandas/tests/groupby/test_missing.py @@ -116,3 +116,13 @@ def test_ffill_handles_nan_groups(dropna, method, has_nan_group): expected = df_without_nan_rows.reindex(ridx).reset_index(drop=True) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("min_count, value", [(2, np.nan), (-1, 1.0)]) +@pytest.mark.parametrize("func", ["first", "last", "max", "min"]) +def test_min_count(func, min_count, value): + # GH#37821 + df = DataFrame({"a": [1] * 3, "b": [1, np.nan, np.nan], "c": [np.nan] * 3}) + result = getattr(df.groupby("a"), func)(min_count=min_count) + expected = DataFrame({"b": [value], "c": [np.nan]}, index=Index([1], name="a")) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 5d75c22c8b795..65b50e829478d 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -1785,3 +1785,16 @@ def test_resample_calendar_day_with_dst( 1.0, pd.date_range(first, exp_last, freq=freq_out, tz="Europe/Amsterdam") ) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("func", ["min", "max", "first", "last"]) +def test_resample_aggregate_functions_min_count(func): + # GH#37768 + index = date_range(start="2020", freq="M", periods=3) + ser = Series([1, np.nan, np.nan], index) + result = getattr(ser.resample("Q"), func)(min_count=2) + expected = Series( + [np.nan], + index=DatetimeIndex(["2020-03-31"], dtype="datetime64[ns]", freq="Q-DEC"), + ) + tm.assert_series_equal(result, expected)