Skip to content

ENH: Add support for min_count keyword for Resample and Groupby functions #37870

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Nov 26, 2020
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,7 @@ Other enhancements
- Improve error reporting for :meth:`DataFrame.merge()` when invalid merge column definitions were given (:issue:`16228`)
- Improve numerical stability for :meth:`Rolling.skew()`, :meth:`Rolling.kurt()`, :meth:`Expanding.skew()` and :meth:`Expanding.kurt()` through implementation of Kahan summation (:issue:`6929`)
- Improved error reporting for subsetting columns of a :class:`DataFrameGroupBy` with ``axis=1`` (:issue:`37725`)
- Add support for ``min_count`` keyword for :meth:`DataFrame.groupby` and :meth:`DataFrame.resample` for functions ``min``, ``max``, ``first`` and ``last`` (:issue:`37821`, :issue:`37768`)

.. ---------------------------------------------------------------------------

Expand Down
22 changes: 9 additions & 13 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -903,8 +903,6 @@ def group_last(rank_t[:, :] out,
ndarray[int64_t, ndim=2] nobs
bint runtime_error = False

assert min_count == -1, "'min_count' only used in add and prod"

# TODO(cython 3.0):
# Instead of `labels.shape[0]` use `len(labels)`
if not len(values) == labels.shape[0]:
Expand Down Expand Up @@ -939,7 +937,7 @@ def group_last(rank_t[:, :] out,

for i in range(ncounts):
for j in range(K):
if nobs[i, j] == 0:
if nobs[i, j] < min_count or nobs[i, j] == 0:
out[i, j] = NAN
else:
out[i, j] = resx[i, j]
Expand All @@ -961,7 +959,7 @@ def group_last(rank_t[:, :] out,

for i in range(ncounts):
for j in range(K):
if nobs[i, j] == 0:
if nobs[i, j] < min_count or nobs[i, j] == 0:
if rank_t is int64_t:
out[i, j] = NPY_NAT
elif rank_t is uint64_t:
Expand All @@ -986,7 +984,8 @@ def group_last(rank_t[:, :] out,
def group_nth(rank_t[:, :] out,
int64_t[:] counts,
ndarray[rank_t, ndim=2] values,
const int64_t[:] labels, int64_t rank=1
const int64_t[:] labels,
int64_t min_count=-1, int64_t rank=1
):
"""
Only aggregates on axis=0
Expand Down Expand Up @@ -1033,7 +1032,7 @@ def group_nth(rank_t[:, :] out,

for i in range(ncounts):
for j in range(K):
if nobs[i, j] == 0:
if nobs[i, j] < min_count or nobs[i, j] == 0:
out[i, j] = NAN
else:
out[i, j] = resx[i, j]
Expand All @@ -1057,7 +1056,7 @@ def group_nth(rank_t[:, :] out,

for i in range(ncounts):
for j in range(K):
if nobs[i, j] == 0:
if nobs[i, j] < min_count or nobs[i, j] == 0:
if rank_t is int64_t:
out[i, j] = NPY_NAT
elif rank_t is uint64_t:
Expand Down Expand Up @@ -1294,8 +1293,6 @@ def group_max(groupby_t[:, :] out,
bint runtime_error = False
int64_t[:, :] nobs

assert min_count == -1, "'min_count' only used in add and prod"

# TODO(cython 3.0):
# Instead of `labels.shape[0]` use `len(labels)`
if not len(values) == labels.shape[0]:
Expand Down Expand Up @@ -1337,11 +1334,12 @@ def group_max(groupby_t[:, :] out,

for i in range(ncounts):
for j in range(K):
if nobs[i, j] == 0:
if nobs[i, j] < min_count or nobs[i, j] == 0:
if groupby_t is uint64_t:
runtime_error = True
break
else:

out[i, j] = nan_val
else:
out[i, j] = maxx[i, j]
Expand Down Expand Up @@ -1369,8 +1367,6 @@ def group_min(groupby_t[:, :] out,
bint runtime_error = False
int64_t[:, :] nobs

assert min_count == -1, "'min_count' only used in add and prod"

# TODO(cython 3.0):
# Instead of `labels.shape[0]` use `len(labels)`
if not len(values) == labels.shape[0]:
Expand Down Expand Up @@ -1411,7 +1407,7 @@ def group_min(groupby_t[:, :] out,

for i in range(ncounts):
for j in range(K):
if nobs[i, j] == 0:
if nobs[i, j] < min_count or nobs[i, j] == 0:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you avoid the double lookup/comparison here by setting min_count = max(min_count, 1) outside the loop?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

if groupby_t is uint64_t:
runtime_error = True
break
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -603,7 +603,7 @@ def _aggregate(
):
if agg_func is libgroupby.group_nth:
# different signature from the others
agg_func(result, counts, values, comp_ids, rank=1)
agg_func(result, counts, values, comp_ids, min_count, rank=1)
else:
agg_func(result, counts, values, comp_ids, min_count)

Expand Down
4 changes: 2 additions & 2 deletions pandas/core/resample.py
Original file line number Diff line number Diff line change
Expand Up @@ -950,7 +950,7 @@ def quantile(self, q=0.5, **kwargs):


# downsample methods
for method in ["sum", "prod"]:
for method in ["sum", "prod", "min", "max", "first", "last"]:

def f(self, _method=method, min_count=0, *args, **kwargs):
nv.validate_resampler_func(_method, args, kwargs)
Expand All @@ -961,7 +961,7 @@ def f(self, _method=method, min_count=0, *args, **kwargs):


# downsample methods
for method in ["min", "max", "first", "last", "mean", "sem", "median", "ohlc"]:
for method in ["mean", "sem", "median", "ohlc"]:

def g(self, _method=method, *args, **kwargs):
nv.validate_resampler_func(_method, args, kwargs)
Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/groupby/aggregate/test_other.py
Original file line number Diff line number Diff line change
Expand Up @@ -638,3 +638,12 @@ def weird_func(x):

result = df["decimals"].groupby(df["id1"]).agg(weird_func)
tm.assert_series_equal(result, expected, check_names=False)


@pytest.mark.parametrize("func", ["first", "last", "max", "min"])
def test_min_count_implementation_min_max_first_last(func):
# GH#37821
df = DataFrame({"a": [1] * 3, "b": [1, np.nan, np.nan]})
result = getattr(df.groupby("a"), func)(min_count=2)
expected = DataFrame({"b": [np.nan]}, index=Index([1], name="a"))
tm.assert_frame_equal(result, expected)
13 changes: 13 additions & 0 deletions pandas/tests/resample/test_datetime_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1785,3 +1785,16 @@ def test_resample_calendar_day_with_dst(
1.0, pd.date_range(first, exp_last, freq=freq_out, tz="Europe/Amsterdam")
)
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("func", ["min", "max", "first", "last"])
def test_resample_aggregate_functions_min_count(func):
# GH#37768
index = date_range(start="2020", freq="M", periods=3)
ser = Series([1, np.nan, np.nan], index)
result = getattr(ser.resample("Q"), func)(min_count=2)
expected = Series(
[np.nan],
index=DatetimeIndex(["2020-03-31"], dtype="datetime64[ns]", freq="Q-DEC"),
)
tm.assert_series_equal(result, expected)