From 2b1808bf9acf46f94e0dba5cd2d1a99e572bb2c0 Mon Sep 17 00:00:00 2001 From: 0xF4D3C0D3 Date: Wed, 25 Sep 2019 12:04:57 +0900 Subject: [PATCH] BUG: value_counts can handle the case even with empty groups (#28479) * If applying rep to recons_labels go fail, use ids which has no consecutive duplicates instead. --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/core/groupby/generic.py | 9 +++++++- pandas/tests/groupby/test_value_counts.py | 27 ++++++++++++++++++++++- 3 files changed, 35 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 5b9e3a7dbad06..c1d47c690b548 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -243,6 +243,7 @@ Other - Using :meth:`DataFrame.replace` with overlapping keys in a nested dictionary will no longer raise, now matching the behavior of a flat dictionary (:issue:`27660`) - :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now support dicts as ``compression`` argument with key ``'method'`` being the compression method and others as additional compression options when the compression method is ``'zip'``. (:issue:`26023`) - :meth:`Series.append` will no longer raise a ``TypeError`` when passed a tuple of ``Series`` (:issue:`28410`) +- :meth:`SeriesGroupBy.value_counts` will be able to handle the case even when the :class:`Grouper` makes empty groups (:issue: 28479) .. _whatsnew_1000.contributors: diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index e731cffea0671..5a20bc8aee3ae 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1266,7 +1266,14 @@ def value_counts( rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx)) # multi-index components - labels = list(map(rep, self.grouper.recons_labels)) + [llab(lab, inc)] + try: + labels = list(map(rep, self.grouper.recons_labels)) + [llab(lab, inc)] + except ValueError: + # If applying rep to recons_labels go fail, use ids which has no + # consecutive duplicates instead. + _ids_idx = np.ones(len(ids), dtype=bool) + _ids_idx[1:] = ids[1:] != ids[:-1] + labels = list(map(rep, [ids[_ids_idx]])) + [llab(lab, inc)] levels = [ping.group_index for ping in self.grouper.groupings] + [lev] names = self.grouper.names + [self._selection_name] diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index c7b28822092a8..e4a8d66a28f9b 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -9,7 +9,7 @@ import numpy as np import pytest -from pandas import DataFrame, MultiIndex, Series, date_range +from pandas import DataFrame, MultiIndex, Series, date_range, Grouper from pandas.util import testing as tm @@ -78,3 +78,28 @@ def rebuild_index(df): # have to sort on index because of unstable sort on values left, right = map(rebuild_index, (left, right)) # xref GH9212 tm.assert_series_equal(left.sort_index(), right.sort_index()) + + +@pytest.mark.parametrize( + "freq, size, frac", product(["1D", "2D", "1W", "1Y"], [100, 1000], [0.1, 0.5, 1]) +) +def test_series_groupby_value_counts_with_grouper(freq, size, frac): + np.random.seed(42) + + df = DataFrame.from_dict( + { + "date": date_range("2019-09-25", periods=size), + "name": np.random.choice(list("abcd"), size), + } + ).sample(frac=frac) + + gr = df.groupby(Grouper(key="date", freq=freq))["name"] + + # have to sort on index because of unstable sort on values xref GH9212 + result = gr.value_counts().sort_index() + expected = gr.apply(Series.value_counts).sort_index() + expected.index.names = ( + result.index.names + ) # .apply(Series.value_counts) can't create all names + + tm.assert_series_equal(result, expected)