diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 22f6659367683..1d733566a0e0b 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -945,6 +945,7 @@ Groupby/resample/rolling - Bug in :meth:`.SeriesGroupBy.nunique` would incorrectly raise when the grouper was an empty categorical and ``observed=True`` (:issue:`21334`) - Bug in :meth:`.SeriesGroupBy.nth` would raise when grouper contained NA values after subsetting from a :class:`DataFrameGroupBy` (:issue:`26454`) - Bug in :meth:`DataFrame.groupby` would not include a :class:`.Grouper` specified by ``key`` in the result when ``as_index=False`` (:issue:`50413`) +- Bug in :meth:`.DataFrameGrouBy.value_counts` would raise when used with a :class:`.TimeGrouper` (:issue:`50486`) - Reshaping diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index e323877a512b0..66ad1b3ea7196 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -425,14 +425,22 @@ class Grouping: If we are a Categorical, use the observed values in_axis : if the Grouping is a column in self.obj and hence among Groupby.exclusions list + dropna : bool, default True + Whether to drop NA groups. + uniques : Array-like, optional + When specified, will be used for unique values. Enables including empty groups + in the result for a BinGrouper. Must not contain duplicates. - Returns + Attributes ------- - **Attributes**: - * indices : dict of {group -> index_list} - * codes : ndarray, group codes - * group_index : unique groups - * groups : dict of {group -> label_list} + indices : dict + Mapping of {group -> index_list} + codes : ndarray + Group codes + group_index : Index or None + unique groups + groups : dict + Mapping of {group -> label_list} """ _codes: npt.NDArray[np.signedinteger] | None = None @@ -452,6 +460,7 @@ def __init__( observed: bool = False, in_axis: bool = False, dropna: bool = True, + uniques: ArrayLike | None = None, ) -> None: self.level = level self._orig_grouper = grouper @@ -464,6 +473,7 @@ def __init__( self._observed = observed self.in_axis = in_axis self._dropna = dropna + self._uniques = uniques self._passed_categorical = False @@ -653,6 +663,7 @@ def group_index(self) -> Index: @cache_readonly def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: + uniques: ArrayLike if self._passed_categorical: # we make a CategoricalIndex out of the cat grouper # preserving the categories / ordered attributes; @@ -697,11 +708,13 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: elif isinstance(self.grouping_vector, ops.BaseGrouper): # we have a list of groupers codes = self.grouping_vector.codes_info - # error: Incompatible types in assignment (expression has type "Union - # [ExtensionArray, ndarray[Any, Any]]", variable has type "Categorical") - uniques = ( - self.grouping_vector.result_index._values # type: ignore[assignment] - ) + uniques = self.grouping_vector.result_index._values + elif self._uniques is not None: + # GH#50486 Code grouping_vector using _uniques; allows + # including uniques that are not present in grouping_vector. + cat = Categorical(self.grouping_vector, categories=self._uniques) + codes = cat.codes + uniques = self._uniques else: # GH35667, replace dropna=False with use_na_sentinel=False # error: Incompatible types in assignment (expression has type "Union[ diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index c20fe34a178f5..ea902800cf7e0 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -1214,7 +1214,11 @@ def names(self) -> list[Hashable]: @property def groupings(self) -> list[grouper.Grouping]: lev = self.binlabels - ping = grouper.Grouping(lev, lev, in_axis=False, level=None) + codes = self.group_info[0] + labels = lev.take(codes) + ping = grouper.Grouping( + labels, labels, in_axis=False, level=None, uniques=lev._values + ) return [ping] def _aggregate_series_fast(self, obj: Series, func: Callable) -> NoReturn: diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py index 8255fbab40dce..56aa121cd48c2 100644 --- a/pandas/tests/groupby/test_frame_value_counts.py +++ b/pandas/tests/groupby/test_frame_value_counts.py @@ -4,9 +4,11 @@ from pandas import ( CategoricalIndex, DataFrame, + Grouper, Index, MultiIndex, Series, + to_datetime, ) import pandas._testing as tm @@ -781,3 +783,39 @@ def test_subset_duplicate_columns(): ), ) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("utc", [True, False]) +def test_value_counts_time_grouper(utc): + # GH#50486 + df = DataFrame( + { + "Timestamp": [ + 1565083561, + 1565083561 + 86400, + 1565083561 + 86500, + 1565083561 + 86400 * 2, + 1565083561 + 86400 * 3, + 1565083561 + 86500 * 3, + 1565083561 + 86400 * 4, + ], + "Food": ["apple", "apple", "banana", "banana", "orange", "orange", "pear"], + } + ).drop([3]) + + df["Datetime"] = to_datetime( + df["Timestamp"].apply(lambda t: str(t)), utc=utc, unit="s" + ) + gb = df.groupby(Grouper(freq="1D", key="Datetime")) + result = gb.value_counts() + dates = to_datetime( + ["2019-08-06", "2019-08-07", "2019-08-09", "2019-08-10"], utc=utc + ) + timestamps = df["Timestamp"].unique() + index = MultiIndex( + levels=[dates, timestamps, ["apple", "banana", "orange", "pear"]], + codes=[[0, 1, 1, 2, 2, 3], range(6), [0, 0, 1, 2, 2, 3]], + names=["Datetime", "Timestamp", "Food"], + ) + expected = Series(1, index=index) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index 577a72d3f5090..11ee13ec05fd6 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -114,7 +114,8 @@ def rebuild_index(df): tm.assert_series_equal(left.sort_index(), right.sort_index()) -def test_series_groupby_value_counts_with_grouper(): +@pytest.mark.parametrize("utc", [True, False]) +def test_series_groupby_value_counts_with_grouper(utc): # GH28479 df = DataFrame( { @@ -131,7 +132,9 @@ def test_series_groupby_value_counts_with_grouper(): } ).drop([3]) - df["Datetime"] = to_datetime(df["Timestamp"].apply(lambda t: str(t)), unit="s") + df["Datetime"] = to_datetime( + df["Timestamp"].apply(lambda t: str(t)), utc=utc, unit="s" + ) dfg = df.groupby(Grouper(freq="1D", key="Datetime")) # have to sort on index because of unstable sort on values xref GH9212