BUG: DataFrameGroupBy.value_counts fails with a TimeGrouper (#50507)

rhshadrach · web-flow · commit 3ea04c383bec · 2023-01-03T14:42:04.000-08:00
* BUG: DataFrameGroupBy.value_counts fails with a TimeGrouper

* mypy fixup

* .values -&gt; ._values
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -948,6 +948,7 @@ Groupby/resample/rolling
 - Bug in :meth:`.SeriesGroupBy.nunique` would incorrectly raise when the grouper was an empty categorical and ``observed=True`` (:issue:`21334`)
 - Bug in :meth:`.SeriesGroupBy.nth` would raise when grouper contained NA values after subsetting from a :class:`DataFrameGroupBy` (:issue:`26454`)
 - Bug in :meth:`DataFrame.groupby` would not include a :class:`.Grouper` specified by ``key`` in the result when ``as_index=False`` (:issue:`50413`)
+- Bug in :meth:`.DataFrameGrouBy.value_counts` would raise when used with a :class:`.TimeGrouper` (:issue:`50486`)
 -
 
 Reshaping
diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py
@@ -425,14 +425,22 @@ class Grouping:
         If we are a Categorical, use the observed values
     in_axis : if the Grouping is a column in self.obj and hence among
         Groupby.exclusions list
+    dropna : bool, default True
+        Whether to drop NA groups.
+    uniques : Array-like, optional
+        When specified, will be used for unique values. Enables including empty groups
+        in the result for a BinGrouper. Must not contain duplicates.
 
-    Returns
+    Attributes
     -------
-    **Attributes**:
-      * indices : dict of {group -> index_list}
-      * codes : ndarray, group codes
-      * group_index : unique groups
-      * groups : dict of {group -> label_list}
+    indices : dict
+        Mapping of {group -> index_list}
+    codes : ndarray
+        Group codes
+    group_index : Index or None
+        unique groups
+    groups : dict
+        Mapping of {group -> label_list}
     """
 
     _codes: npt.NDArray[np.signedinteger] | None = None
@@ -452,6 +460,7 @@ def __init__(
         observed: bool = False,
         in_axis: bool = False,
         dropna: bool = True,
+        uniques: ArrayLike | None = None,
     ) -> None:
         self.level = level
         self._orig_grouper = grouper
@@ -464,6 +473,7 @@ def __init__(
         self._observed = observed
         self.in_axis = in_axis
         self._dropna = dropna
+        self._uniques = uniques
 
         self._passed_categorical = False
 
@@ -653,6 +663,7 @@ def group_index(self) -> Index:
 
     @cache_readonly
     def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
+        uniques: ArrayLike
         if self._passed_categorical:
             # we make a CategoricalIndex out of the cat grouper
             # preserving the categories / ordered attributes;
@@ -697,11 +708,13 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
         elif isinstance(self.grouping_vector, ops.BaseGrouper):
             # we have a list of groupers
             codes = self.grouping_vector.codes_info
-            # error: Incompatible types in assignment (expression has type "Union
-            # [ExtensionArray, ndarray[Any, Any]]", variable has type "Categorical")
-            uniques = (
-                self.grouping_vector.result_index._values  # type: ignore[assignment]
-            )
+            uniques = self.grouping_vector.result_index._values
+        elif self._uniques is not None:
+            # GH#50486 Code grouping_vector using _uniques; allows
+            # including uniques that are not present in grouping_vector.
+            cat = Categorical(self.grouping_vector, categories=self._uniques)
+            codes = cat.codes
+            uniques = self._uniques
         else:
             # GH35667, replace dropna=False with use_na_sentinel=False
             # error: Incompatible types in assignment (expression has type "Union[
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -1214,7 +1214,11 @@ def names(self) -> list[Hashable]:
     @property
     def groupings(self) -> list[grouper.Grouping]:
         lev = self.binlabels
-        ping = grouper.Grouping(lev, lev, in_axis=False, level=None)
+        codes = self.group_info[0]
+        labels = lev.take(codes)
+        ping = grouper.Grouping(
+            labels, labels, in_axis=False, level=None, uniques=lev._values
+        )
         return [ping]
 
     def _aggregate_series_fast(self, obj: Series, func: Callable) -> NoReturn:
diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py
@@ -4,9 +4,11 @@
 from pandas import (
     CategoricalIndex,
     DataFrame,
+    Grouper,
     Index,
     MultiIndex,
     Series,
+    to_datetime,
 )
 import pandas._testing as tm
 
@@ -781,3 +783,39 @@ def test_subset_duplicate_columns():
         ),
     )
     tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("utc", [True, False])
+def test_value_counts_time_grouper(utc):
+    # GH#50486
+    df = DataFrame(
+        {
+            "Timestamp": [
+                1565083561,
+                1565083561 + 86400,
+                1565083561 + 86500,
+                1565083561 + 86400 * 2,
+                1565083561 + 86400 * 3,
+                1565083561 + 86500 * 3,
+                1565083561 + 86400 * 4,
+            ],
+            "Food": ["apple", "apple", "banana", "banana", "orange", "orange", "pear"],
+        }
+    ).drop([3])
+
+    df["Datetime"] = to_datetime(
+        df["Timestamp"].apply(lambda t: str(t)), utc=utc, unit="s"
+    )
+    gb = df.groupby(Grouper(freq="1D", key="Datetime"))
+    result = gb.value_counts()
+    dates = to_datetime(
+        ["2019-08-06", "2019-08-07", "2019-08-09", "2019-08-10"], utc=utc
+    )
+    timestamps = df["Timestamp"].unique()
+    index = MultiIndex(
+        levels=[dates, timestamps, ["apple", "banana", "orange", "pear"]],
+        codes=[[0, 1, 1, 2, 2, 3], range(6), [0, 0, 1, 2, 2, 3]],
+        names=["Datetime", "Timestamp", "Food"],
+    )
+    expected = Series(1, index=index)
+    tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py
@@ -114,7 +114,8 @@ def rebuild_index(df):
     tm.assert_series_equal(left.sort_index(), right.sort_index())
 
 
-def test_series_groupby_value_counts_with_grouper():
+@pytest.mark.parametrize("utc", [True, False])
+def test_series_groupby_value_counts_with_grouper(utc):
     # GH28479
     df = DataFrame(
         {
@@ -131,7 +132,9 @@ def test_series_groupby_value_counts_with_grouper():
         }
     ).drop([3])
 
-    df["Datetime"] = to_datetime(df["Timestamp"].apply(lambda t: str(t)), unit="s")
+    df["Datetime"] = to_datetime(
+        df["Timestamp"].apply(lambda t: str(t)), utc=utc, unit="s"
+    )
     dfg = df.groupby(Grouper(freq="1D", key="Datetime"))
 
     # have to sort on index because of unstable sort on values xref GH9212

Original file line number	Diff line number	Diff line change
`@@ -948,6 +948,7 @@ Groupby/resample/rolling`
`948`	`948`	- Bug in :meth:`.SeriesGroupBy.nunique` would incorrectly raise when the grouper was an empty categorical and ``observed=True`` (:issue:`21334`)
`949`	`949`	- Bug in :meth:`.SeriesGroupBy.nth` would raise when grouper contained NA values after subsetting from a :class:`DataFrameGroupBy` (:issue:`26454`)
`950`	`950`	- Bug in :meth:`DataFrame.groupby` would not include a :class:`.Grouper` specified by ``key`` in the result when ``as_index=False`` (:issue:`50413`)
	`951`	+- Bug in :meth:`.DataFrameGrouBy.value_counts` would raise when used with a :class:`.TimeGrouper` (:issue:`50486`)
`951`	`952`	`-`
`952`	`953`
`953`	`954`	`Reshaping`