From a3fb1a834faa165c1bb0ed07b006123c24992800 Mon Sep 17 00:00:00 2001 From: richard Date: Thu, 29 Dec 2022 11:02:10 -0500 Subject: [PATCH 1/3] BUG: DataFrameGroupBy.value_counts fails with a TimeGrouper --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/core/groupby/grouper.py | 28 ++++++++++++---- pandas/core/groupby/ops.py | 4 ++- .../tests/groupby/test_frame_value_counts.py | 33 +++++++++++++++++++ 4 files changed, 59 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 9957ccb4fde50..9731bf789fed7 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -932,6 +932,7 @@ Groupby/resample/rolling - Bug in :class:`.DataFrameGroupBy` and :class:`.SeriesGroupBy` with ``dropna=False`` would drop NA values when the grouper was categorical (:issue:`36327`) - Bug in :meth:`.SeriesGroupBy.nunique` would incorrectly raise when the grouper was an empty categorical and ``observed=True`` (:issue:`21334`) - Bug in :meth:`.SeriesGroupBy.nth` would raise when grouper contained NA values after subsetting from a :class:`DataFrameGroupBy` (:issue:`26454`) +- Bug in :meth:`.DataFrameGrouBy.value_counts` would raise when used with a :class:`.TimeGrouper` (:issue:`50486`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index d4818eabeac1b..7e7d376478468 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -425,14 +425,22 @@ class Grouping: If we are a Categorical, use the observed values in_axis : if the Grouping is a column in self.obj and hence among Groupby.exclusions list + dropna : bool, default True + Whether to drop NA groups. + uniques : Array-like, optional + When specified, will be used for unique values. Enables including empty groups + in the result for a BinGrouper. Must not contain duplicates. - Returns + Attributes ------- - **Attributes**: - * indices : dict of {group -> index_list} - * codes : ndarray, group codes - * group_index : unique groups - * groups : dict of {group -> label_list} + indices : dict + Mapping of {group -> index_list} + codes : ndarray + Group codes + group_index : Index or None + unique groups + groups : dict + Mapping of {group -> label_list} """ _codes: npt.NDArray[np.signedinteger] | None = None @@ -452,6 +460,7 @@ def __init__( observed: bool = False, in_axis: bool = False, dropna: bool = True, + uniques: ArrayLike | None = None, ) -> None: self.level = level self._orig_grouper = grouper @@ -464,6 +473,7 @@ def __init__( self._observed = observed self.in_axis = in_axis self._dropna = dropna + self._uniques = uniques self._passed_categorical = False @@ -702,6 +712,12 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: uniques = ( self.grouping_vector.result_index._values # type: ignore[assignment] ) + elif self._uniques is not None: + # GH#50486 Code grouping_vector using _uniques; allows + # including uniques that are not present in grouping_vector. + cat = Categorical(self.grouping_vector, categories=self._uniques) + codes = cat.codes + uniques = self._uniques else: # GH35667, replace dropna=False with use_na_sentinel=False # error: Incompatible types in assignment (expression has type "Union[ diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index c20fe34a178f5..224cf1cd9c965 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -1214,7 +1214,9 @@ def names(self) -> list[Hashable]: @property def groupings(self) -> list[grouper.Grouping]: lev = self.binlabels - ping = grouper.Grouping(lev, lev, in_axis=False, level=None) + codes = self.group_info[0] + labels = lev.take(codes) + ping = grouper.Grouping(labels, labels, in_axis=False, level=None, uniques=lev) return [ping] def _aggregate_series_fast(self, obj: Series, func: Callable) -> NoReturn: diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py index 8255fbab40dce..f9ccd05d625e5 100644 --- a/pandas/tests/groupby/test_frame_value_counts.py +++ b/pandas/tests/groupby/test_frame_value_counts.py @@ -4,9 +4,11 @@ from pandas import ( CategoricalIndex, DataFrame, + Grouper, Index, MultiIndex, Series, + to_datetime, ) import pandas._testing as tm @@ -781,3 +783,34 @@ def test_subset_duplicate_columns(): ), ) tm.assert_series_equal(result, expected) + + +def test_value_counts_time_grouper(): + # GH#50486 + df = DataFrame( + { + "Timestamp": [ + 1565083561, + 1565083561 + 86400, + 1565083561 + 86500, + 1565083561 + 86400 * 2, + 1565083561 + 86400 * 3, + 1565083561 + 86500 * 3, + 1565083561 + 86400 * 4, + ], + "Food": ["apple", "apple", "banana", "banana", "orange", "orange", "pear"], + } + ).drop([3]) + + df["Datetime"] = to_datetime(df["Timestamp"].apply(lambda t: str(t)), unit="s") + gb = df.groupby(Grouper(freq="1D", key="Datetime")) + result = gb.value_counts() + dates = to_datetime(["2019-08-06", "2019-08-07", "2019-08-09", "2019-08-10"]) + timestamps = df["Timestamp"].unique() + index = MultiIndex( + levels=[dates, timestamps, ["apple", "banana", "orange", "pear"]], + codes=[[0, 1, 1, 2, 2, 3], range(6), [0, 0, 1, 2, 2, 3]], + names=["Datetime", "Timestamp", "Food"], + ) + expected = Series(1, index=index) + tm.assert_series_equal(result, expected) From 71a29fd15d7b3a061f97e8b92d4630ccba149081 Mon Sep 17 00:00:00 2001 From: richard Date: Fri, 30 Dec 2022 13:57:56 -0500 Subject: [PATCH 2/3] mypy fixup --- pandas/core/groupby/grouper.py | 7 ++----- pandas/core/groupby/ops.py | 4 +++- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 3681307319808..66ad1b3ea7196 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -663,6 +663,7 @@ def group_index(self) -> Index: @cache_readonly def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: + uniques: ArrayLike if self._passed_categorical: # we make a CategoricalIndex out of the cat grouper # preserving the categories / ordered attributes; @@ -707,11 +708,7 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: elif isinstance(self.grouping_vector, ops.BaseGrouper): # we have a list of groupers codes = self.grouping_vector.codes_info - # error: Incompatible types in assignment (expression has type "Union - # [ExtensionArray, ndarray[Any, Any]]", variable has type "Categorical") - uniques = ( - self.grouping_vector.result_index._values # type: ignore[assignment] - ) + uniques = self.grouping_vector.result_index._values elif self._uniques is not None: # GH#50486 Code grouping_vector using _uniques; allows # including uniques that are not present in grouping_vector. diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 224cf1cd9c965..b9f67ec0e5808 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -1216,7 +1216,9 @@ def groupings(self) -> list[grouper.Grouping]: lev = self.binlabels codes = self.group_info[0] labels = lev.take(codes) - ping = grouper.Grouping(labels, labels, in_axis=False, level=None, uniques=lev) + ping = grouper.Grouping( + labels, labels, in_axis=False, level=None, uniques=lev.values + ) return [ping] def _aggregate_series_fast(self, obj: Series, func: Callable) -> NoReturn: From 692d9b7fb18213986e80cc5dc7ad55248d14c54f Mon Sep 17 00:00:00 2001 From: richard Date: Mon, 2 Jan 2023 23:16:10 -0500 Subject: [PATCH 3/3] .values -> ._values --- pandas/core/groupby/ops.py | 2 +- pandas/tests/groupby/test_frame_value_counts.py | 11 ++++++++--- pandas/tests/groupby/test_value_counts.py | 7 +++++-- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index b9f67ec0e5808..ea902800cf7e0 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -1217,7 +1217,7 @@ def groupings(self) -> list[grouper.Grouping]: codes = self.group_info[0] labels = lev.take(codes) ping = grouper.Grouping( - labels, labels, in_axis=False, level=None, uniques=lev.values + labels, labels, in_axis=False, level=None, uniques=lev._values ) return [ping] diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py index f9ccd05d625e5..56aa121cd48c2 100644 --- a/pandas/tests/groupby/test_frame_value_counts.py +++ b/pandas/tests/groupby/test_frame_value_counts.py @@ -785,7 +785,8 @@ def test_subset_duplicate_columns(): tm.assert_series_equal(result, expected) -def test_value_counts_time_grouper(): +@pytest.mark.parametrize("utc", [True, False]) +def test_value_counts_time_grouper(utc): # GH#50486 df = DataFrame( { @@ -802,10 +803,14 @@ def test_value_counts_time_grouper(): } ).drop([3]) - df["Datetime"] = to_datetime(df["Timestamp"].apply(lambda t: str(t)), unit="s") + df["Datetime"] = to_datetime( + df["Timestamp"].apply(lambda t: str(t)), utc=utc, unit="s" + ) gb = df.groupby(Grouper(freq="1D", key="Datetime")) result = gb.value_counts() - dates = to_datetime(["2019-08-06", "2019-08-07", "2019-08-09", "2019-08-10"]) + dates = to_datetime( + ["2019-08-06", "2019-08-07", "2019-08-09", "2019-08-10"], utc=utc + ) timestamps = df["Timestamp"].unique() index = MultiIndex( levels=[dates, timestamps, ["apple", "banana", "orange", "pear"]], diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index 577a72d3f5090..11ee13ec05fd6 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -114,7 +114,8 @@ def rebuild_index(df): tm.assert_series_equal(left.sort_index(), right.sort_index()) -def test_series_groupby_value_counts_with_grouper(): +@pytest.mark.parametrize("utc", [True, False]) +def test_series_groupby_value_counts_with_grouper(utc): # GH28479 df = DataFrame( { @@ -131,7 +132,9 @@ def test_series_groupby_value_counts_with_grouper(): } ).drop([3]) - df["Datetime"] = to_datetime(df["Timestamp"].apply(lambda t: str(t)), unit="s") + df["Datetime"] = to_datetime( + df["Timestamp"].apply(lambda t: str(t)), utc=utc, unit="s" + ) dfg = df.groupby(Grouper(freq="1D", key="Datetime")) # have to sort on index because of unstable sort on values xref GH9212