Skip to content

Commit 3ea04c3

Browse files
authored
BUG: DataFrameGroupBy.value_counts fails with a TimeGrouper (#50507)
* BUG: DataFrameGroupBy.value_counts fails with a TimeGrouper * mypy fixup * .values -> ._values
1 parent 14c8336 commit 3ea04c3

File tree

5 files changed

+73
-14
lines changed

5 files changed

+73
-14
lines changed

doc/source/whatsnew/v2.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -948,6 +948,7 @@ Groupby/resample/rolling
948948
- Bug in :meth:`.SeriesGroupBy.nunique` would incorrectly raise when the grouper was an empty categorical and ``observed=True`` (:issue:`21334`)
949949
- Bug in :meth:`.SeriesGroupBy.nth` would raise when grouper contained NA values after subsetting from a :class:`DataFrameGroupBy` (:issue:`26454`)
950950
- Bug in :meth:`DataFrame.groupby` would not include a :class:`.Grouper` specified by ``key`` in the result when ``as_index=False`` (:issue:`50413`)
951+
- Bug in :meth:`.DataFrameGrouBy.value_counts` would raise when used with a :class:`.TimeGrouper` (:issue:`50486`)
951952
-
952953

953954
Reshaping

pandas/core/groupby/grouper.py

+24-11
Original file line numberDiff line numberDiff line change
@@ -425,14 +425,22 @@ class Grouping:
425425
If we are a Categorical, use the observed values
426426
in_axis : if the Grouping is a column in self.obj and hence among
427427
Groupby.exclusions list
428+
dropna : bool, default True
429+
Whether to drop NA groups.
430+
uniques : Array-like, optional
431+
When specified, will be used for unique values. Enables including empty groups
432+
in the result for a BinGrouper. Must not contain duplicates.
428433
429-
Returns
434+
Attributes
430435
-------
431-
**Attributes**:
432-
* indices : dict of {group -> index_list}
433-
* codes : ndarray, group codes
434-
* group_index : unique groups
435-
* groups : dict of {group -> label_list}
436+
indices : dict
437+
Mapping of {group -> index_list}
438+
codes : ndarray
439+
Group codes
440+
group_index : Index or None
441+
unique groups
442+
groups : dict
443+
Mapping of {group -> label_list}
436444
"""
437445

438446
_codes: npt.NDArray[np.signedinteger] | None = None
@@ -452,6 +460,7 @@ def __init__(
452460
observed: bool = False,
453461
in_axis: bool = False,
454462
dropna: bool = True,
463+
uniques: ArrayLike | None = None,
455464
) -> None:
456465
self.level = level
457466
self._orig_grouper = grouper
@@ -464,6 +473,7 @@ def __init__(
464473
self._observed = observed
465474
self.in_axis = in_axis
466475
self._dropna = dropna
476+
self._uniques = uniques
467477

468478
self._passed_categorical = False
469479

@@ -653,6 +663,7 @@ def group_index(self) -> Index:
653663

654664
@cache_readonly
655665
def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
666+
uniques: ArrayLike
656667
if self._passed_categorical:
657668
# we make a CategoricalIndex out of the cat grouper
658669
# preserving the categories / ordered attributes;
@@ -697,11 +708,13 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
697708
elif isinstance(self.grouping_vector, ops.BaseGrouper):
698709
# we have a list of groupers
699710
codes = self.grouping_vector.codes_info
700-
# error: Incompatible types in assignment (expression has type "Union
701-
# [ExtensionArray, ndarray[Any, Any]]", variable has type "Categorical")
702-
uniques = (
703-
self.grouping_vector.result_index._values # type: ignore[assignment]
704-
)
711+
uniques = self.grouping_vector.result_index._values
712+
elif self._uniques is not None:
713+
# GH#50486 Code grouping_vector using _uniques; allows
714+
# including uniques that are not present in grouping_vector.
715+
cat = Categorical(self.grouping_vector, categories=self._uniques)
716+
codes = cat.codes
717+
uniques = self._uniques
705718
else:
706719
# GH35667, replace dropna=False with use_na_sentinel=False
707720
# error: Incompatible types in assignment (expression has type "Union[

pandas/core/groupby/ops.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -1214,7 +1214,11 @@ def names(self) -> list[Hashable]:
12141214
@property
12151215
def groupings(self) -> list[grouper.Grouping]:
12161216
lev = self.binlabels
1217-
ping = grouper.Grouping(lev, lev, in_axis=False, level=None)
1217+
codes = self.group_info[0]
1218+
labels = lev.take(codes)
1219+
ping = grouper.Grouping(
1220+
labels, labels, in_axis=False, level=None, uniques=lev._values
1221+
)
12181222
return [ping]
12191223

12201224
def _aggregate_series_fast(self, obj: Series, func: Callable) -> NoReturn:

pandas/tests/groupby/test_frame_value_counts.py

+38
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,11 @@
44
from pandas import (
55
CategoricalIndex,
66
DataFrame,
7+
Grouper,
78
Index,
89
MultiIndex,
910
Series,
11+
to_datetime,
1012
)
1113
import pandas._testing as tm
1214

@@ -781,3 +783,39 @@ def test_subset_duplicate_columns():
781783
),
782784
)
783785
tm.assert_series_equal(result, expected)
786+
787+
788+
@pytest.mark.parametrize("utc", [True, False])
789+
def test_value_counts_time_grouper(utc):
790+
# GH#50486
791+
df = DataFrame(
792+
{
793+
"Timestamp": [
794+
1565083561,
795+
1565083561 + 86400,
796+
1565083561 + 86500,
797+
1565083561 + 86400 * 2,
798+
1565083561 + 86400 * 3,
799+
1565083561 + 86500 * 3,
800+
1565083561 + 86400 * 4,
801+
],
802+
"Food": ["apple", "apple", "banana", "banana", "orange", "orange", "pear"],
803+
}
804+
).drop([3])
805+
806+
df["Datetime"] = to_datetime(
807+
df["Timestamp"].apply(lambda t: str(t)), utc=utc, unit="s"
808+
)
809+
gb = df.groupby(Grouper(freq="1D", key="Datetime"))
810+
result = gb.value_counts()
811+
dates = to_datetime(
812+
["2019-08-06", "2019-08-07", "2019-08-09", "2019-08-10"], utc=utc
813+
)
814+
timestamps = df["Timestamp"].unique()
815+
index = MultiIndex(
816+
levels=[dates, timestamps, ["apple", "banana", "orange", "pear"]],
817+
codes=[[0, 1, 1, 2, 2, 3], range(6), [0, 0, 1, 2, 2, 3]],
818+
names=["Datetime", "Timestamp", "Food"],
819+
)
820+
expected = Series(1, index=index)
821+
tm.assert_series_equal(result, expected)

pandas/tests/groupby/test_value_counts.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,8 @@ def rebuild_index(df):
114114
tm.assert_series_equal(left.sort_index(), right.sort_index())
115115

116116

117-
def test_series_groupby_value_counts_with_grouper():
117+
@pytest.mark.parametrize("utc", [True, False])
118+
def test_series_groupby_value_counts_with_grouper(utc):
118119
# GH28479
119120
df = DataFrame(
120121
{
@@ -131,7 +132,9 @@ def test_series_groupby_value_counts_with_grouper():
131132
}
132133
).drop([3])
133134

134-
df["Datetime"] = to_datetime(df["Timestamp"].apply(lambda t: str(t)), unit="s")
135+
df["Datetime"] = to_datetime(
136+
df["Timestamp"].apply(lambda t: str(t)), utc=utc, unit="s"
137+
)
135138
dfg = df.groupby(Grouper(freq="1D", key="Datetime"))
136139

137140
# have to sort on index because of unstable sort on values xref GH9212

0 commit comments

Comments
 (0)