Skip to content

BUG: DataFrameGroupBy.value_counts fails with a TimeGrouper #50507

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jan 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -945,6 +945,7 @@ Groupby/resample/rolling
- Bug in :meth:`.SeriesGroupBy.nunique` would incorrectly raise when the grouper was an empty categorical and ``observed=True`` (:issue:`21334`)
- Bug in :meth:`.SeriesGroupBy.nth` would raise when grouper contained NA values after subsetting from a :class:`DataFrameGroupBy` (:issue:`26454`)
- Bug in :meth:`DataFrame.groupby` would not include a :class:`.Grouper` specified by ``key`` in the result when ``as_index=False`` (:issue:`50413`)
- Bug in :meth:`.DataFrameGrouBy.value_counts` would raise when used with a :class:`.TimeGrouper` (:issue:`50486`)
-

Reshaping
Expand Down
35 changes: 24 additions & 11 deletions pandas/core/groupby/grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -425,14 +425,22 @@ class Grouping:
If we are a Categorical, use the observed values
in_axis : if the Grouping is a column in self.obj and hence among
Groupby.exclusions list
dropna : bool, default True
Whether to drop NA groups.
uniques : Array-like, optional
When specified, will be used for unique values. Enables including empty groups
in the result for a BinGrouper. Must not contain duplicates.

Returns
Attributes
-------
**Attributes**:
* indices : dict of {group -> index_list}
* codes : ndarray, group codes
* group_index : unique groups
* groups : dict of {group -> label_list}
indices : dict
Mapping of {group -> index_list}
codes : ndarray
Group codes
group_index : Index or None
unique groups
groups : dict
Mapping of {group -> label_list}
"""

_codes: npt.NDArray[np.signedinteger] | None = None
Expand All @@ -452,6 +460,7 @@ def __init__(
observed: bool = False,
in_axis: bool = False,
dropna: bool = True,
uniques: ArrayLike | None = None,
) -> None:
self.level = level
self._orig_grouper = grouper
Expand All @@ -464,6 +473,7 @@ def __init__(
self._observed = observed
self.in_axis = in_axis
self._dropna = dropna
self._uniques = uniques

self._passed_categorical = False

Expand Down Expand Up @@ -653,6 +663,7 @@ def group_index(self) -> Index:

@cache_readonly
def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
uniques: ArrayLike
if self._passed_categorical:
# we make a CategoricalIndex out of the cat grouper
# preserving the categories / ordered attributes;
Expand Down Expand Up @@ -697,11 +708,13 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
elif isinstance(self.grouping_vector, ops.BaseGrouper):
# we have a list of groupers
codes = self.grouping_vector.codes_info
# error: Incompatible types in assignment (expression has type "Union
# [ExtensionArray, ndarray[Any, Any]]", variable has type "Categorical")
uniques = (
self.grouping_vector.result_index._values # type: ignore[assignment]
)
uniques = self.grouping_vector.result_index._values
elif self._uniques is not None:
# GH#50486 Code grouping_vector using _uniques; allows
# including uniques that are not present in grouping_vector.
cat = Categorical(self.grouping_vector, categories=self._uniques)
codes = cat.codes
uniques = self._uniques
else:
# GH35667, replace dropna=False with use_na_sentinel=False
# error: Incompatible types in assignment (expression has type "Union[
Expand Down
6 changes: 5 additions & 1 deletion pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -1214,7 +1214,11 @@ def names(self) -> list[Hashable]:
@property
def groupings(self) -> list[grouper.Grouping]:
lev = self.binlabels
ping = grouper.Grouping(lev, lev, in_axis=False, level=None)
codes = self.group_info[0]
labels = lev.take(codes)
ping = grouper.Grouping(
labels, labels, in_axis=False, level=None, uniques=lev._values
)
return [ping]

def _aggregate_series_fast(self, obj: Series, func: Callable) -> NoReturn:
Expand Down
38 changes: 38 additions & 0 deletions pandas/tests/groupby/test_frame_value_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@
from pandas import (
CategoricalIndex,
DataFrame,
Grouper,
Index,
MultiIndex,
Series,
to_datetime,
)
import pandas._testing as tm

Expand Down Expand Up @@ -781,3 +783,39 @@ def test_subset_duplicate_columns():
),
)
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("utc", [True, False])
def test_value_counts_time_grouper(utc):
# GH#50486
df = DataFrame(
{
"Timestamp": [
1565083561,
1565083561 + 86400,
1565083561 + 86500,
1565083561 + 86400 * 2,
1565083561 + 86400 * 3,
1565083561 + 86500 * 3,
1565083561 + 86400 * 4,
],
"Food": ["apple", "apple", "banana", "banana", "orange", "orange", "pear"],
}
).drop([3])

df["Datetime"] = to_datetime(
df["Timestamp"].apply(lambda t: str(t)), utc=utc, unit="s"
)
gb = df.groupby(Grouper(freq="1D", key="Datetime"))
result = gb.value_counts()
dates = to_datetime(
["2019-08-06", "2019-08-07", "2019-08-09", "2019-08-10"], utc=utc
)
timestamps = df["Timestamp"].unique()
index = MultiIndex(
levels=[dates, timestamps, ["apple", "banana", "orange", "pear"]],
codes=[[0, 1, 1, 2, 2, 3], range(6), [0, 0, 1, 2, 2, 3]],
names=["Datetime", "Timestamp", "Food"],
)
expected = Series(1, index=index)
tm.assert_series_equal(result, expected)
7 changes: 5 additions & 2 deletions pandas/tests/groupby/test_value_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,8 @@ def rebuild_index(df):
tm.assert_series_equal(left.sort_index(), right.sort_index())


def test_series_groupby_value_counts_with_grouper():
@pytest.mark.parametrize("utc", [True, False])
def test_series_groupby_value_counts_with_grouper(utc):
# GH28479
df = DataFrame(
{
Expand All @@ -131,7 +132,9 @@ def test_series_groupby_value_counts_with_grouper():
}
).drop([3])

df["Datetime"] = to_datetime(df["Timestamp"].apply(lambda t: str(t)), unit="s")
df["Datetime"] = to_datetime(
df["Timestamp"].apply(lambda t: str(t)), utc=utc, unit="s"
)
dfg = df.groupby(Grouper(freq="1D", key="Datetime"))

# have to sort on index because of unstable sort on values xref GH9212
Expand Down