Skip to content

Commit d924d0b

Browse files
authored
BUG: DataFrameGroupBy.value_counts includes non-observed categories of non-grouping columns (#46798)
1 parent 85738b3 commit d924d0b

File tree

3 files changed

+364
-29
lines changed

3 files changed

+364
-29
lines changed

doc/source/whatsnew/v1.5.0.rst

+44-3
Original file line numberDiff line numberDiff line change
@@ -393,6 +393,48 @@ upon serialization. (Related issue :issue:`12997`)
393393
# Roundtripping now works
394394
pd.read_json(a.to_json(date_format='iso'), typ="series").index == a.index
395395
396+
397+
.. _whatsnew_150.notable_bug_fixes.groupby_value_counts_categorical:
398+
399+
DataFrameGroupBy.value_counts with non-grouping categorical columns and ``observed=True``
400+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
401+
402+
Calling :meth:`.DataFrameGroupBy.value_counts` with ``observed=True`` would incorrectly drop non-observed categories of non-grouping columns (:issue:`46357`).
403+
404+
.. code-block:: ipython
405+
406+
In [6]: df = pd.DataFrame(["a", "b", "c"], dtype="category").iloc[0:2]
407+
In [7]: df
408+
Out[7]:
409+
0
410+
0 a
411+
1 b
412+
413+
*Old Behavior*
414+
415+
.. code-block:: ipython
416+
417+
In [8]: df.groupby(level=0, observed=True).value_counts()
418+
Out[8]:
419+
0 a 1
420+
1 b 1
421+
dtype: int64
422+
423+
424+
*New Behavior*
425+
426+
.. code-block:: ipython
427+
428+
In [9]: df.groupby(level=0, observed=True).value_counts()
429+
Out[9]:
430+
0 a 1
431+
1 a 0
432+
b 1
433+
0 b 0
434+
c 0
435+
1 c 0
436+
dtype: int64
437+
396438
.. ---------------------------------------------------------------------------
397439
.. _whatsnew_150.api_breaking:
398440

@@ -820,9 +862,8 @@ Bug fixes
820862

821863
Categorical
822864
^^^^^^^^^^^
823-
- Bug in :meth:`.Categorical.view` not accepting integer dtypes (:issue:`25464`)
824-
- Bug in :meth:`.CategoricalIndex.union` when the index's categories are integer-dtype and the index contains ``NaN`` values incorrectly raising instead of casting to ``float64`` (:issue:`45362`)
825-
-
865+
- Bug in :meth:`Categorical.view` not accepting integer dtypes (:issue:`25464`)
866+
- Bug in :meth:`CategoricalIndex.union` when the index's categories are integer-dtype and the index contains ``NaN`` values incorrectly raising instead of casting to ``float64`` (:issue:`45362`)
826867

827868
Datetimelike
828869
^^^^^^^^^^^^

pandas/core/groupby/generic.py

+19-2
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@
7070
reconstruct_func,
7171
validate_func_kwargs,
7272
)
73+
from pandas.core.arrays.categorical import Categorical
7374
import pandas.core.common as com
7475
from pandas.core.construction import create_series_with_explicit_dtype
7576
from pandas.core.frame import DataFrame
@@ -87,6 +88,7 @@
8788
MultiIndex,
8889
all_indexes_same,
8990
)
91+
from pandas.core.indexes.category import CategoricalIndex
9092
from pandas.core.series import Series
9193
from pandas.core.shared_docs import _shared_docs
9294
from pandas.core.util.numba_ import maybe_use_numba
@@ -1824,6 +1826,7 @@ def value_counts(
18241826
key=key,
18251827
axis=self.axis,
18261828
sort=self.sort,
1829+
observed=False,
18271830
dropna=dropna,
18281831
)
18291832
groupings += list(grouper.groupings)
@@ -1837,6 +1840,19 @@ def value_counts(
18371840
)
18381841
result_series = cast(Series, gb.size())
18391842

1843+
# GH-46357 Include non-observed categories
1844+
# of non-grouping columns regardless of `observed`
1845+
if any(
1846+
isinstance(grouping.grouping_vector, (Categorical, CategoricalIndex))
1847+
and not grouping._observed
1848+
for grouping in groupings
1849+
):
1850+
levels_list = [ping.result_index for ping in groupings]
1851+
multi_index, _ = MultiIndex.from_product(
1852+
levels_list, names=[ping.name for ping in groupings]
1853+
).sortlevel()
1854+
result_series = result_series.reindex(multi_index, fill_value=0)
1855+
18401856
if normalize:
18411857
# Normalize the results by dividing by the original group sizes.
18421858
# We are guaranteed to have the first N levels be the
@@ -1847,12 +1863,13 @@ def value_counts(
18471863
indexed_group_size = result_series.groupby(
18481864
result_series.index.droplevel(levels),
18491865
sort=self.sort,
1850-
observed=self.observed,
18511866
dropna=self.dropna,
18521867
).transform("sum")
1853-
18541868
result_series /= indexed_group_size
18551869

1870+
# Handle groups of non-observed categories
1871+
result_series = result_series.fillna(0.0)
1872+
18561873
if sort:
18571874
# Sort the values and then resort by the main grouping
18581875
index_level = range(len(self.grouper.groupings))

0 commit comments

Comments
 (0)