Skip to content

Commit 6fa1755

Browse files
committed
DataFrameGroupBy.value_counts includes non-observed categories of non-grouping columns
1 parent d2a7eff commit 6fa1755

File tree

3 files changed

+107
-28
lines changed

3 files changed

+107
-28
lines changed

doc/source/whatsnew/v1.5.0.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -634,7 +634,8 @@ Categorical
634634
^^^^^^^^^^^
635635
- Bug in :meth:`Categorical.view` not accepting integer dtypes (:issue:`25464`)
636636
- Bug in :meth:`CategoricalIndex.union` when the index's categories are integer-dtype and the index contains ``NaN`` values incorrectly raising instead of casting to ``float64`` (:issue:`45362`)
637-
-
637+
- Bug in :meth:`DataFrameGroupBy.value_counts` includes non-observed categories of non-grouping columns regardless of ``observed`` (:issue:`46357`)
638+
638639

639640
Datetimelike
640641
^^^^^^^^^^^^

pandas/core/groupby/generic.py

+16-2
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@
6868
reconstruct_func,
6969
validate_func_kwargs,
7070
)
71+
from pandas.core.arrays.categorical import Categorical
7172
from pandas.core.base import SpecificationError
7273
import pandas.core.common as com
7374
from pandas.core.construction import create_series_with_explicit_dtype
@@ -87,6 +88,7 @@
8788
MultiIndex,
8889
all_indexes_same,
8990
)
91+
from pandas.core.indexes.category import CategoricalIndex
9092
from pandas.core.series import Series
9193
from pandas.core.shared_docs import _shared_docs
9294
from pandas.core.util.numba_ import maybe_use_numba
@@ -1786,6 +1788,7 @@ def value_counts(
17861788
key=key,
17871789
axis=self.axis,
17881790
sort=self.sort,
1791+
observed=False,
17891792
dropna=dropna,
17901793
)
17911794
groupings += list(grouper.groupings)
@@ -1799,6 +1802,19 @@ def value_counts(
17991802
)
18001803
result_series = cast(Series, gb.size())
18011804

1805+
# GH-46357 Include non-observed categories
1806+
# of non-grouping columns regardless of `observed`
1807+
if any(
1808+
isinstance(grouping.grouping_vector, (Categorical, CategoricalIndex))
1809+
and not grouping._observed
1810+
for grouping in groupings
1811+
):
1812+
levels_list = [ping.group_index for ping in groupings]
1813+
multi_index, _ = MultiIndex.from_product(
1814+
levels_list, names=[ping.name for ping in groupings]
1815+
).sortlevel()
1816+
result_series = result_series.reindex(multi_index, fill_value=0)
1817+
18021818
if normalize:
18031819
# Normalize the results by dividing by the original group sizes.
18041820
# We are guaranteed to have the first N levels be the
@@ -1809,10 +1825,8 @@ def value_counts(
18091825
indexed_group_size = result_series.groupby(
18101826
result_series.index.droplevel(levels),
18111827
sort=self.sort,
1812-
observed=self.observed,
18131828
dropna=self.dropna,
18141829
).transform("sum")
1815-
18161830
result_series /= indexed_group_size
18171831

18181832
if sort:

pandas/tests/groupby/test_frame_value_counts.py

+89-25
Original file line numberDiff line numberDiff line change
@@ -308,66 +308,130 @@ def test_data_frame_value_counts_dropna(
308308
tm.assert_series_equal(result_frame_groupby, expected)
309309

310310

311-
@pytest.mark.parametrize("as_index", [False, True])
311+
@pytest.mark.parametrize("as_index", [True, False])
312312
@pytest.mark.parametrize(
313313
"observed, expected_index",
314314
[
315315
(
316316
False,
317317
[
318-
("FR", "male", "low"),
319-
("FR", "female", "high"),
320-
("FR", "male", "medium"),
321-
("FR", "female", "low"),
322-
("FR", "female", "medium"),
323-
("FR", "male", "high"),
324-
("US", "female", "high"),
325-
("US", "male", "low"),
326-
("US", "female", "low"),
327-
("US", "female", "medium"),
328-
("US", "male", "high"),
329-
("US", "male", "medium"),
318+
("FR", "high", "female"),
319+
("FR", "high", "male"),
320+
("FR", "low", "male"),
321+
("FR", "low", "female"),
322+
("FR", "medium", "male"),
323+
("FR", "medium", "female"),
324+
("US", "high", "female"),
325+
("US", "high", "male"),
326+
("US", "low", "male"),
327+
("US", "low", "female"),
328+
("US", "medium", "female"),
329+
("US", "medium", "male"),
330330
],
331331
),
332332
(
333333
True,
334334
[
335-
("FR", "male", "low"),
336-
("FR", "female", "high"),
337-
("FR", "male", "medium"),
338-
("US", "female", "high"),
339-
("US", "male", "low"),
335+
("FR", "high", "female"),
336+
("FR", "low", "male"),
337+
("FR", "medium", "male"),
338+
("US", "high", "female"),
339+
("US", "low", "male"),
340340
],
341341
),
342342
],
343343
)
344344
@pytest.mark.parametrize(
345345
"normalize, expected_data",
346346
[
347-
(False, np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64)),
347+
(False, np.array([1, 0, 2, 0, 1, 0, 1, 0, 1, 0, 0, 0], dtype=np.int64)),
348348
(
349349
True,
350-
np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]),
350+
# NaN values corresponds to non-observed groups
351+
np.array(
352+
[1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, np.nan, np.nan]
353+
),
351354
),
352355
],
353356
)
354-
def test_categorical(
357+
def test_categorical_groupers(
355358
education_df, as_index, observed, expected_index, normalize, expected_data
356359
):
357-
# Test categorical data whether or not observed
358-
gp = education_df.astype("category").groupby(
359-
"country", as_index=as_index, observed=observed
360+
education_df = education_df.copy()
361+
education_df["country"] = education_df["country"].astype("category")
362+
education_df["education"] = education_df["education"].astype("category")
363+
364+
gp = education_df.groupby(
365+
["country", "education"], as_index=as_index, observed=observed
360366
)
361367
result = gp.value_counts(normalize=normalize)
362368

363369
expected_series = Series(
364370
data=expected_data[expected_data > 0.0] if observed else expected_data,
371+
index=MultiIndex.from_tuples(
372+
expected_index,
373+
names=["country", "education", "gender"],
374+
),
375+
)
376+
for i in range(2):
377+
expected_series.index = expected_series.index.set_levels(
378+
CategoricalIndex(expected_series.index.levels[i]), level=i
379+
)
380+
381+
if as_index:
382+
tm.assert_series_equal(result, expected_series)
383+
else:
384+
expected = expected_series.reset_index(
385+
name="proportion" if normalize else "count"
386+
)
387+
tm.assert_frame_equal(result, expected)
388+
389+
390+
@pytest.mark.parametrize("as_index", [False, True])
391+
@pytest.mark.parametrize("observed", [False, True])
392+
@pytest.mark.parametrize(
393+
"normalize, expected_data",
394+
[
395+
(False, np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64)),
396+
(
397+
True,
398+
# NaN values corresponds to non-observed groups
399+
np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]),
400+
),
401+
],
402+
)
403+
def test_categorical_values(education_df, as_index, observed, normalize, expected_data):
404+
# Test non-observed categories are included in the result,
405+
# regardless of `observed`
406+
education_df = education_df.copy()
407+
education_df["gender"] = education_df["gender"].astype("category")
408+
education_df["education"] = education_df["education"].astype("category")
409+
410+
gp = education_df.groupby("country", as_index=as_index, observed=observed)
411+
result = gp.value_counts(normalize=normalize)
412+
413+
expected_index = [
414+
("FR", "male", "low"),
415+
("FR", "female", "high"),
416+
("FR", "male", "medium"),
417+
("FR", "female", "low"),
418+
("FR", "female", "medium"),
419+
("FR", "male", "high"),
420+
("US", "female", "high"),
421+
("US", "male", "low"),
422+
("US", "female", "low"),
423+
("US", "female", "medium"),
424+
("US", "male", "high"),
425+
("US", "male", "medium"),
426+
]
427+
expected_series = Series(
428+
data=expected_data,
365429
index=MultiIndex.from_tuples(
366430
expected_index,
367431
names=["country", "gender", "education"],
368432
),
369433
)
370-
for i in range(3):
434+
for i in range(1, 3):
371435
expected_series.index = expected_series.index.set_levels(
372436
CategoricalIndex(expected_series.index.levels[i]), level=i
373437
)

0 commit comments

Comments
 (0)