Skip to content

Commit db359a2

Browse files
committed
DataFrameGroupBy.value_counts includes non-observed categories of non-grouping columns
1 parent ec75436 commit db359a2

File tree

3 files changed

+107
-28
lines changed

3 files changed

+107
-28
lines changed

doc/source/whatsnew/v1.5.0.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -463,7 +463,8 @@ Categorical
463463
^^^^^^^^^^^
464464
- Bug in :meth:`Categorical.view` not accepting integer dtypes (:issue:`25464`)
465465
- Bug in :meth:`CategoricalIndex.union` when the index's categories are integer-dtype and the index contains ``NaN`` values incorrectly raising instead of casting to ``float64`` (:issue:`45362`)
466-
-
466+
- Bug in :meth:`DataFrameGroupBy.value_counts` includes non-observed categories of non-grouping columns regardless of ``observed`` (:issue:`46357`)
467+
467468

468469
Datetimelike
469470
^^^^^^^^^^^^

pandas/core/groupby/generic.py

+16-2
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@
6464
reconstruct_func,
6565
validate_func_kwargs,
6666
)
67+
from pandas.core.arrays.categorical import Categorical
6768
from pandas.core.base import SpecificationError
6869
import pandas.core.common as com
6970
from pandas.core.construction import create_series_with_explicit_dtype
@@ -83,6 +84,7 @@
8384
MultiIndex,
8485
all_indexes_same,
8586
)
87+
from pandas.core.indexes.category import CategoricalIndex
8688
from pandas.core.series import Series
8789
from pandas.core.shared_docs import _shared_docs
8890
from pandas.core.util.numba_ import maybe_use_numba
@@ -1747,6 +1749,7 @@ def value_counts(
17471749
key=key,
17481750
axis=self.axis,
17491751
sort=self.sort,
1752+
observed=False,
17501753
dropna=dropna,
17511754
)
17521755
groupings += list(grouper.groupings)
@@ -1760,6 +1763,19 @@ def value_counts(
17601763
)
17611764
result_series = cast(Series, gb.size())
17621765

1766+
# GH-46357 Include non-observed categories
1767+
# of non-grouping columns regardless of `observed`
1768+
if any(
1769+
isinstance(grouping.grouping_vector, (Categorical, CategoricalIndex))
1770+
and not grouping._observed
1771+
for grouping in groupings
1772+
):
1773+
levels_list = [ping.group_index for ping in groupings]
1774+
multi_index, _ = MultiIndex.from_product(
1775+
levels_list, names=[ping.name for ping in groupings]
1776+
).sortlevel()
1777+
result_series = result_series.reindex(multi_index, fill_value=0)
1778+
17631779
if normalize:
17641780
# Normalize the results by dividing by the original group sizes.
17651781
# We are guaranteed to have the first N levels be the
@@ -1770,10 +1786,8 @@ def value_counts(
17701786
indexed_group_size = result_series.groupby(
17711787
result_series.index.droplevel(levels),
17721788
sort=self.sort,
1773-
observed=self.observed,
17741789
dropna=self.dropna,
17751790
).transform("sum")
1776-
17771791
result_series /= indexed_group_size
17781792

17791793
if sort:

pandas/tests/groupby/test_frame_value_counts.py

+89-25
Original file line numberDiff line numberDiff line change
@@ -308,66 +308,130 @@ def test_data_frame_value_counts_dropna(
308308
tm.assert_series_equal(result_frame_groupby, expected)
309309

310310

311-
@pytest.mark.parametrize("as_index", [False, True])
311+
@pytest.mark.parametrize("as_index", [True, False])
312312
@pytest.mark.parametrize(
313313
"observed, expected_index",
314314
[
315315
(
316316
False,
317317
[
318-
("FR", "male", "low"),
319-
("FR", "female", "high"),
320-
("FR", "male", "medium"),
321-
("FR", "female", "low"),
322-
("FR", "female", "medium"),
323-
("FR", "male", "high"),
324-
("US", "female", "high"),
325-
("US", "male", "low"),
326-
("US", "female", "low"),
327-
("US", "female", "medium"),
328-
("US", "male", "high"),
329-
("US", "male", "medium"),
318+
("FR", "high", "female"),
319+
("FR", "high", "male"),
320+
("FR", "low", "male"),
321+
("FR", "low", "female"),
322+
("FR", "medium", "male"),
323+
("FR", "medium", "female"),
324+
("US", "high", "female"),
325+
("US", "high", "male"),
326+
("US", "low", "male"),
327+
("US", "low", "female"),
328+
("US", "medium", "female"),
329+
("US", "medium", "male"),
330330
],
331331
),
332332
(
333333
True,
334334
[
335-
("FR", "male", "low"),
336-
("FR", "female", "high"),
337-
("FR", "male", "medium"),
338-
("US", "female", "high"),
339-
("US", "male", "low"),
335+
("FR", "high", "female"),
336+
("FR", "low", "male"),
337+
("FR", "medium", "male"),
338+
("US", "high", "female"),
339+
("US", "low", "male"),
340340
],
341341
),
342342
],
343343
)
344344
@pytest.mark.parametrize(
345345
"normalize, expected_data",
346346
[
347-
(False, np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64)),
347+
(False, np.array([1, 0, 2, 0, 1, 0, 1, 0, 1, 0, 0, 0], dtype=np.int64)),
348348
(
349349
True,
350-
np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]),
350+
# NaN values corresponds to non-observed groups
351+
np.array(
352+
[1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, np.nan, np.nan]
353+
),
351354
),
352355
],
353356
)
354-
def test_categorical(
357+
def test_categorical_groupers(
355358
education_df, as_index, observed, expected_index, normalize, expected_data
356359
):
357-
# Test categorical data whether or not observed
358-
gp = education_df.astype("category").groupby(
359-
"country", as_index=as_index, observed=observed
360+
education_df = education_df.copy()
361+
education_df["country"] = education_df["country"].astype("category")
362+
education_df["education"] = education_df["education"].astype("category")
363+
364+
gp = education_df.groupby(
365+
["country", "education"], as_index=as_index, observed=observed
360366
)
361367
result = gp.value_counts(normalize=normalize)
362368

363369
expected_series = Series(
364370
data=expected_data[expected_data > 0.0] if observed else expected_data,
371+
index=MultiIndex.from_tuples(
372+
expected_index,
373+
names=["country", "education", "gender"],
374+
),
375+
)
376+
for i in range(2):
377+
expected_series.index = expected_series.index.set_levels(
378+
CategoricalIndex(expected_series.index.levels[i]), level=i
379+
)
380+
381+
if as_index:
382+
tm.assert_series_equal(result, expected_series)
383+
else:
384+
expected = expected_series.reset_index(
385+
name="proportion" if normalize else "count"
386+
)
387+
tm.assert_frame_equal(result, expected)
388+
389+
390+
@pytest.mark.parametrize("as_index", [False, True])
391+
@pytest.mark.parametrize("observed", [False, True])
392+
@pytest.mark.parametrize(
393+
"normalize, expected_data",
394+
[
395+
(False, np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64)),
396+
(
397+
True,
398+
# NaN values corresponds to non-observed groups
399+
np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]),
400+
),
401+
],
402+
)
403+
def test_categorical_values(education_df, as_index, observed, normalize, expected_data):
404+
# Test non-observed categories are included in the result,
405+
# regardless of `observed`
406+
education_df = education_df.copy()
407+
education_df["gender"] = education_df["gender"].astype("category")
408+
education_df["education"] = education_df["education"].astype("category")
409+
410+
gp = education_df.groupby("country", as_index=as_index, observed=observed)
411+
result = gp.value_counts(normalize=normalize)
412+
413+
expected_index = [
414+
("FR", "male", "low"),
415+
("FR", "female", "high"),
416+
("FR", "male", "medium"),
417+
("FR", "female", "low"),
418+
("FR", "female", "medium"),
419+
("FR", "male", "high"),
420+
("US", "female", "high"),
421+
("US", "male", "low"),
422+
("US", "female", "low"),
423+
("US", "female", "medium"),
424+
("US", "male", "high"),
425+
("US", "male", "medium"),
426+
]
427+
expected_series = Series(
428+
data=expected_data,
365429
index=MultiIndex.from_tuples(
366430
expected_index,
367431
names=["country", "gender", "education"],
368432
),
369433
)
370-
for i in range(3):
434+
for i in range(1, 3):
371435
expected_series.index = expected_series.index.set_levels(
372436
CategoricalIndex(expected_series.index.levels[i]), level=i
373437
)

0 commit comments

Comments
 (0)