Skip to content

Commit ddf50d4

Browse files
author
Lucas
committed
Fix unobserved categories dtype and normalize behavior + add tests for single grouper
1 parent 270ef3e commit ddf50d4

File tree

3 files changed

+164
-8
lines changed

3 files changed

+164
-8
lines changed

doc/source/whatsnew/v1.5.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -584,7 +584,7 @@ Categorical
584584
^^^^^^^^^^^
585585
- Bug in :meth:`Categorical.view` not accepting integer dtypes (:issue:`25464`)
586586
- Bug in :meth:`CategoricalIndex.union` when the index's categories are integer-dtype and the index contains ``NaN`` values incorrectly raising instead of casting to ``float64`` (:issue:`45362`)
587-
- Bug in :meth:`DataFrameGroupBy.value_counts` includes non-observed categories of non-grouping columns regardless of ``observed`` (:issue:`46357`)
587+
- Bug in :meth:`.DataFrameGroupBy.value_counts` includes non-observed categories of non-grouping columns regardless of ``observed`` (:issue:`46357`)
588588

589589

590590
Datetimelike

pandas/core/groupby/generic.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -1780,7 +1780,7 @@ def value_counts(
17801780
and not grouping._observed
17811781
for grouping in groupings
17821782
):
1783-
levels_list = [ping.group_index for ping in groupings]
1783+
levels_list = [ping.result_index for ping in groupings]
17841784
multi_index, _ = MultiIndex.from_product(
17851785
levels_list, names=[ping.name for ping in groupings]
17861786
).sortlevel()
@@ -1800,6 +1800,9 @@ def value_counts(
18001800
).transform("sum")
18011801
result_series /= indexed_group_size
18021802

1803+
# Handle empty groups of non-observed categories
1804+
result_series = result_series.fillna(0.0)
1805+
18031806
if sort:
18041807
# Sort the values and then resort by the main grouping
18051808
index_level = range(len(self.grouper.groupings))

pandas/tests/groupby/test_frame_value_counts.py

+159-6
Original file line numberDiff line numberDiff line change
@@ -308,6 +308,156 @@ def test_data_frame_value_counts_dropna(
308308
tm.assert_series_equal(result_frame_groupby, expected)
309309

310310

311+
def _test_categorical_single_grouper(
312+
education_df, as_index, observed, expected_index, normalize, expected_data
313+
):
314+
# Test single categorical grouper when non-groupers are also categorical
315+
education_df = education_df.copy().astype("category")
316+
317+
# Add non-observed grouping categories
318+
education_df["country"] = education_df["country"].cat.add_categories(["ASIA"])
319+
320+
gp = education_df.groupby("country", as_index=as_index, observed=observed)
321+
result = gp.value_counts(normalize=normalize)
322+
323+
expected_series = Series(
324+
data=expected_data,
325+
index=MultiIndex.from_tuples(
326+
expected_index,
327+
names=["country", "gender", "education"],
328+
),
329+
)
330+
for i in range(3):
331+
index_level = CategoricalIndex(expected_series.index.levels[i])
332+
if i == 0:
333+
index_level = index_level.set_categories(
334+
education_df["country"].cat.categories
335+
)
336+
expected_series.index = expected_series.index.set_levels(index_level, level=i)
337+
338+
if as_index:
339+
tm.assert_series_equal(result, expected_series)
340+
else:
341+
expected = expected_series.reset_index(
342+
name="proportion" if normalize else "count"
343+
)
344+
tm.assert_frame_equal(result, expected)
345+
346+
347+
@pytest.mark.parametrize("as_index", [True, False])
348+
@pytest.mark.parametrize(
349+
"normalize, expected_data",
350+
[
351+
(False, np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64)),
352+
(
353+
True,
354+
np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]),
355+
),
356+
],
357+
)
358+
def test_categorical_single_grouper_observed_true(
359+
education_df, as_index, normalize, expected_data
360+
):
361+
# GH#46357
362+
363+
expected_index = [
364+
("FR", "male", "low"),
365+
("FR", "female", "high"),
366+
("FR", "male", "medium"),
367+
("FR", "female", "low"),
368+
("FR", "female", "medium"),
369+
("FR", "male", "high"),
370+
("US", "female", "high"),
371+
("US", "male", "low"),
372+
("US", "female", "low"),
373+
("US", "female", "medium"),
374+
("US", "male", "high"),
375+
("US", "male", "medium"),
376+
]
377+
378+
_test_categorical_single_grouper(
379+
education_df=education_df,
380+
as_index=as_index,
381+
observed=True,
382+
expected_index=expected_index,
383+
normalize=normalize,
384+
expected_data=expected_data,
385+
)
386+
387+
388+
@pytest.mark.parametrize("as_index", [True, False])
389+
@pytest.mark.parametrize(
390+
"normalize, expected_data",
391+
[
392+
(
393+
False,
394+
np.array(
395+
[2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=np.int64
396+
),
397+
),
398+
(
399+
True,
400+
np.array(
401+
[
402+
0.5,
403+
0.25,
404+
0.25,
405+
0.0,
406+
0.0,
407+
0.0,
408+
0.5,
409+
0.5,
410+
0.0,
411+
0.0,
412+
0.0,
413+
0.0,
414+
0.0,
415+
0.0,
416+
0.0,
417+
0.0,
418+
0.0,
419+
0.0,
420+
]
421+
),
422+
),
423+
],
424+
)
425+
def test_categorical_single_grouper_observed_false(
426+
education_df, as_index, normalize, expected_data
427+
):
428+
# GH#46357
429+
430+
expected_index = [
431+
("FR", "male", "low"),
432+
("FR", "female", "high"),
433+
("FR", "male", "medium"),
434+
("FR", "female", "low"),
435+
("FR", "male", "high"),
436+
("FR", "female", "medium"),
437+
("US", "female", "high"),
438+
("US", "male", "low"),
439+
("US", "male", "medium"),
440+
("US", "male", "high"),
441+
("US", "female", "medium"),
442+
("US", "female", "low"),
443+
("ASIA", "male", "low"),
444+
("ASIA", "male", "high"),
445+
("ASIA", "female", "medium"),
446+
("ASIA", "female", "low"),
447+
("ASIA", "female", "high"),
448+
("ASIA", "male", "medium"),
449+
]
450+
451+
_test_categorical_single_grouper(
452+
education_df=education_df,
453+
as_index=as_index,
454+
observed=False,
455+
expected_index=expected_index,
456+
normalize=normalize,
457+
expected_data=expected_data,
458+
)
459+
460+
311461
@pytest.mark.parametrize("as_index", [True, False])
312462
@pytest.mark.parametrize(
313463
"observed, expected_index",
@@ -348,15 +498,16 @@ def test_data_frame_value_counts_dropna(
348498
(
349499
True,
350500
# NaN values corresponds to non-observed groups
351-
np.array(
352-
[1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, np.nan, np.nan]
353-
),
501+
np.array([1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0]),
354502
),
355503
],
356504
)
357-
def test_categorical_groupers(
505+
def test_categorical_multiple_groupers(
358506
education_df, as_index, observed, expected_index, normalize, expected_data
359507
):
508+
# GH#46357
509+
510+
# Test multiple categorical groupers when non-groupers are non-categorical
360511
education_df = education_df.copy()
361512
education_df["country"] = education_df["country"].astype("category")
362513
education_df["education"] = education_df["education"].astype("category")
@@ -400,8 +551,10 @@ def test_categorical_groupers(
400551
),
401552
],
402553
)
403-
def test_categorical_values(education_df, as_index, observed, normalize, expected_data):
404-
# Test non-observed categories are included in the result,
554+
def test_categorical_non_groupers(
555+
education_df, as_index, observed, normalize, expected_data
556+
):
557+
# GH#46357 Test non-observed categories are included in the result,
405558
# regardless of `observed`
406559
education_df = education_df.copy()
407560
education_df["gender"] = education_df["gender"].astype("category")

0 commit comments

Comments
 (0)