Skip to content

Commit 81aa308

Browse files
author
Lucas
committed
Restore original single grouper test + add notable fix sub-section
1 parent 8cc3e49 commit 81aa308

File tree

2 files changed

+100
-0
lines changed

2 files changed

+100
-0
lines changed

doc/source/whatsnew/v1.5.0.rst

+40
Original file line numberDiff line numberDiff line change
@@ -389,6 +389,46 @@ upon serialization. (Related issue :issue:`12997`)
389389
# Roundtripping now works
390390
pd.read_json(a.to_json(date_format='iso'), typ="series").index == a.index
391391
392+
393+
DataFrameGroupBy.value_counts with non-grouping categorical columns and ``observed=True``
394+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
395+
396+
Calling :meth:`.DataFrameGroupBy.value_counts` with ``observed=True`` would incorrectly drop non-observed categories of non-grouping columns (:issue:`46357`).
397+
398+
.. code-block:: ipython
399+
400+
In [6]: df = pd.DataFrame(["a", "b", "c"], dtype="category").iloc[0:2]
401+
In [7]: df
402+
Out[7]:
403+
0
404+
0 a
405+
1 b
406+
407+
*Old Behavior*
408+
409+
.. code-block:: ipython
410+
411+
In [8]: df.groupby(level=0, observed=True).value_counts()
412+
Out[8]:
413+
0 a 1
414+
1 b 1
415+
dtype: int64
416+
417+
418+
*New Behavior*
419+
420+
.. code-block:: ipython
421+
422+
In [9]: df.groupby(level=0, observed=True).value_counts()
423+
Out[9]:
424+
0 a 1
425+
1 a 0
426+
b 1
427+
0 b 0
428+
c 0
429+
1 c 0
430+
dtype: int64
431+
392432
.. ---------------------------------------------------------------------------
393433
.. _whatsnew_150.api_breaking:
394434

pandas/tests/groupby/test_frame_value_counts.py

+60
Original file line numberDiff line numberDiff line change
@@ -308,6 +308,66 @@ def test_data_frame_value_counts_dropna(
308308
tm.assert_series_equal(result_frame_groupby, expected)
309309

310310

311+
@pytest.mark.parametrize("as_index", [False, True])
312+
@pytest.mark.parametrize("observed", [False, True])
313+
@pytest.mark.parametrize(
314+
"normalize, expected_data",
315+
[
316+
(False, np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64)),
317+
(
318+
True,
319+
np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]),
320+
),
321+
],
322+
)
323+
def test_categorical_single_grouper_with_only_observed_categories(
324+
education_df, as_index, observed, normalize, expected_data
325+
):
326+
327+
# Test single categorical grouper with only observed grouping categories
328+
# when non-groupers are also categorical
329+
330+
gp = education_df.astype("category").groupby(
331+
"country", as_index=as_index, observed=observed
332+
)
333+
result = gp.value_counts(normalize=normalize)
334+
335+
expected_index = MultiIndex.from_tuples(
336+
[
337+
("FR", "male", "low"),
338+
("FR", "female", "high"),
339+
("FR", "male", "medium"),
340+
("FR", "female", "low"),
341+
("FR", "female", "medium"),
342+
("FR", "male", "high"),
343+
("US", "female", "high"),
344+
("US", "male", "low"),
345+
("US", "female", "low"),
346+
("US", "female", "medium"),
347+
("US", "male", "high"),
348+
("US", "male", "medium"),
349+
],
350+
names=["country", "gender", "education"],
351+
)
352+
353+
expected_series = Series(
354+
data=expected_data,
355+
index=expected_index,
356+
)
357+
for i in range(3):
358+
expected_series.index = expected_series.index.set_levels(
359+
CategoricalIndex(expected_series.index.levels[i]), level=i
360+
)
361+
362+
if as_index:
363+
tm.assert_series_equal(result, expected_series)
364+
else:
365+
expected = expected_series.reset_index(
366+
name="proportion" if normalize else "count"
367+
)
368+
tm.assert_frame_equal(result, expected)
369+
370+
311371
def assert_categorical_single_grouper(
312372
education_df, as_index, observed, expected_index, normalize, expected_data
313373
):

0 commit comments

Comments
 (0)