DataFrameGroupBy.value_counts includes non-observed categories of non-grouping columns

LucasG0 · LucasG0 · commit 95c141cfd17b · 2022-04-18T17:24:34.000+02:00
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -463,7 +463,8 @@ Categorical
 ^^^^^^^^^^^
 - Bug in :meth:`Categorical.view` not accepting integer dtypes (:issue:`25464`)
 - Bug in :meth:`CategoricalIndex.union` when the index's categories are integer-dtype and the index contains ``NaN`` values incorrectly raising instead of casting to ``float64`` (:issue:`45362`)
--
+- Bug in :meth:`DataFrameGroupBy.value_counts` includes non-observed categories of non-grouping columns regardless of ``observed`` (:issue:`46357`)
+
 
 Datetimelike
 ^^^^^^^^^^^^
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -64,6 +64,7 @@
     reconstruct_func,
     validate_func_kwargs,
 )
+from pandas.core.arrays.categorical import Categorical
 from pandas.core.base import SpecificationError
 import pandas.core.common as com
 from pandas.core.construction import create_series_with_explicit_dtype
@@ -83,6 +84,7 @@
     MultiIndex,
     all_indexes_same,
 )
+from pandas.core.indexes.category import CategoricalIndex
 from pandas.core.series import Series
 from pandas.core.shared_docs import _shared_docs
 from pandas.core.util.numba_ import maybe_use_numba
@@ -1747,6 +1749,7 @@ def value_counts(
                     key=key,
                     axis=self.axis,
                     sort=self.sort,
+                    observed=False,
                     dropna=dropna,
                 )
                 groupings += list(grouper.groupings)
@@ -1760,6 +1763,19 @@ def value_counts(
             )
             result_series = cast(Series, gb.size())
 
+            # GH-46357 Include non-observed categories
+            # of non-grouping columns regardless of `observed`
+            if any(
+                isinstance(grouping.grouping_vector, (Categorical, CategoricalIndex))
+                and not grouping._observed
+                for grouping in groupings
+            ):
+                levels_list = [ping.group_index for ping in groupings]
+                index, _ = MultiIndex.from_product(
+                    levels_list, names=[ping.name for ping in groupings]
+                ).sortlevel()
+                result_series = result_series.reindex(index, fill_value=0)
+
             if normalize:
                 # Normalize the results by dividing by the original group sizes.
                 # We are guaranteed to have the first N levels be the
@@ -1770,10 +1786,8 @@ def value_counts(
                 indexed_group_size = result_series.groupby(
                     result_series.index.droplevel(levels),
                     sort=self.sort,
-                    observed=self.observed,
                     dropna=self.dropna,
                 ).transform("sum")
-
                 result_series /= indexed_group_size
 
             if sort:
diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py
@@ -308,66 +308,130 @@ def test_data_frame_value_counts_dropna(
     tm.assert_series_equal(result_frame_groupby, expected)
 
 
-@pytest.mark.parametrize("as_index", [False, True])
+@pytest.mark.parametrize("as_index", [True, False])
 @pytest.mark.parametrize(
     "observed, expected_index",
     [
         (
             False,
             [
-                ("FR", "male", "low"),
-                ("FR", "female", "high"),
-                ("FR", "male", "medium"),
-                ("FR", "female", "low"),
-                ("FR", "female", "medium"),
-                ("FR", "male", "high"),
-                ("US", "female", "high"),
-                ("US", "male", "low"),
-                ("US", "female", "low"),
-                ("US", "female", "medium"),
-                ("US", "male", "high"),
-                ("US", "male", "medium"),
+                ("FR", "high", "female"),
+                ("FR", "high", "male"),
+                ("FR", "low", "male"),
+                ("FR", "low", "female"),
+                ("FR", "medium", "male"),
+                ("FR", "medium", "female"),
+                ("US", "high", "female"),
+                ("US", "high", "male"),
+                ("US", "low", "male"),
+                ("US", "low", "female"),
+                ("US", "medium", "female"),
+                ("US", "medium", "male"),
             ],
         ),
         (
             True,
             [
-                ("FR", "male", "low"),
-                ("FR", "female", "high"),
-                ("FR", "male", "medium"),
-                ("US", "female", "high"),
-                ("US", "male", "low"),
+                ("FR", "high", "female"),
+                ("FR", "low", "male"),
+                ("FR", "medium", "male"),
+                ("US", "high", "female"),
+                ("US", "low", "male"),
             ],
         ),
     ],
 )
 @pytest.mark.parametrize(
     "normalize, expected_data",
     [
-        (False, np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64)),
+        (False, np.array([1, 0, 2, 0, 1, 0, 1, 0, 1, 0, 0, 0], dtype=np.int64)),
         (
             True,
-            np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]),
+            # NaN values corresponds to non-observed groups
+            np.array(
+                [1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, np.nan, np.nan]
+            ),
         ),
     ],
 )
-def test_categorical(
+def test_categorical_groupers(
     education_df, as_index, observed, expected_index, normalize, expected_data
 ):
-    # Test categorical data whether or not observed
-    gp = education_df.astype("category").groupby(
-        "country", as_index=as_index, observed=observed
+    education_df = education_df.copy()
+    education_df["country"] = education_df["country"].astype("category")
+    education_df["education"] = education_df["education"].astype("category")
+
+    gp = education_df.groupby(
+        ["country", "education"], as_index=as_index, observed=observed
     )
     result = gp.value_counts(normalize=normalize)
 
     expected_series = Series(
         data=expected_data[expected_data > 0.0] if observed else expected_data,
+        index=MultiIndex.from_tuples(
+            expected_index,
+            names=["country", "education", "gender"],
+        ),
+    )
+    for i in range(2):
+        expected_series.index = expected_series.index.set_levels(
+            CategoricalIndex(expected_series.index.levels[i]), level=i
+        )
+
+    if as_index:
+        tm.assert_series_equal(result, expected_series)
+    else:
+        expected = expected_series.reset_index(
+            name="proportion" if normalize else "count"
+        )
+        tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("as_index", [False, True])
+@pytest.mark.parametrize("observed", [False, True])
+@pytest.mark.parametrize(
+    "normalize, expected_data",
+    [
+        (False, np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64)),
+        (
+            True,
+            # NaN values corresponds to non-observed groups
+            np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]),
+        ),
+    ],
+)
+def test_categorical_values(education_df, as_index, observed, normalize, expected_data):
+    # Test non-observed categories are included in the result,
+    # regardless of `observed`
+    education_df = education_df.copy()
+    education_df["gender"] = education_df["gender"].astype("category")
+    education_df["education"] = education_df["education"].astype("category")
+
+    gp = education_df.groupby("country", as_index=as_index, observed=observed)
+    result = gp.value_counts(normalize=normalize)
+
+    expected_index = [
+        ("FR", "male", "low"),
+        ("FR", "female", "high"),
+        ("FR", "male", "medium"),
+        ("FR", "female", "low"),
+        ("FR", "female", "medium"),
+        ("FR", "male", "high"),
+        ("US", "female", "high"),
+        ("US", "male", "low"),
+        ("US", "female", "low"),
+        ("US", "female", "medium"),
+        ("US", "male", "high"),
+        ("US", "male", "medium"),
+    ]
+    expected_series = Series(
+        data=expected_data,
         index=MultiIndex.from_tuples(
             expected_index,
             names=["country", "gender", "education"],
         ),
     )
-    for i in range(3):
+    for i in range(1, 3):
         expected_series.index = expected_series.index.set_levels(
             CategoricalIndex(expected_series.index.levels[i]), level=i
         )