BUG: SeriesGroupBy.value_counts - index name missing in categorical columns (#45625)

NumberPiOso · web-flow · commit c80b14511cc2 · 2022-02-05T15:19:27.000-08:00
* BUG: SeriesGroupBy.value_counts index name missing Issue #44324 * TST: Change test to correct categorical naming Value counts tend to preserve index names #45625 Change test test_sorting_with_different_categoricals to comply to this change * REF: Refactor conditionals in value_counts() * RFT: correct mistake introduced via RFT In line with 44324 * RFT: Change variable names and comment #38672 * BUG: Update conditional to is None to consider series
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -227,6 +227,7 @@ Datetimelike
 - Bug in :func:`to_datetime` with sequences of ``np.str_`` objects incorrectly raising (:issue:`32264`)
 - Bug in :class:`Timestamp` construction when passing datetime components as positional arguments and ``tzinfo`` as a keyword argument incorrectly raising (:issue:`31929`)
 - Bug in :meth:`Index.astype` when casting from object dtype to ``timedelta64[ns]`` dtype incorrectly casting ``np.datetime64("NaT")`` values to ``np.timedelta64("NaT")`` instead of raising (:issue:`45722`)
+- Bug in :meth:`SeriesGroupBy.value_counts` index when passing categorical column (:issue:`44324`)
 -
 
 Timedelta
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -602,23 +602,23 @@ def value_counts(
         ids, _, _ = self.grouper.group_info
         val = self.obj._values
 
-        def apply_series_value_counts():
-            return self.apply(
+        names = self.grouper.names + [self.obj.name]
+
+        if is_categorical_dtype(val.dtype) or (
+            bins is not None and not np.iterable(bins)
+        ):
+            # scalar bins cannot be done at top level
+            # in a backward compatible way
+            # GH38672 relates to categorical dtype
+            ser = self.apply(
                 Series.value_counts,
                 normalize=normalize,
                 sort=sort,
                 ascending=ascending,
                 bins=bins,
             )
-
-        if bins is not None:
-            if not np.iterable(bins):
-                # scalar bins cannot be done at top level
-                # in a backward compatible way
-                return apply_series_value_counts()
-        elif is_categorical_dtype(val.dtype):
-            # GH38672
-            return apply_series_value_counts()
+            ser.index.names = names
+            return ser
 
         # groupby removes null keys from groupings
         mask = ids != -1
@@ -683,7 +683,6 @@ def apply_series_value_counts():
         levels = [ping.group_index for ping in self.grouper.groupings] + [
             lev  # type: ignore[list-item]
         ]
-        names = self.grouper.names + [self.obj.name]
 
         if dropna:
             mask = codes[-1] != -1
diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
@@ -271,7 +271,7 @@ def test_sorting_with_different_categoricals():
     index = ["low", "med", "high", "low", "med", "high"]
     index = Categorical(index, categories=["low", "med", "high"], ordered=True)
     index = [["A", "A", "A", "B", "B", "B"], CategoricalIndex(index)]
-    index = MultiIndex.from_arrays(index, names=["group", None])
+    index = MultiIndex.from_arrays(index, names=["group", "dose"])
     expected = Series([2] * 6, index=index, name="dose")
     tm.assert_series_equal(result, expected)
 
diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py
@@ -22,6 +22,26 @@
 import pandas._testing as tm
 
 
+def tests_value_counts_index_names_category_column():
+    # GH44324 Missing name of index category column
+    df = DataFrame(
+        {
+            "gender": ["female"],
+            "country": ["US"],
+        }
+    )
+    df["gender"] = df["gender"].astype("category")
+    result = df.groupby("country")["gender"].value_counts()
+
+    # Construct expected, very specific multiindex
+    df_mi_expected = DataFrame([["US", "female"]], columns=["country", "gender"])
+    df_mi_expected["gender"] = df_mi_expected["gender"].astype("category")
+    mi_expected = MultiIndex.from_frame(df_mi_expected)
+    expected = Series([1], index=mi_expected, name="gender")
+
+    tm.assert_series_equal(result, expected)
+
+
 # our starting frame
 def seed_df(seed_nans, n, m):
     np.random.seed(1234)

Original file line number	Diff line number	Diff line change
`@@ -227,6 +227,7 @@ Datetimelike`
`227`	`227`	- Bug in :func:`to_datetime` with sequences of ``np.str_`` objects incorrectly raising (:issue:`32264`)
`228`	`228`	- Bug in :class:`Timestamp` construction when passing datetime components as positional arguments and ``tzinfo`` as a keyword argument incorrectly raising (:issue:`31929`)
`229`	`229`	- Bug in :meth:`Index.astype` when casting from object dtype to ``timedelta64[ns]`` dtype incorrectly casting ``np.datetime64("NaT")`` values to ``np.timedelta64("NaT")`` instead of raising (:issue:`45722`)
	`230`	+- Bug in :meth:`SeriesGroupBy.value_counts` index when passing categorical column (:issue:`44324`)
`230`	`231`	`-`
`231`	`232`
`232`	`233`	`Timedelta`