Skip to content

Commit c80b145

Browse files
authored
BUG: SeriesGroupBy.value_counts - index name missing in categorical columns (#45625)
* BUG: SeriesGroupBy.value_counts index name missing Issue #44324 * TST: Change test to correct categorical naming Value counts tend to preserve index names #45625 Change test test_sorting_with_different_categoricals to comply to this change * REF: Refactor conditionals in value_counts() * RFT: correct mistake introduced via RFT In line with 44324 * RFT: Change variable names and comment #38672 * BUG: Update conditional to is None to consider series
1 parent 21bbee6 commit c80b145

File tree

4 files changed

+33
-13
lines changed

4 files changed

+33
-13
lines changed

doc/source/whatsnew/v1.5.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,7 @@ Datetimelike
227227
- Bug in :func:`to_datetime` with sequences of ``np.str_`` objects incorrectly raising (:issue:`32264`)
228228
- Bug in :class:`Timestamp` construction when passing datetime components as positional arguments and ``tzinfo`` as a keyword argument incorrectly raising (:issue:`31929`)
229229
- Bug in :meth:`Index.astype` when casting from object dtype to ``timedelta64[ns]`` dtype incorrectly casting ``np.datetime64("NaT")`` values to ``np.timedelta64("NaT")`` instead of raising (:issue:`45722`)
230+
- Bug in :meth:`SeriesGroupBy.value_counts` index when passing categorical column (:issue:`44324`)
230231
-
231232

232233
Timedelta

pandas/core/groupby/generic.py

+11-12
Original file line numberDiff line numberDiff line change
@@ -602,23 +602,23 @@ def value_counts(
602602
ids, _, _ = self.grouper.group_info
603603
val = self.obj._values
604604

605-
def apply_series_value_counts():
606-
return self.apply(
605+
names = self.grouper.names + [self.obj.name]
606+
607+
if is_categorical_dtype(val.dtype) or (
608+
bins is not None and not np.iterable(bins)
609+
):
610+
# scalar bins cannot be done at top level
611+
# in a backward compatible way
612+
# GH38672 relates to categorical dtype
613+
ser = self.apply(
607614
Series.value_counts,
608615
normalize=normalize,
609616
sort=sort,
610617
ascending=ascending,
611618
bins=bins,
612619
)
613-
614-
if bins is not None:
615-
if not np.iterable(bins):
616-
# scalar bins cannot be done at top level
617-
# in a backward compatible way
618-
return apply_series_value_counts()
619-
elif is_categorical_dtype(val.dtype):
620-
# GH38672
621-
return apply_series_value_counts()
620+
ser.index.names = names
621+
return ser
622622

623623
# groupby removes null keys from groupings
624624
mask = ids != -1
@@ -683,7 +683,6 @@ def apply_series_value_counts():
683683
levels = [ping.group_index for ping in self.grouper.groupings] + [
684684
lev # type: ignore[list-item]
685685
]
686-
names = self.grouper.names + [self.obj.name]
687686

688687
if dropna:
689688
mask = codes[-1] != -1

pandas/tests/groupby/test_categorical.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -271,7 +271,7 @@ def test_sorting_with_different_categoricals():
271271
index = ["low", "med", "high", "low", "med", "high"]
272272
index = Categorical(index, categories=["low", "med", "high"], ordered=True)
273273
index = [["A", "A", "A", "B", "B", "B"], CategoricalIndex(index)]
274-
index = MultiIndex.from_arrays(index, names=["group", None])
274+
index = MultiIndex.from_arrays(index, names=["group", "dose"])
275275
expected = Series([2] * 6, index=index, name="dose")
276276
tm.assert_series_equal(result, expected)
277277

pandas/tests/groupby/test_value_counts.py

+20
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,26 @@
2222
import pandas._testing as tm
2323

2424

25+
def tests_value_counts_index_names_category_column():
26+
# GH44324 Missing name of index category column
27+
df = DataFrame(
28+
{
29+
"gender": ["female"],
30+
"country": ["US"],
31+
}
32+
)
33+
df["gender"] = df["gender"].astype("category")
34+
result = df.groupby("country")["gender"].value_counts()
35+
36+
# Construct expected, very specific multiindex
37+
df_mi_expected = DataFrame([["US", "female"]], columns=["country", "gender"])
38+
df_mi_expected["gender"] = df_mi_expected["gender"].astype("category")
39+
mi_expected = MultiIndex.from_frame(df_mi_expected)
40+
expected = Series([1], index=mi_expected, name="gender")
41+
42+
tm.assert_series_equal(result, expected)
43+
44+
2545
# our starting frame
2646
def seed_df(seed_nans, n, m):
2747
np.random.seed(1234)

0 commit comments

Comments
 (0)