From 32cf991d23f2c6bb08bb3b1f45280c9333d573c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pablo=20Osorio=20L=C3=B3pez?= Date: Tue, 25 Jan 2022 17:14:16 -0500 Subject: [PATCH 1/6] BUG: SeriesGroupBy.value_counts index name missing Issue #44324 --- doc/source/whatsnew/v1.5.0.rst | 1 + pandas/core/groupby/generic.py | 7 +++++-- pandas/tests/groupby/test_value_counts.py | 20 ++++++++++++++++++++ 3 files changed, 26 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 4e73ea348dfde..61c60a1489694 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -220,6 +220,7 @@ Datetimelike - Bug in :meth:`DataFrame.quantile` with datetime-like dtypes and no rows incorrectly returning ``float64`` dtype instead of retaining datetime-like dtype (:issue:`41544`) - Bug in :func:`to_datetime` with sequences of ``np.str_`` objects incorrectly raising (:issue:`32264`) - Bug in :class:`Timestamp` construction when passing datetime components as positional arguments and ``tzinfo`` as a keyword argument incorrectly raising (:issue:`31929`) +- Bug in :meth:`SeriesGroupBy.value_counts` index when passing categorical column (:issue:`44324`) - Timedelta diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 175067b4b7c20..c0526dd736ed2 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -602,14 +602,18 @@ def value_counts( ids, _, _ = self.grouper.group_info val = self.obj._values + names = self.grouper.names + [self.obj.name] + def apply_series_value_counts(): - return self.apply( + s = self.apply( Series.value_counts, normalize=normalize, sort=sort, ascending=ascending, bins=bins, ) + s.index.names = names + return s if bins is not None: if not np.iterable(bins): @@ -683,7 +687,6 @@ def apply_series_value_counts(): levels = [ping.group_index for ping in self.grouper.groupings] + [ lev # type: ignore[list-item] ] - names = self.grouper.names + [self.obj.name] if dropna: mask = codes[-1] != -1 diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index 54f672cb69800..d38ba84cd1397 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -22,6 +22,26 @@ import pandas._testing as tm +def tests_value_counts_index_names_category_column(): + # GH44324 Missing name of index category column + df = DataFrame( + { + "gender": ["female"], + "country": ["US"], + } + ) + df["gender"] = df["gender"].astype("category") + result = df.groupby("country")["gender"].value_counts() + + # Construct expected, very specific multiindex + df_mi_expected = DataFrame([["US", "female"]], columns=["country", "gender"]) + df_mi_expected["gender"] = df_mi_expected["gender"].astype("category") + mi_expected = MultiIndex.from_frame(df_mi_expected) + expected = Series([1], index=mi_expected, name="gender") + + tm.assert_series_equal(result, expected) + + # our starting frame def seed_df(seed_nans, n, m): np.random.seed(1234) From 8e8b37cdcde9b30d6cdfa9394c8531d5916bd193 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pablo=20Osorio=20L=C3=B3pez?= Date: Wed, 26 Jan 2022 11:10:34 -0500 Subject: [PATCH 2/6] TST: Change test to correct categorical naming Value counts tend to preserve index names #45625 Change test test_sorting_with_different_categoricals to comply to this change --- pandas/tests/groupby/test_categorical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 3d5016b058c07..a77c3e122e42a 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -271,7 +271,7 @@ def test_sorting_with_different_categoricals(): index = ["low", "med", "high", "low", "med", "high"] index = Categorical(index, categories=["low", "med", "high"], ordered=True) index = [["A", "A", "A", "B", "B", "B"], CategoricalIndex(index)] - index = MultiIndex.from_arrays(index, names=["group", None]) + index = MultiIndex.from_arrays(index, names=["group", "dose"]) expected = Series([2] * 6, index=index, name="dose") tm.assert_series_equal(result, expected) From 7692b368bf95093d8d5e40faef5868dc7b751f26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pablo=20Osorio=20L=C3=B3pez?= Date: Mon, 31 Jan 2022 09:08:33 -0500 Subject: [PATCH 3/6] REF: Refactor conditionals in value_counts() --- pandas/core/groupby/generic.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index c0526dd736ed2..33e491eee2f1a 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -604,7 +604,10 @@ def value_counts( names = self.grouper.names + [self.obj.name] - def apply_series_value_counts(): + if is_categorical_dtype(val.dtype) or (bins and np.iterable(bins)): + # scalar bins cannot be done at top level + # in a backward compatible way + # GH38672 s = self.apply( Series.value_counts, normalize=normalize, @@ -615,15 +618,6 @@ def apply_series_value_counts(): s.index.names = names return s - if bins is not None: - if not np.iterable(bins): - # scalar bins cannot be done at top level - # in a backward compatible way - return apply_series_value_counts() - elif is_categorical_dtype(val.dtype): - # GH38672 - return apply_series_value_counts() - # groupby removes null keys from groupings mask = ids != -1 ids, val = ids[mask], val[mask] From d81d7b93cbe49c90215fc686bcab95e046b77a9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pablo=20Osorio=20L=C3=B3pez?= Date: Mon, 31 Jan 2022 19:18:46 -0500 Subject: [PATCH 4/6] RFT: correct mistake introduced via RFT In line with 44324 --- pandas/core/groupby/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index fe9b608e634e0..582bd8fb6e920 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -604,7 +604,7 @@ def value_counts( names = self.grouper.names + [self.obj.name] - if is_categorical_dtype(val.dtype) or (bins and np.iterable(bins)): + if is_categorical_dtype(val.dtype) or (bins and not np.iterable(bins)): # scalar bins cannot be done at top level # in a backward compatible way # GH38672 From 9d561b9e2756eb87535820cae6bcbca4b0f8329c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pablo=20Osorio=20L=C3=B3pez?= Date: Tue, 1 Feb 2022 08:14:47 -0500 Subject: [PATCH 5/6] RFT: Change variable names and comment #38672 --- pandas/core/groupby/generic.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 582bd8fb6e920..fdb08f3ef807d 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -607,16 +607,16 @@ def value_counts( if is_categorical_dtype(val.dtype) or (bins and not np.iterable(bins)): # scalar bins cannot be done at top level # in a backward compatible way - # GH38672 - s = self.apply( + # GH38672 relates to categorical dtype + ser = self.apply( Series.value_counts, normalize=normalize, sort=sort, ascending=ascending, bins=bins, ) - s.index.names = names - return s + ser.index.names = names + return ser # groupby removes null keys from groupings mask = ids != -1 From 49a375d2740d92e460a55930d4535787a6ff4e42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pablo=20Osorio=20L=C3=B3pez?= Date: Wed, 2 Feb 2022 19:23:20 -0500 Subject: [PATCH 6/6] BUG: Update conditional to is None to consider series --- pandas/core/groupby/generic.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index fdb08f3ef807d..9d0305d0d1208 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -604,7 +604,9 @@ def value_counts( names = self.grouper.names + [self.obj.name] - if is_categorical_dtype(val.dtype) or (bins and not np.iterable(bins)): + if is_categorical_dtype(val.dtype) or ( + bins is not None and not np.iterable(bins) + ): # scalar bins cannot be done at top level # in a backward compatible way # GH38672 relates to categorical dtype