diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index c76555f9ef417..a24aae0855887 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -582,7 +582,8 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupBy.sample` raises ``ValueError`` when the object is empty (:issue:`48459`) - Bug in :meth:`Series.groupby` raises ``ValueError`` when an entry of the index is equal to the name of the index (:issue:`48567`) - Bug in :meth:`DataFrameGroupBy.resample` produces inconsistent results when passing empty DataFrame (:issue:`47705`) -- +- Bug in :class:`.DataFrameGroupBy` and :class:`.SeriesGroupBy` would not include unobserved categories in result when grouping by categorical indexes (:issue:`49354`) +- Bug in :class:`.DataFrameGroupBy` and :class:`.SeriesGroupBy` would change result order depending on the input index when grouping by categoricals (:issue:`49223`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/categorical.py b/pandas/core/groupby/categorical.py index b11bbf35312c9..0a8e12caead1c 100644 --- a/pandas/core/groupby/categorical.py +++ b/pandas/core/groupby/categorical.py @@ -53,7 +53,7 @@ def recode_for_groupby( unique_codes = unique1d(c.codes) take_codes = unique_codes[unique_codes != -1] - if c.ordered: + if c.ordered or sort: take_codes = np.sort(take_codes) # we recode according to the uniques diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 1cc5e90f9a3a4..7da7ea119cea3 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -476,11 +476,15 @@ def __init__( # In extant tests, the new self.grouping_vector matches # `index.get_level_values(ilevel)` whenever # mapper is None and isinstance(index, MultiIndex) + if isinstance(index, MultiIndex): + index_level = index.get_level_values(ilevel) + else: + index_level = index ( self.grouping_vector, # Index self._codes, self._group_index, - ) = index._get_grouper_for_level(mapper, level=ilevel, dropna=dropna) + ) = index_level._get_grouper_for_level(mapper, dropna=dropna) # a passed Grouper like, directly get the grouper in the same way # as single grouper groupby, use the group_info to get codes @@ -504,15 +508,6 @@ def __init__( # use Index instead of ndarray so we can recover the name self.grouping_vector = Index(ng, name=newgrouper.result_index.name) - elif is_categorical_dtype(self.grouping_vector): - # a passed Categorical - self._passed_categorical = True - - self._orig_cats = self.grouping_vector.categories - self.grouping_vector, self._all_grouper = recode_for_groupby( - self.grouping_vector, sort, observed - ) - elif not isinstance( self.grouping_vector, (Series, Index, ExtensionArray, np.ndarray) ): @@ -542,6 +537,14 @@ def __init__( # TODO 2022-10-08 we only have one test that gets here and # values are already in nanoseconds in that case. self.grouping_vector = Series(self.grouping_vector).to_numpy() + elif is_categorical_dtype(self.grouping_vector): + # a passed Categorical + self._passed_categorical = True + + self._orig_cats = self.grouping_vector.categories + self.grouping_vector, self._all_grouper = recode_for_groupby( + self.grouping_vector, sort, observed + ) def __repr__(self) -> str: return f"Grouping({self.name})" diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 935c39af8af3a..3cdd87bd650a2 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -870,14 +870,20 @@ def test_apply_multi_level_name(category): b = [1, 2] * 5 if category: b = pd.Categorical(b, categories=[1, 2, 3]) - expected_index = pd.CategoricalIndex([1, 2], categories=[1, 2, 3], name="B") + expected_index = pd.CategoricalIndex([1, 2, 3], categories=[1, 2, 3], name="B") + # GH#40669 - summing an empty frame gives float dtype + expected_values = [20.0, 25.0, 0.0] else: expected_index = Index([1, 2], name="B") + expected_values = [20, 25] + expected = DataFrame( + {"C": expected_values, "D": expected_values}, index=expected_index + ) + df = DataFrame( {"A": np.arange(10), "B": b, "C": list(range(10)), "D": list(range(10))} ).set_index(["A", "B"]) result = df.groupby("B").apply(lambda x: x.sum()) - expected = DataFrame({"C": [20, 25], "D": [20, 25]}, index=expected_index) tm.assert_frame_equal(result, expected) assert df.index.names == ["A", "B"] diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 092fd4a4d6be0..f8c7cdf658ebf 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -487,6 +487,60 @@ def test_observed_groups(observed): tm.assert_dict_equal(result, expected) +@pytest.mark.parametrize( + "keys, expected_values, expected_index_levels", + [ + ("a", [15, 9, 0], CategoricalIndex([1, 2, 3], name="a")), + ( + ["a", "b"], + [7, 8, 0, 0, 0, 9, 0, 0, 0], + [CategoricalIndex([1, 2, 3], name="a"), Index([4, 5, 6])], + ), + ( + ["a", "a2"], + [15, 0, 0, 0, 9, 0, 0, 0, 0], + [ + CategoricalIndex([1, 2, 3], name="a"), + CategoricalIndex([1, 2, 3], name="a"), + ], + ), + ], +) +@pytest.mark.parametrize("test_series", [True, False]) +def test_unobserved_in_index(keys, expected_values, expected_index_levels, test_series): + # GH#49354 - ensure unobserved cats occur when grouping by index levels + df = DataFrame( + { + "a": Categorical([1, 1, 2], categories=[1, 2, 3]), + "a2": Categorical([1, 1, 2], categories=[1, 2, 3]), + "b": [4, 5, 6], + "c": [7, 8, 9], + } + ).set_index(["a", "a2"]) + if "b" not in keys: + # Only keep b when it is used for grouping for consistent columns in the result + df = df.drop(columns="b") + + gb = df.groupby(keys, observed=False) + if test_series: + gb = gb["c"] + result = gb.sum() + + if len(keys) == 1: + index = expected_index_levels + else: + codes = [[0, 0, 0, 1, 1, 1, 2, 2, 2], 3 * [0, 1, 2]] + index = MultiIndex( + expected_index_levels, + codes=codes, + names=keys, + ) + expected = DataFrame({"c": expected_values}, index=index) + if test_series: + expected = expected["c"] + tm.assert_equal(result, expected) + + def test_observed_groups_with_nan(observed): # GH 24740 df = DataFrame( @@ -1234,11 +1288,12 @@ def df_cat(df): @pytest.mark.parametrize("operation", ["agg", "apply"]) def test_seriesgroupby_observed_true(df_cat, operation): - # GH 24880 - lev_a = Index(["foo", "foo", "bar", "bar"], dtype=df_cat["A"].dtype, name="A") - lev_b = Index(["one", "two", "one", "three"], dtype=df_cat["B"].dtype, name="B") + # GH#24880 + # GH#49223 - order of results was wrong when grouping by index levels + lev_a = Index(["bar", "bar", "foo", "foo"], dtype=df_cat["A"].dtype, name="A") + lev_b = Index(["one", "three", "one", "two"], dtype=df_cat["B"].dtype, name="B") index = MultiIndex.from_arrays([lev_a, lev_b]) - expected = Series(data=[1, 3, 2, 4], index=index, name="C") + expected = Series(data=[2, 4, 1, 3], index=index, name="C") grouped = df_cat.groupby(["A", "B"], observed=True)["C"] result = getattr(grouped, operation)(sum) @@ -1249,6 +1304,7 @@ def test_seriesgroupby_observed_true(df_cat, operation): @pytest.mark.parametrize("observed", [False, None]) def test_seriesgroupby_observed_false_or_none(df_cat, observed, operation): # GH 24880 + # GH#49223 - order of results was wrong when grouping by index levels index, _ = MultiIndex.from_product( [ CategoricalIndex(["bar", "foo"], ordered=False), @@ -1272,16 +1328,16 @@ def test_seriesgroupby_observed_false_or_none(df_cat, observed, operation): True, MultiIndex.from_arrays( [ - Index(["foo"] * 4 + ["bar"] * 4, dtype="category", name="A"), + Index(["bar"] * 4 + ["foo"] * 4, dtype="category", name="A"), Index( - ["one", "one", "two", "two", "one", "one", "three", "three"], + ["one", "one", "three", "three", "one", "one", "two", "two"], dtype="category", name="B", ), Index(["min", "max"] * 4), ] ), - [1, 1, 3, 3, 2, 2, 4, 4], + [2, 2, 4, 4, 1, 1, 3, 3], ), ( False, @@ -1857,7 +1913,7 @@ def test_category_order_reducer( if ( reduction_func in ("idxmax", "idxmin") and not observed - and index_kind == "range" + and index_kind != "multi" ): msg = "GH#10694 - idxmax/min fail with unused categories" request.node.add_marker(pytest.mark.xfail(reason=msg)) @@ -2005,10 +2061,13 @@ def test_category_order_apply(as_index, sort, observed, method, index_kind, orde @pytest.mark.parametrize("index_kind", ["range", "single", "multi"]) -def test_many_categories(as_index, sort, index_kind, ordered): +def test_many_categories(request, as_index, sort, index_kind, ordered): # GH#48749 - Test when the grouper has many categories if index_kind != "range" and not as_index: pytest.skip(reason="Result doesn't have categories, nothing to test") + if index_kind == "multi" and as_index and not sort and ordered: + msg = "GH#48749 - values are unsorted even though the Categorical is ordered" + request.node.add_marker(pytest.mark.xfail(reason=msg)) categories = np.arange(9999, -1, -1) grouper = Categorical([2, 1, 2, 3], categories=categories, ordered=ordered) df = DataFrame({"a": grouper, "b": range(4)}) @@ -2025,11 +2084,7 @@ def test_many_categories(as_index, sort, index_kind, ordered): result = gb.sum() # Test is setup so that data and index are the same values - # TODO: GH#49223 - Order of values should be the same for all index_kinds - if index_kind == "range": - data = [3, 2, 1] if ordered else [2, 1, 3] - else: - data = [3, 2, 1] if sort else [2, 1, 3] + data = [3, 2, 1] if sort or ordered else [2, 1, 3] index = CategoricalIndex( data, categories=grouper.categories, ordered=ordered, name="a"