DEPR: Some Grouper and Grouping attributes (#56149)

rhshadrach · web-flow · commit bd27a3e022c2 · 2023-11-25T20:35:43.000-08:00
* DEPR: Some Grouper and Grouping attributes

* GH#

* GH#

* Rework _group_index
diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
@@ -393,6 +393,8 @@ Other Deprecations
 - Deprecated strings ``H``, ``S``, ``U``, and ``N`` denoting units in :func:`to_timedelta` (:issue:`52536`)
 - Deprecated strings ``H``, ``T``, ``S``, ``L``, ``U``, and ``N`` denoting units in :class:`Timedelta` (:issue:`52536`)
 - Deprecated strings ``T``, ``S``, ``L``, ``U``, and ``N`` denoting frequencies in :class:`Minute`, :class:`Second`, :class:`Milli`, :class:`Micro`, :class:`Nano` (:issue:`52536`)
+- Deprecated the :class:`.BaseGrouper` attributes ``group_keys_seq`` and ``reconstructed_codes``; these will be removed in a future version of pandas (:issue:`56148`)
+- Deprecated the :class:`.Grouping` attributes ``group_index``, ``result_index``, and ``group_arraylike``; these will be removed in a future version of pandas (:issue:`56148`)
 - Deprecated the ``errors="ignore"`` option in :func:`to_datetime`, :func:`to_timedelta`, and :func:`to_numeric`; explicitly catch exceptions instead (:issue:`54467`)
 - Deprecated the ``fastpath`` keyword in the :class:`Series` constructor (:issue:`20110`)
 - Deprecated the ``ordinal`` keyword in :class:`PeriodIndex`, use :meth:`PeriodIndex.from_ordinals` instead (:issue:`55960`)
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -819,9 +819,9 @@ def value_counts(
         rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx))
 
         # multi-index components
-        codes = self.grouper.reconstructed_codes
+        codes = self.grouper._reconstructed_codes
         codes = [rep(level_codes) for level_codes in codes] + [llab(lab, inc)]
-        levels = [ping.group_index for ping in self.grouper.groupings] + [lev]
+        levels = [ping._group_index for ping in self.grouper.groupings] + [lev]
 
         if dropna:
             mask = codes[-1] != -1
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -2820,7 +2820,7 @@ def _value_counts(
             and not grouping._observed
             for grouping in groupings
         ):
-            levels_list = [ping.result_index for ping in groupings]
+            levels_list = [ping._result_index for ping in groupings]
             multi_index = MultiIndex.from_product(
                 levels_list, names=[ping.name for ping in groupings]
             )
@@ -5573,7 +5573,7 @@ def _reindex_output(
         ):
             return output
 
-        levels_list = [ping.group_index for ping in groupings]
+        levels_list = [ping._group_index for ping in groupings]
         names = self.grouper.names
         if qs is not None:
             # error: Argument 1 to "append" of "list" has incompatible type
@@ -5795,7 +5795,7 @@ def _idxmax_idxmin(
             ping._passed_categorical for ping in self.grouper.groupings
         ):
             expected_len = np.prod(
-                [len(ping.group_index) for ping in self.grouper.groupings]
+                [len(ping._group_index) for ping in self.grouper.groupings]
             )
             if len(self.grouper.groupings) == 1:
                 result_len = len(self.grouper.groupings[0].grouping_vector.unique())
diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py
@@ -523,7 +523,6 @@ class Grouping:
     """
 
     _codes: npt.NDArray[np.signedinteger] | None = None
-    _group_index: Index | None = None
     _all_grouper: Categorical | None
     _orig_cats: Index | None
     _index: Index
@@ -679,7 +678,7 @@ def _ilevel(self) -> int | None:
 
     @property
     def ngroups(self) -> int:
-        return len(self.group_index)
+        return len(self._group_index)
 
     @cache_readonly
     def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]:
@@ -695,34 +694,58 @@ def codes(self) -> npt.NDArray[np.signedinteger]:
         return self._codes_and_uniques[0]
 
     @cache_readonly
-    def group_arraylike(self) -> ArrayLike:
+    def _group_arraylike(self) -> ArrayLike:
         """
         Analogous to result_index, but holding an ArrayLike to ensure
         we can retain ExtensionDtypes.
         """
         if self._all_grouper is not None:
             # retain dtype for categories, including unobserved ones
-            return self.result_index._values
+            return self._result_index._values
 
         elif self._passed_categorical:
-            return self.group_index._values
+            return self._group_index._values
 
         return self._codes_and_uniques[1]
 
+    @property
+    def group_arraylike(self) -> ArrayLike:
+        """
+        Analogous to result_index, but holding an ArrayLike to ensure
+        we can retain ExtensionDtypes.
+        """
+        warnings.warn(
+            "group_arraylike is deprecated and will be removed in a future "
+            "version of pandas",
+            category=FutureWarning,
+            stacklevel=find_stack_level(),
+        )
+        return self._group_arraylike
+
     @cache_readonly
-    def result_index(self) -> Index:
+    def _result_index(self) -> Index:
         # result_index retains dtype for categories, including unobserved ones,
         #  which group_index does not
         if self._all_grouper is not None:
-            group_idx = self.group_index
+            group_idx = self._group_index
             assert isinstance(group_idx, CategoricalIndex)
             cats = self._orig_cats
             # set_categories is dynamically added
             return group_idx.set_categories(cats)  # type: ignore[attr-defined]
-        return self.group_index
+        return self._group_index
+
+    @property
+    def result_index(self) -> Index:
+        warnings.warn(
+            "result_index is deprecated and will be removed in a future "
+            "version of pandas",
+            category=FutureWarning,
+            stacklevel=find_stack_level(),
+        )
+        return self._result_index
 
     @cache_readonly
-    def group_index(self) -> Index:
+    def _group_index(self) -> Index:
         codes, uniques = self._codes_and_uniques
         if not self._dropna and self._passed_categorical:
             assert isinstance(uniques, Categorical)
@@ -744,6 +767,16 @@ def group_index(self) -> Index:
                     )
         return Index._with_infer(uniques, name=self.name)
 
+    @property
+    def group_index(self) -> Index:
+        warnings.warn(
+            "group_index is deprecated and will be removed in a future "
+            "version of pandas",
+            category=FutureWarning,
+            stacklevel=find_stack_level(),
+        )
+        return self._group_index
+
     @cache_readonly
     def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
         uniques: ArrayLike
@@ -809,7 +842,7 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
 
     @cache_readonly
     def groups(self) -> dict[Hashable, np.ndarray]:
-        cats = Categorical.from_codes(self.codes, self.group_index, validate=False)
+        cats = Categorical.from_codes(self.codes, self._group_index, validate=False)
         return self._index.groupby(cats)
 
 
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -15,6 +15,7 @@
     Generic,
     final,
 )
+import warnings
 
 import numpy as np
 
@@ -32,6 +33,7 @@
 )
 from pandas.errors import AbstractMethodError
 from pandas.util._decorators import cache_readonly
+from pandas.util._exceptions import find_stack_level
 
 from pandas.core.dtypes.base import ExtensionDtype
 from pandas.core.dtypes.cast import (
@@ -616,7 +618,7 @@ def get_iterator(
         for each group
         """
         splitter = self._get_splitter(data, axis=axis)
-        keys = self.group_keys_seq
+        keys = self._group_keys_seq
         yield from zip(keys, splitter)
 
     @final
@@ -638,7 +640,7 @@ def _get_splitter(self, data: NDFrame, axis: AxisInt = 0) -> DataSplitter:
 
     @final
     @cache_readonly
-    def group_keys_seq(self):
+    def _group_keys_seq(self):
         if len(self.groupings) == 1:
             return self.levels[0]
         else:
@@ -647,14 +649,24 @@ def group_keys_seq(self):
             # provide "flattened" iterator for multi-group setting
             return get_flattened_list(ids, ngroups, self.levels, self.codes)
 
+    @property
+    def group_keys_seq(self):
+        warnings.warn(
+            "group_keys_seq is deprecated and will be removed in a future "
+            "version of pandas",
+            category=FutureWarning,
+            stacklevel=find_stack_level(),
+        )
+        return self._group_keys_seq
+
     @cache_readonly
     def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]:
         """dict {group name -> group indices}"""
         if len(self.groupings) == 1 and isinstance(self.result_index, CategoricalIndex):
             # This shows unused categories in indices GH#38642
             return self.groupings[0].indices
         codes_list = [ping.codes for ping in self.groupings]
-        keys = [ping.group_index for ping in self.groupings]
+        keys = [ping._group_index for ping in self.groupings]
         return get_indexer_dict(codes_list, keys)
 
     @final
@@ -691,7 +703,7 @@ def codes(self) -> list[npt.NDArray[np.signedinteger]]:
 
     @property
     def levels(self) -> list[Index]:
-        return [ping.group_index for ping in self.groupings]
+        return [ping._group_index for ping in self.groupings]
 
     @property
     def names(self) -> list[Hashable]:
@@ -766,26 +778,36 @@ def _get_compressed_codes(
             # FIXME: compress_group_index's second return value is int64, not intp
 
         ping = self.groupings[0]
-        return ping.codes, np.arange(len(ping.group_index), dtype=np.intp)
+        return ping.codes, np.arange(len(ping._group_index), dtype=np.intp)
 
     @final
     @cache_readonly
     def ngroups(self) -> int:
         return len(self.result_index)
 
     @property
-    def reconstructed_codes(self) -> list[npt.NDArray[np.intp]]:
+    def _reconstructed_codes(self) -> list[npt.NDArray[np.intp]]:
         codes = self.codes
         ids, obs_ids, _ = self.group_info
         return decons_obs_group_ids(ids, obs_ids, self.shape, codes, xnull=True)
 
+    @property
+    def reconstructed_codes(self) -> list[npt.NDArray[np.intp]]:
+        warnings.warn(
+            "reconstructed_codes is deprecated and will be removed in a future "
+            "version of pandas",
+            category=FutureWarning,
+            stacklevel=find_stack_level(),
+        )
+        return self._reconstructed_codes
+
     @cache_readonly
     def result_index(self) -> Index:
         if len(self.groupings) == 1:
-            return self.groupings[0].result_index.rename(self.names[0])
+            return self.groupings[0]._result_index.rename(self.names[0])
 
-        codes = self.reconstructed_codes
-        levels = [ping.result_index for ping in self.groupings]
+        codes = self._reconstructed_codes
+        levels = [ping._result_index for ping in self.groupings]
         return MultiIndex(
             levels=levels, codes=codes, verify_integrity=False, names=self.names
         )
@@ -795,12 +817,12 @@ def get_group_levels(self) -> list[ArrayLike]:
         # Note: only called from _insert_inaxis_grouper, which
         #  is only called for BaseGrouper, never for BinGrouper
         if len(self.groupings) == 1:
-            return [self.groupings[0].group_arraylike]
+            return [self.groupings[0]._group_arraylike]
 
         name_list = []
-        for ping, codes in zip(self.groupings, self.reconstructed_codes):
+        for ping, codes in zip(self.groupings, self._reconstructed_codes):
             codes = ensure_platform_int(codes)
-            levels = ping.group_arraylike.take(codes)
+            levels = ping._group_arraylike.take(codes)
 
             name_list.append(levels)
 
@@ -907,7 +929,7 @@ def apply_groupwise(
     ) -> tuple[list, bool]:
         mutated = False
         splitter = self._get_splitter(data, axis=axis)
-        group_keys = self.group_keys_seq
+        group_keys = self._group_keys_seq
         result_values = []
 
         # This calls DataSplitter.__iter__
@@ -1087,7 +1109,7 @@ def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
         )
 
     @cache_readonly
-    def reconstructed_codes(self) -> list[np.ndarray]:
+    def _reconstructed_codes(self) -> list[np.ndarray]:
         # get unique result indices, and prepend 0 as groupby starts from the first
         return [np.r_[0, np.flatnonzero(self.bins[1:] != self.bins[:-1]) + 1]]
 
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -3303,3 +3303,13 @@ def test_groupby_ffill_with_duplicated_index():
     result = df.groupby(level=0).ffill()
     expected = DataFrame({"a": [1, 2, 3, 4, 2, 3]}, index=[0, 1, 2, 0, 1, 2])
     tm.assert_frame_equal(result, expected, check_dtype=False)
+
+
+@pytest.mark.parametrize("attr", ["group_keys_seq", "reconstructed_codes"])
+def test_depr_grouper_attrs(attr):
+    # GH#56148
+    df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5]})
+    gb = df.groupby("a")
+    msg = f"{attr} is deprecated"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        getattr(gb.grouper, attr)
diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py
@@ -1211,3 +1211,13 @@ def test_grouper_groups():
     msg = "Grouper.indexer is deprecated"
     with tm.assert_produces_warning(FutureWarning, match=msg):
         grper.indexer
+
+
+@pytest.mark.parametrize("attr", ["group_index", "result_index", "group_arraylike"])
+def test_depr_grouping_attrs(attr):
+    # GH#56148
+    df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5]})
+    gb = df.groupby("a")
+    msg = f"{attr} is deprecated"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        getattr(gb.grouper.groupings[0], attr)
diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py
@@ -67,7 +67,9 @@ def groupby_with_truncated_bingrouper(frame_for_truncated_bingrouper):
     gb = df.groupby(tdg)
 
     # check we're testing the case we're interested in
-    assert len(gb.grouper.result_index) != len(gb.grouper.group_keys_seq)
+    msg = "group_keys_seq is deprecated"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        assert len(gb.grouper.result_index) != len(gb.grouper.group_keys_seq)
 
     return gb