Skip to content

Commit bd27a3e

Browse files
authored
DEPR: Some Grouper and Grouping attributes (#56149)
* DEPR: Some Grouper and Grouping attributes * GH# * GH# * Rework _group_index
1 parent c07563e commit bd27a3e

File tree

8 files changed

+109
-30
lines changed

8 files changed

+109
-30
lines changed

doc/source/whatsnew/v2.2.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -393,6 +393,8 @@ Other Deprecations
393393
- Deprecated strings ``H``, ``S``, ``U``, and ``N`` denoting units in :func:`to_timedelta` (:issue:`52536`)
394394
- Deprecated strings ``H``, ``T``, ``S``, ``L``, ``U``, and ``N`` denoting units in :class:`Timedelta` (:issue:`52536`)
395395
- Deprecated strings ``T``, ``S``, ``L``, ``U``, and ``N`` denoting frequencies in :class:`Minute`, :class:`Second`, :class:`Milli`, :class:`Micro`, :class:`Nano` (:issue:`52536`)
396+
- Deprecated the :class:`.BaseGrouper` attributes ``group_keys_seq`` and ``reconstructed_codes``; these will be removed in a future version of pandas (:issue:`56148`)
397+
- Deprecated the :class:`.Grouping` attributes ``group_index``, ``result_index``, and ``group_arraylike``; these will be removed in a future version of pandas (:issue:`56148`)
396398
- Deprecated the ``errors="ignore"`` option in :func:`to_datetime`, :func:`to_timedelta`, and :func:`to_numeric`; explicitly catch exceptions instead (:issue:`54467`)
397399
- Deprecated the ``fastpath`` keyword in the :class:`Series` constructor (:issue:`20110`)
398400
- Deprecated the ``ordinal`` keyword in :class:`PeriodIndex`, use :meth:`PeriodIndex.from_ordinals` instead (:issue:`55960`)

pandas/core/groupby/generic.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -819,9 +819,9 @@ def value_counts(
819819
rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx))
820820

821821
# multi-index components
822-
codes = self.grouper.reconstructed_codes
822+
codes = self.grouper._reconstructed_codes
823823
codes = [rep(level_codes) for level_codes in codes] + [llab(lab, inc)]
824-
levels = [ping.group_index for ping in self.grouper.groupings] + [lev]
824+
levels = [ping._group_index for ping in self.grouper.groupings] + [lev]
825825

826826
if dropna:
827827
mask = codes[-1] != -1

pandas/core/groupby/groupby.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -2820,7 +2820,7 @@ def _value_counts(
28202820
and not grouping._observed
28212821
for grouping in groupings
28222822
):
2823-
levels_list = [ping.result_index for ping in groupings]
2823+
levels_list = [ping._result_index for ping in groupings]
28242824
multi_index = MultiIndex.from_product(
28252825
levels_list, names=[ping.name for ping in groupings]
28262826
)
@@ -5573,7 +5573,7 @@ def _reindex_output(
55735573
):
55745574
return output
55755575

5576-
levels_list = [ping.group_index for ping in groupings]
5576+
levels_list = [ping._group_index for ping in groupings]
55775577
names = self.grouper.names
55785578
if qs is not None:
55795579
# error: Argument 1 to "append" of "list" has incompatible type
@@ -5795,7 +5795,7 @@ def _idxmax_idxmin(
57955795
ping._passed_categorical for ping in self.grouper.groupings
57965796
):
57975797
expected_len = np.prod(
5798-
[len(ping.group_index) for ping in self.grouper.groupings]
5798+
[len(ping._group_index) for ping in self.grouper.groupings]
57995799
)
58005800
if len(self.grouper.groupings) == 1:
58015801
result_len = len(self.grouper.groupings[0].grouping_vector.unique())

pandas/core/groupby/grouper.py

+43-10
Original file line numberDiff line numberDiff line change
@@ -523,7 +523,6 @@ class Grouping:
523523
"""
524524

525525
_codes: npt.NDArray[np.signedinteger] | None = None
526-
_group_index: Index | None = None
527526
_all_grouper: Categorical | None
528527
_orig_cats: Index | None
529528
_index: Index
@@ -679,7 +678,7 @@ def _ilevel(self) -> int | None:
679678

680679
@property
681680
def ngroups(self) -> int:
682-
return len(self.group_index)
681+
return len(self._group_index)
683682

684683
@cache_readonly
685684
def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]:
@@ -695,34 +694,58 @@ def codes(self) -> npt.NDArray[np.signedinteger]:
695694
return self._codes_and_uniques[0]
696695

697696
@cache_readonly
698-
def group_arraylike(self) -> ArrayLike:
697+
def _group_arraylike(self) -> ArrayLike:
699698
"""
700699
Analogous to result_index, but holding an ArrayLike to ensure
701700
we can retain ExtensionDtypes.
702701
"""
703702
if self._all_grouper is not None:
704703
# retain dtype for categories, including unobserved ones
705-
return self.result_index._values
704+
return self._result_index._values
706705

707706
elif self._passed_categorical:
708-
return self.group_index._values
707+
return self._group_index._values
709708

710709
return self._codes_and_uniques[1]
711710

711+
@property
712+
def group_arraylike(self) -> ArrayLike:
713+
"""
714+
Analogous to result_index, but holding an ArrayLike to ensure
715+
we can retain ExtensionDtypes.
716+
"""
717+
warnings.warn(
718+
"group_arraylike is deprecated and will be removed in a future "
719+
"version of pandas",
720+
category=FutureWarning,
721+
stacklevel=find_stack_level(),
722+
)
723+
return self._group_arraylike
724+
712725
@cache_readonly
713-
def result_index(self) -> Index:
726+
def _result_index(self) -> Index:
714727
# result_index retains dtype for categories, including unobserved ones,
715728
# which group_index does not
716729
if self._all_grouper is not None:
717-
group_idx = self.group_index
730+
group_idx = self._group_index
718731
assert isinstance(group_idx, CategoricalIndex)
719732
cats = self._orig_cats
720733
# set_categories is dynamically added
721734
return group_idx.set_categories(cats) # type: ignore[attr-defined]
722-
return self.group_index
735+
return self._group_index
736+
737+
@property
738+
def result_index(self) -> Index:
739+
warnings.warn(
740+
"result_index is deprecated and will be removed in a future "
741+
"version of pandas",
742+
category=FutureWarning,
743+
stacklevel=find_stack_level(),
744+
)
745+
return self._result_index
723746

724747
@cache_readonly
725-
def group_index(self) -> Index:
748+
def _group_index(self) -> Index:
726749
codes, uniques = self._codes_and_uniques
727750
if not self._dropna and self._passed_categorical:
728751
assert isinstance(uniques, Categorical)
@@ -744,6 +767,16 @@ def group_index(self) -> Index:
744767
)
745768
return Index._with_infer(uniques, name=self.name)
746769

770+
@property
771+
def group_index(self) -> Index:
772+
warnings.warn(
773+
"group_index is deprecated and will be removed in a future "
774+
"version of pandas",
775+
category=FutureWarning,
776+
stacklevel=find_stack_level(),
777+
)
778+
return self._group_index
779+
747780
@cache_readonly
748781
def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
749782
uniques: ArrayLike
@@ -809,7 +842,7 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
809842

810843
@cache_readonly
811844
def groups(self) -> dict[Hashable, np.ndarray]:
812-
cats = Categorical.from_codes(self.codes, self.group_index, validate=False)
845+
cats = Categorical.from_codes(self.codes, self._group_index, validate=False)
813846
return self._index.groupby(cats)
814847

815848

pandas/core/groupby/ops.py

+36-14
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
Generic,
1616
final,
1717
)
18+
import warnings
1819

1920
import numpy as np
2021

@@ -32,6 +33,7 @@
3233
)
3334
from pandas.errors import AbstractMethodError
3435
from pandas.util._decorators import cache_readonly
36+
from pandas.util._exceptions import find_stack_level
3537

3638
from pandas.core.dtypes.base import ExtensionDtype
3739
from pandas.core.dtypes.cast import (
@@ -616,7 +618,7 @@ def get_iterator(
616618
for each group
617619
"""
618620
splitter = self._get_splitter(data, axis=axis)
619-
keys = self.group_keys_seq
621+
keys = self._group_keys_seq
620622
yield from zip(keys, splitter)
621623

622624
@final
@@ -638,7 +640,7 @@ def _get_splitter(self, data: NDFrame, axis: AxisInt = 0) -> DataSplitter:
638640

639641
@final
640642
@cache_readonly
641-
def group_keys_seq(self):
643+
def _group_keys_seq(self):
642644
if len(self.groupings) == 1:
643645
return self.levels[0]
644646
else:
@@ -647,14 +649,24 @@ def group_keys_seq(self):
647649
# provide "flattened" iterator for multi-group setting
648650
return get_flattened_list(ids, ngroups, self.levels, self.codes)
649651

652+
@property
653+
def group_keys_seq(self):
654+
warnings.warn(
655+
"group_keys_seq is deprecated and will be removed in a future "
656+
"version of pandas",
657+
category=FutureWarning,
658+
stacklevel=find_stack_level(),
659+
)
660+
return self._group_keys_seq
661+
650662
@cache_readonly
651663
def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]:
652664
"""dict {group name -> group indices}"""
653665
if len(self.groupings) == 1 and isinstance(self.result_index, CategoricalIndex):
654666
# This shows unused categories in indices GH#38642
655667
return self.groupings[0].indices
656668
codes_list = [ping.codes for ping in self.groupings]
657-
keys = [ping.group_index for ping in self.groupings]
669+
keys = [ping._group_index for ping in self.groupings]
658670
return get_indexer_dict(codes_list, keys)
659671

660672
@final
@@ -691,7 +703,7 @@ def codes(self) -> list[npt.NDArray[np.signedinteger]]:
691703

692704
@property
693705
def levels(self) -> list[Index]:
694-
return [ping.group_index for ping in self.groupings]
706+
return [ping._group_index for ping in self.groupings]
695707

696708
@property
697709
def names(self) -> list[Hashable]:
@@ -766,26 +778,36 @@ def _get_compressed_codes(
766778
# FIXME: compress_group_index's second return value is int64, not intp
767779

768780
ping = self.groupings[0]
769-
return ping.codes, np.arange(len(ping.group_index), dtype=np.intp)
781+
return ping.codes, np.arange(len(ping._group_index), dtype=np.intp)
770782

771783
@final
772784
@cache_readonly
773785
def ngroups(self) -> int:
774786
return len(self.result_index)
775787

776788
@property
777-
def reconstructed_codes(self) -> list[npt.NDArray[np.intp]]:
789+
def _reconstructed_codes(self) -> list[npt.NDArray[np.intp]]:
778790
codes = self.codes
779791
ids, obs_ids, _ = self.group_info
780792
return decons_obs_group_ids(ids, obs_ids, self.shape, codes, xnull=True)
781793

794+
@property
795+
def reconstructed_codes(self) -> list[npt.NDArray[np.intp]]:
796+
warnings.warn(
797+
"reconstructed_codes is deprecated and will be removed in a future "
798+
"version of pandas",
799+
category=FutureWarning,
800+
stacklevel=find_stack_level(),
801+
)
802+
return self._reconstructed_codes
803+
782804
@cache_readonly
783805
def result_index(self) -> Index:
784806
if len(self.groupings) == 1:
785-
return self.groupings[0].result_index.rename(self.names[0])
807+
return self.groupings[0]._result_index.rename(self.names[0])
786808

787-
codes = self.reconstructed_codes
788-
levels = [ping.result_index for ping in self.groupings]
809+
codes = self._reconstructed_codes
810+
levels = [ping._result_index for ping in self.groupings]
789811
return MultiIndex(
790812
levels=levels, codes=codes, verify_integrity=False, names=self.names
791813
)
@@ -795,12 +817,12 @@ def get_group_levels(self) -> list[ArrayLike]:
795817
# Note: only called from _insert_inaxis_grouper, which
796818
# is only called for BaseGrouper, never for BinGrouper
797819
if len(self.groupings) == 1:
798-
return [self.groupings[0].group_arraylike]
820+
return [self.groupings[0]._group_arraylike]
799821

800822
name_list = []
801-
for ping, codes in zip(self.groupings, self.reconstructed_codes):
823+
for ping, codes in zip(self.groupings, self._reconstructed_codes):
802824
codes = ensure_platform_int(codes)
803-
levels = ping.group_arraylike.take(codes)
825+
levels = ping._group_arraylike.take(codes)
804826

805827
name_list.append(levels)
806828

@@ -907,7 +929,7 @@ def apply_groupwise(
907929
) -> tuple[list, bool]:
908930
mutated = False
909931
splitter = self._get_splitter(data, axis=axis)
910-
group_keys = self.group_keys_seq
932+
group_keys = self._group_keys_seq
911933
result_values = []
912934

913935
# This calls DataSplitter.__iter__
@@ -1087,7 +1109,7 @@ def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
10871109
)
10881110

10891111
@cache_readonly
1090-
def reconstructed_codes(self) -> list[np.ndarray]:
1112+
def _reconstructed_codes(self) -> list[np.ndarray]:
10911113
# get unique result indices, and prepend 0 as groupby starts from the first
10921114
return [np.r_[0, np.flatnonzero(self.bins[1:] != self.bins[:-1]) + 1]]
10931115

pandas/tests/groupby/test_groupby.py

+10
Original file line numberDiff line numberDiff line change
@@ -3303,3 +3303,13 @@ def test_groupby_ffill_with_duplicated_index():
33033303
result = df.groupby(level=0).ffill()
33043304
expected = DataFrame({"a": [1, 2, 3, 4, 2, 3]}, index=[0, 1, 2, 0, 1, 2])
33053305
tm.assert_frame_equal(result, expected, check_dtype=False)
3306+
3307+
3308+
@pytest.mark.parametrize("attr", ["group_keys_seq", "reconstructed_codes"])
3309+
def test_depr_grouper_attrs(attr):
3310+
# GH#56148
3311+
df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5]})
3312+
gb = df.groupby("a")
3313+
msg = f"{attr} is deprecated"
3314+
with tm.assert_produces_warning(FutureWarning, match=msg):
3315+
getattr(gb.grouper, attr)

pandas/tests/groupby/test_grouping.py

+10
Original file line numberDiff line numberDiff line change
@@ -1211,3 +1211,13 @@ def test_grouper_groups():
12111211
msg = "Grouper.indexer is deprecated"
12121212
with tm.assert_produces_warning(FutureWarning, match=msg):
12131213
grper.indexer
1214+
1215+
1216+
@pytest.mark.parametrize("attr", ["group_index", "result_index", "group_arraylike"])
1217+
def test_depr_grouping_attrs(attr):
1218+
# GH#56148
1219+
df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5]})
1220+
gb = df.groupby("a")
1221+
msg = f"{attr} is deprecated"
1222+
with tm.assert_produces_warning(FutureWarning, match=msg):
1223+
getattr(gb.grouper.groupings[0], attr)

pandas/tests/groupby/test_timegrouper.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,9 @@ def groupby_with_truncated_bingrouper(frame_for_truncated_bingrouper):
6767
gb = df.groupby(tdg)
6868

6969
# check we're testing the case we're interested in
70-
assert len(gb.grouper.result_index) != len(gb.grouper.group_keys_seq)
70+
msg = "group_keys_seq is deprecated"
71+
with tm.assert_produces_warning(FutureWarning, match=msg):
72+
assert len(gb.grouper.result_index) != len(gb.grouper.group_keys_seq)
7173

7274
return gb
7375

0 commit comments

Comments
 (0)