Skip to content

Commit ee05885

Browse files
authored
CLN: BaseGrouper (#59034)
1 parent 8395f98 commit ee05885

File tree

4 files changed

+29
-52
lines changed

4 files changed

+29
-52
lines changed

pandas/core/groupby/generic.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -686,7 +686,8 @@ def nunique(self, dropna: bool = True) -> Series | DataFrame:
686686
b 1
687687
dtype: int64
688688
"""
689-
ids, ngroups = self._grouper.group_info
689+
ids = self._grouper.ids
690+
ngroups = self._grouper.ngroups
690691
val = self.obj._values
691692
codes, uniques = algorithms.factorize(val, use_na_sentinel=dropna, sort=False)
692693

pandas/core/groupby/groupby.py

+11-6
Original file line numberDiff line numberDiff line change
@@ -1360,7 +1360,7 @@ def _wrap_applied_output(
13601360

13611361
@final
13621362
def _numba_prep(self, data: DataFrame):
1363-
ids, ngroups = self._grouper.group_info
1363+
ngroups = self._grouper.ngroups
13641364
sorted_index = self._grouper.result_ilocs
13651365
sorted_ids = self._grouper._sorted_ids
13661366

@@ -1969,7 +1969,8 @@ def _cumcount_array(self, ascending: bool = True) -> np.ndarray:
19691969
this is currently implementing sort=False
19701970
(though the default is sort=True) for groupby in general
19711971
"""
1972-
ids, ngroups = self._grouper.group_info
1972+
ids = self._grouper.ids
1973+
ngroups = self._grouper.ngroups
19731974
sorter = get_group_index_sorter(ids, ngroups)
19741975
ids, count = ids[sorter], len(ids)
19751976

@@ -2185,7 +2186,8 @@ def count(self) -> NDFrameT:
21852186
Freq: MS, dtype: int64
21862187
"""
21872188
data = self._get_data_to_aggregate()
2188-
ids, ngroups = self._grouper.group_info
2189+
ids = self._grouper.ids
2190+
ngroups = self._grouper.ngroups
21892191
mask = ids != -1
21902192

21912193
is_series = data.ndim == 1
@@ -3840,7 +3842,8 @@ def _fill(self, direction: Literal["ffill", "bfill"], limit: int | None = None):
38403842
if limit is None:
38413843
limit = -1
38423844

3843-
ids, ngroups = self._grouper.group_info
3845+
ids = self._grouper.ids
3846+
ngroups = self._grouper.ngroups
38443847

38453848
col_func = partial(
38463849
libgroupby.group_fillna_indexer,
@@ -4361,7 +4364,8 @@ def post_processor(
43614364
qs = np.array([q], dtype=np.float64)
43624365
pass_qs = None
43634366

4364-
ids, ngroups = self._grouper.group_info
4367+
ids = self._grouper.ids
4368+
ngroups = self._grouper.ngroups
43654369
if self.dropna:
43664370
# splitter drops NA groups, we need to do the same
43674371
ids = ids[ids >= 0]
@@ -5038,7 +5042,8 @@ def shift(
50385042
else:
50395043
if fill_value is lib.no_default:
50405044
fill_value = None
5041-
ids, ngroups = self._grouper.group_info
5045+
ids = self._grouper.ids
5046+
ngroups = self._grouper.ngroups
50425047
res_indexer = np.zeros(len(ids), dtype=np.int64)
50435048

50445049
libgroupby.group_shift_indexer(res_indexer, ids, ngroups, period)

pandas/core/groupby/ops.py

+13-44
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,6 @@
7373
Generator,
7474
Hashable,
7575
Iterator,
76-
Sequence,
7776
)
7877

7978
from pandas.core.generic import NDFrame
@@ -581,25 +580,21 @@ class BaseGrouper:
581580
def __init__(
582581
self,
583582
axis: Index,
584-
groupings: Sequence[grouper.Grouping],
583+
groupings: list[grouper.Grouping],
585584
sort: bool = True,
586585
dropna: bool = True,
587586
) -> None:
588587
assert isinstance(axis, Index), axis
589588

590589
self.axis = axis
591-
self._groupings: list[grouper.Grouping] = list(groupings)
590+
self._groupings = groupings
592591
self._sort = sort
593592
self.dropna = dropna
594593

595594
@property
596595
def groupings(self) -> list[grouper.Grouping]:
597596
return self._groupings
598597

599-
@property
600-
def shape(self) -> Shape:
601-
return tuple(ping.ngroups for ping in self.groupings)
602-
603598
def __iter__(self) -> Iterator[Hashable]:
604599
return iter(self.indices)
605600

@@ -628,11 +623,15 @@ def _get_splitter(self, data: NDFrame) -> DataSplitter:
628623
-------
629624
Generator yielding subsetted objects
630625
"""
631-
ids, ngroups = self.group_info
632-
return _get_splitter(
626+
if isinstance(data, Series):
627+
klass: type[DataSplitter] = SeriesSplitter
628+
else:
629+
# i.e. DataFrame
630+
klass = FrameSplitter
631+
632+
return klass(
633633
data,
634-
ids,
635-
ngroups,
634+
self.ngroups,
636635
sorted_ids=self._sorted_ids,
637636
sort_idx=self.result_ilocs,
638637
)
@@ -692,7 +691,8 @@ def size(self) -> Series:
692691
"""
693692
Compute group sizes.
694693
"""
695-
ids, ngroups = self.group_info
694+
ids = self.ids
695+
ngroups = self.ngroups
696696
out: np.ndarray | list
697697
if ngroups:
698698
out = np.bincount(ids[ids != -1], minlength=ngroups)
@@ -729,12 +729,6 @@ def has_dropped_na(self) -> bool:
729729
"""
730730
return bool((self.ids < 0).any())
731731

732-
@cache_readonly
733-
def group_info(self) -> tuple[npt.NDArray[np.intp], int]:
734-
result_index, ids = self.result_index_and_ids
735-
ngroups = len(result_index)
736-
return ids, ngroups
737-
738732
@cache_readonly
739733
def codes_info(self) -> npt.NDArray[np.intp]:
740734
# return the codes of items in original grouped axis
@@ -1123,10 +1117,6 @@ def indices(self):
11231117
i = bin
11241118
return indices
11251119

1126-
@cache_readonly
1127-
def group_info(self) -> tuple[npt.NDArray[np.intp], int]:
1128-
return self.ids, self.ngroups
1129-
11301120
@cache_readonly
11311121
def codes(self) -> list[npt.NDArray[np.intp]]:
11321122
return [self.ids]
@@ -1191,29 +1181,25 @@ class DataSplitter(Generic[NDFrameT]):
11911181
def __init__(
11921182
self,
11931183
data: NDFrameT,
1194-
labels: npt.NDArray[np.intp],
11951184
ngroups: int,
11961185
*,
11971186
sort_idx: npt.NDArray[np.intp],
11981187
sorted_ids: npt.NDArray[np.intp],
11991188
) -> None:
12001189
self.data = data
1201-
self.labels = ensure_platform_int(labels) # _should_ already be np.intp
12021190
self.ngroups = ngroups
12031191

12041192
self._slabels = sorted_ids
12051193
self._sort_idx = sort_idx
12061194

12071195
def __iter__(self) -> Iterator:
1208-
sdata = self._sorted_data
1209-
12101196
if self.ngroups == 0:
12111197
# we are inside a generator, rather than raise StopIteration
12121198
# we merely return signal the end
12131199
return
12141200

12151201
starts, ends = lib.generate_slices(self._slabels, self.ngroups)
1216-
1202+
sdata = self._sorted_data
12171203
for start, end in zip(starts, ends):
12181204
yield self._chop(sdata, slice(start, end))
12191205

@@ -1241,20 +1227,3 @@ def _chop(self, sdata: DataFrame, slice_obj: slice) -> DataFrame:
12411227
mgr = sdata._mgr.get_slice(slice_obj, axis=1)
12421228
df = sdata._constructor_from_mgr(mgr, axes=mgr.axes)
12431229
return df.__finalize__(sdata, method="groupby")
1244-
1245-
1246-
def _get_splitter(
1247-
data: NDFrame,
1248-
labels: npt.NDArray[np.intp],
1249-
ngroups: int,
1250-
*,
1251-
sort_idx: npt.NDArray[np.intp],
1252-
sorted_ids: npt.NDArray[np.intp],
1253-
) -> DataSplitter:
1254-
if isinstance(data, Series):
1255-
klass: type[DataSplitter] = SeriesSplitter
1256-
else:
1257-
# i.e. DataFrame
1258-
klass = FrameSplitter
1259-
1260-
return klass(data, labels, ngroups, sort_idx=sort_idx, sorted_ids=sorted_ids)

pandas/tests/test_sorting.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,9 @@ def test_int64_overflow_groupby_large_df_shuffled(self, agg):
104104
gr = df.groupby(list("abcde"))
105105

106106
# verify this is testing what it is supposed to test!
107-
assert is_int64_overflow_possible(gr._grouper.shape)
107+
assert is_int64_overflow_possible(
108+
tuple(ping.ngroups for ping in gr._grouper.groupings)
109+
)
108110

109111
mi = MultiIndex.from_arrays(
110112
[ar.ravel() for ar in np.array_split(np.unique(arr, axis=0), 5, axis=1)],

0 commit comments

Comments
 (0)