Skip to content

Commit 1a34d8f

Browse files
jbrockmendelTLouf
authored andcommitted
REF: make Grouping less stateful (pandas-dev#41529)
1 parent d9e39f7 commit 1a34d8f

File tree

2 files changed

+47
-35
lines changed

2 files changed

+47
-35
lines changed

pandas/core/groupby/grouper.py

+33-34
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import numpy as np
1111

1212
from pandas._typing import (
13+
ArrayLike,
1314
FrameOrSeries,
1415
final,
1516
)
@@ -587,20 +588,23 @@ def indices(self):
587588

588589
@property
589590
def codes(self) -> np.ndarray:
590-
if self._passed_categorical:
591-
# we make a CategoricalIndex out of the cat grouper
592-
# preserving the categories / ordered attributes
593-
cat = self.grouper
594-
return cat.codes
591+
if self._codes is not None:
592+
# _codes is set in __init__ for MultiIndex cases
593+
return self._codes
595594

596-
if self._codes is None:
597-
self._make_codes()
598-
# error: Incompatible return value type (got "Optional[ndarray]",
599-
# expected "ndarray")
600-
return self._codes # type: ignore[return-value]
595+
return self._codes_and_uniques[0]
596+
597+
@cache_readonly
598+
def group_arraylike(self) -> ArrayLike:
599+
"""
600+
Analogous to result_index, but holding an ArrayLike to ensure
601+
we can can retain ExtensionDtypes.
602+
"""
603+
return self._codes_and_uniques[1]
601604

602605
@cache_readonly
603606
def result_index(self) -> Index:
607+
# TODO: what's the difference between result_index vs group_index?
604608
if self.all_grouper is not None:
605609
group_idx = self.group_index
606610
assert isinstance(group_idx, CategoricalIndex)
@@ -609,40 +613,37 @@ def result_index(self) -> Index:
609613

610614
@cache_readonly
611615
def group_index(self) -> Index:
616+
if self._group_index is not None:
617+
# _group_index is set in __init__ for MultiIndex cases
618+
return self._group_index
619+
uniques = self.group_arraylike
620+
return Index(uniques, name=self.name)
621+
622+
@cache_readonly
623+
def _codes_and_uniques(self) -> tuple[np.ndarray, ArrayLike]:
612624
if self._passed_categorical:
613625
# we make a CategoricalIndex out of the cat grouper
614626
# preserving the categories / ordered attributes
615627
cat = self.grouper
616628
categories = cat.categories
617629

618630
if self.observed:
619-
codes = algorithms.unique1d(cat.codes)
620-
codes = codes[codes != -1]
631+
ucodes = algorithms.unique1d(cat.codes)
632+
ucodes = ucodes[ucodes != -1]
621633
if self.sort or cat.ordered:
622-
codes = np.sort(codes)
634+
ucodes = np.sort(ucodes)
623635
else:
624-
codes = np.arange(len(categories))
636+
ucodes = np.arange(len(categories))
625637

626-
return CategoricalIndex(
627-
Categorical.from_codes(
628-
codes=codes, categories=categories, ordered=cat.ordered
629-
),
630-
name=self.name,
638+
uniques = Categorical.from_codes(
639+
codes=ucodes, categories=categories, ordered=cat.ordered
631640
)
641+
return cat.codes, uniques
632642

633-
if self._group_index is None:
634-
self._make_codes()
635-
assert self._group_index is not None
636-
return self._group_index
637-
638-
def _make_codes(self) -> None:
639-
if self._codes is not None and self._group_index is not None:
640-
return
641-
642-
# we have a list of groupers
643-
if isinstance(self.grouper, ops.BaseGrouper):
643+
elif isinstance(self.grouper, ops.BaseGrouper):
644+
# we have a list of groupers
644645
codes = self.grouper.codes_info
645-
uniques = self.grouper.result_index
646+
uniques = self.grouper.result_arraylike
646647
else:
647648
# GH35667, replace dropna=False with na_sentinel=None
648649
if not self.dropna:
@@ -652,9 +653,7 @@ def _make_codes(self) -> None:
652653
codes, uniques = algorithms.factorize(
653654
self.grouper, sort=self.sort, na_sentinel=na_sentinel
654655
)
655-
uniques = Index(uniques, name=self.name)
656-
self._codes = codes
657-
self._group_index = uniques
656+
return codes, uniques
658657

659658
@cache_readonly
660659
def groups(self) -> dict[Hashable, np.ndarray]:

pandas/core/groupby/ops.py

+14-1
Original file line numberDiff line numberDiff line change
@@ -907,6 +907,19 @@ def reconstructed_codes(self) -> list[np.ndarray]:
907907
ids, obs_ids, _ = self.group_info
908908
return decons_obs_group_ids(ids, obs_ids, self.shape, codes, xnull=True)
909909

910+
@cache_readonly
911+
def result_arraylike(self) -> ArrayLike:
912+
"""
913+
Analogous to result_index, but returning an ndarray/ExtensionArray
914+
allowing us to retain ExtensionDtypes not supported by Index.
915+
"""
916+
# TODO: once Index supports arbitrary EAs, this can be removed in favor
917+
# of result_index
918+
if len(self.groupings) == 1:
919+
return self.groupings[0].group_arraylike
920+
921+
return self.result_index._values
922+
910923
@cache_readonly
911924
def result_index(self) -> Index:
912925
if len(self.groupings) == 1:
@@ -919,7 +932,7 @@ def result_index(self) -> Index:
919932
)
920933

921934
@final
922-
def get_group_levels(self) -> list[Index]:
935+
def get_group_levels(self) -> list[ArrayLike]:
923936
# Note: only called from _insert_inaxis_grouper_inplace, which
924937
# is only called for BaseGrouper, never for BinGrouper
925938
if len(self.groupings) == 1:

0 commit comments

Comments
 (0)