Skip to content

Commit 3bb6568

Browse files
rhshadrachpmhatre1
authored andcommitted
REF: Compute complete result_index upfront in groupby (pandas-dev#55738)
* REF: Compute correct result_index upfront in groupby * Refinements * Refinements * Refinements * Restore inferring index dtype * Test fixups * Refinements * Refinements * fixup * fixup * fixup * Fix sorting and non-sorting * Cleanup * Call ensure_plantform_int last * fixup * fixup * REF: Compute correct result_index upfront in groupby * Add test * Remove test * Move unobserved to the end * cleanup * cleanup * cleanup * Merge fixup * fixup * fixup * Fixup and test * whatsnew * type ignore * Refactor & type annotations * Better bikeshed
1 parent fb148c5 commit 3bb6568

File tree

14 files changed

+283
-421
lines changed

14 files changed

+283
-421
lines changed

doc/source/whatsnew/v3.0.0.rst

+5
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,11 @@ Groupby/resample/rolling
212212
^^^^^^^^^^^^^^^^^^^^^^^^
213213
- Bug in :meth:`.DataFrameGroupBy.quantile` when ``interpolation="nearest"`` is inconsistent with :meth:`DataFrame.quantile` (:issue:`47942`)
214214
- Bug in :meth:`DataFrame.ewm` and :meth:`Series.ewm` when passed ``times`` and aggregation functions other than mean (:issue:`51695`)
215+
- Bug in :meth:`.DataFrameGroupBy.groups` and :meth:`.SeriesGroupby.groups` that would not respect groupby arguments ``dropna`` and ``sort`` (:issue:`55919`, :issue:`56966`, :issue:`56851`)
216+
- Bug in :meth:`.DataFrameGroupBy.nunique` and :meth:`.SeriesGroupBy.nunique` would fail with multiple categorical groupings when ``as_index=False`` (:issue:`52848`)
217+
- Bug in :meth:`.DataFrameGroupBy.prod`, :meth:`.DataFrameGroupBy.any`, and :meth:`.DataFrameGroupBy.all` would result in NA values on unobserved groups; they now result in ``1``, ``False``, and ``True`` respectively (:issue:`55783`)
218+
- Bug in :meth:`.DataFrameGroupBy.value_counts` would produce incorrect results when used with some categorical and some non-categorical groupings and ``observed=False`` (:issue:`56016`)
219+
-
215220

216221
Reshaping
217222
^^^^^^^^^

pandas/core/groupby/generic.py

+17-9
Original file line numberDiff line numberDiff line change
@@ -411,7 +411,6 @@ def _wrap_applied_output(
411411
# GH #823 #24880
412412
index = self._grouper.result_index
413413
res_df = self.obj._constructor_expanddim(values, index=index)
414-
res_df = self._reindex_output(res_df)
415414
# if self.observed is False,
416415
# keep all-NaN rows created while re-indexing
417416
res_ser = res_df.stack(future_stack=True)
@@ -437,7 +436,7 @@ def _wrap_applied_output(
437436
if not self.as_index:
438437
result = self._insert_inaxis_grouper(result)
439438
result.index = default_index(len(result))
440-
return self._reindex_output(result)
439+
return result
441440

442441
def _aggregate_named(self, func, *args, **kwargs):
443442
# Note: this is very similar to _aggregate_series_pure_python,
@@ -658,7 +657,7 @@ def nunique(self, dropna: bool = True) -> Series | DataFrame:
658657
2023-02-01 1
659658
Freq: MS, dtype: int64
660659
"""
661-
ids, _, ngroups = self._grouper.group_info
660+
ids, ngroups = self._grouper.group_info
662661
val = self.obj._values
663662
codes, uniques = algorithms.factorize(val, use_na_sentinel=dropna, sort=False)
664663

@@ -691,7 +690,7 @@ def nunique(self, dropna: bool = True) -> Series | DataFrame:
691690
if not self.as_index:
692691
result = self._insert_inaxis_grouper(result)
693692
result.index = default_index(len(result))
694-
return self._reindex_output(result, fill_value=0)
693+
return result
695694

696695
@doc(Series.describe)
697696
def describe(self, percentiles=None, include=None, exclude=None) -> Series:
@@ -719,7 +718,7 @@ def value_counts(
719718
from pandas.core.reshape.merge import get_join_indexers
720719
from pandas.core.reshape.tile import cut
721720

722-
ids, _, _ = self._grouper.group_info
721+
ids, _ = self._grouper.group_info
723722
val = self.obj._values
724723

725724
index_names = self._grouper.names + [self.obj.name]
@@ -789,9 +788,18 @@ def value_counts(
789788
rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx))
790789

791790
# multi-index components
792-
codes = self._grouper.reconstructed_codes
791+
if isinstance(self._grouper.result_index, MultiIndex):
792+
codes = list(self._grouper.result_index.codes)
793+
else:
794+
codes = [
795+
algorithms.factorize(
796+
self._grouper.result_index,
797+
sort=self._grouper._sort,
798+
use_na_sentinel=self._grouper.dropna,
799+
)[0]
800+
]
793801
codes = [rep(level_codes) for level_codes in codes] + [llab(lab, inc)]
794-
levels = [ping._group_index for ping in self._grouper.groupings] + [lev]
802+
levels = self._grouper.levels + [lev]
795803

796804
if dropna:
797805
mask = codes[-1] != -1
@@ -834,7 +842,7 @@ def value_counts(
834842
# ndarray[Any, Any]], Index, Series]]
835843
_, idx = get_join_indexers(
836844
left, # type: ignore[arg-type]
837-
right, # type: ignore[arg-type]
845+
right,
838846
sort=False,
839847
how="left",
840848
)
@@ -1605,7 +1613,7 @@ def _wrap_applied_output_series(
16051613
if not self.as_index:
16061614
result = self._insert_inaxis_grouper(result)
16071615

1608-
return self._reindex_output(result)
1616+
return result
16091617

16101618
def _cython_transform(
16111619
self,

0 commit comments

Comments
 (0)