Skip to content

Commit 9e76f4a

Browse files
krsnik93jreback
authored andcommitted
Fix 'observed' kwarg not doing anything on SeriesGroupBy (pandas-dev#26463)
1 parent 072408e commit 9e76f4a

File tree

4 files changed

+268
-166
lines changed

4 files changed

+268
-166
lines changed

doc/source/whatsnew/v0.25.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -553,6 +553,7 @@ Groupby/Resample/Rolling
553553
- Bug in :func:`pandas.core.groupby.GroupBy.agg` when applying a aggregation function to timezone aware data (:issue:`23683`)
554554
- Bug in :func:`pandas.core.groupby.GroupBy.first` and :func:`pandas.core.groupby.GroupBy.last` where timezone information would be dropped (:issue:`21603`)
555555
- Bug in :func:`pandas.core.groupby.GroupBy.size` when grouping only NA values (:issue:`23050`)
556+
- Bug in :func:`Series.groupby` where ``observed`` kwarg was previously ignored (:issue:`24880`)
556557
- Bug in :func:`Series.groupby` where using ``groupby`` with a :class:`MultiIndex` Series with a list of labels equal to the length of the series caused incorrect grouping (:issue:`25704`)
557558
- Ensured that ordering of outputs in ``groupby`` aggregation functions is consistent across all versions of Python (:issue:`25692`)
558559
- Ensured that result group order is correct when grouping on an ordered ``Categorical`` and specifying ``observed=True`` (:issue:`25871`, :issue:`25167`)

pandas/core/groupby/generic.py

+18-75
Original file line numberDiff line numberDiff line change
@@ -28,15 +28,14 @@
2828
from pandas.core.dtypes.missing import isna, notna
2929

3030
import pandas.core.algorithms as algorithms
31-
from pandas.core.arrays import Categorical
3231
from pandas.core.base import DataError, SpecificationError
3332
import pandas.core.common as com
3433
from pandas.core.frame import DataFrame
3534
from pandas.core.generic import NDFrame, _shared_docs
3635
from pandas.core.groupby import base
3736
from pandas.core.groupby.groupby import (
3837
GroupBy, _apply_docs, _transform_template)
39-
from pandas.core.index import CategoricalIndex, Index, MultiIndex
38+
from pandas.core.index import Index, MultiIndex
4039
import pandas.core.indexes.base as ibase
4140
from pandas.core.internals import BlockManager, make_block
4241
from pandas.core.series import Series
@@ -852,9 +851,10 @@ def _wrap_output(self, output, index, names=None):
852851
return Series(output, index=index, name=name)
853852

854853
def _wrap_aggregated_output(self, output, names=None):
855-
return self._wrap_output(output=output,
856-
index=self.grouper.result_index,
857-
names=names)
854+
result = self._wrap_output(output=output,
855+
index=self.grouper.result_index,
856+
names=names)
857+
return self._reindex_output(result)._convert(datetime=True)
858858

859859
def _wrap_transformed_output(self, output, names=None):
860860
return self._wrap_output(output=output,
@@ -874,23 +874,28 @@ def _get_index():
874874
return index
875875

876876
if isinstance(values[0], dict):
877-
# GH #823
877+
# GH #823 #24880
878878
index = _get_index()
879-
result = DataFrame(values, index=index).stack()
879+
result = self._reindex_output(DataFrame(values, index=index))
880+
# if self.observed is False,
881+
# keep all-NaN rows created while re-indexing
882+
result = result.stack(dropna=self.observed)
880883
result.name = self._selection_name
881884
return result
882885

883-
if isinstance(values[0], (Series, dict)):
886+
if isinstance(values[0], Series):
884887
return self._concat_objects(keys, values,
885888
not_indexed_same=not_indexed_same)
886889
elif isinstance(values[0], DataFrame):
887890
# possible that Series -> DataFrame by applied function
888891
return self._concat_objects(keys, values,
889892
not_indexed_same=not_indexed_same)
890893
else:
891-
# GH #6265
892-
return Series(values, index=_get_index(),
893-
name=self._selection_name)
894+
# GH #6265 #24880
895+
result = Series(data=values,
896+
index=_get_index(),
897+
name=self._selection_name)
898+
return self._reindex_output(result)
894899

895900
def _aggregate_named(self, func, *args, **kwargs):
896901
result = OrderedDict()
@@ -1373,7 +1378,8 @@ def _gotitem(self, key, ndim, subset=None):
13731378
if subset is None:
13741379
subset = self.obj[key]
13751380
return SeriesGroupBy(subset, selection=key,
1376-
grouper=self.grouper)
1381+
grouper=self.grouper,
1382+
observed=self.observed)
13771383

13781384
raise AssertionError("invalid ndim for _gotitem")
13791385

@@ -1445,69 +1451,6 @@ def _wrap_agged_blocks(self, items, blocks):
14451451

14461452
return self._reindex_output(result)._convert(datetime=True)
14471453

1448-
def _reindex_output(self, result):
1449-
"""
1450-
If we have categorical groupers, then we want to make sure that
1451-
we have a fully reindex-output to the levels. These may have not
1452-
participated in the groupings (e.g. may have all been
1453-
nan groups);
1454-
1455-
This can re-expand the output space
1456-
"""
1457-
1458-
# we need to re-expand the output space to accomodate all values
1459-
# whether observed or not in the cartesian product of our groupes
1460-
groupings = self.grouper.groupings
1461-
if groupings is None:
1462-
return result
1463-
elif len(groupings) == 1:
1464-
return result
1465-
1466-
# if we only care about the observed values
1467-
# we are done
1468-
elif self.observed:
1469-
return result
1470-
1471-
# reindexing only applies to a Categorical grouper
1472-
elif not any(isinstance(ping.grouper, (Categorical, CategoricalIndex))
1473-
for ping in groupings):
1474-
return result
1475-
1476-
levels_list = [ping.group_index for ping in groupings]
1477-
index, _ = MultiIndex.from_product(
1478-
levels_list, names=self.grouper.names).sortlevel()
1479-
1480-
if self.as_index:
1481-
d = {self.obj._get_axis_name(self.axis): index, 'copy': False}
1482-
return result.reindex(**d)
1483-
1484-
# GH 13204
1485-
# Here, the categorical in-axis groupers, which need to be fully
1486-
# expanded, are columns in `result`. An idea is to do:
1487-
# result = result.set_index(self.grouper.names)
1488-
# .reindex(index).reset_index()
1489-
# but special care has to be taken because of possible not-in-axis
1490-
# groupers.
1491-
# So, we manually select and drop the in-axis grouper columns,
1492-
# reindex `result`, and then reset the in-axis grouper columns.
1493-
1494-
# Select in-axis groupers
1495-
in_axis_grps = ((i, ping.name) for (i, ping)
1496-
in enumerate(groupings) if ping.in_axis)
1497-
g_nums, g_names = zip(*in_axis_grps)
1498-
1499-
result = result.drop(labels=list(g_names), axis=1)
1500-
1501-
# Set a temp index and reindex (possibly expanding)
1502-
result = result.set_index(self.grouper.result_index
1503-
).reindex(index, copy=False)
1504-
1505-
# Reset in-axis grouper columns
1506-
# (using level numbers `g_nums` because level names may not be unique)
1507-
result = result.reset_index(level=g_nums)
1508-
1509-
return result.reset_index(drop=True)
1510-
15111454
def _iterate_column_groupbys(self):
15121455
for i, colname in enumerate(self._selected_obj.columns):
15131456
yield colname, SeriesGroupBy(self._selected_obj.iloc[:, i],

pandas/core/groupby/groupby.py

+75-1
Original file line numberDiff line numberDiff line change
@@ -36,13 +36,14 @@ class providing the base-class of operations.
3636
from pandas.api.types import (
3737
is_datetime64_dtype, is_integer_dtype, is_object_dtype)
3838
import pandas.core.algorithms as algorithms
39+
from pandas.core.arrays import Categorical
3940
from pandas.core.base import (
4041
DataError, GroupByError, PandasObject, SelectionMixin, SpecificationError)
4142
import pandas.core.common as com
4243
from pandas.core.frame import DataFrame
4344
from pandas.core.generic import NDFrame
4445
from pandas.core.groupby import base
45-
from pandas.core.index import Index, MultiIndex
46+
from pandas.core.index import CategoricalIndex, Index, MultiIndex
4647
from pandas.core.series import Series
4748
from pandas.core.sorting import get_group_index_sorter
4849

@@ -2301,6 +2302,79 @@ def tail(self, n=5):
23012302
mask = self._cumcount_array(ascending=False) < n
23022303
return self._selected_obj[mask]
23032304

2305+
def _reindex_output(self, output):
2306+
"""
2307+
If we have categorical groupers, then we might want to make sure that
2308+
we have a fully re-indexed output to the levels. This means expanding
2309+
the output space to accommodate all values in the cartesian product of
2310+
our groups, regardless of whether they were observed in the data or
2311+
not. This will expand the output space if there are missing groups.
2312+
2313+
The method returns early without modifying the input if the number of
2314+
groupings is less than 2, self.observed == True or none of the groupers
2315+
are categorical.
2316+
2317+
Parameters
2318+
----------
2319+
output: Series or DataFrame
2320+
Object resulting from grouping and applying an operation.
2321+
2322+
Returns
2323+
-------
2324+
Series or DataFrame
2325+
Object (potentially) re-indexed to include all possible groups.
2326+
"""
2327+
groupings = self.grouper.groupings
2328+
if groupings is None:
2329+
return output
2330+
elif len(groupings) == 1:
2331+
return output
2332+
2333+
# if we only care about the observed values
2334+
# we are done
2335+
elif self.observed:
2336+
return output
2337+
2338+
# reindexing only applies to a Categorical grouper
2339+
elif not any(isinstance(ping.grouper, (Categorical, CategoricalIndex))
2340+
for ping in groupings):
2341+
return output
2342+
2343+
levels_list = [ping.group_index for ping in groupings]
2344+
index, _ = MultiIndex.from_product(
2345+
levels_list, names=self.grouper.names).sortlevel()
2346+
2347+
if self.as_index:
2348+
d = {self.obj._get_axis_name(self.axis): index, 'copy': False}
2349+
return output.reindex(**d)
2350+
2351+
# GH 13204
2352+
# Here, the categorical in-axis groupers, which need to be fully
2353+
# expanded, are columns in `output`. An idea is to do:
2354+
# output = output.set_index(self.grouper.names)
2355+
# .reindex(index).reset_index()
2356+
# but special care has to be taken because of possible not-in-axis
2357+
# groupers.
2358+
# So, we manually select and drop the in-axis grouper columns,
2359+
# reindex `output`, and then reset the in-axis grouper columns.
2360+
2361+
# Select in-axis groupers
2362+
in_axis_grps = ((i, ping.name) for (i, ping)
2363+
in enumerate(groupings) if ping.in_axis)
2364+
g_nums, g_names = zip(*in_axis_grps)
2365+
2366+
output = output.drop(labels=list(g_names), axis=1)
2367+
2368+
# Set a temp index and reindex (possibly expanding)
2369+
output = output.set_index(self.grouper.result_index
2370+
).reindex(index, copy=False)
2371+
2372+
# Reset in-axis grouper columns
2373+
# (using level numbers `g_nums` because level names may not be unique)
2374+
output = output.reset_index(level=g_nums)
2375+
2376+
return output.reset_index(drop=True)
2377+
23042378

23052379
GroupBy._add_numeric_operations()
23062380

0 commit comments

Comments
 (0)