Skip to content

Commit c94a68c

Browse files
authored
DOC: followup to #20583, observed kwarg for .groupby (#20941)
1 parent be6f11e commit c94a68c

File tree

4 files changed

+25
-28
lines changed

4 files changed

+25
-28
lines changed

doc/source/groupby.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -994,7 +994,7 @@ is only interesting over one column (here ``colname``), it may be filtered
994994
Handling of (un)observed Categorical values
995995
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
996996

997-
When using a ``Categorical`` grouper (as a single or as part of multipler groupers), the ``observed`` keyword
997+
When using a ``Categorical`` grouper (as a single grouper, or as part of multipler groupers), the ``observed`` keyword
998998
controls whether to return a cartesian product of all possible groupers values (``observed=False``) or only those
999999
that are observed groupers (``observed=True``).
10001000

doc/source/whatsnew/v0.23.0.txt

+5-3
Original file line numberDiff line numberDiff line change
@@ -419,9 +419,11 @@ documentation. If you build an extension array, publicize it on our
419419
Categorical Groupers has gained an observed keyword
420420
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
421421

422-
In previous versions, grouping by 1 or more categorical columns would result in an index that was the cartesian product of all of the categories for
423-
each grouper, not just the observed values.``.groupby()`` has gained the ``observed`` keyword to toggle this behavior. The default remains backward
424-
compatible (generate a cartesian product). (:issue:`14942`, :issue:`8138`, :issue:`15217`, :issue:`17594`, :issue:`8669`, :issue:`20583`)
422+
Grouping by a categorical includes the unobserved categories in the output.
423+
When grouping with multiple groupers, this means you get the cartesian product of all the
424+
categories, including combinations where there are no observations, which can result in a large
425+
number of groupers. We have added a keyword ``observed`` to control this behavior, it defaults to
426+
``observed=False`` for backward-compatiblity. (:issue:`14942`, :issue:`8138`, :issue:`15217`, :issue:`17594`, :issue:`8669`, :issue:`20583`, :issue:`20902`)
425427

426428

427429
.. ipython:: python

pandas/core/generic.py

+5-6
Original file line numberDiff line numberDiff line change
@@ -6584,7 +6584,7 @@ def clip_lower(self, threshold, axis=None, inplace=False):
65846584
axis=axis, inplace=inplace)
65856585

65866586
def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True,
6587-
group_keys=True, squeeze=False, observed=None, **kwargs):
6587+
group_keys=True, squeeze=False, observed=False, **kwargs):
65886588
"""
65896589
Group series using mapper (dict or key function, apply given function
65906590
to group, return result as series) or by a series of columns.
@@ -6617,11 +6617,10 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True,
66176617
squeeze : boolean, default False
66186618
reduce the dimensionality of the return type if possible,
66196619
otherwise return a consistent type
6620-
observed : boolean, default None
6621-
if True: only show observed values for categorical groupers.
6622-
if False: show all values for categorical groupers.
6623-
if None: if any categorical groupers, show a FutureWarning,
6624-
default to False.
6620+
observed : boolean, default False
6621+
This only applies if any of the groupers are Categoricals
6622+
If True: only show observed values for categorical groupers.
6623+
If False: show all values for categorical groupers.
66256624
66266625
.. versionadded:: 0.23.0
66276626

pandas/core/groupby/groupby.py

+14-18
Original file line numberDiff line numberDiff line change
@@ -556,7 +556,7 @@ class _GroupBy(PandasObject, SelectionMixin):
556556
def __init__(self, obj, keys=None, axis=0, level=None,
557557
grouper=None, exclusions=None, selection=None, as_index=True,
558558
sort=True, group_keys=True, squeeze=False,
559-
observed=None, **kwargs):
559+
observed=False, **kwargs):
560560

561561
self._selection = selection
562562

@@ -2907,7 +2907,7 @@ class Grouping(object):
29072907
"""
29082908

29092909
def __init__(self, index, grouper=None, obj=None, name=None, level=None,
2910-
sort=True, observed=None, in_axis=False):
2910+
sort=True, observed=False, in_axis=False):
29112911

29122912
self.name = name
29132913
self.level = level
@@ -2964,12 +2964,6 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
29642964
# a passed Categorical
29652965
elif is_categorical_dtype(self.grouper):
29662966

2967-
# observed can be True/False/None
2968-
# we treat None as False. If in the future
2969-
# we need to warn if observed is not passed
2970-
# then we have this option
2971-
# gh-20583
2972-
29732967
self.all_grouper = self.grouper
29742968
self.grouper = self.grouper._codes_for_groupby(
29752969
self.sort, observed)
@@ -3088,7 +3082,7 @@ def groups(self):
30883082

30893083

30903084
def _get_grouper(obj, key=None, axis=0, level=None, sort=True,
3091-
observed=None, mutated=False, validate=True):
3085+
observed=False, mutated=False, validate=True):
30923086
"""
30933087
create and return a BaseGrouper, which is an internal
30943088
mapping of how to create the grouper indexers.
@@ -4734,26 +4728,28 @@ def _wrap_agged_blocks(self, items, blocks):
47344728

47354729
def _reindex_output(self, result):
47364730
"""
4737-
if we have categorical groupers, then we want to make sure that
4731+
If we have categorical groupers, then we want to make sure that
47384732
we have a fully reindex-output to the levels. These may have not
47394733
participated in the groupings (e.g. may have all been
4740-
nan groups)
4734+
nan groups);
47414735
47424736
This can re-expand the output space
47434737
"""
47444738

4745-
# TODO(jreback): remove completely
4746-
# when observed parameter is defaulted to True
4747-
# gh-20583
4748-
4749-
if self.observed:
4750-
return result
4751-
4739+
# we need to re-expand the output space to accomodate all values
4740+
# whether observed or not in the cartesian product of our groupes
47524741
groupings = self.grouper.groupings
47534742
if groupings is None:
47544743
return result
47554744
elif len(groupings) == 1:
47564745
return result
4746+
4747+
# if we only care about the observed values
4748+
# we are done
4749+
elif self.observed:
4750+
return result
4751+
4752+
# reindexing only applies to a Categorical grouper
47574753
elif not any(isinstance(ping.grouper, (Categorical, CategoricalIndex))
47584754
for ping in groupings):
47594755
return result

0 commit comments

Comments
 (0)