pandas-dev · jseabold · Aug 28, 2020 · Aug 28, 2020 · Aug 28, 2020 · Aug 28, 2020
diff --git a/doc/source/user_guide/10min.rst b/doc/source/user_guide/10min.rst
@@ -702,11 +702,11 @@ Sorting is per order in the categories, not lexical order.
 
     df.sort_values(by="grade")
 
-Grouping by a categorical column also shows empty categories.
+Grouping by a categorical column can also show empty categories, using the observed keyword.
 
 .. ipython:: python
 
-    df.groupby("grade").size()
+    df.groupby("grade", observed=False).size()
 
 
 Plotting

diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst
@@ -809,8 +809,8 @@ Groupby operations on the index will preserve the index nature as well.
 
 .. ipython:: python
 
-   df2.groupby(level=0).sum()
-   df2.groupby(level=0).sum().index
+   df2.groupby(level=0, observed=False).sum()
+   df2.groupby(level=0, observed=False).sum().index
 
 Reindexing operations will return a resulting index based on the type of the passed
 indexer. Passing a list will return a plain-old ``Index``; indexing with

diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst
@@ -622,7 +622,7 @@ even if some categories are not present in the data:
     s = pd.Series(pd.Categorical(["a", "b", "c", "c"], categories=["c", "a", "b", "d"]))
     s.value_counts()
 
-``DataFrame`` methods like :meth:`DataFrame.sum` also show "unused" categories.
+``DataFrame`` methods like :meth:`DataFrame.sum` also show "unused" categories:
 
 .. ipython:: python
 
@@ -635,15 +635,16 @@ even if some categories are not present in the data:
     )
     df.sum(axis=1, level=1)
 
-Groupby will also show "unused" categories:
+Groupby will also show "unused" categories by default, though this behavior
+is deprecated. In a future release, users must specify a value for ``observed``:
 
 .. ipython:: python
 
     cats = pd.Categorical(
         ["a", "b", "b", "b", "c", "c", "c"], categories=["a", "b", "c", "d"]
     )
     df = pd.DataFrame({"cats": cats, "values": [1, 2, 2, 2, 3, 4, 5]})
-    df.groupby("cats").mean()
+    df.groupby("cats", observed=False).mean()
 
     cats2 = pd.Categorical(["a", "a", "b", "b"], categories=["a", "b", "c"])
     df2 = pd.DataFrame(
@@ -653,7 +654,7 @@ Groupby will also show "unused" categories:
             "values": [1, 2, 3, 4],
         }
     )
-    df2.groupby(["cats", "B"]).mean()
+    df2.groupby(["cats", "B"], observed=False).mean()
 
 
 Pivot tables:
@@ -662,7 +663,7 @@ Pivot tables:
 
     raw_cat = pd.Categorical(["a", "a", "b", "b"], categories=["a", "b", "c"])
     df = pd.DataFrame({"A": raw_cat, "B": ["c", "d", "c", "d"], "values": [1, 2, 3, 4]})
-    pd.pivot_table(df, values="values", index=["A", "B"])
+    pd.pivot_table(df, values="values", index=["A", "B"], observed=False)
 
 Data munging
 ------------

diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst
@@ -1269,7 +1269,7 @@ can be used as group keys. If so, the order of the levels will be preserved:
 
    factor = pd.qcut(data, [0, 0.25, 0.5, 0.75, 1.0])
 
-   data.groupby(factor).mean()
+   data.groupby(factor, observed=True).mean()
 
 .. _groupby.specify:
 

diff --git a/doc/source/whatsnew/v0.19.0.rst b/doc/source/whatsnew/v0.19.0.rst
@@ -1131,6 +1131,7 @@ An analogous change has been made to ``MultiIndex.from_product``.
 As a consequence, ``groupby`` and ``set_index`` also preserve categorical dtypes in indexes
 
 .. ipython:: python
+   :okwarning:
 
    df = pd.DataFrame({"A": [0, 1], "B": [10, 11], "C": cat})
    df_grouped = df.groupby(by=["A", "C"]).first()

diff --git a/doc/source/whatsnew/v0.20.0.rst b/doc/source/whatsnew/v0.20.0.rst
@@ -291,6 +291,7 @@ In previous versions, ``.groupby(..., sort=False)`` would fail with a ``ValueErr
 **New behavior**:
 
 .. ipython:: python
+   :okwarning:
 
    df[df.chromosomes != '1'].groupby('chromosomes', sort=False).sum()
 

diff --git a/doc/source/whatsnew/v0.22.0.rst b/doc/source/whatsnew/v0.22.0.rst
@@ -118,6 +118,7 @@ instead of ``NaN``.
 *pandas 0.22*
 
 .. ipython:: python
+   :okwarning:
 
    grouper = pd.Categorical(["a", "a"], categories=["a", "b"])
    pd.Series([1, 2]).groupby(grouper).sum()
@@ -126,6 +127,7 @@ To restore the 0.21 behavior of returning ``NaN`` for unobserved groups,
 use ``min_count>=1``.
 
 .. ipython:: python
+   :okwarning:
 
    pd.Series([1, 2]).groupby(grouper).sum(min_count=1)
 

diff --git a/doc/source/whatsnew/v0.23.0.rst b/doc/source/whatsnew/v0.23.0.rst
@@ -288,6 +288,7 @@ For pivoting operations, this behavior is *already* controlled by the ``dropna``
    df
 
 .. ipython:: python
+   :okwarning:
 
    pd.pivot_table(df, values='values', index=['A', 'B'],
                   dropna=True)

diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
@@ -522,6 +522,7 @@ Deprecations
 - Deprecated :meth:`Index.asi8` for :class:`Index` subclasses other than :class:`.DatetimeIndex`, :class:`.TimedeltaIndex`, and :class:`PeriodIndex` (:issue:`37877`)
 - The ``inplace`` parameter of :meth:`Categorical.remove_unused_categories` is deprecated and will be removed in a future version (:issue:`37643`)
 - The ``null_counts`` parameter of :meth:`DataFrame.info` is deprecated and replaced by ``show_counts``. It will be removed in a future version (:issue:`37999`)
+- Deprecated default keyword argument of ``observed=False`` in :~meth:`DataFrame.groupby` and :~meth:`DataFrame.pivot_table` (:issue:`17594`)
 
 .. ---------------------------------------------------------------------------
 

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -5677,7 +5677,7 @@ def value_counts(
         if subset is None:
             subset = self.columns.tolist()
 
-        counts = self.groupby(subset).grouper.size()
+        counts = self.groupby(subset, observed=True).grouper.size()
 
         if sort:
             counts = counts.sort_values(ascending=ascending)
@@ -6698,7 +6698,7 @@ def groupby(
         sort: bool = True,
         group_keys: bool = True,
         squeeze: bool = no_default,
-        observed: bool = False,
+        observed: Optional[bool] = None,
         dropna: bool = True,
     ) -> DataFrameGroupBy:
         from pandas.core.groupby.generic import DataFrameGroupBy
@@ -7029,7 +7029,7 @@ def pivot_table(
         margins=False,
         dropna=True,
         margins_name="All",
-        observed=False,
+        observed=None,
     ) -> DataFrame:
         from pandas.core.reshape.pivot import pivot_table
 

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -87,10 +87,15 @@
 from pandas.core.dtypes.missing import isna, notna
 
 import pandas as pd
-from pandas.core import arraylike, indexing, missing, nanops
-import pandas.core.algorithms as algos
+from pandas.core import (
+    algorithms as algos,
+    arraylike,
+    common as com,
+    indexing,
+    missing,
+    nanops,
+)
 from pandas.core.base import PandasObject, SelectionMixin
-import pandas.core.common as com
 from pandas.core.construction import create_series_with_explicit_dtype
 from pandas.core.flags import Flags
 from pandas.core.indexes import base as ibase
@@ -10545,7 +10550,8 @@ def pct_change(
     def _agg_by_level(self, name, axis=0, level=0, skipna=True, **kwargs):
         if axis is None:
             raise ValueError("Must specify 'axis' when aggregating by level.")
-        grouped = self.groupby(level=level, axis=axis, sort=False)
+        # see pr-35967 for discussion about the observed keyword
+        grouped = self.groupby(level=level, axis=axis, sort=False, observed=False)
         if hasattr(grouped, name) and skipna:
             return getattr(grouped, name)(**kwargs)
         axis = self._get_axis_number(axis)

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -526,7 +526,7 @@ def __init__(
         sort: bool = True,
         group_keys: bool = True,
         squeeze: bool = False,
-        observed: bool = False,
+        observed: Optional[bool] = None,
         mutated: bool = False,
         dropna: bool = True,
     ):
@@ -3016,7 +3016,7 @@ def get_groupby(
     sort: bool = True,
     group_keys: bool = True,
     squeeze: bool = False,
-    observed: bool = False,
+    observed: Optional[bool] = None,
     mutated: bool = False,
     dropna: bool = True,
 ) -> GroupBy:

diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py
@@ -2,6 +2,7 @@
 Provide user facing operators for doing the split part of the
 split-apply-combine paradigm.
 """
+import textwrap
 from typing import Dict, Hashable, List, Optional, Set, Tuple
 import warnings
 
@@ -31,6 +32,18 @@
 
 from pandas.io.formats.printing import pprint_thing
 
+_observed_msg = textwrap.dedent(
+    """\
+Grouping by a categorical but 'observed' was not specified.
+Using 'observed=False', but in a future version of pandas
+not specifying 'observed' will raise an error. Pass
+'observed=True' or 'observed=False' to silence this warning.
+
+See the `groupby` documentation for more information on the
+observed keyword.
+"""
+)
+
 
 class Grouper:
     """
@@ -432,7 +445,7 @@ def __init__(
         name=None,
         level=None,
         sort: bool = True,
-        observed: bool = False,
+        observed: Optional[bool] = None,
         in_axis: bool = False,
         dropna: bool = True,
     ):
@@ -495,6 +508,10 @@ def __init__(
             # a passed Categorical
             elif is_categorical_dtype(self.grouper):
 
+                if observed is None:
+                    warnings.warn(_observed_msg, FutureWarning)
+                    observed = False
+
                 self.grouper, self.all_grouper = recode_for_groupby(
                     self.grouper, self.sort, observed
                 )
@@ -631,7 +648,7 @@ def get_grouper(
     axis: int = 0,
     level=None,
     sort: bool = True,
-    observed: bool = False,
+    observed: Optional[bool] = None,
     mutated: bool = False,
     validate: bool = True,
     dropna: bool = True,

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -493,7 +493,12 @@ def _format_duplicate_message(self):
         duplicates = self[self.duplicated(keep="first")].unique()
         assert len(duplicates)
 
-        out = Series(np.arange(len(self))).groupby(self).agg(list)[duplicates]
+        # see pr-35967 about the observed keyword
+        out = (
+            Series(np.arange(len(self)))
+            .groupby(self, observed=False)
+            .agg(list)[duplicates]
+        )
         if self.nlevels == 1:
             out = out.rename_axis("label")
         return out.to_frame(name="positions")

diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
@@ -109,13 +109,15 @@ def _groupby_and_merge(by, on, left: "DataFrame", right: "DataFrame", merge_piec
     if not isinstance(by, (list, tuple)):
         by = [by]
 
-    lby = left.groupby(by, sort=False)
+    # see pr-35967 for discussion about observed=False
+    # this is the previous default behavior if the group is a categorical
+    lby = left.groupby(by, sort=False, observed=False)
     rby: Optional[groupby.DataFrameGroupBy] = None
 
     # if we can groupby the rhs
     # then we can get vastly better perf
     if all(item in right.columns for item in by):
-        rby = right.groupby(by, sort=False)
+        rby = right.groupby(by, sort=False, observed=False)
 
     for key, lhs in lby:
 

diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py
@@ -46,7 +46,7 @@ def pivot_table(
     margins=False,
     dropna=True,
     margins_name="All",
-    observed=False,
+    observed=None,
 ) -> "DataFrame":
     index = _convert_by(index)
     columns = _convert_by(columns)
@@ -612,6 +612,8 @@ def crosstab(
         margins=margins,
         margins_name=margins_name,
         dropna=dropna,
+        # the below is only here to silence the FutureWarning
+        observed=False,
         **kwargs,
     )
 

diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -1674,7 +1674,7 @@ def groupby(
         sort: bool = True,
         group_keys: bool = True,
         squeeze: bool = no_default,
-        observed: bool = False,
+        observed: Optional[bool] = None,
         dropna: bool = True,
     ) -> "SeriesGroupBy":
         from pandas.core.groupby.generic import SeriesGroupBy

diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py
@@ -119,6 +119,17 @@
     This only applies if any of the groupers are Categoricals.
     If True: only show observed values for categorical groupers.
     If False: show all values for categorical groupers.
+
+    The current default of ``observed=False`` is deprecated. In
+    the future this will be a required keyword in the presence
+    of a categorical grouper and a failure to specify a value will
+    result in an error.
+
+    Explicitly pass ``observed=True`` to silence the warning and not
+    show all observed values.
+    Explicitly pass ``observed=False`` to silence the warning and
+    show groups for all observed values.
+
 dropna : bool, default True
     If True, and if group keys contain NA values, NA values together
     with row/column will be dropped.

diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py
@@ -195,7 +195,7 @@ def _grouped_plot_by_column(
     return_type=None,
     **kwargs,
 ):
-    grouped = data.groupby(by)
+    grouped = data.groupby(by, observed=False)
     if columns is None:
         if not isinstance(by, (list, tuple)):
             by = [by]

diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py
@@ -13,8 +13,7 @@
 from pandas.core.dtypes.common import is_integer_dtype
 
 import pandas as pd
-from pandas import DataFrame, Index, MultiIndex, Series, concat
-import pandas._testing as tm
+from pandas import DataFrame, Index, MultiIndex, Series, _testing as tm, concat
 from pandas.core.base import SpecificationError
 from pandas.core.groupby.grouper import Grouping
 
@@ -1074,7 +1073,7 @@ def test_groupby_single_agg_cat_cols(grp_col_dict, exp_data):
 
     input_df = input_df.astype({"cat": "category", "cat_ord": "category"})
     input_df["cat_ord"] = input_df["cat_ord"].cat.as_ordered()
-    result_df = input_df.groupby("cat").agg(grp_col_dict)
+    result_df = input_df.groupby("cat", observed=False).agg(grp_col_dict)
 
     # create expected dataframe
     cat_index = pd.CategoricalIndex(
@@ -1108,7 +1107,7 @@ def test_groupby_combined_aggs_cat_cols(grp_col_dict, exp_data):
 
     input_df = input_df.astype({"cat": "category", "cat_ord": "category"})
     input_df["cat_ord"] = input_df["cat_ord"].cat.as_ordered()
-    result_df = input_df.groupby("cat").agg(grp_col_dict)
+    result_df = input_df.groupby("cat", observed=False).agg(grp_col_dict)
 
     # create expected dataframe
     cat_index = pd.CategoricalIndex(