diff --git a/doc/source/user_guide/10min.rst b/doc/source/user_guide/10min.rst index cf548ba5d1133..81c33b53e21a8 100644 --- a/doc/source/user_guide/10min.rst +++ b/doc/source/user_guide/10min.rst @@ -702,11 +702,11 @@ Sorting is per order in the categories, not lexical order. df.sort_values(by="grade") -Grouping by a categorical column also shows empty categories. +Grouping by a categorical column can also show empty categories, using the observed keyword. .. ipython:: python - df.groupby("grade").size() + df.groupby("grade", observed=False).size() Plotting diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst index 2cd48ac7adb0e..f952bd9150ce5 100644 --- a/doc/source/user_guide/advanced.rst +++ b/doc/source/user_guide/advanced.rst @@ -809,8 +809,8 @@ Groupby operations on the index will preserve the index nature as well. .. ipython:: python - df2.groupby(level=0).sum() - df2.groupby(level=0).sum().index + df2.groupby(level=0, observed=False).sum() + df2.groupby(level=0, observed=False).sum().index Reindexing operations will return a resulting index based on the type of the passed indexer. Passing a list will return a plain-old ``Index``; indexing with diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst index 5c43de05fb5b9..0221bc4101b63 100644 --- a/doc/source/user_guide/categorical.rst +++ b/doc/source/user_guide/categorical.rst @@ -622,7 +622,7 @@ even if some categories are not present in the data: s = pd.Series(pd.Categorical(["a", "b", "c", "c"], categories=["c", "a", "b", "d"])) s.value_counts() -``DataFrame`` methods like :meth:`DataFrame.sum` also show "unused" categories. +``DataFrame`` methods like :meth:`DataFrame.sum` also show "unused" categories: .. ipython:: python @@ -635,7 +635,8 @@ even if some categories are not present in the data: ) df.sum(axis=1, level=1) -Groupby will also show "unused" categories: +Groupby will also show "unused" categories by default, though this behavior +is deprecated. In a future release, users must specify a value for ``observed``: .. ipython:: python @@ -643,7 +644,7 @@ Groupby will also show "unused" categories: ["a", "b", "b", "b", "c", "c", "c"], categories=["a", "b", "c", "d"] ) df = pd.DataFrame({"cats": cats, "values": [1, 2, 2, 2, 3, 4, 5]}) - df.groupby("cats").mean() + df.groupby("cats", observed=False).mean() cats2 = pd.Categorical(["a", "a", "b", "b"], categories=["a", "b", "c"]) df2 = pd.DataFrame( @@ -653,7 +654,7 @@ Groupby will also show "unused" categories: "values": [1, 2, 3, 4], } ) - df2.groupby(["cats", "B"]).mean() + df2.groupby(["cats", "B"], observed=False).mean() Pivot tables: @@ -662,7 +663,7 @@ Pivot tables: raw_cat = pd.Categorical(["a", "a", "b", "b"], categories=["a", "b", "c"]) df = pd.DataFrame({"A": raw_cat, "B": ["c", "d", "c", "d"], "values": [1, 2, 3, 4]}) - pd.pivot_table(df, values="values", index=["A", "B"]) + pd.pivot_table(df, values="values", index=["A", "B"], observed=False) Data munging ------------ diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index d6081155b58db..b6f30beae1dbb 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -1269,7 +1269,7 @@ can be used as group keys. If so, the order of the levels will be preserved: factor = pd.qcut(data, [0, 0.25, 0.5, 0.75, 1.0]) - data.groupby(factor).mean() + data.groupby(factor, observed=True).mean() .. _groupby.specify: diff --git a/doc/source/whatsnew/v0.19.0.rst b/doc/source/whatsnew/v0.19.0.rst index 340e1ce9ee1ef..cec8e44806250 100644 --- a/doc/source/whatsnew/v0.19.0.rst +++ b/doc/source/whatsnew/v0.19.0.rst @@ -1131,6 +1131,7 @@ An analogous change has been made to ``MultiIndex.from_product``. As a consequence, ``groupby`` and ``set_index`` also preserve categorical dtypes in indexes .. ipython:: python + :okwarning: df = pd.DataFrame({"A": [0, 1], "B": [10, 11], "C": cat}) df_grouped = df.groupby(by=["A", "C"]).first() diff --git a/doc/source/whatsnew/v0.20.0.rst b/doc/source/whatsnew/v0.20.0.rst index 2cb8e13e9a18a..dbd77aab4ff3d 100644 --- a/doc/source/whatsnew/v0.20.0.rst +++ b/doc/source/whatsnew/v0.20.0.rst @@ -291,6 +291,7 @@ In previous versions, ``.groupby(..., sort=False)`` would fail with a ``ValueErr **New behavior**: .. ipython:: python + :okwarning: df[df.chromosomes != '1'].groupby('chromosomes', sort=False).sum() diff --git a/doc/source/whatsnew/v0.22.0.rst b/doc/source/whatsnew/v0.22.0.rst index ec9769c22e76b..d8672be0bc711 100644 --- a/doc/source/whatsnew/v0.22.0.rst +++ b/doc/source/whatsnew/v0.22.0.rst @@ -118,6 +118,7 @@ instead of ``NaN``. *pandas 0.22* .. ipython:: python + :okwarning: grouper = pd.Categorical(["a", "a"], categories=["a", "b"]) pd.Series([1, 2]).groupby(grouper).sum() @@ -126,6 +127,7 @@ To restore the 0.21 behavior of returning ``NaN`` for unobserved groups, use ``min_count>=1``. .. ipython:: python + :okwarning: pd.Series([1, 2]).groupby(grouper).sum(min_count=1) diff --git a/doc/source/whatsnew/v0.23.0.rst b/doc/source/whatsnew/v0.23.0.rst index f4caea9d363eb..a763803d6fa3b 100644 --- a/doc/source/whatsnew/v0.23.0.rst +++ b/doc/source/whatsnew/v0.23.0.rst @@ -288,6 +288,7 @@ For pivoting operations, this behavior is *already* controlled by the ``dropna`` df .. ipython:: python + :okwarning: pd.pivot_table(df, values='values', index=['A', 'B'], dropna=True) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 8dbc6728dccfe..ce6e2a1395868 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -522,6 +522,7 @@ Deprecations - Deprecated :meth:`Index.asi8` for :class:`Index` subclasses other than :class:`.DatetimeIndex`, :class:`.TimedeltaIndex`, and :class:`PeriodIndex` (:issue:`37877`) - The ``inplace`` parameter of :meth:`Categorical.remove_unused_categories` is deprecated and will be removed in a future version (:issue:`37643`) - The ``null_counts`` parameter of :meth:`DataFrame.info` is deprecated and replaced by ``show_counts``. It will be removed in a future version (:issue:`37999`) +- Deprecated default keyword argument of ``observed=False`` in :~meth:`DataFrame.groupby` and :~meth:`DataFrame.pivot_table` (:issue:`17594`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5f149f10b05d3..53f72abd8d93f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5677,7 +5677,7 @@ def value_counts( if subset is None: subset = self.columns.tolist() - counts = self.groupby(subset).grouper.size() + counts = self.groupby(subset, observed=True).grouper.size() if sort: counts = counts.sort_values(ascending=ascending) @@ -6698,7 +6698,7 @@ def groupby( sort: bool = True, group_keys: bool = True, squeeze: bool = no_default, - observed: bool = False, + observed: Optional[bool] = None, dropna: bool = True, ) -> DataFrameGroupBy: from pandas.core.groupby.generic import DataFrameGroupBy @@ -7029,7 +7029,7 @@ def pivot_table( margins=False, dropna=True, margins_name="All", - observed=False, + observed=None, ) -> DataFrame: from pandas.core.reshape.pivot import pivot_table diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 4a9e020a0fe46..61cdc6b98d919 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -87,10 +87,15 @@ from pandas.core.dtypes.missing import isna, notna import pandas as pd -from pandas.core import arraylike, indexing, missing, nanops -import pandas.core.algorithms as algos +from pandas.core import ( + algorithms as algos, + arraylike, + common as com, + indexing, + missing, + nanops, +) from pandas.core.base import PandasObject, SelectionMixin -import pandas.core.common as com from pandas.core.construction import create_series_with_explicit_dtype from pandas.core.flags import Flags from pandas.core.indexes import base as ibase @@ -10545,7 +10550,8 @@ def pct_change( def _agg_by_level(self, name, axis=0, level=0, skipna=True, **kwargs): if axis is None: raise ValueError("Must specify 'axis' when aggregating by level.") - grouped = self.groupby(level=level, axis=axis, sort=False) + # see pr-35967 for discussion about the observed keyword + grouped = self.groupby(level=level, axis=axis, sort=False, observed=False) if hasattr(grouped, name) and skipna: return getattr(grouped, name)(**kwargs) axis = self._get_axis_number(axis) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 798c0742f03e5..98d26ccb34a00 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -526,7 +526,7 @@ def __init__( sort: bool = True, group_keys: bool = True, squeeze: bool = False, - observed: bool = False, + observed: Optional[bool] = None, mutated: bool = False, dropna: bool = True, ): @@ -3016,7 +3016,7 @@ def get_groupby( sort: bool = True, group_keys: bool = True, squeeze: bool = False, - observed: bool = False, + observed: Optional[bool] = None, mutated: bool = False, dropna: bool = True, ) -> GroupBy: diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index e8af9da30a298..23b562301aeb1 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -2,6 +2,7 @@ Provide user facing operators for doing the split part of the split-apply-combine paradigm. """ +import textwrap from typing import Dict, Hashable, List, Optional, Set, Tuple import warnings @@ -31,6 +32,18 @@ from pandas.io.formats.printing import pprint_thing +_observed_msg = textwrap.dedent( + """\ +Grouping by a categorical but 'observed' was not specified. +Using 'observed=False', but in a future version of pandas +not specifying 'observed' will raise an error. Pass +'observed=True' or 'observed=False' to silence this warning. + +See the `groupby` documentation for more information on the +observed keyword. +""" +) + class Grouper: """ @@ -432,7 +445,7 @@ def __init__( name=None, level=None, sort: bool = True, - observed: bool = False, + observed: Optional[bool] = None, in_axis: bool = False, dropna: bool = True, ): @@ -495,6 +508,10 @@ def __init__( # a passed Categorical elif is_categorical_dtype(self.grouper): + if observed is None: + warnings.warn(_observed_msg, FutureWarning) + observed = False + self.grouper, self.all_grouper = recode_for_groupby( self.grouper, self.sort, observed ) @@ -631,7 +648,7 @@ def get_grouper( axis: int = 0, level=None, sort: bool = True, - observed: bool = False, + observed: Optional[bool] = None, mutated: bool = False, validate: bool = True, dropna: bool = True, diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 52ffb1567cb2d..c9ffc9a69281b 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -493,7 +493,12 @@ def _format_duplicate_message(self): duplicates = self[self.duplicated(keep="first")].unique() assert len(duplicates) - out = Series(np.arange(len(self))).groupby(self).agg(list)[duplicates] + # see pr-35967 about the observed keyword + out = ( + Series(np.arange(len(self))) + .groupby(self, observed=False) + .agg(list)[duplicates] + ) if self.nlevels == 1: out = out.rename_axis("label") return out.to_frame(name="positions") diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 2c6cdb846221f..94d8b50cf5597 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -109,13 +109,15 @@ def _groupby_and_merge(by, on, left: "DataFrame", right: "DataFrame", merge_piec if not isinstance(by, (list, tuple)): by = [by] - lby = left.groupby(by, sort=False) + # see pr-35967 for discussion about observed=False + # this is the previous default behavior if the group is a categorical + lby = left.groupby(by, sort=False, observed=False) rby: Optional[groupby.DataFrameGroupBy] = None # if we can groupby the rhs # then we can get vastly better perf if all(item in right.columns for item in by): - rby = right.groupby(by, sort=False) + rby = right.groupby(by, sort=False, observed=False) for key, lhs in lby: diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 40496a5b8671b..19a56b1651197 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -46,7 +46,7 @@ def pivot_table( margins=False, dropna=True, margins_name="All", - observed=False, + observed=None, ) -> "DataFrame": index = _convert_by(index) columns = _convert_by(columns) @@ -612,6 +612,8 @@ def crosstab( margins=margins, margins_name=margins_name, dropna=dropna, + # the below is only here to silence the FutureWarning + observed=False, **kwargs, ) diff --git a/pandas/core/series.py b/pandas/core/series.py index b20cf8eed9a2e..b51e2a42293d0 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1674,7 +1674,7 @@ def groupby( sort: bool = True, group_keys: bool = True, squeeze: bool = no_default, - observed: bool = False, + observed: Optional[bool] = None, dropna: bool = True, ) -> "SeriesGroupBy": from pandas.core.groupby.generic import SeriesGroupBy diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index 3aeb3b664b27f..92e52a3d174dd 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -119,6 +119,17 @@ This only applies if any of the groupers are Categoricals. If True: only show observed values for categorical groupers. If False: show all values for categorical groupers. + + The current default of ``observed=False`` is deprecated. In + the future this will be a required keyword in the presence + of a categorical grouper and a failure to specify a value will + result in an error. + + Explicitly pass ``observed=True`` to silence the warning and not + show all observed values. + Explicitly pass ``observed=False`` to silence the warning and + show groups for all observed values. + dropna : bool, default True If True, and if group keys contain NA values, NA values together with row/column will be dropped. diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py index 7122a38db9d0a..82bf1af5da297 100644 --- a/pandas/plotting/_matplotlib/boxplot.py +++ b/pandas/plotting/_matplotlib/boxplot.py @@ -195,7 +195,7 @@ def _grouped_plot_by_column( return_type=None, **kwargs, ): - grouped = data.groupby(by) + grouped = data.groupby(by, observed=False) if columns is None: if not isinstance(by, (list, tuple)): by = [by] diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 073918eda3deb..cd3757f6a5ecf 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -13,8 +13,7 @@ from pandas.core.dtypes.common import is_integer_dtype import pandas as pd -from pandas import DataFrame, Index, MultiIndex, Series, concat -import pandas._testing as tm +from pandas import DataFrame, Index, MultiIndex, Series, _testing as tm, concat from pandas.core.base import SpecificationError from pandas.core.groupby.grouper import Grouping @@ -1074,7 +1073,7 @@ def test_groupby_single_agg_cat_cols(grp_col_dict, exp_data): input_df = input_df.astype({"cat": "category", "cat_ord": "category"}) input_df["cat_ord"] = input_df["cat_ord"].cat.as_ordered() - result_df = input_df.groupby("cat").agg(grp_col_dict) + result_df = input_df.groupby("cat", observed=False).agg(grp_col_dict) # create expected dataframe cat_index = pd.CategoricalIndex( @@ -1108,7 +1107,7 @@ def test_groupby_combined_aggs_cat_cols(grp_col_dict, exp_data): input_df = input_df.astype({"cat": "category", "cat_ord": "category"}) input_df["cat_ord"] = input_df["cat_ord"].cat.as_ordered() - result_df = input_df.groupby("cat").agg(grp_col_dict) + result_df = input_df.groupby("cat", observed=False).agg(grp_col_dict) # create expected dataframe cat_index = pd.CategoricalIndex( diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index c907391917ca8..6e96605418731 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -1,13 +1,20 @@ """ test cython .agg behavior """ - import numpy as np import pytest import pandas as pd -from pandas import DataFrame, Index, NaT, Series, Timedelta, Timestamp, bdate_range -import pandas._testing as tm +from pandas import ( + DataFrame, + Index, + NaT, + Series, + Timedelta, + Timestamp, + _testing as tm, + bdate_range, +) from pandas.core.groupby.groupby import DataError @@ -175,6 +182,7 @@ def test__cython_agg_general(op, targop): ("max", np.max), ], ) +@pytest.mark.filterwarnings("ignore:Grouping by a categorical:FutureWarning") def test_cython_agg_empty_buckets(op, targop, observed): df = DataFrame([11, 12, 13]) grps = range(0, 55, 5) @@ -189,6 +197,7 @@ def test_cython_agg_empty_buckets(op, targop, observed): tm.assert_frame_equal(result, expected) +@pytest.mark.filterwarnings("ignore:Grouping by a categorical:FutureWarning") def test_cython_agg_empty_buckets_nanops(observed): # GH-18869 can't call nanops on empty groups, so hardcode expected # for these diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 5d0f6d6262899..5138f5de21a4c 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -1,7 +1,6 @@ """ test all other .agg behavior """ - import datetime as dt from functools import partial @@ -15,10 +14,10 @@ MultiIndex, PeriodIndex, Series, + _testing as tm, date_range, period_range, ) -import pandas._testing as tm from pandas.core.base import SpecificationError from pandas.io.formats.printing import pprint_thing @@ -555,6 +554,7 @@ def test_agg_structs_series(structure, expected): tm.assert_series_equal(result, expected) +@pytest.mark.filterwarnings("ignore:Grouping by a categorical:FutureWarning") def test_agg_category_nansum(observed): categories = ["a", "b", "c"] df = DataFrame( diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 8cf77ca6335f4..a1b3f7fe2e463 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -11,9 +11,9 @@ Index, MultiIndex, Series, + _testing as tm, qcut, ) -import pandas._testing as tm def cartesian_product_for_groupers(result, args, names, fill_value=np.NaN): @@ -212,6 +212,7 @@ def f(x): tm.assert_index_equal((desc_result.stack().index.get_level_values(1)), exp) +@pytest.mark.filterwarnings("ignore:Grouping by a categorical:FutureWarning") def test_level_get_group(observed): # GH15155 df = DataFrame( @@ -276,6 +277,7 @@ def test_apply(ordered): tm.assert_series_equal(result, expected) +@pytest.mark.filterwarnings("ignore:Grouping by a categorical:FutureWarning") def test_observed(observed): # multiple groupers, don't re-expand the output space # of the grouper @@ -384,11 +386,13 @@ def test_observed(observed): tm.assert_frame_equal(result, expected) +@pytest.mark.filterwarnings("ignore:Grouping by a categorical:FutureWarning") def test_observed_codes_remap(observed): d = {"C1": [3, 3, 4, 5], "C2": [1, 2, 3, 4], "C3": [10, 100, 200, 34]} df = DataFrame(d) values = pd.cut(df["C1"], [1, 2, 3, 6]) values.name = "cat" + groups_double_key = df.groupby([values, "C2"], observed=observed) idx = MultiIndex.from_arrays([values, [1, 2, 3, 4]], names=["cat", "C2"]) @@ -423,12 +427,14 @@ def test_observed_perf(): assert result.index.levels[2].nunique() == df.other_id.nunique() +@pytest.mark.filterwarnings("ignore:Grouping by a categorical:FutureWarning") def test_observed_groups(observed): # gh-20583 # test that we have the appropriate groups cat = Categorical(["a", "c", "a"], categories=["a", "b", "c"]) df = DataFrame({"cat": cat, "vals": [1, 2, 3]}) + g = df.groupby("cat", observed=observed) result = g.groups @@ -444,6 +450,7 @@ def test_observed_groups(observed): tm.assert_dict_equal(result, expected) +@pytest.mark.filterwarnings("ignore:Grouping by a categorical:FutureWarning") def test_observed_groups_with_nan(observed): # GH 24740 df = DataFrame( @@ -480,6 +487,7 @@ def test_observed_nth(): tm.assert_series_equal(result, expected) +@pytest.mark.filterwarnings("ignore:Grouping by a categorical:FutureWarning") def test_dataframe_categorical_with_nan(observed): # GH 21151 s1 = Categorical([np.nan, "a", np.nan, "a"], categories=["a", "b", "c"]) @@ -503,6 +511,7 @@ def test_dataframe_categorical_with_nan(observed): @pytest.mark.parametrize("ordered", [True, False]) @pytest.mark.parametrize("observed", [True, False]) @pytest.mark.parametrize("sort", [True, False]) +@pytest.mark.filterwarnings("ignore:Grouping by a categorical:FutureWarning") def test_dataframe_categorical_ordered_observed_sort(ordered, observed, sort): # GH 25871: Fix groupby sorting on ordered Categoricals # GH 25167: Groupby with observed=True doesn't sort @@ -1062,7 +1071,7 @@ def test_groupby_multiindex_categorical_datetime(): "values": np.arange(9), } ) - result = df.groupby(["key1", "key2"]).mean() + result = df.groupby(["key1", "key2"], observed=False).mean() idx = MultiIndex.from_product( [ @@ -1167,6 +1176,7 @@ def test_seriesgroupby_observed_true(df_cat, operation, kwargs): @pytest.mark.parametrize("operation", ["agg", "apply"]) @pytest.mark.parametrize("observed", [False, None]) +@pytest.mark.filterwarnings("ignore:Grouping by a categorical:FutureWarning") def test_seriesgroupby_observed_false_or_none(df_cat, observed, operation): # GH 24880 index, _ = MultiIndex.from_product( @@ -1231,6 +1241,7 @@ def test_seriesgroupby_observed_false_or_none(df_cat, observed, operation): ), ], ) +@pytest.mark.filterwarnings("ignore:Grouping by a categorical:FutureWarning") def test_seriesgroupby_observed_apply_dict(df_cat, observed, index, data): # GH 24880 expected = Series(data=data, index=index, name="C") @@ -1242,12 +1253,13 @@ def test_seriesgroupby_observed_apply_dict(df_cat, observed, index, data): def test_groupby_categorical_series_dataframe_consistent(df_cat): # GH 20416 - expected = df_cat.groupby(["A", "B"])["C"].mean() - result = df_cat.groupby(["A", "B"]).mean()["C"] + expected = df_cat.groupby(["A", "B"], observed=False)["C"].mean() + result = df_cat.groupby(["A", "B"], observed=False).mean()["C"] tm.assert_series_equal(result, expected) @pytest.mark.parametrize("code", [([1, 0, 0]), ([0, 0, 0])]) +@pytest.mark.filterwarnings("ignore:Grouping by a categorical:FutureWarning") def test_groupby_categorical_axis_1(code): # GH 13420 df = DataFrame({"a": [1, 2, 3, 4], "b": [-1, -2, -3, -4], "c": [5, 6, 7, 8]}) @@ -1257,6 +1269,7 @@ def test_groupby_categorical_axis_1(code): tm.assert_frame_equal(result, expected) +@pytest.mark.filterwarnings("ignore:Grouping by a categorical:FutureWarning") def test_groupby_cat_preserves_structure(observed, ordered): # GH 28787 df = DataFrame( @@ -1285,6 +1298,7 @@ def test_get_nonexistent_category(): ) +@pytest.mark.filterwarnings("ignore:Grouping by a categorical:FutureWarning") def test_series_groupby_on_2_categoricals_unobserved(reduction_func, observed, request): # GH 17605 if reduction_func == "ngroup": @@ -1384,6 +1398,7 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_true(reduction_fun @pytest.mark.parametrize("observed", [False, None]) +@pytest.mark.filterwarnings("ignore:Grouping by a categorical:FutureWarning") def test_dataframe_groupby_on_2_categoricals_when_observed_is_false( reduction_func, observed, request ): @@ -1417,6 +1432,7 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_false( assert (res.loc[unobserved_cats] == expected).all().all() +@pytest.mark.filterwarnings("ignore:Grouping by a categorical:FutureWarning") def test_series_groupby_categorical_aggregation_getitem(): # GH 8870 d = {"foo": [10, 8, 4, 1], "bar": [10, 20, 30, 40], "baz": ["d", "c", "d", "c"]} @@ -1472,6 +1488,7 @@ def test_groupy_first_returned_categorical_instead_of_dataframe(func): tm.assert_series_equal(result, expected) +@pytest.mark.filterwarnings("ignore:Grouping by a categorical:FutureWarning") def test_read_only_category_no_sort(): # GH33410 cats = np.array([1, 2]) @@ -1480,10 +1497,12 @@ def test_read_only_category_no_sort(): {"a": [1, 3, 5, 7], "b": Categorical([1, 1, 2, 2], categories=Index(cats))} ) expected = DataFrame(data={"a": [2, 6]}, index=CategoricalIndex([1, 2], name="b")) + result = df.groupby("b", sort=False).mean() tm.assert_frame_equal(result, expected) +@pytest.mark.filterwarnings("ignore:Grouping by a categorical:FutureWarning") def test_sorted_missing_category_values(): # GH 28597 df = DataFrame( @@ -1631,6 +1650,7 @@ def test_categorical_transform(): @pytest.mark.parametrize("func", ["first", "last"]) +@pytest.mark.filterwarnings("ignore:Grouping by a categorical:FutureWarning") def test_series_groupby_first_on_categorical_col_grouped_on_2_categoricals( func: str, observed: bool ): @@ -1656,6 +1676,7 @@ def test_series_groupby_first_on_categorical_col_grouped_on_2_categoricals( @pytest.mark.parametrize("func", ["first", "last"]) +@pytest.mark.filterwarnings("ignore:Grouping by a categorical:FutureWarning") def test_df_groupby_first_on_categorical_col_grouped_on_2_categoricals( func: str, observed: bool ): diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 12e570490487d..cc0c6c61e7e56 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -7,9 +7,17 @@ from pandas.errors import UnsupportedFunctionCall import pandas as pd -from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, date_range, isna -import pandas._testing as tm -import pandas.core.nanops as nanops +from pandas import ( + DataFrame, + Index, + MultiIndex, + Series, + Timestamp, + _testing as tm, + date_range, + isna, +) +from pandas.core import nanops as nanops from pandas.util import _test_decorators as td @@ -410,6 +418,7 @@ def test_cython_median(): tm.assert_frame_equal(rs, xp) +@pytest.mark.filterwarnings("ignore:Grouping by a categorical:FutureWarning") def test_median_empty_bins(observed): df = DataFrame(np.random.randint(0, 44, 500)) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 7c179a79513fa..a96789a7c80ce 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -15,10 +15,10 @@ MultiIndex, Series, Timestamp, + _testing as tm, date_range, read_csv, ) -import pandas._testing as tm from pandas.core.base import SpecificationError import pandas.core.common as com @@ -2012,7 +2012,7 @@ def test_dup_labels_output_shape(groupby_func, idx): pytest.skip("Not applicable") df = DataFrame([[1, 1]], columns=idx) - grp_by = df.groupby([0]) + grp_by = df.groupby([0], observed=False) args = [] if groupby_func in {"fillna", "nth"}: diff --git a/pandas/tests/groupby/test_groupby_subclass.py b/pandas/tests/groupby/test_groupby_subclass.py index d268d87708552..574a42fb7224e 100644 --- a/pandas/tests/groupby/test_groupby_subclass.py +++ b/pandas/tests/groupby/test_groupby_subclass.py @@ -3,8 +3,7 @@ import numpy as np import pytest -from pandas import DataFrame, Series -import pandas._testing as tm +from pandas import DataFrame, Series, _testing as tm @pytest.mark.parametrize( @@ -21,7 +20,7 @@ def test_groupby_preserves_subclass(obj, groupby_func): if isinstance(obj, Series) and groupby_func in {"corrwith"}: pytest.skip("Not applicable") - grouped = obj.groupby(np.arange(0, 10)) + grouped = obj.groupby(np.arange(0, 10), observed=False) # Groups should preserve subclass type assert isinstance(grouped.get_group(0), type(obj)) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 1d2208592a06d..979b01371247f 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -1,5 +1,4 @@ """ test where we are determining what we are grouping, or getting groups """ - import numpy as np import pytest @@ -11,9 +10,9 @@ MultiIndex, Series, Timestamp, + _testing as tm, date_range, ) -import pandas._testing as tm from pandas.core.groupby.grouper import Grouping # selection @@ -311,6 +310,7 @@ def test_groupby_levels_and_columns(self): by_columns.columns = by_columns.columns.astype(np.int64) tm.assert_frame_equal(by_levels, by_columns) + @pytest.mark.filterwarnings("ignore:Grouping by a categorical:FutureWarning") def test_groupby_categorical_index_and_columns(self, observed): # GH18432, adapted for GH25871 columns = ["A", "B", "A", "B"] @@ -702,6 +702,29 @@ def test_groupby_multiindex_level_empty(self): ) tm.assert_frame_equal(result, expected) + def test_default_observed_deprecated(self): + # pr-35967 + df = DataFrame([["A", 1, 1], ["A", 2, 1], ["B", 1, 1]], columns=["x", "y", "z"]) + df.x = df.x.astype("category") + df.y = df.x.astype("category") + + with tm.assert_produces_warning( + expected_warning=FutureWarning, check_stacklevel=False + ): + df.groupby(["x", "y"]) + + with tm.assert_produces_warning(None) as any_warnings: + df.groupby(["x", "y"], observed=True) + df.groupby(["x", "y"], observed=False) + assert len(any_warnings) == 0 + + cat = pd.Categorical(["A", "B", "C"], categories=["A", "B", "C", "D"]) + s = Series(cat) + with tm.assert_produces_warning( + expected_warning=FutureWarning, check_stacklevel=False + ): + s.groupby(cat) + # get_group # -------------------------------- @@ -755,6 +778,7 @@ def test_get_group(self): with pytest.raises(ValueError, match=msg): g.get_group(("foo", "bar", "baz")) + @pytest.mark.filterwarnings("ignore:Grouping by a categorical:FutureWarning") def test_get_group_empty_bins(self, observed): d = DataFrame([3, 1, 7, 6]) diff --git a/pandas/tests/groupby/test_size.py b/pandas/tests/groupby/test_size.py index ba27e5a24ba00..cb724d46bc0d1 100644 --- a/pandas/tests/groupby/test_size.py +++ b/pandas/tests/groupby/test_size.py @@ -1,8 +1,7 @@ import numpy as np import pytest -from pandas import DataFrame, Index, PeriodIndex, Series -import pandas._testing as tm +from pandas import DataFrame, Index, PeriodIndex, Series, _testing as tm @pytest.mark.parametrize("by", ["A", "B", ["A", "B"]]) @@ -50,7 +49,7 @@ def test_size_period_index(): def test_size_on_categorical(as_index): df = DataFrame([[1, 1], [2, 2]], columns=["A", "B"]) df["A"] = df["A"].astype("category") - result = df.groupby(["A", "B"], as_index=as_index).size() + result = df.groupby(["A", "B"], as_index=as_index, observed=False).size() expected = DataFrame( [[1, 1, 1], [1, 2, 0], [2, 1, 0], [2, 2, 1]], columns=["A", "B", "size"] diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 8acd051fbc643..71e182f34bb0a 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -13,10 +13,10 @@ MultiIndex, Series, Timestamp, + _testing as tm, concat, date_range, ) -import pandas._testing as tm from pandas.core.groupby.groupby import DataError @@ -994,7 +994,7 @@ def test_transform_absent_categories(func): x_cats = range(2) y = [1] df = DataFrame({"x": Categorical(x_vals, x_cats), "y": y}) - result = getattr(df.y.groupby(df.x), func)() + result = getattr(df.y.groupby(df.x, observed=False), func)() expected = df.y tm.assert_series_equal(result, expected) @@ -1153,6 +1153,7 @@ def test_transform_lambda_indexing(): tm.assert_frame_equal(result, expected) +@pytest.mark.filterwarnings("ignore:Grouping by a categorical:FutureWarning") def test_categorical_and_not_categorical_key(observed): # Checks that groupby-transform, when grouping by both a categorical # and a non-categorical key, doesn't try to expand the output to include diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index f9b2a02920841..11fef6f271672 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -12,10 +12,10 @@ Index, MultiIndex, Series, + _testing as tm, concat, date_range, ) -import pandas._testing as tm from pandas.api.types import CategoricalDtype as CDT from pandas.core.reshape.pivot import pivot_table @@ -108,6 +108,7 @@ def test_pivot_table(self, observed): expected = self.data.groupby(index + [columns])["D"].agg(np.mean).unstack() tm.assert_frame_equal(table, expected) + @pytest.mark.filterwarnings("ignore:Grouping by a categorical:FutureWarning") def test_pivot_table_categorical_observed_equal(self, observed): # issue #24923 df = DataFrame( @@ -193,7 +194,9 @@ def test_pivot_table_categorical(self): ["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True ) df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) - result = pd.pivot_table(df, values="values", index=["A", "B"], dropna=True) + result = pd.pivot_table( + df, values="values", index=["A", "B"], dropna=True, observed=False + ) exp_index = MultiIndex.from_arrays([cat1, cat2], names=["A", "B"]) expected = DataFrame({"values": [1, 2, 3, 4]}, index=exp_index) @@ -212,7 +215,9 @@ def test_pivot_table_dropna_categoricals(self, dropna): ) df["A"] = df["A"].astype(CDT(categories, ordered=False)) - result = df.pivot_table(index="B", columns="A", values="C", dropna=dropna) + result = df.pivot_table( + index="B", columns="A", values="C", dropna=dropna, observed=False + ) expected_columns = Series(["a", "b", "c"], name="A") expected_columns = expected_columns.astype(CDT(categories, ordered=False)) expected_index = Series([1, 2, 3], name="B") @@ -240,7 +245,7 @@ def test_pivot_with_non_observable_dropna(self, dropna): } ) - result = df.pivot_table(index="A", values="B", dropna=dropna) + result = df.pivot_table(index="A", values="B", dropna=dropna, observed=False) expected = DataFrame( {"B": [2, 3]}, index=Index( @@ -265,7 +270,7 @@ def test_pivot_with_non_observable_dropna(self, dropna): } ) - result = df.pivot_table(index="A", values="B", dropna=dropna) + result = df.pivot_table(index="A", values="B", dropna=dropna, observed=False) expected = DataFrame( {"B": [2, 3, 0]}, index=Index( @@ -281,7 +286,7 @@ def test_pivot_with_non_observable_dropna(self, dropna): def test_pivot_with_interval_index(self, interval_values, dropna): # GH 25814 df = DataFrame({"A": interval_values, "B": 1}) - result = df.pivot_table(index="A", values="B", dropna=dropna) + result = df.pivot_table(index="A", values="B", dropna=dropna, observed=False) expected = DataFrame({"B": 1}, index=Index(interval_values.unique(), name="A")) tm.assert_frame_equal(result, expected) @@ -299,7 +304,13 @@ def test_pivot_with_interval_index_margins(self): ) pivot_tab = pd.pivot_table( - df, index="C", columns="B", values="A", aggfunc="sum", margins=True + df, + index="C", + columns="B", + values="A", + aggfunc="sum", + margins=True, + observed=False, ) result = pivot_tab["All"] @@ -1752,6 +1763,7 @@ def test_margins_casted_to_float(self, observed): ) tm.assert_frame_equal(result, expected) + @pytest.mark.filterwarnings("ignore:Grouping by a categorical:FutureWarning") def test_pivot_with_categorical(self, observed, ordered): # gh-21370 idx = [np.nan, "low", "high", "low", np.nan] @@ -1787,6 +1799,7 @@ def test_pivot_with_categorical(self, observed, ordered): tm.assert_frame_equal(result, expected) + @pytest.mark.filterwarnings("ignore:Grouping by a categorical:FutureWarning") def test_categorical_aggfunc(self, observed): # GH 9534 df = DataFrame( @@ -1807,6 +1820,7 @@ def test_categorical_aggfunc(self, observed): ) tm.assert_frame_equal(result, expected) + @pytest.mark.filterwarnings("ignore:Grouping by a categorical:FutureWarning") def test_categorical_pivot_index_ordering(self, observed): # GH 8731 df = DataFrame( @@ -2058,6 +2072,13 @@ def agg(arr): with pytest.raises(KeyError, match="notpresent"): foo.pivot_table("notpresent", "X", "Y", aggfunc=agg) + def test_pivot_table_observed_deprecated_default(self): + # pr-35967 + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + # make sure we actually have a category to warn on + self.data.A = self.data.A.astype("category") + self.data.pivot_table(values="D", index=["A", "B"], columns=["C"]) + class TestPivot: def test_pivot(self):