diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index cb68bd0e762c4..75d47938f983a 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -183,6 +183,47 @@ New repr for :class:`pandas.core.arrays.IntervalArray` pd.arrays.IntervalArray.from_tuples([(0, 1), (2, 3)]) + +All :class:`SeriesGroupBy` aggregation methods now respect the ``observed`` keyword +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +The following methods now also correctly output values for unobserved categories when called through ``groupby(..., observed=False)`` (:issue:`17605`) + +- :meth:`SeriesGroupBy.count` +- :meth:`SeriesGroupBy.size` +- :meth:`SeriesGroupBy.nunique` +- :meth:`SeriesGroupBy.nth` + +.. ipython:: python + + df = pd.DataFrame({ + "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")), + "cat_2": pd.Categorical(list("AB") * 2, categories=list("ABC")), + "value": [0.1] * 4, + }) + df + + +*pandas 0.25.x* + +.. code-block:: ipython + + In [2]: df.groupby(["cat_1", "cat_2"], observed=False)["value"].count() + Out[2]: + cat_1 cat_2 + A A 1 + B 1 + B A 1 + B 1 + Name: value, dtype: int64 + + +*pandas 1.0.0* + +.. ipython:: python + + df.groupby(["cat_1", "cat_2"], observed=False)["value"].count() + + .. _whatsnew_1000.api.other: Other API changes diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 6376dbefcf435..f5ec763db390f 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -569,7 +569,8 @@ def nunique(self, dropna: bool = True) -> Series: res, out = np.zeros(len(ri), dtype=out.dtype), res res[ids[idx]] = out - return Series(res, index=ri, name=self._selection_name) + result = Series(res, index=ri, name=self._selection_name) + return self._reindex_output(result, fill_value=0) @Appender(Series.describe.__doc__) def describe(self, **kwargs): @@ -721,12 +722,13 @@ def count(self) -> Series: minlength = ngroups or 0 out = np.bincount(ids[mask], minlength=minlength) - return Series( + result = Series( out, index=self.grouper.result_index, name=self._selection_name, dtype="int64", ) + return self._reindex_output(result, fill_value=0) def _apply_to_column_groupbys(self, func): """ return a pass thru """ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 236df4b3854a4..62bbb151f793e 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -39,6 +39,7 @@ class providing the base-class of operations. ) from pandas.core.dtypes.missing import isna, notna +from pandas._typing import FrameOrSeries, Scalar from pandas.core import nanops import pandas.core.algorithms as algorithms from pandas.core.arrays import Categorical, try_cast_to_ea @@ -1296,7 +1297,7 @@ def size(self): if isinstance(self.obj, Series): result.name = self.obj.name - return result + return self._reindex_output(result, fill_value=0) @classmethod def _add_numeric_operations(cls): @@ -1743,6 +1744,7 @@ def nth(self, n: Union[int, List[int]], dropna: Optional[str] = None) -> DataFra if not self.observed and isinstance(result_index, CategoricalIndex): out = out.reindex(result_index) + out = self._reindex_output(out) return out.sort_index() if self.sort else out # dropna is truthy @@ -2383,7 +2385,9 @@ def tail(self, n=5): mask = self._cumcount_array(ascending=False) < n return self._selected_obj[mask] - def _reindex_output(self, output): + def _reindex_output( + self, output: FrameOrSeries, fill_value: Scalar = np.NaN + ) -> FrameOrSeries: """ If we have categorical groupers, then we might want to make sure that we have a fully re-indexed output to the levels. This means expanding @@ -2397,8 +2401,10 @@ def _reindex_output(self, output): Parameters ---------- - output: Series or DataFrame + output : Series or DataFrame Object resulting from grouping and applying an operation. + fill_value : scalar, default np.NaN + Value to use for unobserved categories if self.observed is False. Returns ------- @@ -2429,7 +2435,11 @@ def _reindex_output(self, output): ).sortlevel() if self.as_index: - d = {self.obj._get_axis_name(self.axis): index, "copy": False} + d = { + self.obj._get_axis_name(self.axis): index, + "copy": False, + "fill_value": fill_value, + } return output.reindex(**d) # GH 13204 @@ -2451,7 +2461,9 @@ def _reindex_output(self, output): output = output.drop(labels=list(g_names), axis=1) # Set a temp index and reindex (possibly expanding) - output = output.set_index(self.grouper.result_index).reindex(index, copy=False) + output = output.set_index(self.grouper.result_index).reindex( + index, copy=False, fill_value=fill_value + ) # Reset in-axis grouper columns # (using level numbers `g_nums` because level names may not be unique) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 663e03aa1bc81..5f78e4860f1e9 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1252,3 +1252,82 @@ def test_get_nonexistent_category(): {"var": [rows.iloc[-1]["var"]], "val": [rows.iloc[-1]["vau"]]} ) ) + + +def test_series_groupby_on_2_categoricals_unobserved( + reduction_func: str, observed: bool +): + # GH 17605 + + if reduction_func == "ngroup": + pytest.skip("ngroup is not truly a reduction") + + df = pd.DataFrame( + { + "cat_1": pd.Categorical(list("AABB"), categories=list("ABCD")), + "cat_2": pd.Categorical(list("AB") * 2, categories=list("ABCD")), + "value": [0.1] * 4, + } + ) + args = {"nth": [0]}.get(reduction_func, []) + + expected_length = 4 if observed else 16 + + series_groupby = df.groupby(["cat_1", "cat_2"], observed=observed)["value"] + agg = getattr(series_groupby, reduction_func) + result = agg(*args) + + assert len(result) == expected_length + + +@pytest.mark.parametrize( + "func, zero_or_nan", + [ + ("all", np.NaN), + ("any", np.NaN), + ("count", 0), + ("first", np.NaN), + ("idxmax", np.NaN), + ("idxmin", np.NaN), + ("last", np.NaN), + ("mad", np.NaN), + ("max", np.NaN), + ("mean", np.NaN), + ("median", np.NaN), + ("min", np.NaN), + ("nth", np.NaN), + ("nunique", 0), + ("prod", np.NaN), + ("quantile", np.NaN), + ("sem", np.NaN), + ("size", 0), + ("skew", np.NaN), + ("std", np.NaN), + ("sum", np.NaN), + ("var", np.NaN), + ], +) +def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(func, zero_or_nan): + # GH 17605 + # Tests whether the unobserved categories in the result contain 0 or NaN + df = pd.DataFrame( + { + "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")), + "cat_2": pd.Categorical(list("AB") * 2, categories=list("ABC")), + "value": [0.1] * 4, + } + ) + unobserved = [tuple("AC"), tuple("BC"), tuple("CA"), tuple("CB"), tuple("CC")] + args = {"nth": [0]}.get(func, []) + + series_groupby = df.groupby(["cat_1", "cat_2"], observed=False)["value"] + agg = getattr(series_groupby, func) + result = agg(*args) + + for idx in unobserved: + val = result.loc[idx] + assert (pd.isna(zero_or_nan) and pd.isna(val)) or (val == zero_or_nan) + + # If we expect unobserved values to be zero, we also expect the dtype to be int + if zero_or_nan == 0: + assert np.issubdtype(result.dtype, np.integer)