diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 986ee371566cd..eabb02d885762 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1085,6 +1085,8 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.groupby` lost index, when one of the ``agg`` keys referenced an empty list (:issue:`32580`) - Bug in :meth:`Rolling.apply` where ``center=True`` was ignored when ``engine='numba'`` was specified (:issue:`34784`) - Bug in :meth:`DataFrame.ewm.cov` was throwing ``AssertionError`` for :class:`MultiIndex` inputs (:issue:`34440`) +- Bug in :meth:`DataFrameGroupBy.count` was returning ``NaN`` for missing categories when grouped on multiple ``Categoricals``. Now returning ``0`` (:issue:`35028`) +- Bug in :meth:`DataFrameGroupBy.sum` and :meth:`SeriesGroupBy.sum` was reutrning ``NaN`` for missing categories when grouped on multiple ``Categorials``. Now returning ``0`` (:issue:`31422`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index ebb9d82766c1b..5093bbe8a1711 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -31,7 +31,7 @@ import numpy as np from pandas._libs import lib -from pandas._typing import FrameOrSeries, FrameOrSeriesUnion +from pandas._typing import FrameOrSeries, FrameOrSeriesUnion, Scalar from pandas.util._decorators import Appender, Substitution, doc from pandas.core.dtypes.cast import ( @@ -363,7 +363,9 @@ def _wrap_series_output( return result def _wrap_aggregated_output( - self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]] + self, + output: Mapping[base.OutputKey, Union[Series, np.ndarray]], + fill_value: Scalar = np.NaN, ) -> Union[Series, DataFrame]: """ Wraps the output of a SeriesGroupBy aggregation into the expected result. @@ -385,7 +387,7 @@ def _wrap_aggregated_output( result = self._wrap_series_output( output=output, index=self.grouper.result_index ) - return self._reindex_output(result) + return self._reindex_output(result, fill_value) def _wrap_transformed_output( self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]] @@ -415,7 +417,11 @@ def _wrap_transformed_output( return result def _wrap_applied_output( - self, keys: Index, values: Optional[List[Any]], not_indexed_same: bool = False + self, + keys: Index, + values: Optional[List[Any]], + not_indexed_same: bool = False, + fill_value: Scalar = np.NaN, ) -> FrameOrSeriesUnion: """ Wrap the output of SeriesGroupBy.apply into the expected result. @@ -465,7 +471,7 @@ def _get_index() -> Index: result = self.obj._constructor( data=values, index=_get_index(), name=self._selection_name ) - return self._reindex_output(result) + return self._reindex_output(result, fill_value) def _aggregate_named(self, func, *args, **kwargs): result = {} @@ -1029,7 +1035,10 @@ def _cython_agg_general( agg_blocks, agg_items = self._cython_agg_blocks( how, alt=alt, numeric_only=numeric_only, min_count=min_count ) - return self._wrap_agged_blocks(agg_blocks, items=agg_items) + fill_value = self._cython_func_fill_values.get(alt, np.NaN) + return self._wrap_agged_blocks( + agg_blocks, items=agg_items, fill_value=fill_value + ) def _cython_agg_blocks( self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 @@ -1219,7 +1228,9 @@ def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame: return self.obj._constructor(result, columns=result_columns) - def _wrap_applied_output(self, keys, values, not_indexed_same=False): + def _wrap_applied_output( + self, keys, values, not_indexed_same=False, fill_value=np.NaN + ): if len(keys) == 0: return self.obj._constructor(index=keys) @@ -1380,7 +1391,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): if not self.as_index: self._insert_inaxis_grouper_inplace(result) - return self._reindex_output(result) + return self._reindex_output(result, fill_value) # values are not series or array-like but scalars else: @@ -1731,7 +1742,9 @@ def _insert_inaxis_grouper_inplace(self, result): result.insert(0, name, lev) def _wrap_aggregated_output( - self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]] + self, + output: Mapping[base.OutputKey, Union[Series, np.ndarray]], + fill_value: Scalar = np.NaN, ) -> DataFrame: """ Wraps the output of DataFrameGroupBy aggregations into the expected result. @@ -1762,7 +1775,7 @@ def _wrap_aggregated_output( if self.axis == 1: result = result.T - return self._reindex_output(result) + return self._reindex_output(result, fill_value) def _wrap_transformed_output( self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]] @@ -1788,7 +1801,9 @@ def _wrap_transformed_output( return result - def _wrap_agged_blocks(self, blocks: "Sequence[Block]", items: Index) -> DataFrame: + def _wrap_agged_blocks( + self, blocks: "Sequence[Block]", items: Index, fill_value: Scalar = np.NaN + ) -> DataFrame: if not self.as_index: index = np.arange(blocks[0].values.shape[-1]) mgr = BlockManager(blocks, axes=[items, index]) @@ -1804,7 +1819,7 @@ def _wrap_agged_blocks(self, blocks: "Sequence[Block]", items: Index) -> DataFra if self.axis == 1: result = result.T - return self._reindex_output(result)._convert(datetime=True) + return self._reindex_output(result, fill_value)._convert(datetime=True) def _iterate_column_groupbys(self): for i, colname in enumerate(self._selected_obj.columns): @@ -1846,7 +1861,7 @@ def count(self): ) blocks = [make_block(val, placement=loc) for val, loc in zip(counted, locs)] - return self._wrap_agged_blocks(blocks, items=data.items) + return self._wrap_agged_blocks(blocks, items=data.items, fill_value=0) def nunique(self, dropna: bool = True): """ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index d039b715b3c08..afee28a76afc3 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -888,8 +888,12 @@ def _python_apply_general( """ keys, values, mutated = self.grouper.apply(f, data, self.axis) + fill_value = self._cython_func_fill_values.get(f, np.NaN) return self._wrap_applied_output( - keys, values, not_indexed_same=mutated or self.mutated + keys, + values, + not_indexed_same=mutated or self.mutated, + fill_value=fill_value, ) def _iterate_slices(self) -> Iterable[Series]: @@ -970,13 +974,17 @@ def _cython_transform(self, how: str, numeric_only: bool = True, **kwargs): return self._wrap_transformed_output(output) - def _wrap_aggregated_output(self, output: Mapping[base.OutputKey, np.ndarray]): + def _wrap_aggregated_output( + self, output: Mapping[base.OutputKey, np.ndarray], fill_value: Scalar = np.NaN + ): raise AbstractMethodError(self) def _wrap_transformed_output(self, output: Mapping[base.OutputKey, np.ndarray]): raise AbstractMethodError(self) - def _wrap_applied_output(self, keys, values, not_indexed_same: bool = False): + def _wrap_applied_output( + self, keys, values, not_indexed_same: bool = False, fill_value: Scalar = np.NaN + ): raise AbstractMethodError(self) def _agg_general( @@ -1010,6 +1018,8 @@ def _agg_general( result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) return result + _cython_func_fill_values = {np.sum: 0} + def _cython_agg_general( self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 ): @@ -1045,7 +1055,9 @@ def _cython_agg_general( if len(output) == 0: raise DataError("No numeric types to aggregate") - return self._wrap_aggregated_output(output) + fill_value = self._cython_func_fill_values.get(alt, np.NaN) + + return self._wrap_aggregated_output(output, fill_value) def _python_agg_general( self, func, *args, engine="cython", engine_kwargs=None, **kwargs diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 118d928ac02f4..e64c95063f2cc 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -19,7 +19,7 @@ import pandas._testing as tm -def cartesian_product_for_groupers(result, args, names): +def cartesian_product_for_groupers(result, args, names, fill_value=np.NaN): """ Reindex to a cartesian production for the groupers, preserving the nature (Categorical) of each grouper """ @@ -33,7 +33,7 @@ def f(a): return a index = MultiIndex.from_product(map(f, args), names=names) - return result.reindex(index).sort_index() + return result.reindex(index, fill_value=fill_value).sort_index() _results_for_groupbys_with_missing_categories = dict( @@ -309,7 +309,7 @@ def test_observed(observed): result = gb.sum() if not observed: expected = cartesian_product_for_groupers( - expected, [cat1, cat2, ["foo", "bar"]], list("ABC") + expected, [cat1, cat2, ["foo", "bar"]], list("ABC"), fill_value=0 ) tm.assert_frame_equal(result, expected) @@ -319,7 +319,9 @@ def test_observed(observed): expected = DataFrame({"values": [1, 2, 3, 4]}, index=exp_index) result = gb.sum() if not observed: - expected = cartesian_product_for_groupers(expected, [cat1, cat2], list("AB")) + expected = cartesian_product_for_groupers( + expected, [cat1, cat2], list("AB"), fill_value=0 + ) tm.assert_frame_equal(result, expected) @@ -1188,9 +1190,10 @@ def test_seriesgroupby_observed_false_or_none(df_cat, observed, operation): names=["A", "B"], ).sortlevel() - expected = Series(data=[2, 4, np.nan, 1, np.nan, 3], index=index, name="C") + expected = Series(data=[2, 4, 0, 1, 0, 3], index=index, name="C") grouped = df_cat.groupby(["A", "B"], observed=observed)["C"] result = getattr(grouped, operation)(sum) + tm.assert_series_equal(result, expected) @@ -1340,15 +1343,6 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans( ) request.node.add_marker(mark) - if reduction_func == "sum": # GH 31422 - mark = pytest.mark.xfail( - reason=( - "sum should return 0 but currently returns NaN. " - "This is a known bug. See GH 31422." - ) - ) - request.node.add_marker(mark) - df = pd.DataFrame( { "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")), @@ -1369,8 +1363,11 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans( val = result.loc[idx] assert (pd.isna(zero_or_nan) and pd.isna(val)) or (val == zero_or_nan) - # If we expect unobserved values to be zero, we also expect the dtype to be int - if zero_or_nan == 0: + # If we expect unobserved values to be zero, we also expect the dtype to be int. + # Except for .sum(). If the observed categories sum to dtype=float (i.e. their + # sums have decimals), then the zeros for the missing categories should also be + # floats. + if zero_or_nan == 0 and reduction_func != "sum": assert np.issubdtype(result.dtype, np.integer) @@ -1412,24 +1409,6 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_false( if reduction_func == "ngroup": pytest.skip("ngroup does not return the Categories on the index") - if reduction_func == "count": # GH 35028 - mark = pytest.mark.xfail( - reason=( - "DataFrameGroupBy.count returns np.NaN for missing " - "categories, when it should return 0. See GH 35028" - ) - ) - request.node.add_marker(mark) - - if reduction_func == "sum": # GH 31422 - mark = pytest.mark.xfail( - reason=( - "sum should return 0 but currently returns NaN. " - "This is a known bug. See GH 31422." - ) - ) - request.node.add_marker(mark) - df = pd.DataFrame( { "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")), diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index c07a5673fe503..67b3151b0ff9c 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1817,7 +1817,7 @@ def test_categorical_aggfunc(self, observed): ["A", "B", "C"], categories=["A", "B", "C"], ordered=False, name="C1" ) expected_columns = pd.Index(["a", "b"], name="C2") - expected_data = np.array([[1.0, np.nan], [1.0, np.nan], [np.nan, 2.0]]) + expected_data = np.array([[1, 0], [1, 0], [0, 2]], dtype=np.int64) expected = pd.DataFrame( expected_data, index=expected_index, columns=expected_columns ) @@ -1851,18 +1851,19 @@ def test_categorical_pivot_index_ordering(self, observed): values="Sales", index="Month", columns="Year", - dropna=observed, + observed=observed, aggfunc="sum", ) expected_columns = pd.Int64Index([2013, 2014], name="Year") expected_index = pd.CategoricalIndex( - ["January"], categories=months, ordered=False, name="Month" + months, categories=months, ordered=False, name="Month" ) + expected_data = [[320, 120]] + [[0, 0]] * 11 expected = pd.DataFrame( - [[320, 120]], index=expected_index, columns=expected_columns + expected_data, index=expected_index, columns=expected_columns ) - if not observed: - result = result.dropna().astype(np.int64) + if observed: + expected = expected.loc[["January"]] tm.assert_frame_equal(result, expected)