From b65467a75a63dc8882c610c4a7f213efaf67d24e Mon Sep 17 00:00:00 2001 From: smithto1 Date: Wed, 8 Jul 2020 23:05:30 +0100 Subject: [PATCH 1/7] #35028 DFGroupBy.count() now returns zero for missing categories when groupby by multiple categories --- pandas/core/groupby/generic.py | 10 ++++++---- pandas/tests/groupby/test_categorical.py | 9 --------- 2 files changed, 6 insertions(+), 13 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index ebb9d82766c1b..fc4b91ab2163e 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -31,7 +31,7 @@ import numpy as np from pandas._libs import lib -from pandas._typing import FrameOrSeries, FrameOrSeriesUnion +from pandas._typing import FrameOrSeries, FrameOrSeriesUnion, Scalar from pandas.util._decorators import Appender, Substitution, doc from pandas.core.dtypes.cast import ( @@ -1788,7 +1788,9 @@ def _wrap_transformed_output( return result - def _wrap_agged_blocks(self, blocks: "Sequence[Block]", items: Index) -> DataFrame: + def _wrap_agged_blocks( + self, blocks: "Sequence[Block]", items: Index, fill_value: Scalar = np.NaN + ) -> DataFrame: if not self.as_index: index = np.arange(blocks[0].values.shape[-1]) mgr = BlockManager(blocks, axes=[items, index]) @@ -1804,7 +1806,7 @@ def _wrap_agged_blocks(self, blocks: "Sequence[Block]", items: Index) -> DataFra if self.axis == 1: result = result.T - return self._reindex_output(result)._convert(datetime=True) + return self._reindex_output(result, fill_value)._convert(datetime=True) def _iterate_column_groupbys(self): for i, colname in enumerate(self._selected_obj.columns): @@ -1846,7 +1848,7 @@ def count(self): ) blocks = [make_block(val, placement=loc) for val, loc in zip(counted, locs)] - return self._wrap_agged_blocks(blocks, items=data.items) + return self._wrap_agged_blocks(blocks, items=data.items, fill_value=0) def nunique(self, dropna: bool = True): """ diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 118d928ac02f4..4ee77029e74ce 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1412,15 +1412,6 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_false( if reduction_func == "ngroup": pytest.skip("ngroup does not return the Categories on the index") - if reduction_func == "count": # GH 35028 - mark = pytest.mark.xfail( - reason=( - "DataFrameGroupBy.count returns np.NaN for missing " - "categories, when it should return 0. See GH 35028" - ) - ) - request.node.add_marker(mark) - if reduction_func == "sum": # GH 31422 mark = pytest.mark.xfail( reason=( From 4de1d6ba23c3bae668e39535dd6d6fd893a3ac50 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Thu, 9 Jul 2020 00:19:24 +0100 Subject: [PATCH 2/7] #31422 GroupBy.sum() returns 0 for missing categories when grouping by multiple Categoricals. Updates to tests to reflect this expected output --- pandas/core/groupby/generic.py | 25 +++++++++++----- pandas/core/groupby/groupby.py | 12 ++++++-- pandas/tests/groupby/test_categorical.py | 38 ++++++++---------------- 3 files changed, 41 insertions(+), 34 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index fc4b91ab2163e..d249dfa0140d9 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -363,7 +363,9 @@ def _wrap_series_output( return result def _wrap_aggregated_output( - self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]] + self, + output: Mapping[base.OutputKey, Union[Series, np.ndarray]], + fill_value: Scalar = np.NaN, ) -> Union[Series, DataFrame]: """ Wraps the output of a SeriesGroupBy aggregation into the expected result. @@ -385,7 +387,7 @@ def _wrap_aggregated_output( result = self._wrap_series_output( output=output, index=self.grouper.result_index ) - return self._reindex_output(result) + return self._reindex_output(result, fill_value) def _wrap_transformed_output( self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]] @@ -415,7 +417,11 @@ def _wrap_transformed_output( return result def _wrap_applied_output( - self, keys: Index, values: Optional[List[Any]], not_indexed_same: bool = False + self, + keys: Index, + values: Optional[List[Any]], + not_indexed_same: bool = False, + fill_value: Scalar = np.NaN, ) -> FrameOrSeriesUnion: """ Wrap the output of SeriesGroupBy.apply into the expected result. @@ -465,7 +471,7 @@ def _get_index() -> Index: result = self.obj._constructor( data=values, index=_get_index(), name=self._selection_name ) - return self._reindex_output(result) + return self._reindex_output(result, fill_value) def _aggregate_named(self, func, *args, **kwargs): result = {} @@ -1029,7 +1035,10 @@ def _cython_agg_general( agg_blocks, agg_items = self._cython_agg_blocks( how, alt=alt, numeric_only=numeric_only, min_count=min_count ) - return self._wrap_agged_blocks(agg_blocks, items=agg_items) + fill_value = self._cython_func_fill_values.get(alt, np.NaN) + return self._wrap_agged_blocks( + agg_blocks, items=agg_items, fill_value=fill_value + ) def _cython_agg_blocks( self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 @@ -1219,7 +1228,9 @@ def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame: return self.obj._constructor(result, columns=result_columns) - def _wrap_applied_output(self, keys, values, not_indexed_same=False): + def _wrap_applied_output( + self, keys, values, not_indexed_same=False, fill_value: Scalar = np.NaN + ): if len(keys) == 0: return self.obj._constructor(index=keys) @@ -1380,7 +1391,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): if not self.as_index: self._insert_inaxis_grouper_inplace(result) - return self._reindex_output(result) + return self._reindex_output(result, fill_value) # values are not series or array-like but scalars else: diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index d039b715b3c08..412446d28535f 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -888,8 +888,12 @@ def _python_apply_general( """ keys, values, mutated = self.grouper.apply(f, data, self.axis) + fill_value = self._cython_func_fill_values.get(f, np.NaN) return self._wrap_applied_output( - keys, values, not_indexed_same=mutated or self.mutated + keys, + values, + not_indexed_same=mutated or self.mutated, + fill_value=fill_value, ) def _iterate_slices(self) -> Iterable[Series]: @@ -1010,6 +1014,8 @@ def _agg_general( result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) return result + _cython_func_fill_values = {np.sum: 0} + def _cython_agg_general( self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 ): @@ -1045,7 +1051,9 @@ def _cython_agg_general( if len(output) == 0: raise DataError("No numeric types to aggregate") - return self._wrap_aggregated_output(output) + fill_value = self._cython_func_fill_values.get(alt, np.NaN) + + return self._wrap_aggregated_output(output, fill_value) def _python_agg_general( self, func, *args, engine="cython", engine_kwargs=None, **kwargs diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 4ee77029e74ce..e64c95063f2cc 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -19,7 +19,7 @@ import pandas._testing as tm -def cartesian_product_for_groupers(result, args, names): +def cartesian_product_for_groupers(result, args, names, fill_value=np.NaN): """ Reindex to a cartesian production for the groupers, preserving the nature (Categorical) of each grouper """ @@ -33,7 +33,7 @@ def f(a): return a index = MultiIndex.from_product(map(f, args), names=names) - return result.reindex(index).sort_index() + return result.reindex(index, fill_value=fill_value).sort_index() _results_for_groupbys_with_missing_categories = dict( @@ -309,7 +309,7 @@ def test_observed(observed): result = gb.sum() if not observed: expected = cartesian_product_for_groupers( - expected, [cat1, cat2, ["foo", "bar"]], list("ABC") + expected, [cat1, cat2, ["foo", "bar"]], list("ABC"), fill_value=0 ) tm.assert_frame_equal(result, expected) @@ -319,7 +319,9 @@ def test_observed(observed): expected = DataFrame({"values": [1, 2, 3, 4]}, index=exp_index) result = gb.sum() if not observed: - expected = cartesian_product_for_groupers(expected, [cat1, cat2], list("AB")) + expected = cartesian_product_for_groupers( + expected, [cat1, cat2], list("AB"), fill_value=0 + ) tm.assert_frame_equal(result, expected) @@ -1188,9 +1190,10 @@ def test_seriesgroupby_observed_false_or_none(df_cat, observed, operation): names=["A", "B"], ).sortlevel() - expected = Series(data=[2, 4, np.nan, 1, np.nan, 3], index=index, name="C") + expected = Series(data=[2, 4, 0, 1, 0, 3], index=index, name="C") grouped = df_cat.groupby(["A", "B"], observed=observed)["C"] result = getattr(grouped, operation)(sum) + tm.assert_series_equal(result, expected) @@ -1340,15 +1343,6 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans( ) request.node.add_marker(mark) - if reduction_func == "sum": # GH 31422 - mark = pytest.mark.xfail( - reason=( - "sum should return 0 but currently returns NaN. " - "This is a known bug. See GH 31422." - ) - ) - request.node.add_marker(mark) - df = pd.DataFrame( { "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")), @@ -1369,8 +1363,11 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans( val = result.loc[idx] assert (pd.isna(zero_or_nan) and pd.isna(val)) or (val == zero_or_nan) - # If we expect unobserved values to be zero, we also expect the dtype to be int - if zero_or_nan == 0: + # If we expect unobserved values to be zero, we also expect the dtype to be int. + # Except for .sum(). If the observed categories sum to dtype=float (i.e. their + # sums have decimals), then the zeros for the missing categories should also be + # floats. + if zero_or_nan == 0 and reduction_func != "sum": assert np.issubdtype(result.dtype, np.integer) @@ -1412,15 +1409,6 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_false( if reduction_func == "ngroup": pytest.skip("ngroup does not return the Categories on the index") - if reduction_func == "sum": # GH 31422 - mark = pytest.mark.xfail( - reason=( - "sum should return 0 but currently returns NaN. " - "This is a known bug. See GH 31422." - ) - ) - request.node.add_marker(mark) - df = pd.DataFrame( { "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")), From 78ae5c7bb32f78c63200ab62d7c852790803af18 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Fri, 10 Jul 2020 00:10:00 +0100 Subject: [PATCH 3/7] whatsnew --- doc/source/whatsnew/v1.1.0.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 986ee371566cd..eabb02d885762 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1085,6 +1085,8 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.groupby` lost index, when one of the ``agg`` keys referenced an empty list (:issue:`32580`) - Bug in :meth:`Rolling.apply` where ``center=True`` was ignored when ``engine='numba'`` was specified (:issue:`34784`) - Bug in :meth:`DataFrame.ewm.cov` was throwing ``AssertionError`` for :class:`MultiIndex` inputs (:issue:`34440`) +- Bug in :meth:`DataFrameGroupBy.count` was returning ``NaN`` for missing categories when grouped on multiple ``Categoricals``. Now returning ``0`` (:issue:`35028`) +- Bug in :meth:`DataFrameGroupBy.sum` and :meth:`SeriesGroupBy.sum` was reutrning ``NaN`` for missing categories when grouped on multiple ``Categorials``. Now returning ``0`` (:issue:`31422`) Reshaping ^^^^^^^^^ From 1ec3389b199b2dd7009c1177442f3b13acbdbff2 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Fri, 10 Jul 2020 07:28:00 +0100 Subject: [PATCH 4/7] addressing Type Validation check errors --- pandas/core/groupby/generic.py | 2 +- pandas/core/groupby/groupby.py | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index d249dfa0140d9..b651113c10533 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1229,7 +1229,7 @@ def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame: return self.obj._constructor(result, columns=result_columns) def _wrap_applied_output( - self, keys, values, not_indexed_same=False, fill_value: Scalar = np.NaN + self, keys, values, not_indexed_same=False, fill_value=np.NaN ): if len(keys) == 0: return self.obj._constructor(index=keys) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 412446d28535f..afee28a76afc3 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -974,13 +974,17 @@ def _cython_transform(self, how: str, numeric_only: bool = True, **kwargs): return self._wrap_transformed_output(output) - def _wrap_aggregated_output(self, output: Mapping[base.OutputKey, np.ndarray]): + def _wrap_aggregated_output( + self, output: Mapping[base.OutputKey, np.ndarray], fill_value: Scalar = np.NaN + ): raise AbstractMethodError(self) def _wrap_transformed_output(self, output: Mapping[base.OutputKey, np.ndarray]): raise AbstractMethodError(self) - def _wrap_applied_output(self, keys, values, not_indexed_same: bool = False): + def _wrap_applied_output( + self, keys, values, not_indexed_same: bool = False, fill_value: Scalar = np.NaN + ): raise AbstractMethodError(self) def _agg_general( From 1efb6e1710a814ed9ecc70a0380d49b5de3d91c9 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Fri, 10 Jul 2020 08:01:09 +0100 Subject: [PATCH 5/7] addressing second Type Validation error --- pandas/core/groupby/generic.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index b651113c10533..5093bbe8a1711 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1742,7 +1742,9 @@ def _insert_inaxis_grouper_inplace(self, result): result.insert(0, name, lev) def _wrap_aggregated_output( - self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]] + self, + output: Mapping[base.OutputKey, Union[Series, np.ndarray]], + fill_value: Scalar = np.NaN, ) -> DataFrame: """ Wraps the output of DataFrameGroupBy aggregations into the expected result. @@ -1773,7 +1775,7 @@ def _wrap_aggregated_output( if self.axis == 1: result = result.T - return self._reindex_output(result) + return self._reindex_output(result, fill_value) def _wrap_transformed_output( self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]] From b53439456bf67e3341f4e1cf6c68ddc626a4f83f Mon Sep 17 00:00:00 2001 From: smithto1 Date: Fri, 10 Jul 2020 08:47:20 +0100 Subject: [PATCH 6/7] test_pivot updates to reflect that .count() and .sum() now return 0 instead of NaN --- pandas/tests/reshape/test_pivot.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index c07a5673fe503..e91cffc55ad03 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1817,7 +1817,7 @@ def test_categorical_aggfunc(self, observed): ["A", "B", "C"], categories=["A", "B", "C"], ordered=False, name="C1" ) expected_columns = pd.Index(["a", "b"], name="C2") - expected_data = np.array([[1.0, np.nan], [1.0, np.nan], [np.nan, 2.0]]) + expected_data = np.array([[1, 0], [1, 0], [0, 2]], dtype=np.int64) expected = pd.DataFrame( expected_data, index=expected_index, columns=expected_columns ) @@ -1856,13 +1856,12 @@ def test_categorical_pivot_index_ordering(self, observed): ) expected_columns = pd.Int64Index([2013, 2014], name="Year") expected_index = pd.CategoricalIndex( - ["January"], categories=months, ordered=False, name="Month" + months, categories=months, ordered=False, name="Month" ) + expected_data = [[320, 120]] + [[0, 0]] * 11 expected = pd.DataFrame( - [[320, 120]], index=expected_index, columns=expected_columns + expected_data, index=expected_index, columns=expected_columns ) - if not observed: - result = result.dropna().astype(np.int64) tm.assert_frame_equal(result, expected) From 1e2fec89d61e606e00bf5d1208032d022364f982 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Fri, 10 Jul 2020 10:10:32 +0100 Subject: [PATCH 7/7] fixed pivot test to use observed so the output is different depending whether observed True/False --- pandas/tests/reshape/test_pivot.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index e91cffc55ad03..67b3151b0ff9c 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1851,7 +1851,7 @@ def test_categorical_pivot_index_ordering(self, observed): values="Sales", index="Month", columns="Year", - dropna=observed, + observed=observed, aggfunc="sum", ) expected_columns = pd.Int64Index([2013, 2014], name="Year") @@ -1862,6 +1862,8 @@ def test_categorical_pivot_index_ordering(self, observed): expected = pd.DataFrame( expected_data, index=expected_index, columns=expected_columns ) + if observed: + expected = expected.loc[["January"]] tm.assert_frame_equal(result, expected)