Skip to content

BUG: GroupBy.count() and GroupBy.sum() incorreclty return NaN instead of 0 for missing categories #35201

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 8 commits into from
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1085,6 +1085,8 @@ Groupby/resample/rolling
- Bug in :meth:`DataFrame.groupby` lost index, when one of the ``agg`` keys referenced an empty list (:issue:`32580`)
- Bug in :meth:`Rolling.apply` where ``center=True`` was ignored when ``engine='numba'`` was specified (:issue:`34784`)
- Bug in :meth:`DataFrame.ewm.cov` was throwing ``AssertionError`` for :class:`MultiIndex` inputs (:issue:`34440`)
- Bug in :meth:`DataFrameGroupBy.count` was returning ``NaN`` for missing categories when grouped on multiple ``Categoricals``. Now returning ``0`` (:issue:`35028`)
- Bug in :meth:`DataFrameGroupBy.sum` and :meth:`SeriesGroupBy.sum` was reutrning ``NaN`` for missing categories when grouped on multiple ``Categorials``. Now returning ``0`` (:issue:`31422`)

Reshaping
^^^^^^^^^
Expand Down
41 changes: 28 additions & 13 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
import numpy as np

from pandas._libs import lib
from pandas._typing import FrameOrSeries, FrameOrSeriesUnion
from pandas._typing import FrameOrSeries, FrameOrSeriesUnion, Scalar
from pandas.util._decorators import Appender, Substitution, doc

from pandas.core.dtypes.cast import (
Expand Down Expand Up @@ -363,7 +363,9 @@ def _wrap_series_output(
return result

def _wrap_aggregated_output(
self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]]
self,
output: Mapping[base.OutputKey, Union[Series, np.ndarray]],
fill_value: Scalar = np.NaN,
) -> Union[Series, DataFrame]:
"""
Wraps the output of a SeriesGroupBy aggregation into the expected result.
Expand All @@ -385,7 +387,7 @@ def _wrap_aggregated_output(
result = self._wrap_series_output(
output=output, index=self.grouper.result_index
)
return self._reindex_output(result)
return self._reindex_output(result, fill_value)

def _wrap_transformed_output(
self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]]
Expand Down Expand Up @@ -415,7 +417,11 @@ def _wrap_transformed_output(
return result

def _wrap_applied_output(
self, keys: Index, values: Optional[List[Any]], not_indexed_same: bool = False
self,
keys: Index,
values: Optional[List[Any]],
not_indexed_same: bool = False,
fill_value: Scalar = np.NaN,
) -> FrameOrSeriesUnion:
"""
Wrap the output of SeriesGroupBy.apply into the expected result.
Expand Down Expand Up @@ -465,7 +471,7 @@ def _get_index() -> Index:
result = self.obj._constructor(
data=values, index=_get_index(), name=self._selection_name
)
return self._reindex_output(result)
return self._reindex_output(result, fill_value)

def _aggregate_named(self, func, *args, **kwargs):
result = {}
Expand Down Expand Up @@ -1029,7 +1035,10 @@ def _cython_agg_general(
agg_blocks, agg_items = self._cython_agg_blocks(
how, alt=alt, numeric_only=numeric_only, min_count=min_count
)
return self._wrap_agged_blocks(agg_blocks, items=agg_items)
fill_value = self._cython_func_fill_values.get(alt, np.NaN)
return self._wrap_agged_blocks(
agg_blocks, items=agg_items, fill_value=fill_value
)

def _cython_agg_blocks(
self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1
Expand Down Expand Up @@ -1219,7 +1228,9 @@ def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame:

return self.obj._constructor(result, columns=result_columns)

def _wrap_applied_output(self, keys, values, not_indexed_same=False):
def _wrap_applied_output(
self, keys, values, not_indexed_same=False, fill_value=np.NaN
):
if len(keys) == 0:
return self.obj._constructor(index=keys)

Expand Down Expand Up @@ -1380,7 +1391,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
if not self.as_index:
self._insert_inaxis_grouper_inplace(result)

return self._reindex_output(result)
return self._reindex_output(result, fill_value)

# values are not series or array-like but scalars
else:
Expand Down Expand Up @@ -1731,7 +1742,9 @@ def _insert_inaxis_grouper_inplace(self, result):
result.insert(0, name, lev)

def _wrap_aggregated_output(
self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]]
self,
output: Mapping[base.OutputKey, Union[Series, np.ndarray]],
fill_value: Scalar = np.NaN,
) -> DataFrame:
"""
Wraps the output of DataFrameGroupBy aggregations into the expected result.
Expand Down Expand Up @@ -1762,7 +1775,7 @@ def _wrap_aggregated_output(
if self.axis == 1:
result = result.T

return self._reindex_output(result)
return self._reindex_output(result, fill_value)

def _wrap_transformed_output(
self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]]
Expand All @@ -1788,7 +1801,9 @@ def _wrap_transformed_output(

return result

def _wrap_agged_blocks(self, blocks: "Sequence[Block]", items: Index) -> DataFrame:
def _wrap_agged_blocks(
self, blocks: "Sequence[Block]", items: Index, fill_value: Scalar = np.NaN
) -> DataFrame:
if not self.as_index:
index = np.arange(blocks[0].values.shape[-1])
mgr = BlockManager(blocks, axes=[items, index])
Expand All @@ -1804,7 +1819,7 @@ def _wrap_agged_blocks(self, blocks: "Sequence[Block]", items: Index) -> DataFra
if self.axis == 1:
result = result.T

return self._reindex_output(result)._convert(datetime=True)
return self._reindex_output(result, fill_value)._convert(datetime=True)

def _iterate_column_groupbys(self):
for i, colname in enumerate(self._selected_obj.columns):
Expand Down Expand Up @@ -1846,7 +1861,7 @@ def count(self):
)
blocks = [make_block(val, placement=loc) for val, loc in zip(counted, locs)]

return self._wrap_agged_blocks(blocks, items=data.items)
return self._wrap_agged_blocks(blocks, items=data.items, fill_value=0)

def nunique(self, dropna: bool = True):
"""
Expand Down
20 changes: 16 additions & 4 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -888,8 +888,12 @@ def _python_apply_general(
"""
keys, values, mutated = self.grouper.apply(f, data, self.axis)

fill_value = self._cython_func_fill_values.get(f, np.NaN)
return self._wrap_applied_output(
keys, values, not_indexed_same=mutated or self.mutated
keys,
values,
not_indexed_same=mutated or self.mutated,
fill_value=fill_value,
)

def _iterate_slices(self) -> Iterable[Series]:
Expand Down Expand Up @@ -970,13 +974,17 @@ def _cython_transform(self, how: str, numeric_only: bool = True, **kwargs):

return self._wrap_transformed_output(output)

def _wrap_aggregated_output(self, output: Mapping[base.OutputKey, np.ndarray]):
def _wrap_aggregated_output(
self, output: Mapping[base.OutputKey, np.ndarray], fill_value: Scalar = np.NaN
):
raise AbstractMethodError(self)

def _wrap_transformed_output(self, output: Mapping[base.OutputKey, np.ndarray]):
raise AbstractMethodError(self)

def _wrap_applied_output(self, keys, values, not_indexed_same: bool = False):
def _wrap_applied_output(
self, keys, values, not_indexed_same: bool = False, fill_value: Scalar = np.NaN
):
raise AbstractMethodError(self)

def _agg_general(
Expand Down Expand Up @@ -1010,6 +1018,8 @@ def _agg_general(
result = self.aggregate(lambda x: npfunc(x, axis=self.axis))
return result

_cython_func_fill_values = {np.sum: 0}

def _cython_agg_general(
self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1
):
Expand Down Expand Up @@ -1045,7 +1055,9 @@ def _cython_agg_general(
if len(output) == 0:
raise DataError("No numeric types to aggregate")

return self._wrap_aggregated_output(output)
fill_value = self._cython_func_fill_values.get(alt, np.NaN)

return self._wrap_aggregated_output(output, fill_value)

def _python_agg_general(
self, func, *args, engine="cython", engine_kwargs=None, **kwargs
Expand Down
47 changes: 13 additions & 34 deletions pandas/tests/groupby/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import pandas._testing as tm


def cartesian_product_for_groupers(result, args, names):
def cartesian_product_for_groupers(result, args, names, fill_value=np.NaN):
""" Reindex to a cartesian production for the groupers,
preserving the nature (Categorical) of each grouper
"""
Expand All @@ -33,7 +33,7 @@ def f(a):
return a

index = MultiIndex.from_product(map(f, args), names=names)
return result.reindex(index).sort_index()
return result.reindex(index, fill_value=fill_value).sort_index()


_results_for_groupbys_with_missing_categories = dict(
Expand Down Expand Up @@ -309,7 +309,7 @@ def test_observed(observed):
result = gb.sum()
if not observed:
expected = cartesian_product_for_groupers(
expected, [cat1, cat2, ["foo", "bar"]], list("ABC")
expected, [cat1, cat2, ["foo", "bar"]], list("ABC"), fill_value=0
)

tm.assert_frame_equal(result, expected)
Expand All @@ -319,7 +319,9 @@ def test_observed(observed):
expected = DataFrame({"values": [1, 2, 3, 4]}, index=exp_index)
result = gb.sum()
if not observed:
expected = cartesian_product_for_groupers(expected, [cat1, cat2], list("AB"))
expected = cartesian_product_for_groupers(
expected, [cat1, cat2], list("AB"), fill_value=0
)

tm.assert_frame_equal(result, expected)

Expand Down Expand Up @@ -1188,9 +1190,10 @@ def test_seriesgroupby_observed_false_or_none(df_cat, observed, operation):
names=["A", "B"],
).sortlevel()

expected = Series(data=[2, 4, np.nan, 1, np.nan, 3], index=index, name="C")
expected = Series(data=[2, 4, 0, 1, 0, 3], index=index, name="C")
grouped = df_cat.groupby(["A", "B"], observed=observed)["C"]
result = getattr(grouped, operation)(sum)

tm.assert_series_equal(result, expected)


Expand Down Expand Up @@ -1340,15 +1343,6 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(
)
request.node.add_marker(mark)

if reduction_func == "sum": # GH 31422
mark = pytest.mark.xfail(
reason=(
"sum should return 0 but currently returns NaN. "
"This is a known bug. See GH 31422."
)
)
request.node.add_marker(mark)

df = pd.DataFrame(
{
"cat_1": pd.Categorical(list("AABB"), categories=list("ABC")),
Expand All @@ -1369,8 +1363,11 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(
val = result.loc[idx]
assert (pd.isna(zero_or_nan) and pd.isna(val)) or (val == zero_or_nan)

# If we expect unobserved values to be zero, we also expect the dtype to be int
if zero_or_nan == 0:
# If we expect unobserved values to be zero, we also expect the dtype to be int.
# Except for .sum(). If the observed categories sum to dtype=float (i.e. their
# sums have decimals), then the zeros for the missing categories should also be
# floats.
if zero_or_nan == 0 and reduction_func != "sum":
assert np.issubdtype(result.dtype, np.integer)


Expand Down Expand Up @@ -1412,24 +1409,6 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_false(
if reduction_func == "ngroup":
pytest.skip("ngroup does not return the Categories on the index")

if reduction_func == "count": # GH 35028
mark = pytest.mark.xfail(
reason=(
"DataFrameGroupBy.count returns np.NaN for missing "
"categories, when it should return 0. See GH 35028"
)
)
request.node.add_marker(mark)

if reduction_func == "sum": # GH 31422
mark = pytest.mark.xfail(
reason=(
"sum should return 0 but currently returns NaN. "
"This is a known bug. See GH 31422."
)
)
request.node.add_marker(mark)

df = pd.DataFrame(
{
"cat_1": pd.Categorical(list("AABB"), categories=list("ABC")),
Expand Down
13 changes: 7 additions & 6 deletions pandas/tests/reshape/test_pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -1817,7 +1817,7 @@ def test_categorical_aggfunc(self, observed):
["A", "B", "C"], categories=["A", "B", "C"], ordered=False, name="C1"
)
expected_columns = pd.Index(["a", "b"], name="C2")
expected_data = np.array([[1.0, np.nan], [1.0, np.nan], [np.nan, 2.0]])
expected_data = np.array([[1, 0], [1, 0], [0, 2]], dtype=np.int64)
expected = pd.DataFrame(
expected_data, index=expected_index, columns=expected_columns
)
Expand Down Expand Up @@ -1851,18 +1851,19 @@ def test_categorical_pivot_index_ordering(self, observed):
values="Sales",
index="Month",
columns="Year",
dropna=observed,
observed=observed,
aggfunc="sum",
)
expected_columns = pd.Int64Index([2013, 2014], name="Year")
expected_index = pd.CategoricalIndex(
["January"], categories=months, ordered=False, name="Month"
months, categories=months, ordered=False, name="Month"
)
expected_data = [[320, 120]] + [[0, 0]] * 11
expected = pd.DataFrame(
[[320, 120]], index=expected_index, columns=expected_columns
expected_data, index=expected_index, columns=expected_columns
)
if not observed:
result = result.dropna().astype(np.int64)
if observed:
expected = expected.loc[["January"]]

tm.assert_frame_equal(result, expected)

Expand Down