Skip to content

Commit a21ce87

Browse files
authored
BUG: GroupBy.count() and GroupBy.sum() incorreclty return NaN instead of 0 for missing categories (#35280)
1 parent 3b88446 commit a21ce87

File tree

5 files changed

+42
-44
lines changed

5 files changed

+42
-44
lines changed

doc/source/whatsnew/v1.2.0.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,7 @@ Plotting
152152
Groupby/resample/rolling
153153
^^^^^^^^^^^^^^^^^^^^^^^^
154154

155+
- Bug in :meth:`DataFrameGroupBy.count` and :meth:`SeriesGroupBy.sum` returning ``NaN`` for missing categories when grouped on multiple ``Categoricals``. Now returning ``0`` (:issue:`35028`)
155156
- Bug in :meth:`DataFrameGroupBy.apply` that would some times throw an erroneous ``ValueError`` if the grouping axis had duplicate entries (:issue:`16646`)
156157
-
157158
-
@@ -160,7 +161,7 @@ Groupby/resample/rolling
160161
Reshaping
161162
^^^^^^^^^
162163

163-
-
164+
- Bug in :meth:`DataFrame.pivot_table` with ``aggfunc='count'`` or ``aggfunc='sum'`` returning ``NaN`` for missing categories when pivoted on a ``Categorical``. Now returning ``0`` (:issue:`31422`)
164165
-
165166

166167
Sparse

pandas/core/groupby/generic.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -1829,7 +1829,13 @@ def count(self):
18291829
)
18301830
blocks = [make_block(val, placement=loc) for val, loc in zip(counted, locs)]
18311831

1832-
return self._wrap_agged_blocks(blocks, items=data.items)
1832+
# If we are grouping on categoricals we want unobserved categories to
1833+
# return zero, rather than the default of NaN which the reindexing in
1834+
# _wrap_agged_blocks() returns. GH 35028
1835+
with com.temp_setattr(self, "observed", True):
1836+
result = self._wrap_agged_blocks(blocks, items=data.items)
1837+
1838+
return self._reindex_output(result, fill_value=0)
18331839

18341840
def nunique(self, dropna: bool = True):
18351841
"""

pandas/core/groupby/groupby.py

+13-3
Original file line numberDiff line numberDiff line change
@@ -1536,9 +1536,19 @@ def size(self) -> FrameOrSeriesUnion:
15361536

15371537
@doc(_groupby_agg_method_template, fname="sum", no=True, mc=0)
15381538
def sum(self, numeric_only: bool = True, min_count: int = 0):
1539-
return self._agg_general(
1540-
numeric_only=numeric_only, min_count=min_count, alias="add", npfunc=np.sum
1541-
)
1539+
1540+
# If we are grouping on categoricals we want unobserved categories to
1541+
# return zero, rather than the default of NaN which the reindexing in
1542+
# _agg_general() returns. GH #31422
1543+
with com.temp_setattr(self, "observed", True):
1544+
result = self._agg_general(
1545+
numeric_only=numeric_only,
1546+
min_count=min_count,
1547+
alias="add",
1548+
npfunc=np.sum,
1549+
)
1550+
1551+
return self._reindex_output(result, fill_value=0)
15421552

15431553
@doc(_groupby_agg_method_template, fname="prod", no=True, mc=0)
15441554
def prod(self, numeric_only: bool = True, min_count: int = 0):

pandas/tests/groupby/test_categorical.py

+13-33
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
import pandas._testing as tm
2020

2121

22-
def cartesian_product_for_groupers(result, args, names):
22+
def cartesian_product_for_groupers(result, args, names, fill_value=np.NaN):
2323
""" Reindex to a cartesian production for the groupers,
2424
preserving the nature (Categorical) of each grouper
2525
"""
@@ -33,7 +33,7 @@ def f(a):
3333
return a
3434

3535
index = MultiIndex.from_product(map(f, args), names=names)
36-
return result.reindex(index).sort_index()
36+
return result.reindex(index, fill_value=fill_value).sort_index()
3737

3838

3939
_results_for_groupbys_with_missing_categories = dict(
@@ -309,7 +309,7 @@ def test_observed(observed):
309309
result = gb.sum()
310310
if not observed:
311311
expected = cartesian_product_for_groupers(
312-
expected, [cat1, cat2, ["foo", "bar"]], list("ABC")
312+
expected, [cat1, cat2, ["foo", "bar"]], list("ABC"), fill_value=0
313313
)
314314

315315
tm.assert_frame_equal(result, expected)
@@ -319,7 +319,9 @@ def test_observed(observed):
319319
expected = DataFrame({"values": [1, 2, 3, 4]}, index=exp_index)
320320
result = gb.sum()
321321
if not observed:
322-
expected = cartesian_product_for_groupers(expected, [cat1, cat2], list("AB"))
322+
expected = cartesian_product_for_groupers(
323+
expected, [cat1, cat2], list("AB"), fill_value=0
324+
)
323325

324326
tm.assert_frame_equal(result, expected)
325327

@@ -1189,6 +1191,8 @@ def test_seriesgroupby_observed_false_or_none(df_cat, observed, operation):
11891191
).sortlevel()
11901192

11911193
expected = Series(data=[2, 4, np.nan, 1, np.nan, 3], index=index, name="C")
1194+
if operation == "agg":
1195+
expected = expected.fillna(0, downcast="infer")
11921196
grouped = df_cat.groupby(["A", "B"], observed=observed)["C"]
11931197
result = getattr(grouped, operation)(sum)
11941198
tm.assert_series_equal(result, expected)
@@ -1338,15 +1342,6 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(
13381342
)
13391343
request.node.add_marker(mark)
13401344

1341-
if reduction_func == "sum": # GH 31422
1342-
mark = pytest.mark.xfail(
1343-
reason=(
1344-
"sum should return 0 but currently returns NaN. "
1345-
"This is a known bug. See GH 31422."
1346-
)
1347-
)
1348-
request.node.add_marker(mark)
1349-
13501345
df = pd.DataFrame(
13511346
{
13521347
"cat_1": pd.Categorical(list("AABB"), categories=list("ABC")),
@@ -1367,8 +1362,11 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(
13671362
val = result.loc[idx]
13681363
assert (pd.isna(zero_or_nan) and pd.isna(val)) or (val == zero_or_nan)
13691364

1370-
# If we expect unobserved values to be zero, we also expect the dtype to be int
1371-
if zero_or_nan == 0:
1365+
# If we expect unobserved values to be zero, we also expect the dtype to be int.
1366+
# Except for .sum(). If the observed categories sum to dtype=float (i.e. their
1367+
# sums have decimals), then the zeros for the missing categories should also be
1368+
# floats.
1369+
if zero_or_nan == 0 and reduction_func != "sum":
13721370
assert np.issubdtype(result.dtype, np.integer)
13731371

13741372

@@ -1410,24 +1408,6 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_false(
14101408
if reduction_func == "ngroup":
14111409
pytest.skip("ngroup does not return the Categories on the index")
14121410

1413-
if reduction_func == "count": # GH 35028
1414-
mark = pytest.mark.xfail(
1415-
reason=(
1416-
"DataFrameGroupBy.count returns np.NaN for missing "
1417-
"categories, when it should return 0. See GH 35028"
1418-
)
1419-
)
1420-
request.node.add_marker(mark)
1421-
1422-
if reduction_func == "sum": # GH 31422
1423-
mark = pytest.mark.xfail(
1424-
reason=(
1425-
"sum should return 0 but currently returns NaN. "
1426-
"This is a known bug. See GH 31422."
1427-
)
1428-
)
1429-
request.node.add_marker(mark)
1430-
14311411
df = pd.DataFrame(
14321412
{
14331413
"cat_1": pd.Categorical(list("AABB"), categories=list("ABC")),

pandas/tests/reshape/test_pivot.py

+7-6
Original file line numberDiff line numberDiff line change
@@ -1817,7 +1817,7 @@ def test_categorical_aggfunc(self, observed):
18171817
["A", "B", "C"], categories=["A", "B", "C"], ordered=False, name="C1"
18181818
)
18191819
expected_columns = pd.Index(["a", "b"], name="C2")
1820-
expected_data = np.array([[1.0, np.nan], [1.0, np.nan], [np.nan, 2.0]])
1820+
expected_data = np.array([[1, 0], [1, 0], [0, 2]], dtype=np.int64)
18211821
expected = pd.DataFrame(
18221822
expected_data, index=expected_index, columns=expected_columns
18231823
)
@@ -1851,18 +1851,19 @@ def test_categorical_pivot_index_ordering(self, observed):
18511851
values="Sales",
18521852
index="Month",
18531853
columns="Year",
1854-
dropna=observed,
1854+
observed=observed,
18551855
aggfunc="sum",
18561856
)
18571857
expected_columns = pd.Int64Index([2013, 2014], name="Year")
18581858
expected_index = pd.CategoricalIndex(
1859-
["January"], categories=months, ordered=False, name="Month"
1859+
months, categories=months, ordered=False, name="Month"
18601860
)
1861+
expected_data = [[320, 120]] + [[0, 0]] * 11
18611862
expected = pd.DataFrame(
1862-
[[320, 120]], index=expected_index, columns=expected_columns
1863+
expected_data, index=expected_index, columns=expected_columns
18631864
)
1864-
if not observed:
1865-
result = result.dropna().astype(np.int64)
1865+
if observed:
1866+
expected = expected.loc[["January"]]
18661867

18671868
tm.assert_frame_equal(result, expected)
18681869

0 commit comments

Comments
 (0)