Skip to content

Commit 0939732

Browse files
Oliver Hofkensjacobaustin123
Oliver Hofkens
authored andcommitted
BUG: Series groupby does not include nan counts for all categorical labels (pandas-dev#17605) (pandas-dev#29690)
1 parent 7d1946e commit 0939732

File tree

4 files changed

+141
-7
lines changed

4 files changed

+141
-7
lines changed

doc/source/whatsnew/v1.0.0.rst

+41
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,47 @@ New repr for :class:`pandas.core.arrays.IntervalArray`
196196
197197
pd.arrays.IntervalArray.from_tuples([(0, 1), (2, 3)])
198198
199+
200+
All :class:`SeriesGroupBy` aggregation methods now respect the ``observed`` keyword
201+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
202+
The following methods now also correctly output values for unobserved categories when called through ``groupby(..., observed=False)`` (:issue:`17605`)
203+
204+
- :meth:`SeriesGroupBy.count`
205+
- :meth:`SeriesGroupBy.size`
206+
- :meth:`SeriesGroupBy.nunique`
207+
- :meth:`SeriesGroupBy.nth`
208+
209+
.. ipython:: python
210+
211+
df = pd.DataFrame({
212+
"cat_1": pd.Categorical(list("AABB"), categories=list("ABC")),
213+
"cat_2": pd.Categorical(list("AB") * 2, categories=list("ABC")),
214+
"value": [0.1] * 4,
215+
})
216+
df
217+
218+
219+
*pandas 0.25.x*
220+
221+
.. code-block:: ipython
222+
223+
In [2]: df.groupby(["cat_1", "cat_2"], observed=False)["value"].count()
224+
Out[2]:
225+
cat_1 cat_2
226+
A A 1
227+
B 1
228+
B A 1
229+
B 1
230+
Name: value, dtype: int64
231+
232+
233+
*pandas 1.0.0*
234+
235+
.. ipython:: python
236+
237+
df.groupby(["cat_1", "cat_2"], observed=False)["value"].count()
238+
239+
199240
.. _whatsnew_1000.api.other:
200241

201242
Other API changes

pandas/core/groupby/generic.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -557,7 +557,8 @@ def nunique(self, dropna: bool = True) -> Series:
557557
res, out = np.zeros(len(ri), dtype=out.dtype), res
558558
res[ids[idx]] = out
559559

560-
return Series(res, index=ri, name=self._selection_name)
560+
result = Series(res, index=ri, name=self._selection_name)
561+
return self._reindex_output(result, fill_value=0)
561562

562563
@Appender(Series.describe.__doc__)
563564
def describe(self, **kwargs):
@@ -709,12 +710,13 @@ def count(self) -> Series:
709710
minlength = ngroups or 0
710711
out = np.bincount(ids[mask], minlength=minlength)
711712

712-
return Series(
713+
result = Series(
713714
out,
714715
index=self.grouper.result_index,
715716
name=self._selection_name,
716717
dtype="int64",
717718
)
719+
return self._reindex_output(result, fill_value=0)
718720

719721
def _apply_to_column_groupbys(self, func):
720722
""" return a pass thru """

pandas/core/groupby/groupby.py

+17-5
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ class providing the base-class of operations.
3939
)
4040
from pandas.core.dtypes.missing import isna, notna
4141

42+
from pandas._typing import FrameOrSeries, Scalar
4243
from pandas.core import nanops
4344
import pandas.core.algorithms as algorithms
4445
from pandas.core.arrays import Categorical, try_cast_to_ea
@@ -1296,7 +1297,7 @@ def size(self):
12961297

12971298
if isinstance(self.obj, Series):
12981299
result.name = self.obj.name
1299-
return result
1300+
return self._reindex_output(result, fill_value=0)
13001301

13011302
@classmethod
13021303
def _add_numeric_operations(cls):
@@ -1740,6 +1741,7 @@ def nth(self, n: Union[int, List[int]], dropna: Optional[str] = None) -> DataFra
17401741
if not self.observed and isinstance(result_index, CategoricalIndex):
17411742
out = out.reindex(result_index)
17421743

1744+
out = self._reindex_output(out)
17431745
return out.sort_index() if self.sort else out
17441746

17451747
# dropna is truthy
@@ -2380,7 +2382,9 @@ def tail(self, n=5):
23802382
mask = self._cumcount_array(ascending=False) < n
23812383
return self._selected_obj[mask]
23822384

2383-
def _reindex_output(self, output):
2385+
def _reindex_output(
2386+
self, output: FrameOrSeries, fill_value: Scalar = np.NaN
2387+
) -> FrameOrSeries:
23842388
"""
23852389
If we have categorical groupers, then we might want to make sure that
23862390
we have a fully re-indexed output to the levels. This means expanding
@@ -2394,8 +2398,10 @@ def _reindex_output(self, output):
23942398
23952399
Parameters
23962400
----------
2397-
output: Series or DataFrame
2401+
output : Series or DataFrame
23982402
Object resulting from grouping and applying an operation.
2403+
fill_value : scalar, default np.NaN
2404+
Value to use for unobserved categories if self.observed is False.
23992405
24002406
Returns
24012407
-------
@@ -2426,7 +2432,11 @@ def _reindex_output(self, output):
24262432
).sortlevel()
24272433

24282434
if self.as_index:
2429-
d = {self.obj._get_axis_name(self.axis): index, "copy": False}
2435+
d = {
2436+
self.obj._get_axis_name(self.axis): index,
2437+
"copy": False,
2438+
"fill_value": fill_value,
2439+
}
24302440
return output.reindex(**d)
24312441

24322442
# GH 13204
@@ -2448,7 +2458,9 @@ def _reindex_output(self, output):
24482458
output = output.drop(labels=list(g_names), axis=1)
24492459

24502460
# Set a temp index and reindex (possibly expanding)
2451-
output = output.set_index(self.grouper.result_index).reindex(index, copy=False)
2461+
output = output.set_index(self.grouper.result_index).reindex(
2462+
index, copy=False, fill_value=fill_value
2463+
)
24522464

24532465
# Reset in-axis grouper columns
24542466
# (using level numbers `g_nums` because level names may not be unique)

pandas/tests/groupby/test_categorical.py

+79
Original file line numberDiff line numberDiff line change
@@ -1252,3 +1252,82 @@ def test_get_nonexistent_category():
12521252
{"var": [rows.iloc[-1]["var"]], "val": [rows.iloc[-1]["vau"]]}
12531253
)
12541254
)
1255+
1256+
1257+
def test_series_groupby_on_2_categoricals_unobserved(
1258+
reduction_func: str, observed: bool
1259+
):
1260+
# GH 17605
1261+
1262+
if reduction_func == "ngroup":
1263+
pytest.skip("ngroup is not truly a reduction")
1264+
1265+
df = pd.DataFrame(
1266+
{
1267+
"cat_1": pd.Categorical(list("AABB"), categories=list("ABCD")),
1268+
"cat_2": pd.Categorical(list("AB") * 2, categories=list("ABCD")),
1269+
"value": [0.1] * 4,
1270+
}
1271+
)
1272+
args = {"nth": [0]}.get(reduction_func, [])
1273+
1274+
expected_length = 4 if observed else 16
1275+
1276+
series_groupby = df.groupby(["cat_1", "cat_2"], observed=observed)["value"]
1277+
agg = getattr(series_groupby, reduction_func)
1278+
result = agg(*args)
1279+
1280+
assert len(result) == expected_length
1281+
1282+
1283+
@pytest.mark.parametrize(
1284+
"func, zero_or_nan",
1285+
[
1286+
("all", np.NaN),
1287+
("any", np.NaN),
1288+
("count", 0),
1289+
("first", np.NaN),
1290+
("idxmax", np.NaN),
1291+
("idxmin", np.NaN),
1292+
("last", np.NaN),
1293+
("mad", np.NaN),
1294+
("max", np.NaN),
1295+
("mean", np.NaN),
1296+
("median", np.NaN),
1297+
("min", np.NaN),
1298+
("nth", np.NaN),
1299+
("nunique", 0),
1300+
("prod", np.NaN),
1301+
("quantile", np.NaN),
1302+
("sem", np.NaN),
1303+
("size", 0),
1304+
("skew", np.NaN),
1305+
("std", np.NaN),
1306+
("sum", np.NaN),
1307+
("var", np.NaN),
1308+
],
1309+
)
1310+
def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(func, zero_or_nan):
1311+
# GH 17605
1312+
# Tests whether the unobserved categories in the result contain 0 or NaN
1313+
df = pd.DataFrame(
1314+
{
1315+
"cat_1": pd.Categorical(list("AABB"), categories=list("ABC")),
1316+
"cat_2": pd.Categorical(list("AB") * 2, categories=list("ABC")),
1317+
"value": [0.1] * 4,
1318+
}
1319+
)
1320+
unobserved = [tuple("AC"), tuple("BC"), tuple("CA"), tuple("CB"), tuple("CC")]
1321+
args = {"nth": [0]}.get(func, [])
1322+
1323+
series_groupby = df.groupby(["cat_1", "cat_2"], observed=False)["value"]
1324+
agg = getattr(series_groupby, func)
1325+
result = agg(*args)
1326+
1327+
for idx in unobserved:
1328+
val = result.loc[idx]
1329+
assert (pd.isna(zero_or_nan) and pd.isna(val)) or (val == zero_or_nan)
1330+
1331+
# If we expect unobserved values to be zero, we also expect the dtype to be int
1332+
if zero_or_nan == 0:
1333+
assert np.issubdtype(result.dtype, np.integer)

0 commit comments

Comments
 (0)