Skip to content

BUG: Series groupby does not include nan counts for all categorical labels (#17605) #29690

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Nov 20, 2019
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -438,6 +438,7 @@ Groupby/resample/rolling
- Bug in :meth:`DataFrameGroupby.agg` not able to use lambda function with named aggregation (:issue:`27519`)
- Bug in :meth:`DataFrame.groupby` losing column name information when grouping by a categorical column (:issue:`28787`)
- Bug in :meth:`DataFrameGroupBy.rolling().quantile()` ignoring ``interpolation`` keyword argument (:issue:`28779`)
- Bug in :meth:`SeriesGroupBy.count`, :meth:`SeriesGroupBy.size`, :meth:`SeriesGroupBy.nunique` and :meth:`SeriesGroupBy.nth` missing unobserved categories when ``observed=False`` (:issue:`17605`)

Reshaping
^^^^^^^^^
Expand Down
6 changes: 4 additions & 2 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -569,7 +569,8 @@ def nunique(self, dropna: bool = True) -> Series:
res, out = np.zeros(len(ri), dtype=out.dtype), res
res[ids[idx]] = out

return Series(res, index=ri, name=self._selection_name)
result = Series(res, index=ri, name=self._selection_name)
return self._reindex_output(result, fill_value=0)

@Appender(Series.describe.__doc__)
def describe(self, **kwargs):
Expand Down Expand Up @@ -721,12 +722,13 @@ def count(self) -> Series:
minlength = ngroups or 0
out = np.bincount(ids[mask], minlength=minlength)

return Series(
result = Series(
out,
index=self.grouper.result_index,
name=self._selection_name,
dtype="int64",
)
return self._reindex_output(result, fill_value=0)

def _apply_to_column_groupbys(self, func):
""" return a pass thru """
Expand Down
15 changes: 11 additions & 4 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1296,7 +1296,7 @@ def size(self):

if isinstance(self.obj, Series):
result.name = self.obj.name
return result
return self._reindex_output(result, fill_value=0)

@classmethod
def _add_numeric_operations(cls):
Expand Down Expand Up @@ -1743,6 +1743,7 @@ def nth(self, n: Union[int, List[int]], dropna: Optional[str] = None) -> DataFra
if not self.observed and isinstance(result_index, CategoricalIndex):
out = out.reindex(result_index)

out = self._reindex_output(out)
return out.sort_index() if self.sort else out

# dropna is truthy
Expand Down Expand Up @@ -2383,7 +2384,7 @@ def tail(self, n=5):
mask = self._cumcount_array(ascending=False) < n
return self._selected_obj[mask]

def _reindex_output(self, output):
def _reindex_output(self, output, fill_value=np.NaN):
"""
If we have categorical groupers, then we might want to make sure that
we have a fully re-indexed output to the levels. This means expanding
Expand Down Expand Up @@ -2429,7 +2430,11 @@ def _reindex_output(self, output):
).sortlevel()

if self.as_index:
d = {self.obj._get_axis_name(self.axis): index, "copy": False}
d = {
self.obj._get_axis_name(self.axis): index,
"copy": False,
"fill_value": fill_value,
}
return output.reindex(**d)

# GH 13204
Expand All @@ -2451,7 +2456,9 @@ def _reindex_output(self, output):
output = output.drop(labels=list(g_names), axis=1)

# Set a temp index and reindex (possibly expanding)
output = output.set_index(self.grouper.result_index).reindex(index, copy=False)
output = output.set_index(self.grouper.result_index).reindex(
index, copy=False, fill_value=fill_value
)

# Reset in-axis grouper columns
# (using level numbers `g_nums` because level names may not be unique)
Expand Down
80 changes: 80 additions & 0 deletions pandas/tests/groupby/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1252,3 +1252,83 @@ def test_get_nonexistent_category():
{"var": [rows.iloc[-1]["var"]], "val": [rows.iloc[-1]["vau"]]}
)
)


@pytest.mark.parametrize("observed", [True, False])
def test_series_groupby_on_2_categoricals_unobserved(
reduction_func: str, observed: bool
):
# GH 17605

if reduction_func == "ngroup":
pytest.skip("ngroup is not truly a reduction")

df = pd.DataFrame(
{
"cat_1": pd.Categorical(list("AABB"), categories=list("ABCD")),
"cat_2": pd.Categorical(list("AB") * 2, categories=list("ABCD")),
"value": [0.1] * 4,
}
)
args = {"nth": [0]}.get(reduction_func, [])

expected_length = 4 if observed else 16

series_groupby = df.groupby(["cat_1", "cat_2"], observed=observed)["value"]
agg = getattr(series_groupby, reduction_func)
result = agg(*args)

assert len(result) == expected_length


@pytest.mark.parametrize(
"func, zero_or_nan",
[
("all", np.NaN),
("any", np.NaN),
("count", 0),
("first", np.NaN),
("idxmax", np.NaN),
("idxmin", np.NaN),
("last", np.NaN),
("mad", np.NaN),
("max", np.NaN),
("mean", np.NaN),
("median", np.NaN),
("min", np.NaN),
("nth", np.NaN),
("nunique", 0),
("prod", np.NaN),
("quantile", np.NaN),
("sem", np.NaN),
("size", 0),
("skew", np.NaN),
("std", np.NaN),
("sum", np.NaN),
("var", np.NaN),
],
)
def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(func, zero_or_nan):
# GH 17605
# Tests whether the unobserved categories in the result contain 0 or NaN
df = pd.DataFrame(
{
"cat_1": pd.Categorical(list("AABB"), categories=list("ABC")),
"cat_2": pd.Categorical(list("AB") * 2, categories=list("ABC")),
"value": [0.1] * 4,
}
)
unobserved = [tuple("AC"), tuple("BC"), tuple("CA"), tuple("CB"), tuple("CC")]
args = {"nth": [0]}.get(func, [])

series_groupby = df.groupby(["cat_1", "cat_2"], observed=False)["value"]
agg = getattr(series_groupby, func)
result = agg(*args)

for idx in unobserved:
val = result.loc[idx]
assert (pd.isna(zero_or_nan) and pd.isna(val)) or (val == zero_or_nan)

# If we expect unobserved values to be zero, we also expect the dtype to be int
if zero_or_nan == 0:
assert np.issubdtype(result.dtype, np.integer)