Skip to content

BUG: Series groupby does not include nan counts for all categorical labels (#17605) #29690

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Nov 20, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions doc/source/whatsnew/v1.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,47 @@ New repr for :class:`pandas.core.arrays.IntervalArray`

pd.arrays.IntervalArray.from_tuples([(0, 1), (2, 3)])


All :class:`SeriesGroupBy` aggregation methods now respect the ``observed`` keyword
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
The following methods now also correctly output values for unobserved categories when called through ``groupby(..., observed=False)`` (:issue:`17605`)

- :meth:`SeriesGroupBy.count`
- :meth:`SeriesGroupBy.size`
- :meth:`SeriesGroupBy.nunique`
- :meth:`SeriesGroupBy.nth`

.. ipython:: python

df = pd.DataFrame({
"cat_1": pd.Categorical(list("AABB"), categories=list("ABC")),
"cat_2": pd.Categorical(list("AB") * 2, categories=list("ABC")),
"value": [0.1] * 4,
})
df


*pandas 0.25.x*

.. code-block:: ipython

In [2]: df.groupby(["cat_1", "cat_2"], observed=False)["value"].count()
Out[2]:
cat_1 cat_2
A A 1
B 1
B A 1
B 1
Name: value, dtype: int64


*pandas 1.0.0*

.. ipython:: python

df.groupby(["cat_1", "cat_2"], observed=False)["value"].count()


.. _whatsnew_1000.api.other:

Other API changes
Expand Down
6 changes: 4 additions & 2 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -569,7 +569,8 @@ def nunique(self, dropna: bool = True) -> Series:
res, out = np.zeros(len(ri), dtype=out.dtype), res
res[ids[idx]] = out

return Series(res, index=ri, name=self._selection_name)
result = Series(res, index=ri, name=self._selection_name)
return self._reindex_output(result, fill_value=0)

@Appender(Series.describe.__doc__)
def describe(self, **kwargs):
Expand Down Expand Up @@ -721,12 +722,13 @@ def count(self) -> Series:
minlength = ngroups or 0
out = np.bincount(ids[mask], minlength=minlength)

return Series(
result = Series(
out,
index=self.grouper.result_index,
name=self._selection_name,
dtype="int64",
)
return self._reindex_output(result, fill_value=0)

def _apply_to_column_groupbys(self, func):
""" return a pass thru """
Expand Down
22 changes: 17 additions & 5 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ class providing the base-class of operations.
)
from pandas.core.dtypes.missing import isna, notna

from pandas._typing import FrameOrSeries, Scalar
from pandas.core import nanops
import pandas.core.algorithms as algorithms
from pandas.core.arrays import Categorical, try_cast_to_ea
Expand Down Expand Up @@ -1296,7 +1297,7 @@ def size(self):

if isinstance(self.obj, Series):
result.name = self.obj.name
return result
return self._reindex_output(result, fill_value=0)

@classmethod
def _add_numeric_operations(cls):
Expand Down Expand Up @@ -1743,6 +1744,7 @@ def nth(self, n: Union[int, List[int]], dropna: Optional[str] = None) -> DataFra
if not self.observed and isinstance(result_index, CategoricalIndex):
out = out.reindex(result_index)

out = self._reindex_output(out)
return out.sort_index() if self.sort else out

# dropna is truthy
Expand Down Expand Up @@ -2383,7 +2385,9 @@ def tail(self, n=5):
mask = self._cumcount_array(ascending=False) < n
return self._selected_obj[mask]

def _reindex_output(self, output):
def _reindex_output(
self, output: FrameOrSeries, fill_value: Scalar = np.NaN
) -> FrameOrSeries:
"""
If we have categorical groupers, then we might want to make sure that
we have a fully re-indexed output to the levels. This means expanding
Expand All @@ -2397,8 +2401,10 @@ def _reindex_output(self, output):

Parameters
----------
output: Series or DataFrame
output : Series or DataFrame
Object resulting from grouping and applying an operation.
fill_value : scalar, default np.NaN
Value to use for unobserved categories if self.observed is False.

Returns
-------
Expand Down Expand Up @@ -2429,7 +2435,11 @@ def _reindex_output(self, output):
).sortlevel()

if self.as_index:
d = {self.obj._get_axis_name(self.axis): index, "copy": False}
d = {
self.obj._get_axis_name(self.axis): index,
"copy": False,
"fill_value": fill_value,
}
return output.reindex(**d)

# GH 13204
Expand All @@ -2451,7 +2461,9 @@ def _reindex_output(self, output):
output = output.drop(labels=list(g_names), axis=1)

# Set a temp index and reindex (possibly expanding)
output = output.set_index(self.grouper.result_index).reindex(index, copy=False)
output = output.set_index(self.grouper.result_index).reindex(
index, copy=False, fill_value=fill_value
)

# Reset in-axis grouper columns
# (using level numbers `g_nums` because level names may not be unique)
Expand Down
79 changes: 79 additions & 0 deletions pandas/tests/groupby/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1252,3 +1252,82 @@ def test_get_nonexistent_category():
{"var": [rows.iloc[-1]["var"]], "val": [rows.iloc[-1]["vau"]]}
)
)


def test_series_groupby_on_2_categoricals_unobserved(
reduction_func: str, observed: bool
):
# GH 17605

if reduction_func == "ngroup":
pytest.skip("ngroup is not truly a reduction")

df = pd.DataFrame(
{
"cat_1": pd.Categorical(list("AABB"), categories=list("ABCD")),
"cat_2": pd.Categorical(list("AB") * 2, categories=list("ABCD")),
"value": [0.1] * 4,
}
)
args = {"nth": [0]}.get(reduction_func, [])

expected_length = 4 if observed else 16

series_groupby = df.groupby(["cat_1", "cat_2"], observed=observed)["value"]
agg = getattr(series_groupby, reduction_func)
result = agg(*args)

assert len(result) == expected_length


@pytest.mark.parametrize(
"func, zero_or_nan",
[
("all", np.NaN),
("any", np.NaN),
("count", 0),
("first", np.NaN),
("idxmax", np.NaN),
("idxmin", np.NaN),
("last", np.NaN),
("mad", np.NaN),
("max", np.NaN),
("mean", np.NaN),
("median", np.NaN),
("min", np.NaN),
("nth", np.NaN),
("nunique", 0),
("prod", np.NaN),
("quantile", np.NaN),
("sem", np.NaN),
("size", 0),
("skew", np.NaN),
("std", np.NaN),
("sum", np.NaN),
("var", np.NaN),
],
)
def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(func, zero_or_nan):
# GH 17605
# Tests whether the unobserved categories in the result contain 0 or NaN
df = pd.DataFrame(
{
"cat_1": pd.Categorical(list("AABB"), categories=list("ABC")),
"cat_2": pd.Categorical(list("AB") * 2, categories=list("ABC")),
"value": [0.1] * 4,
}
)
unobserved = [tuple("AC"), tuple("BC"), tuple("CA"), tuple("CB"), tuple("CC")]
args = {"nth": [0]}.get(func, [])

series_groupby = df.groupby(["cat_1", "cat_2"], observed=False)["value"]
agg = getattr(series_groupby, func)
result = agg(*args)

for idx in unobserved:
val = result.loc[idx]
assert (pd.isna(zero_or_nan) and pd.isna(val)) or (val == zero_or_nan)

# If we expect unobserved values to be zero, we also expect the dtype to be int
if zero_or_nan == 0:
assert np.issubdtype(result.dtype, np.integer)