Skip to content

Commit 486fe15

Browse files
Backport PR #48702 on branch 1.5.x (REGR: dropna affects observed in groupby) (#48750)
Backport PR #48702: REGR: dropna affects observed in groupby Co-authored-by: Richard Shadrach <[email protected]>
1 parent b28b5c4 commit 486fe15

File tree

5 files changed

+85
-2
lines changed

5 files changed

+85
-2
lines changed

doc/source/whatsnew/v1.5.1.rst

+55
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,61 @@ including other versions of pandas.
1010

1111
.. ---------------------------------------------------------------------------
1212
13+
.. _whatsnew_151.groupby_categorical_regr:
14+
15+
Behavior of ``groupby`` with categorical groupers (:issue:`48645`)
16+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
17+
18+
In versions of pandas prior to 1.5, ``groupby`` with ``dropna=False`` would still drop
19+
NA values when the grouper was a categorical dtype. A fix for this was attempted in
20+
1.5, however it introduced a regression where passing ``observed=False`` and
21+
``dropna=False`` to ``groupby`` would result in only observed categories. It was found
22+
that the patch fixing the ``dropna=False`` bug is incompatible with ``observed=False``,
23+
and decided that the best resolution is to restore the correct ``observed=False``
24+
behavior at the cost of reintroducing the ``dropna=False`` bug.
25+
26+
.. ipython:: python
27+
28+
df = pd.DataFrame(
29+
{
30+
"x": pd.Categorical([1, None], categories=[1, 2, 3]),
31+
"y": [3, 4],
32+
}
33+
)
34+
df
35+
36+
*1.5.0 behavior*:
37+
38+
.. code-block:: ipython
39+
40+
In [3]: # Correct behavior, NA values are not dropped
41+
df.groupby("x", observed=True, dropna=False).sum()
42+
Out[3]:
43+
y
44+
x
45+
1 3
46+
NaN 4
47+
48+
49+
In [4]: # Incorrect behavior, only observed categories present
50+
df.groupby("x", observed=False, dropna=False).sum()
51+
Out[4]:
52+
y
53+
x
54+
1 3
55+
NaN 4
56+
57+
58+
*1.5.1 behavior*:
59+
60+
.. ipython:: python
61+
62+
# Incorrect behavior, NA values are dropped
63+
df.groupby("x", observed=True, dropna=False).sum()
64+
65+
# Correct behavior, unobserved categories present (NA values still dropped)
66+
df.groupby("x", observed=False, dropna=False).sum()
67+
1368
.. _whatsnew_151.regressions:
1469

1570
Fixed regressions

pandas/core/groupby/grouper.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -658,7 +658,7 @@ def group_index(self) -> Index:
658658

659659
@cache_readonly
660660
def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
661-
if self._dropna and self._passed_categorical:
661+
if self._passed_categorical:
662662
# we make a CategoricalIndex out of the cat grouper
663663
# preserving the categories / ordered attributes;
664664
# doesn't (yet - GH#46909) handle dropna=False

pandas/tests/groupby/conftest.py

+5
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,11 @@ def dropna(request):
2424
return request.param
2525

2626

27+
@pytest.fixture(params=[True, False])
28+
def observed(request):
29+
return request.param
30+
31+
2732
@pytest.fixture
2833
def mframe(multiindex_dataframe_random_data):
2934
return multiindex_dataframe_random_data

pandas/tests/groupby/test_categorical.py

+17
Original file line numberDiff line numberDiff line change
@@ -1828,3 +1828,20 @@ def test_groupby_categorical_aggregate_functions():
18281828
)
18291829

18301830
tm.assert_series_equal(result, expected)
1831+
1832+
1833+
def test_groupby_categorical_dropna(observed, dropna):
1834+
# GH#48645 - dropna should have no impact on the result when there are no NA values
1835+
cat = Categorical([1, 2], categories=[1, 2, 3])
1836+
df = DataFrame({"x": Categorical([1, 2], categories=[1, 2, 3]), "y": [3, 4]})
1837+
gb = df.groupby("x", observed=observed, dropna=dropna)
1838+
result = gb.sum()
1839+
1840+
if observed:
1841+
expected = DataFrame({"y": [3, 4]}, index=cat)
1842+
else:
1843+
index = CategoricalIndex([1, 2, 3], [1, 2, 3])
1844+
expected = DataFrame({"y": [3, 4, 0]}, index=index)
1845+
expected.index.name = "x"
1846+
1847+
tm.assert_frame_equal(result, expected)

pandas/tests/groupby/test_groupby_dropna.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -408,7 +408,13 @@ def test_groupby_drop_nan_with_multi_index():
408408
([2, np.nan, 1, 2], "Float32"),
409409
([2, np.nan, 1, 2], "Int64"),
410410
([2, np.nan, 1, 2], "Float64"),
411-
(["y", None, "x", "y"], "category"),
411+
pytest.param(
412+
["y", None, "x", "y"],
413+
"category",
414+
marks=pytest.mark.xfail(
415+
reason="dropna=False not correct for categorical, GH#48645"
416+
),
417+
),
412418
(["y", pd.NA, "x", "y"], "string"),
413419
pytest.param(
414420
["y", pd.NA, "x", "y"],

0 commit comments

Comments
 (0)