Backport PR #48702 on branch 1.5.x (REGR: dropna affects observed in groupby) (#48750)

meeseeksmachine · rhshadrach · web-flow · commit 486fe153361c · 2022-09-23T16:55:59.000-07:00
Backport PR #48702: REGR: dropna affects observed in groupby Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com>
diff --git a/doc/source/whatsnew/v1.5.1.rst b/doc/source/whatsnew/v1.5.1.rst
@@ -10,6 +10,61 @@ including other versions of pandas.
 
 .. ---------------------------------------------------------------------------
 
+.. _whatsnew_151.groupby_categorical_regr:
+
+Behavior of ``groupby`` with categorical groupers (:issue:`48645`)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In versions of pandas prior to 1.5, ``groupby`` with ``dropna=False`` would still drop
+NA values when the grouper was a categorical dtype. A fix for this was attempted in
+1.5, however it introduced a regression where passing ``observed=False`` and
+``dropna=False`` to ``groupby`` would result in only observed categories. It was found
+that the patch fixing the ``dropna=False`` bug is incompatible with ``observed=False``,
+and decided that the best resolution is to restore the correct ``observed=False``
+behavior at the cost of reintroducing the ``dropna=False`` bug.
+
+.. ipython:: python
+
+   df = pd.DataFrame(
+       {
+           "x": pd.Categorical([1, None], categories=[1, 2, 3]),
+           "y": [3, 4],
+       }
+   )
+   df
+
+*1.5.0 behavior*:
+
+.. code-block:: ipython
+
+   In [3]: # Correct behavior, NA values are not dropped
+           df.groupby("x", observed=True, dropna=False).sum()
+   Out[3]:
+        y
+   x
+   1    3
+   NaN  4
+
+
+   In [4]: # Incorrect behavior, only observed categories present
+           df.groupby("x", observed=False, dropna=False).sum()
+   Out[4]:
+        y
+   x
+   1    3
+   NaN  4
+
+
+*1.5.1 behavior*:
+
+.. ipython:: python
+
+   # Incorrect behavior, NA values are dropped
+   df.groupby("x", observed=True, dropna=False).sum()
+
+   # Correct behavior, unobserved categories present (NA values still dropped)
+   df.groupby("x", observed=False, dropna=False).sum()
+
 .. _whatsnew_151.regressions:
 
 Fixed regressions
diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py
@@ -658,7 +658,7 @@ def group_index(self) -> Index:
 
     @cache_readonly
     def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
-        if self._dropna and self._passed_categorical:
+        if self._passed_categorical:
             # we make a CategoricalIndex out of the cat grouper
             # preserving the categories / ordered attributes;
             # doesn't (yet - GH#46909) handle dropna=False
diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py
@@ -24,6 +24,11 @@ def dropna(request):
     return request.param
 
 
+@pytest.fixture(params=[True, False])
+def observed(request):
+    return request.param
+
+
 @pytest.fixture
 def mframe(multiindex_dataframe_random_data):
     return multiindex_dataframe_random_data
diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
@@ -1828,3 +1828,20 @@ def test_groupby_categorical_aggregate_functions():
     )
 
     tm.assert_series_equal(result, expected)
+
+
+def test_groupby_categorical_dropna(observed, dropna):
+    # GH#48645 - dropna should have no impact on the result when there are no NA values
+    cat = Categorical([1, 2], categories=[1, 2, 3])
+    df = DataFrame({"x": Categorical([1, 2], categories=[1, 2, 3]), "y": [3, 4]})
+    gb = df.groupby("x", observed=observed, dropna=dropna)
+    result = gb.sum()
+
+    if observed:
+        expected = DataFrame({"y": [3, 4]}, index=cat)
+    else:
+        index = CategoricalIndex([1, 2, 3], [1, 2, 3])
+        expected = DataFrame({"y": [3, 4, 0]}, index=index)
+    expected.index.name = "x"
+
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py
@@ -408,7 +408,13 @@ def test_groupby_drop_nan_with_multi_index():
         ([2, np.nan, 1, 2], "Float32"),
         ([2, np.nan, 1, 2], "Int64"),
         ([2, np.nan, 1, 2], "Float64"),
-        (["y", None, "x", "y"], "category"),
+        pytest.param(
+            ["y", None, "x", "y"],
+            "category",
+            marks=pytest.mark.xfail(
+                reason="dropna=False not correct for categorical, GH#48645"
+            ),
+        ),
         (["y", pd.NA, "x", "y"], "string"),
         pytest.param(
             ["y", pd.NA, "x", "y"],