BUG: Groupy dropped nan groups from result when grouping over single column (#36842)

phofl · web-flow · commit f6f3dd3e7727 · 2020-11-03T21:59:02.000-05:00
diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
@@ -527,6 +527,7 @@ Groupby/resample/rolling
 - Using :meth:`Rolling.var()` instead of :meth:`Rolling.std()` avoids numerical issues for :meth:`Rolling.corr()` when :meth:`Rolling.var()` is still within floating point precision while :meth:`Rolling.std()` is not (:issue:`31286`)
 - Bug in :meth:`df.groupby(..).quantile() <pandas.core.groupby.DataFrameGroupBy.quantile>` and :meth:`df.resample(..).quantile() <pandas.core.resample.Resampler.quantile>` raised ``TypeError`` when values were of type ``Timedelta`` (:issue:`29485`)
 - Bug in :meth:`Rolling.median` and :meth:`Rolling.quantile` returned wrong values for :class:`BaseIndexer` subclasses with non-monotonic starting or ending points for windows (:issue:`37153`)
+- Bug in :meth:`DataFrame.groupby` dropped ``nan`` groups from result with ``dropna=False`` when grouping over a single column (:issue:`35646`, :issue:`35542`)
 
 Reshaping
 ^^^^^^^^^
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -896,21 +896,28 @@ def indices_fast(ndarray index, const int64_t[:] labels, list keys,
 
         if lab != cur:
             if lab != -1:
-                tup = PyTuple_New(k)
-                for j in range(k):
-                    val = keys[j][sorted_labels[j][i - 1]]
-                    PyTuple_SET_ITEM(tup, j, val)
-                    Py_INCREF(val)
-
+                if k == 1:
+                    # When k = 1 we do not want to return a tuple as key
+                    tup = keys[0][sorted_labels[0][i - 1]]
+                else:
+                    tup = PyTuple_New(k)
+                    for j in range(k):
+                        val = keys[j][sorted_labels[j][i - 1]]
+                        PyTuple_SET_ITEM(tup, j, val)
+                        Py_INCREF(val)
                 result[tup] = index[start:i]
             start = i
         cur = lab
 
-    tup = PyTuple_New(k)
-    for j in range(k):
-        val = keys[j][sorted_labels[j][n - 1]]
-        PyTuple_SET_ITEM(tup, j, val)
-        Py_INCREF(val)
+    if k == 1:
+        # When k = 1 we do not want to return a tuple as key
+        tup = keys[0][sorted_labels[0][n - 1]]
+    else:
+        tup = PyTuple_New(k)
+        for j in range(k):
+            val = keys[j][sorted_labels[j][n - 1]]
+            PyTuple_SET_ITEM(tup, j, val)
+            Py_INCREF(val)
     result[tup] = index[start:]
 
     return result
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -229,12 +229,9 @@ def apply(self, f: F, data: FrameOrSeries, axis: int = 0):
     @cache_readonly
     def indices(self):
         """ dict {group name -> group indices} """
-        if len(self.groupings) == 1:
-            return self.groupings[0].indices
-        else:
-            codes_list = [ping.codes for ping in self.groupings]
-            keys = [ping.group_index for ping in self.groupings]
-            return get_indexer_dict(codes_list, keys)
+        codes_list = [ping.codes for ping in self.groupings]
+        keys = [ping.group_index for ping in self.groupings]
+        return get_indexer_dict(codes_list, keys)
 
     @property
     def codes(self) -> List[np.ndarray]:
diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py
@@ -4,6 +4,7 @@
     TYPE_CHECKING,
     Callable,
     DefaultDict,
+    Dict,
     Iterable,
     List,
     Optional,
@@ -528,16 +529,22 @@ def get_flattened_list(
     return [tuple(array) for array in arrays.values()]
 
 
-def get_indexer_dict(label_list, keys):
+def get_indexer_dict(
+    label_list: List[np.ndarray], keys: List["Index"]
+) -> Dict[Union[str, Tuple], np.ndarray]:
     """
     Returns
     -------
-    dict
+    dict:
         Labels mapped to indexers.
     """
     shape = [len(x) for x in keys]
 
     group_index = get_group_index(label_list, shape, sort=True, xnull=True)
+    if np.all(group_index == -1):
+        # When all keys are nan and dropna=True, indices_fast can't handle this
+        # and the return is empty anyway
+        return {}
     ngroups = (
         ((group_index.size and group_index.max()) + 1)
         if is_int64_overflow_possible(shape)
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -1298,6 +1298,13 @@ def test_groupby_nat_exclude():
             grouped.get_group(pd.NaT)
 
 
+def test_groupby_two_group_keys_all_nan():
+    # GH #36842: Grouping over two group keys shouldn't raise an error
+    df = DataFrame({"a": [np.nan, np.nan], "b": [np.nan, np.nan], "c": [1, 2]})
+    result = df.groupby(["a", "b"]).indices
+    assert result == {}
+
+
 def test_groupby_2d_malformed():
     d = DataFrame(index=range(2))
     d["group"] = ["g1", "g2"]
diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py
@@ -2,7 +2,7 @@
 import pytest
 
 import pandas as pd
-import pandas.testing as tm
+import pandas._testing as tm
 
 
 @pytest.mark.parametrize(
@@ -335,3 +335,21 @@ def test_groupby_apply_with_dropna_for_multi_index(dropna, data, selected_data,
 
     expected = pd.DataFrame(selected_data, index=mi)
     tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_nan_included():
+    # GH 35646
+    data = {"group": ["g1", np.nan, "g1", "g2", np.nan], "B": [0, 1, 2, 3, 4]}
+    df = pd.DataFrame(data)
+    grouped = df.groupby("group", dropna=False)
+    result = grouped.indices
+    dtype = "int64"
+    expected = {
+        "g1": np.array([0, 2], dtype=dtype),
+        "g2": np.array([3], dtype=dtype),
+        np.nan: np.array([1, 4], dtype=dtype),
+    }
+    for result_values, expected_values in zip(result.values(), expected.values()):
+        tm.assert_numpy_array_equal(result_values, expected_values)
+    assert np.isnan(list(result.keys())[2])
+    assert list(result.keys())[0:2] == ["g1", "g2"]
diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py
@@ -1087,3 +1087,18 @@ def test_rolling_corr_timedelta_index(index, window):
     result = x.rolling(window).corr(y)
     expected = Series([np.nan, np.nan, 1, 1, 1], index=index)
     tm.assert_almost_equal(result, expected)
+
+
+def test_groupby_rolling_nan_included():
+    # GH 35542
+    data = {"group": ["g1", np.nan, "g1", "g2", np.nan], "B": [0, 1, 2, 3, 4]}
+    df = DataFrame(data)
+    result = df.groupby("group", dropna=False).rolling(1, min_periods=1).mean()
+    expected = DataFrame(
+        {"B": [0.0, 2.0, 3.0, 1.0, 4.0]},
+        index=pd.MultiIndex.from_tuples(
+            [("g1", 0), ("g1", 2), ("g2", 3), (np.nan, 1), (np.nan, 4)],
+            names=["group", None],
+        ),
+    )
+    tm.assert_frame_equal(result, expected)