BUG: GroupBy.apply with Grouper and NaT (#43500)

jbrockmendel · web-flow · commit 8d664c578190 · 2021-09-10T17:37:42.000-04:00
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -419,6 +419,7 @@ Groupby/resample/rolling
 - Bug in :meth:`Series.rolling` when the :class:`Series` ``dtype`` was ``Int64`` (:issue:`43016`)
 - Bug in :meth:`DataFrame.rolling.corr` when the :class:`DataFrame` columns was a :class:`MultiIndex` (:issue:`21157`)
 - Bug in :meth:`DataFrame.groupby.rolling` when specifying ``on`` and calling ``__getitem__`` would subsequently return incorrect results (:issue:`43355`)
+- Bug in :meth:`GroupBy.apply` with time-based :class:`Grouper` objects incorrectly raising ``ValueError`` in corner cases where the grouping vector contains a ``NaT`` (:issue:`43500`)
 
 Reshaping
 ^^^^^^^^^
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -401,7 +401,7 @@ def _wrap_applied_output(
 
         if isinstance(values[0], dict):
             # GH #823 #24880
-            index = self._group_keys_index
+            index = self.grouper.result_index
             res_df = self.obj._constructor_expanddim(values, index=index)
             res_df = self._reindex_output(res_df)
             # if self.observed is False,
@@ -414,7 +414,7 @@ def _wrap_applied_output(
         else:
             # GH #6265 #24880
             result = self.obj._constructor(
-                data=values, index=self._group_keys_index, name=self.obj.name
+                data=values, index=self.grouper.result_index, name=self.obj.name
             )
             return self._reindex_output(result)
 
diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
@@ -1183,18 +1183,14 @@ def df_cat(df):
     return df_cat
 
 
-@pytest.mark.parametrize(
-    "operation, kwargs", [("agg", {"dtype": "category"}), ("apply", {})]
-)
-def test_seriesgroupby_observed_true(df_cat, operation, kwargs):
+@pytest.mark.parametrize("operation", ["agg", "apply"])
+def test_seriesgroupby_observed_true(df_cat, operation):
     # GH 24880
-    index = MultiIndex.from_frame(
-        DataFrame(
-            {"A": ["foo", "foo", "bar", "bar"], "B": ["one", "two", "one", "three"]},
-            **kwargs,
-        )
-    )
+    lev_a = Index(["foo", "foo", "bar", "bar"], dtype=df_cat["A"].dtype, name="A")
+    lev_b = Index(["one", "two", "one", "three"], dtype=df_cat["B"].dtype, name="B")
+    index = MultiIndex.from_arrays([lev_a, lev_b])
     expected = Series(data=[1, 3, 2, 4], index=index, name="C")
+
     grouped = df_cat.groupby(["A", "B"], observed=True)["C"]
     result = getattr(grouped, operation)(sum)
     tm.assert_series_equal(result, expected)
@@ -1225,18 +1221,16 @@ def test_seriesgroupby_observed_false_or_none(df_cat, observed, operation):
     [
         (
             True,
-            MultiIndex.from_tuples(
+            MultiIndex.from_arrays(
                 [
-                    ("foo", "one", "min"),
-                    ("foo", "one", "max"),
-                    ("foo", "two", "min"),
-                    ("foo", "two", "max"),
-                    ("bar", "one", "min"),
-                    ("bar", "one", "max"),
-                    ("bar", "three", "min"),
-                    ("bar", "three", "max"),
-                ],
-                names=["A", "B", None],
+                    Index(["foo"] * 4 + ["bar"] * 4, dtype="category", name="A"),
+                    Index(
+                        ["one", "one", "two", "two", "one", "one", "three", "three"],
+                        dtype="category",
+                        name="B",
+                    ),
+                    Index(["min", "max"] * 4),
+                ]
             ),
             [1, 1, 3, 3, 2, 2, 4, 4],
         ),
diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py
@@ -23,6 +23,41 @@
 from pandas.core.groupby.ops import BinGrouper
 
 
+@pytest.fixture
+def groupby_with_truncated_bingrouper():
+    """
+    GroupBy object such that gb.grouper is a BinGrouper and
+    len(gb.grouper.result_index) < len(gb.grouper.group_keys_seq)
+
+    Aggregations on this groupby should have
+
+        dti = date_range("2013-09-01", "2013-10-01", freq="5D", name="Date")
+
+    As either the index or an index level.
+    """
+    df = DataFrame(
+        {
+            "Quantity": [18, 3, 5, 1, 9, 3],
+            "Date": [
+                Timestamp(2013, 9, 1, 13, 0),
+                Timestamp(2013, 9, 1, 13, 5),
+                Timestamp(2013, 10, 1, 20, 0),
+                Timestamp(2013, 10, 3, 10, 0),
+                pd.NaT,
+                Timestamp(2013, 9, 2, 14, 0),
+            ],
+        }
+    )
+
+    tdg = Grouper(key="Date", freq="5D")
+    gb = df.groupby(tdg)
+
+    # check we're testing the case we're interested in
+    assert len(gb.grouper.result_index) != len(gb.grouper.group_keys_seq)
+
+    return gb
+
+
 class TestGroupBy:
     def test_groupby_with_timegrouper(self):
         # GH 4161
@@ -779,3 +814,36 @@ def test_grouper_period_index(self):
             range(0, periods), index=Index(range(1, periods + 1), name=index.name)
         )
         tm.assert_series_equal(result, expected)
+
+    def test_groupby_apply_timegrouper_with_nat_dict_returns(
+        self, groupby_with_truncated_bingrouper
+    ):
+        # GH#43500 case where gb.grouper.result_index and gb.grouper.group_keys_seq
+        #  have different lengths that goes through the `isinstance(values[0], dict)`
+        #  path
+        gb = groupby_with_truncated_bingrouper
+
+        res = gb["Quantity"].apply(lambda x: {"foo": len(x)})
+
+        dti = date_range("2013-09-01", "2013-10-01", freq="5D", name="Date")
+        mi = MultiIndex.from_arrays([dti, ["foo"] * len(dti)])
+        expected = Series([3, 0, 0, 0, 0, 0, 2], index=mi, name="Quantity")
+        tm.assert_series_equal(res, expected)
+
+    def test_groupby_apply_timegrouper_with_nat_scalar_returns(
+        self, groupby_with_truncated_bingrouper
+    ):
+        # GH#43500 Previously raised ValueError bc used index with incorrect
+        #  length in wrap_applied_result
+        gb = groupby_with_truncated_bingrouper
+
+        res = gb["Quantity"].apply(lambda x: x.iloc[0] if len(x) else np.nan)
+
+        dti = date_range("2013-09-01", "2013-10-01", freq="5D", name="Date")
+        expected = Series(
+            [18, np.nan, np.nan, np.nan, np.nan, np.nan, 5],
+            index=dti._with_freq(None),
+            name="Quantity",
+        )
+
+        tm.assert_series_equal(res, expected)