Skip to content

Commit 8d664c5

Browse files
authored
BUG: GroupBy.apply with Grouper and NaT (#43500)
1 parent 1ec2d1d commit 8d664c5

File tree

4 files changed

+86
-23
lines changed

4 files changed

+86
-23
lines changed

doc/source/whatsnew/v1.4.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -419,6 +419,7 @@ Groupby/resample/rolling
419419
- Bug in :meth:`Series.rolling` when the :class:`Series` ``dtype`` was ``Int64`` (:issue:`43016`)
420420
- Bug in :meth:`DataFrame.rolling.corr` when the :class:`DataFrame` columns was a :class:`MultiIndex` (:issue:`21157`)
421421
- Bug in :meth:`DataFrame.groupby.rolling` when specifying ``on`` and calling ``__getitem__`` would subsequently return incorrect results (:issue:`43355`)
422+
- Bug in :meth:`GroupBy.apply` with time-based :class:`Grouper` objects incorrectly raising ``ValueError`` in corner cases where the grouping vector contains a ``NaT`` (:issue:`43500`)
422423

423424
Reshaping
424425
^^^^^^^^^

pandas/core/groupby/generic.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -401,7 +401,7 @@ def _wrap_applied_output(
401401

402402
if isinstance(values[0], dict):
403403
# GH #823 #24880
404-
index = self._group_keys_index
404+
index = self.grouper.result_index
405405
res_df = self.obj._constructor_expanddim(values, index=index)
406406
res_df = self._reindex_output(res_df)
407407
# if self.observed is False,
@@ -414,7 +414,7 @@ def _wrap_applied_output(
414414
else:
415415
# GH #6265 #24880
416416
result = self.obj._constructor(
417-
data=values, index=self._group_keys_index, name=self.obj.name
417+
data=values, index=self.grouper.result_index, name=self.obj.name
418418
)
419419
return self._reindex_output(result)
420420

pandas/tests/groupby/test_categorical.py

+15-21
Original file line numberDiff line numberDiff line change
@@ -1183,18 +1183,14 @@ def df_cat(df):
11831183
return df_cat
11841184

11851185

1186-
@pytest.mark.parametrize(
1187-
"operation, kwargs", [("agg", {"dtype": "category"}), ("apply", {})]
1188-
)
1189-
def test_seriesgroupby_observed_true(df_cat, operation, kwargs):
1186+
@pytest.mark.parametrize("operation", ["agg", "apply"])
1187+
def test_seriesgroupby_observed_true(df_cat, operation):
11901188
# GH 24880
1191-
index = MultiIndex.from_frame(
1192-
DataFrame(
1193-
{"A": ["foo", "foo", "bar", "bar"], "B": ["one", "two", "one", "three"]},
1194-
**kwargs,
1195-
)
1196-
)
1189+
lev_a = Index(["foo", "foo", "bar", "bar"], dtype=df_cat["A"].dtype, name="A")
1190+
lev_b = Index(["one", "two", "one", "three"], dtype=df_cat["B"].dtype, name="B")
1191+
index = MultiIndex.from_arrays([lev_a, lev_b])
11971192
expected = Series(data=[1, 3, 2, 4], index=index, name="C")
1193+
11981194
grouped = df_cat.groupby(["A", "B"], observed=True)["C"]
11991195
result = getattr(grouped, operation)(sum)
12001196
tm.assert_series_equal(result, expected)
@@ -1225,18 +1221,16 @@ def test_seriesgroupby_observed_false_or_none(df_cat, observed, operation):
12251221
[
12261222
(
12271223
True,
1228-
MultiIndex.from_tuples(
1224+
MultiIndex.from_arrays(
12291225
[
1230-
("foo", "one", "min"),
1231-
("foo", "one", "max"),
1232-
("foo", "two", "min"),
1233-
("foo", "two", "max"),
1234-
("bar", "one", "min"),
1235-
("bar", "one", "max"),
1236-
("bar", "three", "min"),
1237-
("bar", "three", "max"),
1238-
],
1239-
names=["A", "B", None],
1226+
Index(["foo"] * 4 + ["bar"] * 4, dtype="category", name="A"),
1227+
Index(
1228+
["one", "one", "two", "two", "one", "one", "three", "three"],
1229+
dtype="category",
1230+
name="B",
1231+
),
1232+
Index(["min", "max"] * 4),
1233+
]
12401234
),
12411235
[1, 1, 3, 3, 2, 2, 4, 4],
12421236
),

pandas/tests/groupby/test_timegrouper.py

+68
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,41 @@
2323
from pandas.core.groupby.ops import BinGrouper
2424

2525

26+
@pytest.fixture
27+
def groupby_with_truncated_bingrouper():
28+
"""
29+
GroupBy object such that gb.grouper is a BinGrouper and
30+
len(gb.grouper.result_index) < len(gb.grouper.group_keys_seq)
31+
32+
Aggregations on this groupby should have
33+
34+
dti = date_range("2013-09-01", "2013-10-01", freq="5D", name="Date")
35+
36+
As either the index or an index level.
37+
"""
38+
df = DataFrame(
39+
{
40+
"Quantity": [18, 3, 5, 1, 9, 3],
41+
"Date": [
42+
Timestamp(2013, 9, 1, 13, 0),
43+
Timestamp(2013, 9, 1, 13, 5),
44+
Timestamp(2013, 10, 1, 20, 0),
45+
Timestamp(2013, 10, 3, 10, 0),
46+
pd.NaT,
47+
Timestamp(2013, 9, 2, 14, 0),
48+
],
49+
}
50+
)
51+
52+
tdg = Grouper(key="Date", freq="5D")
53+
gb = df.groupby(tdg)
54+
55+
# check we're testing the case we're interested in
56+
assert len(gb.grouper.result_index) != len(gb.grouper.group_keys_seq)
57+
58+
return gb
59+
60+
2661
class TestGroupBy:
2762
def test_groupby_with_timegrouper(self):
2863
# GH 4161
@@ -779,3 +814,36 @@ def test_grouper_period_index(self):
779814
range(0, periods), index=Index(range(1, periods + 1), name=index.name)
780815
)
781816
tm.assert_series_equal(result, expected)
817+
818+
def test_groupby_apply_timegrouper_with_nat_dict_returns(
819+
self, groupby_with_truncated_bingrouper
820+
):
821+
# GH#43500 case where gb.grouper.result_index and gb.grouper.group_keys_seq
822+
# have different lengths that goes through the `isinstance(values[0], dict)`
823+
# path
824+
gb = groupby_with_truncated_bingrouper
825+
826+
res = gb["Quantity"].apply(lambda x: {"foo": len(x)})
827+
828+
dti = date_range("2013-09-01", "2013-10-01", freq="5D", name="Date")
829+
mi = MultiIndex.from_arrays([dti, ["foo"] * len(dti)])
830+
expected = Series([3, 0, 0, 0, 0, 0, 2], index=mi, name="Quantity")
831+
tm.assert_series_equal(res, expected)
832+
833+
def test_groupby_apply_timegrouper_with_nat_scalar_returns(
834+
self, groupby_with_truncated_bingrouper
835+
):
836+
# GH#43500 Previously raised ValueError bc used index with incorrect
837+
# length in wrap_applied_result
838+
gb = groupby_with_truncated_bingrouper
839+
840+
res = gb["Quantity"].apply(lambda x: x.iloc[0] if len(x) else np.nan)
841+
842+
dti = date_range("2013-09-01", "2013-10-01", freq="5D", name="Date")
843+
expected = Series(
844+
[18, np.nan, np.nan, np.nan, np.nan, np.nan, 5],
845+
index=dti._with_freq(None),
846+
name="Quantity",
847+
)
848+
849+
tm.assert_series_equal(res, expected)

0 commit comments

Comments
 (0)