Skip to content

Commit 2a6dfc3

Browse files
jorisvandenbosschesimonjayhawkins
authored andcommitted
Backport PR pandas-dev#39655: REGR: fix case all-NaN/numeric object column in groupby
1 parent fbe9511 commit 2a6dfc3

File tree

4 files changed

+67
-4
lines changed

4 files changed

+67
-4
lines changed

doc/source/whatsnew/v1.2.2.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,8 @@ Fixed regressions
2323
- Fixed regression in :meth:`~DataFrame.to_csv` opening ``codecs.StreamWriter`` in binary mode instead of in text mode and ignoring user-provided ``mode`` (:issue:`39247`)
2424
- Fixed regression in :meth:`~DataFrame.to_excel` creating corrupt files when appending (``mode="a"``) to an existing file (:issue:`39576`)
2525
- Fixed regression in :meth:`DataFrame.transform` failing in case of an empty DataFrame or Series (:issue:`39636`)
26-
- Fixed regression in :meth:`core.window.rolling.Rolling.count` where the ``min_periods`` argument would be set to ``0`` after the operation (:issue:`39554`)
26+
- Fixed regression in :meth:`~DataFrame.groupby` or :meth:`~DataFrame.resample` when aggregating an all-NaN or numeric object dtype column (:issue:`39329`)
27+
- Fixed regression in :meth:`.Rolling.count` where the ``min_periods`` argument would be set to ``0`` after the operation (:issue:`39554`)
2728
- Fixed regression in :func:`read_excel` that incorrectly raised when the argument ``io`` was a non-path and non-buffer and the ``engine`` argument was specified (:issue:`39528`)
2829
-
2930

pandas/core/groupby/generic.py

+10-3
Original file line numberDiff line numberDiff line change
@@ -1078,11 +1078,18 @@ def py_fallback(bvalues: ArrayLike) -> ArrayLike:
10781078
# in the operation. We un-split here.
10791079
result = result._consolidate()
10801080
assert isinstance(result, (Series, DataFrame)) # for mypy
1081-
assert len(result._mgr.blocks) == 1
1081+
mgr = result._mgr
1082+
assert isinstance(mgr, BlockManager)
10821083

10831084
# unwrap DataFrame to get array
1084-
result = result._mgr.blocks[0].values
1085-
return result
1085+
if len(mgr.blocks) != 1:
1086+
# We've split an object block! Everything we've assumed
1087+
# about a single block input returning a single block output
1088+
# is a lie. See eg GH-39329
1089+
return mgr.as_array()
1090+
else:
1091+
result = mgr.blocks[0].values
1092+
return result
10861093

10871094
def blk_func(bvalues: ArrayLike) -> ArrayLike:
10881095

pandas/tests/groupby/aggregate/test_aggregate.py

+24
Original file line numberDiff line numberDiff line change
@@ -1175,3 +1175,27 @@ def test_aggregate_datetime_objects():
11751175
result = df.groupby("A").B.max()
11761176
expected = df.set_index("A")["B"]
11771177
tm.assert_series_equal(result, expected)
1178+
1179+
1180+
def test_aggregate_numeric_object_dtype():
1181+
# https://github.com/pandas-dev/pandas/issues/39329
1182+
# simplified case: multiple object columns where one is all-NaN
1183+
# -> gets split as the all-NaN is inferred as float
1184+
df = DataFrame(
1185+
{"key": ["A", "A", "B", "B"], "col1": list("abcd"), "col2": [np.nan] * 4},
1186+
).astype(object)
1187+
result = df.groupby("key").min()
1188+
expected = DataFrame(
1189+
{"key": ["A", "B"], "col1": ["a", "c"], "col2": [np.nan, np.nan]}
1190+
).set_index("key")
1191+
tm.assert_frame_equal(result, expected)
1192+
1193+
# same but with numbers
1194+
df = DataFrame(
1195+
{"key": ["A", "A", "B", "B"], "col1": list("abcd"), "col2": range(4)},
1196+
).astype(object)
1197+
result = df.groupby("key").min()
1198+
expected = DataFrame(
1199+
{"key": ["A", "B"], "col1": ["a", "c"], "col2": [0, 2]}
1200+
).set_index("key")
1201+
tm.assert_frame_equal(result, expected)

pandas/tests/resample/test_resampler_grouper.py

+31
Original file line numberDiff line numberDiff line change
@@ -392,3 +392,34 @@ def test_resample_groupby_agg():
392392
result = resampled.agg({"num": "sum"})
393393

394394
tm.assert_frame_equal(result, expected)
395+
396+
397+
@pytest.mark.parametrize("consolidate", [True, False])
398+
def test_resample_groupby_agg_object_dtype_all_nan(consolidate):
399+
# https://github.com/pandas-dev/pandas/issues/39329
400+
401+
dates = pd.date_range("2020-01-01", periods=15, freq="D")
402+
df1 = DataFrame({"key": "A", "date": dates, "col1": range(15), "col_object": "val"})
403+
df2 = DataFrame({"key": "B", "date": dates, "col1": range(15)})
404+
df = pd.concat([df1, df2], ignore_index=True)
405+
if consolidate:
406+
df = df._consolidate()
407+
408+
result = df.groupby(["key"]).resample("W", on="date").min()
409+
idx = pd.MultiIndex.from_arrays(
410+
[
411+
["A"] * 3 + ["B"] * 3,
412+
pd.to_datetime(["2020-01-05", "2020-01-12", "2020-01-19"] * 2),
413+
],
414+
names=["key", "date"],
415+
)
416+
expected = DataFrame(
417+
{
418+
"key": ["A"] * 3 + ["B"] * 3,
419+
"date": pd.to_datetime(["2020-01-01", "2020-01-06", "2020-01-13"] * 2),
420+
"col1": [0, 5, 12] * 2,
421+
"col_object": ["val"] * 3 + [np.nan] * 3,
422+
},
423+
index=idx,
424+
)
425+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)