Backport PR pandas-dev#39655: REGR: fix case all-NaN/numeric object column in groupby

jorisvandenbossche · simonjayhawkins · commit 2a6dfc381a13 · 2021-02-08T14:41:25.000Z
diff --git a/doc/source/whatsnew/v1.2.2.rst b/doc/source/whatsnew/v1.2.2.rst
@@ -23,7 +23,8 @@ Fixed regressions
 - Fixed regression in :meth:`~DataFrame.to_csv` opening ``codecs.StreamWriter`` in binary mode instead of in text mode and ignoring user-provided ``mode`` (:issue:`39247`)
 - Fixed regression in :meth:`~DataFrame.to_excel` creating corrupt files when appending (``mode="a"``) to an existing file (:issue:`39576`)
 - Fixed regression in :meth:`DataFrame.transform` failing in case of an empty DataFrame or Series (:issue:`39636`)
-- Fixed regression in :meth:`core.window.rolling.Rolling.count` where the ``min_periods`` argument would be set to ``0`` after the operation (:issue:`39554`)
+- Fixed regression in :meth:`~DataFrame.groupby` or :meth:`~DataFrame.resample` when aggregating an all-NaN or numeric object dtype column (:issue:`39329`)
+- Fixed regression in :meth:`.Rolling.count` where the ``min_periods`` argument would be set to ``0`` after the operation (:issue:`39554`)
 - Fixed regression in :func:`read_excel` that incorrectly raised when the argument ``io`` was a non-path and non-buffer and the ``engine`` argument was specified (:issue:`39528`)
 -
 
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -1078,11 +1078,18 @@ def py_fallback(bvalues: ArrayLike) -> ArrayLike:
             #  in the operation.  We un-split here.
             result = result._consolidate()
             assert isinstance(result, (Series, DataFrame))  # for mypy
-            assert len(result._mgr.blocks) == 1
+            mgr = result._mgr
+            assert isinstance(mgr, BlockManager)
 
             # unwrap DataFrame to get array
-            result = result._mgr.blocks[0].values
-            return result
+            if len(mgr.blocks) != 1:
+                # We've split an object block! Everything we've assumed
+                # about a single block input returning a single block output
+                # is a lie. See eg GH-39329
+                return mgr.as_array()
+            else:
+                result = mgr.blocks[0].values
+                return result
 
         def blk_func(bvalues: ArrayLike) -> ArrayLike:
 
diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py
@@ -1175,3 +1175,27 @@ def test_aggregate_datetime_objects():
     result = df.groupby("A").B.max()
     expected = df.set_index("A")["B"]
     tm.assert_series_equal(result, expected)
+
+
+def test_aggregate_numeric_object_dtype():
+    # https://github.com/pandas-dev/pandas/issues/39329
+    # simplified case: multiple object columns where one is all-NaN
+    # -> gets split as the all-NaN is inferred as float
+    df = DataFrame(
+        {"key": ["A", "A", "B", "B"], "col1": list("abcd"), "col2": [np.nan] * 4},
+    ).astype(object)
+    result = df.groupby("key").min()
+    expected = DataFrame(
+        {"key": ["A", "B"], "col1": ["a", "c"], "col2": [np.nan, np.nan]}
+    ).set_index("key")
+    tm.assert_frame_equal(result, expected)
+
+    # same but with numbers
+    df = DataFrame(
+        {"key": ["A", "A", "B", "B"], "col1": list("abcd"), "col2": range(4)},
+    ).astype(object)
+    result = df.groupby("key").min()
+    expected = DataFrame(
+        {"key": ["A", "B"], "col1": ["a", "c"], "col2": [0, 2]}
+    ).set_index("key")
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py
@@ -392,3 +392,34 @@ def test_resample_groupby_agg():
     result = resampled.agg({"num": "sum"})
 
     tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("consolidate", [True, False])
+def test_resample_groupby_agg_object_dtype_all_nan(consolidate):
+    # https://github.com/pandas-dev/pandas/issues/39329
+
+    dates = pd.date_range("2020-01-01", periods=15, freq="D")
+    df1 = DataFrame({"key": "A", "date": dates, "col1": range(15), "col_object": "val"})
+    df2 = DataFrame({"key": "B", "date": dates, "col1": range(15)})
+    df = pd.concat([df1, df2], ignore_index=True)
+    if consolidate:
+        df = df._consolidate()
+
+    result = df.groupby(["key"]).resample("W", on="date").min()
+    idx = pd.MultiIndex.from_arrays(
+        [
+            ["A"] * 3 + ["B"] * 3,
+            pd.to_datetime(["2020-01-05", "2020-01-12", "2020-01-19"] * 2),
+        ],
+        names=["key", "date"],
+    )
+    expected = DataFrame(
+        {
+            "key": ["A"] * 3 + ["B"] * 3,
+            "date": pd.to_datetime(["2020-01-01", "2020-01-06", "2020-01-13"] * 2),
+            "col1": [0, 5, 12] * 2,
+            "col_object": ["val"] * 3 + [np.nan] * 3,
+        },
+        index=idx,
+    )
+    tm.assert_frame_equal(result, expected)

Original file line number	Diff line number	Diff line change
`@@ -23,7 +23,8 @@ Fixed regressions`
`23`	`23`	- Fixed regression in :meth:`~DataFrame.to_csv` opening ``codecs.StreamWriter`` in binary mode instead of in text mode and ignoring user-provided ``mode`` (:issue:`39247`)
`24`	`24`	- Fixed regression in :meth:`~DataFrame.to_excel` creating corrupt files when appending (``mode="a"``) to an existing file (:issue:`39576`)
`25`	`25`	- Fixed regression in :meth:`DataFrame.transform` failing in case of an empty DataFrame or Series (:issue:`39636`)
`26`		-- Fixed regression in :meth:`core.window.rolling.Rolling.count` where the ``min_periods`` argument would be set to ``0`` after the operation (:issue:`39554`)
	`26`	+- Fixed regression in :meth:`~DataFrame.groupby` or :meth:`~DataFrame.resample` when aggregating an all-NaN or numeric object dtype column (:issue:`39329`)
	`27`	+- Fixed regression in :meth:`.Rolling.count` where the ``min_periods`` argument would be set to ``0`` after the operation (:issue:`39554`)
`27`	`28`	- Fixed regression in :func:`read_excel` that incorrectly raised when the argument ``io`` was a non-path and non-buffer and the ``engine`` argument was specified (:issue:`39528`)
`28`	`29`	`-`
`29`	`30`