From 6e1514af147fa78fe39e5d08bcf0eb6d32b1c636 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 7 Feb 2021 20:29:59 +0100 Subject: [PATCH 1/9] Revert "REF: simplify _cython_agg_blocks (#35841)" This reverts commit 0e199f3d490045d592715b3e95a0203729f17a0f. --- pandas/core/groupby/generic.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 7b6eb4c8fe2f9..02723bc99dc6d 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1100,13 +1100,20 @@ def py_fallback(bvalues: ArrayLike) -> ArrayLike: # in the operation. We un-split here. result = result._consolidate() assert isinstance(result, (Series, DataFrame)) # for mypy - mgr = result._mgr - assert isinstance(mgr, BlockManager) - assert len(mgr.blocks) == 1 # unwrap DataFrame to get array - result = mgr.blocks[0].values - return result + mgr = result._mgr + assert isinstance(mgr, BlockManager) + if len(mgr.blocks) != 1: + # We've split an object block! Everything we've assumed + # about a single block input returning a single block output + # is a lie. To keep the code-path for the typical non-split case + # clean, we choose to clean up this mess later on. + return mgr.as_array() + else: + assert len(mgr.blocks) == 1 + result = mgr.blocks[0].values + return result def blk_func(bvalues: ArrayLike) -> ArrayLike: From 62cc348ee3aa1ca2027adb202d706e591d483e75 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 7 Feb 2021 20:34:57 +0100 Subject: [PATCH 2/9] add test case from issue --- .../tests/groupby/aggregate/test_aggregate.py | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index de4ef0996ad49..e55a63c0a52a3 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1187,3 +1187,31 @@ def test_aggregate_datetime_objects(): result = df.groupby("A").B.max() expected = df.set_index("A")["B"] tm.assert_series_equal(result, expected) + + +def test_aggregate_object_dtype_all_nan(): + # https://github.com/pandas-dev/pandas/issues/39329 + + dates = pd.date_range("2020-01-01", periods=15, freq="D") + df1 = DataFrame({"key": "A", "date": dates, "col1": range(15), "col_object": "val"}) + df2 = DataFrame({"key": "B", "date": dates, "col1": range(15)}) + df = pd.concat([df1, df2], ignore_index=True) + + result = df.groupby(["key"]).resample("W", on="date").min() + idx = MultiIndex.from_arrays( + [ + ["A"] * 3 + ["B"] * 3, + pd.to_datetime(["2020-01-05", "2020-01-12", "2020-01-19"] * 2), + ], + names=["key", "date"], + ) + expected = DataFrame( + { + "key": ["A"] * 3 + ["B"] * 3, + "date": pd.to_datetime(["2020-01-01", "2020-01-06", "2020-01-13"] * 2), + "col1": [0, 5, 12] * 2, + "col_object": ["val"] * 3 + [np.nan] * 3, + }, + index=idx, + ) + tm.assert_frame_equal(result, expected) From 9d0ea88fbd274173e86e466124337e83928d70bc Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 7 Feb 2021 20:52:09 +0100 Subject: [PATCH 3/9] add simpler groupby-only test case --- .../tests/groupby/aggregate/test_aggregate.py | 29 +++++-------------- .../tests/resample/test_resampler_grouper.py | 28 ++++++++++++++++++ 2 files changed, 36 insertions(+), 21 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index e55a63c0a52a3..349267b741956 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1191,27 +1191,14 @@ def test_aggregate_datetime_objects(): def test_aggregate_object_dtype_all_nan(): # https://github.com/pandas-dev/pandas/issues/39329 - - dates = pd.date_range("2020-01-01", periods=15, freq="D") - df1 = DataFrame({"key": "A", "date": dates, "col1": range(15), "col_object": "val"}) - df2 = DataFrame({"key": "B", "date": dates, "col1": range(15)}) - df = pd.concat([df1, df2], ignore_index=True) - - result = df.groupby(["key"]).resample("W", on="date").min() - idx = MultiIndex.from_arrays( - [ - ["A"] * 3 + ["B"] * 3, - pd.to_datetime(["2020-01-05", "2020-01-12", "2020-01-19"] * 2), - ], - names=["key", "date"], + # simplified case: multiple object columns where one is all-NaN + # -> gets split as the all-NaN is inferred as float + df = DataFrame( + {"key": ["A", "A", "B", "B"], "col1": list("abcd"), "col2": [np.nan] * 4}, + dtype=object, ) + result = df.groupby("key").min() expected = DataFrame( - { - "key": ["A"] * 3 + ["B"] * 3, - "date": pd.to_datetime(["2020-01-01", "2020-01-06", "2020-01-13"] * 2), - "col1": [0, 5, 12] * 2, - "col_object": ["val"] * 3 + [np.nan] * 3, - }, - index=idx, - ) + {"key": ["A", "B"], "col1": ["a", "c"], "col2": [np.nan, np.nan]} + ).set_index("key") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 1a10255a81a8c..30e00ac8afc6a 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -392,3 +392,31 @@ def test_resample_groupby_agg(): result = resampled.agg({"num": "sum"}) tm.assert_frame_equal(result, expected) + + +def test_resample_groupby_agg_object_dtype_all_nan(): + # https://github.com/pandas-dev/pandas/issues/39329 + + dates = pd.date_range("2020-01-01", periods=15, freq="D") + df1 = DataFrame({"key": "A", "date": dates, "col1": range(15), "col_object": "val"}) + df2 = DataFrame({"key": "B", "date": dates, "col1": range(15)}) + df = pd.concat([df1, df2], ignore_index=True) + + result = df.groupby(["key"]).resample("W", on="date").min() + idx = pd.MultiIndex.from_arrays( + [ + ["A"] * 3 + ["B"] * 3, + pd.to_datetime(["2020-01-05", "2020-01-12", "2020-01-19"] * 2), + ], + names=["key", "date"], + ) + expected = DataFrame( + { + "key": ["A"] * 3 + ["B"] * 3, + "date": pd.to_datetime(["2020-01-01", "2020-01-06", "2020-01-13"] * 2), + "col1": [0, 5, 12] * 2, + "col_object": ["val"] * 3 + [np.nan] * 3, + }, + index=idx, + ) + tm.assert_frame_equal(result, expected) From 383bf50b3a0edfbe62b383491314c772cf779c66 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 7 Feb 2021 21:05:21 +0100 Subject: [PATCH 4/9] add whatsnew --- doc/source/whatsnew/v1.2.2.rst | 1 + pandas/core/groupby/generic.py | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.2.2.rst b/doc/source/whatsnew/v1.2.2.rst index b6e81e44e6f08..ed77cbd411546 100644 --- a/doc/source/whatsnew/v1.2.2.rst +++ b/doc/source/whatsnew/v1.2.2.rst @@ -23,6 +23,7 @@ Fixed regressions - Fixed regression in :meth:`~DataFrame.to_csv` opening ``codecs.StreamWriter`` in binary mode instead of in text mode and ignoring user-provided ``mode`` (:issue:`39247`) - Fixed regression in :meth:`~DataFrame.to_excel` creating corrupt files when appending (``mode="a"``) to an existing file (:issue:`39576`) - Fixed regression in :meth:`DataFrame.transform` failing in case of an empty DataFrame or Series (:issue:`39636`) +- Fixed regression in :meth:`~DataFrame.groupby` or :meth:`~DataFrame.resample` when aggregating an all-NaN object dtype column (:issue:`39329`) - Fixed regression in :meth:`core.window.rolling.Rolling.count` where the ``min_periods`` argument would be set to ``0`` after the operation (:issue:`39554`) - Fixed regression in :func:`read_excel` that incorrectly raised when the argument ``io`` was a non-path and non-buffer and the ``engine`` argument was specified (:issue:`39528`) - diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 02723bc99dc6d..12703a979b0b8 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1107,8 +1107,7 @@ def py_fallback(bvalues: ArrayLike) -> ArrayLike: if len(mgr.blocks) != 1: # We've split an object block! Everything we've assumed # about a single block input returning a single block output - # is a lie. To keep the code-path for the typical non-split case - # clean, we choose to clean up this mess later on. + # is a lie. See eg GH-39329 return mgr.as_array() else: assert len(mgr.blocks) == 1 From a21c4eda75ff018caed09f072404503cd91da7cf Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 7 Feb 2021 21:08:57 +0100 Subject: [PATCH 5/9] add more generic numeric test case --- pandas/tests/groupby/aggregate/test_aggregate.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 349267b741956..48527de6b2047 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1189,16 +1189,25 @@ def test_aggregate_datetime_objects(): tm.assert_series_equal(result, expected) -def test_aggregate_object_dtype_all_nan(): +def test_aggregate_numeric_object_dtype(): # https://github.com/pandas-dev/pandas/issues/39329 # simplified case: multiple object columns where one is all-NaN # -> gets split as the all-NaN is inferred as float df = DataFrame( {"key": ["A", "A", "B", "B"], "col1": list("abcd"), "col2": [np.nan] * 4}, - dtype=object, - ) + ).astype(object) result = df.groupby("key").min() expected = DataFrame( {"key": ["A", "B"], "col1": ["a", "c"], "col2": [np.nan, np.nan]} ).set_index("key") tm.assert_frame_equal(result, expected) + + # same but with numbers + df = DataFrame( + {"key": ["A", "A", "B", "B"], "col1": list("abcd"), "col2": range(4)}, + ).astype(object) + result = df.groupby("key").min() + expected = DataFrame( + {"key": ["A", "B"], "col1": ["a", "c"], "col2": [0, 2]} + ).set_index("key") + tm.assert_frame_equal(result, expected) From 9ae380ad376dfe0612c021c620167b2abbbcfa7e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 7 Feb 2021 21:10:12 +0100 Subject: [PATCH 6/9] update whatsnew message --- doc/source/whatsnew/v1.2.2.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.2.rst b/doc/source/whatsnew/v1.2.2.rst index ed77cbd411546..3cabe59744788 100644 --- a/doc/source/whatsnew/v1.2.2.rst +++ b/doc/source/whatsnew/v1.2.2.rst @@ -23,7 +23,7 @@ Fixed regressions - Fixed regression in :meth:`~DataFrame.to_csv` opening ``codecs.StreamWriter`` in binary mode instead of in text mode and ignoring user-provided ``mode`` (:issue:`39247`) - Fixed regression in :meth:`~DataFrame.to_excel` creating corrupt files when appending (``mode="a"``) to an existing file (:issue:`39576`) - Fixed regression in :meth:`DataFrame.transform` failing in case of an empty DataFrame or Series (:issue:`39636`) -- Fixed regression in :meth:`~DataFrame.groupby` or :meth:`~DataFrame.resample` when aggregating an all-NaN object dtype column (:issue:`39329`) +- Fixed regression in :meth:`~DataFrame.groupby` or :meth:`~DataFrame.resample` when aggregating an all-NaN or numeric object dtype column (:issue:`39329`) - Fixed regression in :meth:`core.window.rolling.Rolling.count` where the ``min_periods`` argument would be set to ``0`` after the operation (:issue:`39554`) - Fixed regression in :func:`read_excel` that incorrectly raised when the argument ``io`` was a non-path and non-buffer and the ``engine`` argument was specified (:issue:`39528`) - From 7fb7108cbb04aafa225878428a1ed834eebbe03c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 7 Feb 2021 21:20:33 +0100 Subject: [PATCH 7/9] other whatsnew fix --- doc/source/whatsnew/v1.2.2.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.2.rst b/doc/source/whatsnew/v1.2.2.rst index 3cabe59744788..c9cbf33ff2ad9 100644 --- a/doc/source/whatsnew/v1.2.2.rst +++ b/doc/source/whatsnew/v1.2.2.rst @@ -24,7 +24,7 @@ Fixed regressions - Fixed regression in :meth:`~DataFrame.to_excel` creating corrupt files when appending (``mode="a"``) to an existing file (:issue:`39576`) - Fixed regression in :meth:`DataFrame.transform` failing in case of an empty DataFrame or Series (:issue:`39636`) - Fixed regression in :meth:`~DataFrame.groupby` or :meth:`~DataFrame.resample` when aggregating an all-NaN or numeric object dtype column (:issue:`39329`) -- Fixed regression in :meth:`core.window.rolling.Rolling.count` where the ``min_periods`` argument would be set to ``0`` after the operation (:issue:`39554`) +- Fixed regression in :meth:`.Rolling.count` where the ``min_periods`` argument would be set to ``0`` after the operation (:issue:`39554`) - Fixed regression in :func:`read_excel` that incorrectly raised when the argument ``io`` was a non-path and non-buffer and the ``engine`` argument was specified (:issue:`39528`) - From baae4a38019d786f444aa72e00f9de3b12cf20d0 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 8 Feb 2021 09:08:18 +0100 Subject: [PATCH 8/9] parametrize with consolidation --- pandas/tests/resample/test_resampler_grouper.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 30e00ac8afc6a..39d4533ca08dc 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -394,13 +394,16 @@ def test_resample_groupby_agg(): tm.assert_frame_equal(result, expected) -def test_resample_groupby_agg_object_dtype_all_nan(): +@pytest.mark.parametrize("consolidate", [True, False]) +def test_resample_groupby_agg_object_dtype_all_nan(consolidate): # https://github.com/pandas-dev/pandas/issues/39329 dates = pd.date_range("2020-01-01", periods=15, freq="D") df1 = DataFrame({"key": "A", "date": dates, "col1": range(15), "col_object": "val"}) df2 = DataFrame({"key": "B", "date": dates, "col1": range(15)}) df = pd.concat([df1, df2], ignore_index=True) + if consolidate: + df = df._consolidate() result = df.groupby(["key"]).resample("W", on="date").min() idx = pd.MultiIndex.from_arrays( From 277c0fede334876c96cd81004a536f5054c1815e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 8 Feb 2021 09:09:46 +0100 Subject: [PATCH 9/9] remove unnecessary assert --- pandas/core/groupby/generic.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 12703a979b0b8..04e9eb039c249 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1100,17 +1100,16 @@ def py_fallback(bvalues: ArrayLike) -> ArrayLike: # in the operation. We un-split here. result = result._consolidate() assert isinstance(result, (Series, DataFrame)) # for mypy - - # unwrap DataFrame to get array mgr = result._mgr assert isinstance(mgr, BlockManager) + + # unwrap DataFrame to get array if len(mgr.blocks) != 1: # We've split an object block! Everything we've assumed # about a single block input returning a single block output # is a lie. See eg GH-39329 return mgr.as_array() else: - assert len(mgr.blocks) == 1 result = mgr.blocks[0].values return result