From 4cd824d31ea438149d162f001c221fefb6c67604 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 27 Oct 2020 13:38:13 +0100 Subject: [PATCH 1/2] Backport PR #37433: REGR: fix groupby std() with nullable dtypes --- doc/source/whatsnew/v1.1.4.rst | 1 + pandas/core/groupby/groupby.py | 2 +- pandas/tests/groupby/aggregate/test_cython.py | 35 +++++++++++++++++++ 3 files changed, 37 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.4.rst b/doc/source/whatsnew/v1.1.4.rst index c0aa1afc34c8f..a717e46692a19 100644 --- a/doc/source/whatsnew/v1.1.4.rst +++ b/doc/source/whatsnew/v1.1.4.rst @@ -21,6 +21,7 @@ Fixed regressions - Fixed regression in :meth:`Series.astype` converting ``None`` to ``"nan"`` when casting to string (:issue:`36904`) - Fixed regression in :class:`RollingGroupby` causing a segmentation fault with Index of dtype object (:issue:`36727`) - Fixed regression in :meth:`DataFrame.resample(...).apply(...)` raised ``AttributeError`` when input was a :class:`DataFrame` and only a :class:`Series` was evaluated (:issue:`36951`) +- Fixed regression in ``DataFrame.groupby(..).std()`` with nullable integer dtype (:issue:`37415`) - Fixed regression in :class:`PeriodDtype` comparing both equal and unequal to its string representation (:issue:`37265`) - Fixed regression where slicing :class:`DatetimeIndex` raised :exc:`AssertionError` on irregular time series with ``pd.NaT`` or on unsorted indices (:issue:`36953` and :issue:`35509`) - Fixed regression in certain offsets (:meth:`pd.offsets.Day() ` and below) no longer being hashable (:issue:`37267`) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index b3ec9cf71786a..9415ee1b7e969 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2489,9 +2489,9 @@ def _get_cythonized_result( except TypeError as e: error_msg = str(e) continue + vals = vals.astype(cython_dtype, copy=False) if needs_2d: vals = vals.reshape((-1, 1)) - vals = vals.astype(cython_dtype, copy=False) func = partial(func, vals) func = partial(func, labels) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index 87ebd8b5a27fb..02dbf00ee6421 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -277,3 +277,38 @@ def test_read_only_buffer_source_agg(agg): expected = df.copy().groupby(["species"]).agg({"sepal_length": agg}) tm.assert_equal(result, expected) + + +@pytest.mark.parametrize( + "op_name", + [ + "count", + "sum", + "std", + "var", + "sem", + "mean", + "median", + "prod", + "min", + "max", + ], +) +def test_cython_agg_nullable_int(op_name): + # ensure that the cython-based aggregations don't fail for nullable dtype + # (eg https://github.com/pandas-dev/pandas/issues/37415) + df = DataFrame( + { + "A": ["A", "B"] * 5, + "B": pd.array([1, 2, 3, 4, 5, 6, 7, 8, 9, pd.NA], dtype="Int64"), + } + ) + result = getattr(df.groupby("A")["B"], op_name)() + df2 = df.assign(B=df["B"].astype("float64")) + expected = getattr(df2.groupby("A")["B"], op_name)() + + if op_name != "count": + # the result is not yet consistently using Int64/Float64 dtype, + # so for now just checking the values by casting to float + result = result.astype("float64") + tm.assert_series_equal(result, expected) From bd0b4dda9c4d6c8265e751173bada728df677848 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 27 Oct 2020 13:42:54 +0000 Subject: [PATCH 2/2] reformat with black 19.10b0 --- pandas/tests/groupby/aggregate/test_cython.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index 02dbf00ee6421..7bacb62ce62f4 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -281,18 +281,7 @@ def test_read_only_buffer_source_agg(agg): @pytest.mark.parametrize( "op_name", - [ - "count", - "sum", - "std", - "var", - "sem", - "mean", - "median", - "prod", - "min", - "max", - ], + ["count", "sum", "std", "var", "sem", "mean", "median", "prod", "min", "max"], ) def test_cython_agg_nullable_int(op_name): # ensure that the cython-based aggregations don't fail for nullable dtype