From 08cf43f390723d5cdf24d5dd853af4f9df77c105 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Sun, 6 Dec 2020 13:46:40 -0500 Subject: [PATCH] Backport PR #38330: REGR: Groupby first/last/nth treats None as an observation --- doc/source/whatsnew/v1.1.5.rst | 1 + pandas/_libs/groupby.pyx | 12 ++++-------- pandas/tests/groupby/test_nth.py | 20 ++++++++++++++++++++ 3 files changed, 25 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v1.1.5.rst b/doc/source/whatsnew/v1.1.5.rst index fbb12cb38448a..7164830392f35 100644 --- a/doc/source/whatsnew/v1.1.5.rst +++ b/doc/source/whatsnew/v1.1.5.rst @@ -27,6 +27,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.fillna` not filling ``NaN`` after other operations such as :meth:`DataFrame.pivot` (:issue:`36495`). - Fixed performance regression in ``df.groupby(..).rolling(..)`` (:issue:`38038`) - Fixed regression in :meth:`MultiIndex.intersection` returning duplicates when at least one of the indexes had duplicates (:issue:`36915`) +- Fixed regression in :meth:`.GroupBy.first` and :meth:`.GroupBy.last` where ``None`` was considered a non-NA value (:issue:`38286`) .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index a83634aad3ce2..5215bde281652 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -926,9 +926,7 @@ def group_last(rank_t[:, :] out, for j in range(K): val = values[i, j] - # None should not be treated like other NA-like - # so that it won't be converted to nan - if not checknull(val) or val is None: + if not checknull(val): # NB: use _treat_as_na here once # conditional-nogil is available. nobs[lab, j] += 1 @@ -937,7 +935,7 @@ def group_last(rank_t[:, :] out, for i in range(ncounts): for j in range(K): if nobs[i, j] == 0: - out[i, j] = NAN + out[i, j] = None else: out[i, j] = resx[i, j] else: @@ -1021,9 +1019,7 @@ def group_nth(rank_t[:, :] out, for j in range(K): val = values[i, j] - # None should not be treated like other NA-like - # so that it won't be converted to nan - if not checknull(val) or val is None: + if not checknull(val): # NB: use _treat_as_na here once # conditional-nogil is available. nobs[lab, j] += 1 @@ -1033,7 +1029,7 @@ def group_nth(rank_t[:, :] out, for i in range(ncounts): for j in range(K): if nobs[i, j] == 0: - out[i, j] = NAN + out[i, j] = None else: out[i, j] = resx[i, j] diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index 0cbfbad85a8b6..559b5116e4240 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -105,6 +105,26 @@ def test_first_last_with_None(method): tm.assert_frame_equal(result, df) +@pytest.mark.parametrize("method", ["first", "last"]) +@pytest.mark.parametrize( + "df, expected", + [ + ( + DataFrame({"id": "a", "value": [None, "foo", np.nan]}), + DataFrame({"value": ["foo"]}, index=Index(["a"], name="id")), + ), + ( + DataFrame({"id": "a", "value": [np.nan]}, dtype=object), + DataFrame({"value": [None]}, index=Index(["a"], name="id")), + ), + ], +) +def test_first_last_with_None_expanded(method, df, expected): + # GH 32800, 38286 + result = getattr(df.groupby("id"), method)() + tm.assert_frame_equal(result, expected) + + def test_first_last_nth_dtypes(df_mixed_floats): df = df_mixed_floats.copy()