From 627410d99aa0fd31b9bc2cddbff278fcd891789f Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Wed, 19 Feb 2020 19:28:21 -0600 Subject: [PATCH 1/6] Add test --- pandas/tests/groupby/test_nth.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index 0f850f2e94581..0b13d2c298f1d 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -530,3 +530,20 @@ def test_nth_nan_in_grouper(dropna): ) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("method", ["first", "last"]) +def test_first_last_with_na_object(method): + # https://github.com/pandas-dev/pandas/issues/32123 + groups = pd.DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, pd.NA]}).groupby("a") + result = getattr(groups, method)() + + if method == "first": + values = {"b": [1, 3]} + else: + values = {"b": [2, 3]} + + idx = pd.Index([1, 2], name="a") + expected = pd.DataFrame(values, index=idx) + + tm.assert_frame_equal(result, expected) From bb4ad11efd2d5d5e0f781b5e64ef6eeea55b49fe Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Wed, 19 Feb 2020 19:29:16 -0600 Subject: [PATCH 2/6] Check for NA --- pandas/_libs/groupby.pyx | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index edc44f1c94589..2dfd6b32b636a 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -22,6 +22,8 @@ from pandas._libs.algos cimport (swap, TiebreakEnumType, TIEBREAK_AVERAGE, from pandas._libs.algos import (take_2d_axis1_float64_float64, groupsort_indexer, tiebreakers) +from pandas._libs.missing import NA + cdef int64_t NPY_NAT = get_nat() _int64_max = np.iinfo(np.int64).max @@ -887,7 +889,7 @@ def group_last(rank_t[:, :] out, for j in range(K): val = values[i, j] - if val == val: + if (val is not NA) and (val == val): # NB: use _treat_as_na here once # conditional-nogil is available. nobs[lab, j] += 1 @@ -976,7 +978,7 @@ def group_nth(rank_t[:, :] out, for j in range(K): val = values[i, j] - if val == val: + if (val is not NA) and (val == val): # NB: use _treat_as_na here once # conditional-nogil is available. nobs[lab, j] += 1 From 32137189ec4874d710bcba9426984d706a30e178 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Wed, 19 Feb 2020 19:29:41 -0600 Subject: [PATCH 3/6] Update whatsnew --- doc/source/whatsnew/v1.0.2.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.0.2.rst b/doc/source/whatsnew/v1.0.2.rst index c9031ac1ae9fe..f09107ea3a08c 100644 --- a/doc/source/whatsnew/v1.0.2.rst +++ b/doc/source/whatsnew/v1.0.2.rst @@ -45,6 +45,7 @@ Bug fixes - Fix bug in :meth:`DataFrame.convert_dtypes` for columns that were already using the ``"string"`` dtype (:issue:`31731`). - Fixed bug in setting values using a slice indexer with string dtype (:issue:`31772`) +- Fixed bug where :meth:`GroupBy.first` and :meth:`GroupBy.last` would raise a ``TypeError`` when groups contained ``pd.NA`` in a column of object dtype (:issue:`32123`) .. --------------------------------------------------------------------------- From 58bbb57fc2e8f7bc731edef9bd5857f6705dbd4e Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Wed, 19 Feb 2020 20:45:10 -0600 Subject: [PATCH 4/6] Use nulls_fixture --- pandas/tests/groupby/test_nth.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index 0b13d2c298f1d..db5401b7e947f 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -533,17 +533,20 @@ def test_nth_nan_in_grouper(dropna): @pytest.mark.parametrize("method", ["first", "last"]) -def test_first_last_with_na_object(method): +def test_first_last_with_na_object(method, nulls_fixture): # https://github.com/pandas-dev/pandas/issues/32123 - groups = pd.DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, pd.NA]}).groupby("a") + groups = pd.DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, nulls_fixture]}).groupby( + "a" + ) result = getattr(groups, method)() if method == "first": - values = {"b": [1, 3]} + values = [1, 3] else: - values = {"b": [2, 3]} + values = [2, 3] + values = np.array(values, dtype=result["b"].dtype) idx = pd.Index([1, 2], name="a") - expected = pd.DataFrame(values, index=idx) + expected = pd.DataFrame({"b": values}, index=idx) tm.assert_frame_equal(result, expected) From 4f2d126d320cd48aaed80535d8f89c161b200b9e Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Wed, 19 Feb 2020 20:45:26 -0600 Subject: [PATCH 5/6] Use checknull --- pandas/_libs/groupby.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 2dfd6b32b636a..27b3095d8cb4f 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -22,7 +22,7 @@ from pandas._libs.algos cimport (swap, TiebreakEnumType, TIEBREAK_AVERAGE, from pandas._libs.algos import (take_2d_axis1_float64_float64, groupsort_indexer, tiebreakers) -from pandas._libs.missing import NA +from pandas._libs.missing cimport checknull cdef int64_t NPY_NAT = get_nat() _int64_max = np.iinfo(np.int64).max @@ -889,7 +889,7 @@ def group_last(rank_t[:, :] out, for j in range(K): val = values[i, j] - if (val is not NA) and (val == val): + if not checknull(val): # NB: use _treat_as_na here once # conditional-nogil is available. nobs[lab, j] += 1 @@ -978,7 +978,7 @@ def group_nth(rank_t[:, :] out, for j in range(K): val = values[i, j] - if (val is not NA) and (val == val): + if not checknull(val): # NB: use _treat_as_na here once # conditional-nogil is available. nobs[lab, j] += 1 From 87456af557c9eff70c931e4f3b628193cb8bc277 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sat, 22 Feb 2020 21:52:06 -0600 Subject: [PATCH 6/6] Move and add test --- pandas/tests/groupby/test_nth.py | 60 +++++++++++++++++++++----------- 1 file changed, 40 insertions(+), 20 deletions(-) diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index db5401b7e947f..b1476f1059d84 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -54,6 +54,46 @@ def test_first_last_nth(df): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("method", ["first", "last"]) +def test_first_last_with_na_object(method, nulls_fixture): + # https://github.com/pandas-dev/pandas/issues/32123 + groups = pd.DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, nulls_fixture]}).groupby( + "a" + ) + result = getattr(groups, method)() + + if method == "first": + values = [1, 3] + else: + values = [2, 3] + + values = np.array(values, dtype=result["b"].dtype) + idx = pd.Index([1, 2], name="a") + expected = pd.DataFrame({"b": values}, index=idx) + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("index", [0, -1]) +def test_nth_with_na_object(index, nulls_fixture): + # https://github.com/pandas-dev/pandas/issues/32123 + groups = pd.DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, nulls_fixture]}).groupby( + "a" + ) + result = groups.nth(index) + + if index == 0: + values = [1, 3] + else: + values = [2, nulls_fixture] + + values = np.array(values, dtype=result["b"].dtype) + idx = pd.Index([1, 2], name="a") + expected = pd.DataFrame({"b": values}, index=idx) + + tm.assert_frame_equal(result, expected) + + def test_first_last_nth_dtypes(df_mixed_floats): df = df_mixed_floats.copy() @@ -530,23 +570,3 @@ def test_nth_nan_in_grouper(dropna): ) tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("method", ["first", "last"]) -def test_first_last_with_na_object(method, nulls_fixture): - # https://github.com/pandas-dev/pandas/issues/32123 - groups = pd.DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, nulls_fixture]}).groupby( - "a" - ) - result = getattr(groups, method)() - - if method == "first": - values = [1, 3] - else: - values = [2, 3] - - values = np.array(values, dtype=result["b"].dtype) - idx = pd.Index([1, 2], name="a") - expected = pd.DataFrame({"b": values}, index=idx) - - tm.assert_frame_equal(result, expected)