From 9b09add82efed76c33891e15f43bc9a0b467ed4b Mon Sep 17 00:00:00 2001 From: MarcoGorelli Date: Thu, 5 Sep 2019 22:14:33 +0100 Subject: [PATCH 1/3] Temporary fix --- doc/source/whatsnew/v1.0.0.rst | 2 +- pandas/core/groupby/generic.py | 4 +++ pandas/tests/groupby/test_function.py | 48 ++++++++++++++++++++++++++- 3 files changed, 52 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 58892b316c940..2f72de25c579b 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -97,7 +97,7 @@ Datetimelike - Bug in :meth:`Series.__setitem__` incorrectly casting ``np.timedelta64("NaT")`` to ``np.datetime64("NaT")`` when inserting into a :class:`Series` with datetime64 dtype (:issue:`27311`) - Bug in :meth:`Series.dt` property lookups when the underlying data is read-only (:issue:`27529`) - Bug in ``HDFStore.__getitem__`` incorrectly reading tz attribute created in Python 2 (:issue:`26443`) -- +- Bug in :meth:`pandas.core.groupby.SeriesGroupBy.nunique` where ``NaT`` values were interfering with the count of unique values (:issue:`27951`) Timedelta diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index c0436e9389078..e514162f84c37 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1147,6 +1147,10 @@ def nunique(self, dropna=True): val = self.obj._internal_get_values() + # GH 27951 + # temporary fix while we wait for NumPy bug 12629 to be fixed + val[isna(val)] = np.datetime64("NaT") + try: sorter = np.lexsort((val, ids)) except TypeError: # catches object dtypes diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index d89233f2fd603..b46f9c3b824a5 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -2,6 +2,7 @@ from io import StringIO from itertools import product from string import ascii_lowercase +import datetime as dt import numpy as np import pytest @@ -9,7 +10,16 @@ from pandas.errors import UnsupportedFunctionCall import pandas as pd -from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, date_range, isna +from pandas import ( + DataFrame, + Index, + MultiIndex, + Series, + Timestamp, + date_range, + isna, + NaT, +) import pandas.core.nanops as nanops from pandas.util import _test_decorators as td, testing as tm @@ -1015,6 +1025,42 @@ def test_nunique_with_timegrouper(): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize( + "key, data, dropna, expected", + [ + ( + ["x", "x", "x"], + [Timestamp("2019-01-01"), NaT, Timestamp("2019-01-01")], + True, + Series([1], index=pd.Index(["x"], name="key"), name="data"), + ), + ( + ["x", "x", "x"], + [dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1)], + True, + Series([1], index=pd.Index(["x"], name="key"), name="data"), + ), + ( + ["x", "x", "x", "y", "y"], + [dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1)], + False, + Series([2, 2], index=pd.Index(["x", "y"], name="key"), name="data"), + ), + ( + ["x", "x", "x", "x", "y"], + [dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1)], + False, + Series([2, 1], index=pd.Index(["x", "y"], name="key"), name="data"), + ), + ], +) +def test_nunique_with_NaT(key, data, dropna, expected): + # GH 27951 + df = pd.DataFrame({"key": key, "data": data}) + result = df.groupby(["key"])["data"].nunique(dropna=dropna) + tm.assert_series_equal(result, expected) + + def test_nunique_preserves_column_level_names(): # GH 23222 test = pd.DataFrame([1, 2, 2], columns=pd.Index(["A"], name="level_0")) From ac9aa7e8ae3877dad47e4f8ae4aa3fc0eca7e282 Mon Sep 17 00:00:00 2001 From: MarcoGorelli Date: Thu, 5 Sep 2019 22:51:52 +0100 Subject: [PATCH 2/3] Correct order of imports --- pandas/tests/groupby/test_function.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index b46f9c3b824a5..cebf41a6bbb6e 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1,8 +1,8 @@ import builtins +import datetime as dt from io import StringIO from itertools import product from string import ascii_lowercase -import datetime as dt import numpy as np import pytest From 20ec544519c2a45d8b658c7bf089bcf745c91e0a Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Fri, 6 Sep 2019 16:51:19 +0100 Subject: [PATCH 3/3] Correct order to imports --- pandas/tests/groupby/test_function.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index cebf41a6bbb6e..afb22a732691c 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -14,11 +14,11 @@ DataFrame, Index, MultiIndex, + NaT, Series, Timestamp, date_range, isna, - NaT, ) import pandas.core.nanops as nanops from pandas.util import _test_decorators as td, testing as tm