merge with upstream

MarcoGorelli · MarcoGorelli · commit d7f64de1c53d · 2019-09-05T21:38:01.000+01:00
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
@@ -178,6 +178,7 @@ Groupby/resample/rolling
 
 -
 - Bug in :meth:`DataFrame.rolling` not allowing for rolling over datetimes when ``axis=1`` (:issue: `28192`)
+- Bug in :meth:`pandas.core.groupby.SeriesGroupBy.nunique` where ``NaT`` values were interfering with the count of unique values (:issue:`27951`)
 - Bug in :meth:`DataFrame.groupby` not offering selection by column name when ``axis=1`` (:issue:`27614`)
 - Bug in :meth:`DataFrameGroupby.agg` not able to use lambda function with named aggregation (:issue:`27519`)
 
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -1147,6 +1147,9 @@ def nunique(self, dropna=True):
 
         val = self.obj._internal_get_values()
 
+        # GH 27951
+        val[isna(val)] = np.datetime64("NaT")
+
         try:
             sorter = np.lexsort((val, ids))
         except TypeError:  # catches object dtypes
diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py
@@ -1,4 +1,5 @@
 import builtins
+import datetime as dt
 from io import StringIO
 from itertools import product
 from string import ascii_lowercase
@@ -9,7 +10,16 @@
 from pandas.errors import UnsupportedFunctionCall
 
 import pandas as pd
-from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, date_range, isna
+from pandas import (
+    DataFrame,
+    Index,
+    MultiIndex,
+    NaT,
+    Series,
+    Timestamp,
+    date_range,
+    isna,
+)
 import pandas.core.nanops as nanops
 from pandas.util import _test_decorators as td, testing as tm
 
@@ -1015,6 +1025,81 @@ def test_nunique_with_timegrouper():
     tm.assert_series_equal(result, expected)
 
 
+@pytest.mark.parametrize(
+    "data, dropna, expected",
+    [
+        (
+            DataFrame(
+                {
+                    "key": ["x", "x", "x", "x", "x"],
+                    "data": [
+                        Timestamp("2019-01-01 00:00:00"),
+                        NaT,
+                        Timestamp("2019-01-01 00:00:00"),
+                        NaT,
+                        Timestamp("2019-01-01 00:00:00"),
+                    ],
+                }
+            ),
+            True,
+            Series([1], index=pd.Index(["x"], name="key"), name="data"),
+        ),
+        (
+            DataFrame(
+                {
+                    "key": ["x", "x", "x", "x", "x"],
+                    "data": [
+                        dt.date(2019, 1, 1),
+                        NaT,
+                        dt.date(2019, 1, 1),
+                        NaT,
+                        dt.date(2019, 1, 1),
+                    ],
+                }
+            ),
+            True,
+            Series([1], index=pd.Index(["x"], name="key"), name="data"),
+        ),
+        (
+            DataFrame(
+                {
+                    "key": ["x", "x", "x", "y", "y"],
+                    "data": [
+                        dt.date(2019, 1, 1),
+                        NaT,
+                        dt.date(2019, 1, 1),
+                        NaT,
+                        dt.date(2019, 1, 1),
+                    ],
+                }
+            ),
+            False,
+            Series([2, 2], index=pd.Index(["x", "y"], name="key"), name="data"),
+        ),
+        (
+            DataFrame(
+                {
+                    "key": ["x", "x", "x", "x", "y"],
+                    "data": [
+                        dt.date(2019, 1, 1),
+                        NaT,
+                        dt.date(2019, 1, 1),
+                        NaT,
+                        dt.date(2019, 1, 1),
+                    ],
+                }
+            ),
+            False,
+            Series([2, 1], index=pd.Index(["x", "y"], name="key"), name="data"),
+        ),
+    ],
+)
+def test_nunique_with_NaT(data, dropna, expected):
+    # GH 27951
+    result = data.groupby(["key"])["data"].nunique(dropna=dropna)
+    tm.assert_series_equal(result, expected)
+
+
 def test_nunique_preserves_column_level_names():
     # GH 23222
     test = pd.DataFrame([1, 2, 2], columns=pd.Index(["A"], name="level_0"))

Original file line number	Diff line number	Diff line change
`@@ -178,6 +178,7 @@ Groupby/resample/rolling`
`178`	`178`
`179`	`179`	`-`
`180`	`180`	- Bug in :meth:`DataFrame.rolling` not allowing for rolling over datetimes when ``axis=1`` (:issue: `28192`)
	`181`	+- Bug in :meth:`pandas.core.groupby.SeriesGroupBy.nunique` where ``NaT`` values were interfering with the count of unique values (:issue:`27951`)
`181`	`182`	- Bug in :meth:`DataFrame.groupby` not offering selection by column name when ``axis=1`` (:issue:`27614`)
`182`	`183`	- Bug in :meth:`DataFrameGroupby.agg` not able to use lambda function with named aggregation (:issue:`27519`)
`183`	`184`