Skip to content

Commit 192d681

Browse files
MarcoGorelliproost
authored andcommitted
BUG: Remove null values before sorting during groupby nunique calculation (pandas-dev#27951)
Closes pandas-dev#27904
1 parent d3b1c7e commit 192d681

File tree

3 files changed

+52
-2
lines changed

3 files changed

+52
-2
lines changed

doc/source/whatsnew/v1.0.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ Datetimelike
9797
- Bug in :meth:`Series.__setitem__` incorrectly casting ``np.timedelta64("NaT")`` to ``np.datetime64("NaT")`` when inserting into a :class:`Series` with datetime64 dtype (:issue:`27311`)
9898
- Bug in :meth:`Series.dt` property lookups when the underlying data is read-only (:issue:`27529`)
9999
- Bug in ``HDFStore.__getitem__`` incorrectly reading tz attribute created in Python 2 (:issue:`26443`)
100-
-
100+
- Bug in :meth:`pandas.core.groupby.SeriesGroupBy.nunique` where ``NaT`` values were interfering with the count of unique values (:issue:`27951`)
101101

102102

103103
Timedelta

pandas/core/groupby/generic.py

+4
Original file line numberDiff line numberDiff line change
@@ -1147,6 +1147,10 @@ def nunique(self, dropna=True):
11471147

11481148
val = self.obj._internal_get_values()
11491149

1150+
# GH 27951
1151+
# temporary fix while we wait for NumPy bug 12629 to be fixed
1152+
val[isna(val)] = np.datetime64("NaT")
1153+
11501154
try:
11511155
sorter = np.lexsort((val, ids))
11521156
except TypeError: # catches object dtypes

pandas/tests/groupby/test_function.py

+47-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import builtins
2+
import datetime as dt
23
from io import StringIO
34
from itertools import product
45
from string import ascii_lowercase
@@ -9,7 +10,16 @@
910
from pandas.errors import UnsupportedFunctionCall
1011

1112
import pandas as pd
12-
from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, date_range, isna
13+
from pandas import (
14+
DataFrame,
15+
Index,
16+
MultiIndex,
17+
NaT,
18+
Series,
19+
Timestamp,
20+
date_range,
21+
isna,
22+
)
1323
import pandas.core.nanops as nanops
1424
from pandas.util import _test_decorators as td, testing as tm
1525

@@ -1015,6 +1025,42 @@ def test_nunique_with_timegrouper():
10151025
tm.assert_series_equal(result, expected)
10161026

10171027

1028+
@pytest.mark.parametrize(
1029+
"key, data, dropna, expected",
1030+
[
1031+
(
1032+
["x", "x", "x"],
1033+
[Timestamp("2019-01-01"), NaT, Timestamp("2019-01-01")],
1034+
True,
1035+
Series([1], index=pd.Index(["x"], name="key"), name="data"),
1036+
),
1037+
(
1038+
["x", "x", "x"],
1039+
[dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1)],
1040+
True,
1041+
Series([1], index=pd.Index(["x"], name="key"), name="data"),
1042+
),
1043+
(
1044+
["x", "x", "x", "y", "y"],
1045+
[dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1)],
1046+
False,
1047+
Series([2, 2], index=pd.Index(["x", "y"], name="key"), name="data"),
1048+
),
1049+
(
1050+
["x", "x", "x", "x", "y"],
1051+
[dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1)],
1052+
False,
1053+
Series([2, 1], index=pd.Index(["x", "y"], name="key"), name="data"),
1054+
),
1055+
],
1056+
)
1057+
def test_nunique_with_NaT(key, data, dropna, expected):
1058+
# GH 27951
1059+
df = pd.DataFrame({"key": key, "data": data})
1060+
result = df.groupby(["key"])["data"].nunique(dropna=dropna)
1061+
tm.assert_series_equal(result, expected)
1062+
1063+
10181064
def test_nunique_preserves_column_level_names():
10191065
# GH 23222
10201066
test = pd.DataFrame([1, 2, 2], columns=pd.Index(["A"], name="level_0"))

0 commit comments

Comments
 (0)