From 691b3841144e10dc8570b801ecc03bfb72150c16 Mon Sep 17 00:00:00 2001 From: Licht-T Date: Fri, 22 Sep 2017 21:17:30 +0900 Subject: [PATCH 1/5] BUG: Fix groupby nunique with NaT --- pandas/core/groupby.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index f14ed08a27fae..5b4456589c4ea 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -3177,7 +3177,11 @@ def nunique(self, dropna=True): out = np.add.reduceat(inc, idx).astype('int64', copy=False) if len(ids): - res = out if ids[0] != -1 else out[1:] + if ids[0] == -1: + res = out[1:] + idx = idx[np.flatnonzero(idx)] + else: + res = out else: res = out[1:] ri = self.grouper.result_index From 800d1ad38931dbaa4e1a87479f75a71555c0bc02 Mon Sep 17 00:00:00 2001 From: Licht-T Date: Fri, 22 Sep 2017 21:19:08 +0900 Subject: [PATCH 2/5] TST: Add the nunique with NaT test --- pandas/tests/groupby/test_groupby.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 8957beacab376..0364c67f46808 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -3688,6 +3688,18 @@ def test_nunique_with_timegrouper(self): )['data'].apply(pd.Series.nunique) tm.assert_series_equal(result, expected) + def test_nunique_with_timegrouper_and_nat(self): + test = pd.DataFrame({ + 'time': [Timestamp('2016-06-28 09:35:35'), + pd.NaT, + Timestamp('2016-06-28 16:46:28')], + 'data': ['1', '2', '3']}) + + grouper = pd.TimeGrouper(key='time', freq='h') + result = test.groupby(grouper)['data'].nunique() + expected = test[test.time.notnull()].groupby(grouper)['data'].nunique() + tm.assert_series_equal(result, expected) + def test_numpy_compat(self): # see gh-12811 df = pd.DataFrame({'A': [1, 2, 1], 'B': [1, 2, 3]}) From 5e0d924ce3bb88984fe0990237618c7f4f5a2515 Mon Sep 17 00:00:00 2001 From: Licht-T Date: Sat, 23 Sep 2017 01:18:12 +0900 Subject: [PATCH 3/5] TST: Move the groupby nunique with NaT test to test_timegrouper and add the GitHub Issue number comment --- pandas/tests/groupby/test_groupby.py | 12 ------------ pandas/tests/groupby/test_timegrouper.py | 13 +++++++++++++ 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 0364c67f46808..8957beacab376 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -3688,18 +3688,6 @@ def test_nunique_with_timegrouper(self): )['data'].apply(pd.Series.nunique) tm.assert_series_equal(result, expected) - def test_nunique_with_timegrouper_and_nat(self): - test = pd.DataFrame({ - 'time': [Timestamp('2016-06-28 09:35:35'), - pd.NaT, - Timestamp('2016-06-28 16:46:28')], - 'data': ['1', '2', '3']}) - - grouper = pd.TimeGrouper(key='time', freq='h') - result = test.groupby(grouper)['data'].nunique() - expected = test[test.time.notnull()].groupby(grouper)['data'].nunique() - tm.assert_series_equal(result, expected) - def test_numpy_compat(self): # see gh-12811 df = pd.DataFrame({'A': [1, 2, 1], 'B': [1, 2, 3]}) diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index df0a93d783375..f83a3fcd0668d 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -608,3 +608,16 @@ def test_first_last_max_min_on_time_data(self): assert_frame_equal(grouped_ref.min(), grouped_test.min()) assert_frame_equal(grouped_ref.first(), grouped_test.first()) assert_frame_equal(grouped_ref.last(), grouped_test.last()) + + def test_nunique_with_timegrouper_and_nat(self): + # GH 17575 + test = pd.DataFrame({ + 'time': [Timestamp('2016-06-28 09:35:35'), + pd.NaT, + Timestamp('2016-06-28 16:46:28')], + 'data': ['1', '2', '3']}) + + grouper = pd.TimeGrouper(key='time', freq='h') + result = test.groupby(grouper)['data'].nunique() + expected = test[test.time.notnull()].groupby(grouper)['data'].nunique() + tm.assert_series_equal(result, expected) From fabc3714af43029b99087f3d20614961f97de0eb Mon Sep 17 00:00:00 2001 From: Licht-T Date: Sat, 23 Sep 2017 01:50:51 +0900 Subject: [PATCH 4/5] DOC: Add the description for the fix groupby nunique with NaT in whatsnew note --- doc/source/whatsnew/v0.21.0.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 1cd65bb530f73..e4a3bd796e3ba 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -543,6 +543,7 @@ Groupby/Resample/Rolling - Bug in ``Series.resample(...).apply()`` where an empty ``Series`` modified the source index and did not return the name of a ``Series`` (:issue:`14313`) - Bug in ``.rolling(...).apply(...)`` with a ``DataFrame`` with a ``DatetimeIndex``, a ``window`` of a timedelta-convertible and ``min_periods >= 1` (:issue:`15305`) - Bug in ``DataFrame.groupby`` where index and column keys were not recognized correctly when the number of keys equaled the number of elements on the groupby axis (:issue:`16859`) +- Bug in ``groupby.nunique()`` with ``TimeGrouper`` which cannot handle ``NaT`` correctly (:issue:`17575`) Sparse ^^^^^^ From 29d4cf6874435cf9771633214e43188c10cf6552 Mon Sep 17 00:00:00 2001 From: Licht-T Date: Sat, 23 Sep 2017 02:13:01 +0900 Subject: [PATCH 5/5] DOC: Add the comment for the fix groupby nunique with NaT --- pandas/core/groupby.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 5b4456589c4ea..a62ae40a85941 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -3177,6 +3177,8 @@ def nunique(self, dropna=True): out = np.add.reduceat(inc, idx).astype('int64', copy=False) if len(ids): + # NaN/NaT group exists if the head of ids is -1, + # so remove it from res and exclude its index from idx if ids[0] == -1: res = out[1:] idx = idx[np.flatnonzero(idx)]