diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt index c749c4540013b..abb8ef8d2970c 100644 --- a/doc/source/whatsnew/v0.18.0.txt +++ b/doc/source/whatsnew/v0.18.0.txt @@ -1035,7 +1035,7 @@ Bug Fixes - Bug in ``pd.concat`` while concatenating tz-aware NaT series. (:issue:`11693`, :issue:`11755`, :issue:`12217`) - Bug in ``pd.read_stata`` with version <= 108 files (:issue:`12232`) - Bug in ``Series.resample`` using a frequency of ``Nano`` when the index is a ``DatetimeIndex`` and contains non-zero nanosecond parts (:issue:`12037`) - +- Bug in resampling with ``.nunique`` and a sparse index (:issue:`12352`) - Bug in ``NaT`` subtraction from ``Timestamp`` or ``DatetimeIndex`` with timezones (:issue:`11718`) - Bug in subtraction of ``Series`` of a single tz-aware ``Timestamp`` (:issue:`12290`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 963c6223730f3..c081671c6d8db 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2817,8 +2817,16 @@ def nunique(self, dropna=True): inc[idx] = 1 out = np.add.reduceat(inc, idx).astype('int64', copy=False) - return Series(out if ids[0] != -1 else out[1:], - index=self.grouper.result_index, + res = out if ids[0] != -1 else out[1:] + ri = self.grouper.result_index + + # we might have duplications among the bins + if len(res) != len(ri): + res, out = np.zeros(len(ri), dtype=out.dtype), res + res[ids] = out + + return Series(res, + index=ri, name=self.name) @deprecate_kwarg('take_last', 'keep', diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index 68999ac143ea8..b0e315ead2acb 100644 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -1526,6 +1526,34 @@ def test_resample_timegrouper(self): result = df.groupby(pd.Grouper(freq='M', key='A')).count() assert_frame_equal(result, expected) + def test_resample_nunique(self): + + # GH 12352 + df = DataFrame({ + 'ID': {pd.Timestamp('2015-06-05 00:00:00'): '0010100903', + pd.Timestamp('2015-06-08 00:00:00'): '0010150847'}, + 'DATE': {pd.Timestamp('2015-06-05 00:00:00'): '2015-06-05', + pd.Timestamp('2015-06-08 00:00:00'): '2015-06-08'}}) + r = df.resample('D') + g = df.groupby(pd.Grouper(freq='D')) + expected = df.groupby(pd.TimeGrouper('D')).ID.apply(lambda x: + x.nunique()) + self.assertEqual(expected.name, 'ID') + + for t in [r, g]: + result = r.ID.nunique() + assert_series_equal(result, expected) + + # TODO + # this should have name + # https://github.com/pydata/pandas/issues/12363 + expected.name = None + result = df.ID.resample('D').nunique() + assert_series_equal(result, expected) + + result = df.ID.groupby(pd.Grouper(freq='D')).nunique() + assert_series_equal(result, expected) + def test_resample_group_info(self): # GH10914 for n, k in product((10000, 100000), (10, 100, 1000)): dr = date_range(start='2015-08-27', periods=n // 10, freq='T')