diff --git a/doc/source/v0.14.1.txt b/doc/source/v0.14.1.txt index 97e6ee51511bc..8a3ab4c83ef4b 100644 --- a/doc/source/v0.14.1.txt +++ b/doc/source/v0.14.1.txt @@ -260,6 +260,7 @@ Bug Fixes ``Index`` (:issue:`7464`). - Bug in ``DataFrame.reset_index`` loses ``tz`` (:issue:`3950`) - Bug in ``DatetimeIndex.freqstr`` raises ``AttributeError`` when ``freq`` is ``None`` (:issue:`7606`) +- Bug in ``GroupBy.size`` created by ``TimeGrouper`` raises ``AttributeError`` (:issue:`7453`) - Bug in non-monotonic ``Index.union`` may preserve ``name`` incorrectly (:issue:`7458`) - Bug in ``DatetimeIndex.intersection`` doesn't preserve timezone (:issue:`4690`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index c7611d9829308..7e32fc75be8fb 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1643,8 +1643,9 @@ def indices(self): i = 0 for label, bin in zip(self.binlabels, self.bins): - if label is not tslib.NaT and i < bin: - indices[label] = list(range(i, bin)) + if i < bin: + if label is not tslib.NaT: + indices[label] = list(range(i, bin)) i = bin return indices @@ -1665,6 +1666,22 @@ def levels(self): def names(self): return [self.binlabels.name] + def size(self): + """ + Compute group sizes + + """ + base = Series(np.zeros(len(self.result_index), dtype=np.int64), + index=self.result_index) + indices = self.indices + for k, v in compat.iteritems(indices): + indices[k] = len(v) + bin_counts = Series(indices, dtype=np.int64) + result = base.add(bin_counts, fill_value=0) + # addition with fill_value changes dtype to float64 + result = result.astype(np.int64) + return result + #---------------------------------------------------------------------- # cython aggregation diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index 7c73933d9b001..ff8b6945a23be 100644 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -1326,6 +1326,13 @@ def test_aggregate_normal(self): dt_result = getattr(dt_grouped, func)() assert_frame_equal(expected, dt_result) + # GH 7453 + for func in ['size']: + expected = getattr(normal_grouped, func)() + expected.index = date_range(start='2013-01-01', freq='D', periods=5, name='key') + dt_result = getattr(dt_grouped, func)() + assert_series_equal(expected, dt_result) + """ for func in ['first', 'last']: expected = getattr(normal_grouped, func)() @@ -1339,7 +1346,7 @@ def test_aggregate_normal(self): dt_result = getattr(dt_grouped, func)(3) assert_frame_equal(expected, dt_result) """ - # if TimeGrouper is used included, 'size' 'first','last' and 'nth' doesn't work yet + # if TimeGrouper is used included, 'first','last' and 'nth' doesn't work yet def test_aggregate_with_nat(self): # check TimeGrouper's aggregation is identical as normal groupby @@ -1375,7 +1382,16 @@ def test_aggregate_with_nat(self): dt_result = getattr(dt_grouped, func)() assert_frame_equal(expected, dt_result) - # if NaT is included, 'var', 'std', 'mean', 'size', 'first','last' and 'nth' doesn't work yet + for func in ['size']: + normal_result = getattr(normal_grouped, func)() + pad = Series([0], index=[3]) + expected = normal_result.append(pad) + expected = expected.sort_index() + expected.index = date_range(start='2013-01-01', freq='D', periods=5, name='key') + dt_result = getattr(dt_grouped, func)() + assert_series_equal(expected, dt_result) + + # if NaT is included, 'var', 'std', 'mean', 'first','last' and 'nth' doesn't work yet if __name__ == '__main__': diff --git a/vb_suite/groupby.py b/vb_suite/groupby.py index 6a444d0a09af7..d1bea79a639cc 100644 --- a/vb_suite/groupby.py +++ b/vb_suite/groupby.py @@ -108,16 +108,26 @@ def f(): # size() speed setup = common_setup + """ -df = DataFrame({'key1': np.random.randint(0, 500, size=100000), - 'key2': np.random.randint(0, 100, size=100000), - 'value1' : np.random.randn(100000), - 'value2' : np.random.randn(100000), - 'value3' : np.random.randn(100000)}) +n = 100000 +offsets = np.random.randint(n, size=n).astype('timedelta64[ns]') +dates = np.datetime64('now') + offsets +df = DataFrame({'key1': np.random.randint(0, 500, size=n), + 'key2': np.random.randint(0, 100, size=n), + 'value1' : np.random.randn(n), + 'value2' : np.random.randn(n), + 'value3' : np.random.randn(n), + 'dates' : dates}) """ groupby_multi_size = Benchmark("df.groupby(['key1', 'key2']).size()", setup, start_date=datetime(2011, 10, 1)) +groupby_dt_size = Benchmark("df.groupby(['dates']).size()", + setup, start_date=datetime(2011, 10, 1)) + +groupby_dt_timegrouper_size = Benchmark("df.groupby(TimeGrouper(key='dates', freq='M')).size()", + setup, start_date=datetime(2011, 10, 1)) + #---------------------------------------------------------------------- # count() speed