BUG: GroupBy.size created by TimeGrouper raises AttributeError

sinhrks · sinhrks · commit eb2cd0459a34 · 2014-06-30T19:44:19.000+09:00
diff --git a/doc/source/v0.14.1.txt b/doc/source/v0.14.1.txt
@@ -260,6 +260,7 @@ Bug Fixes
   ``Index`` (:issue:`7464`).
 - Bug in ``DataFrame.reset_index`` loses ``tz`` (:issue:`3950`)
 - Bug in ``DatetimeIndex.freqstr`` raises ``AttributeError`` when ``freq`` is ``None`` (:issue:`7606`)
+- Bug in ``GroupBy.size`` created by ``TimeGrouper`` raises ``AttributeError`` (:issue:`7453`)
 
 - Bug in non-monotonic ``Index.union`` may preserve ``name`` incorrectly (:issue:`7458`)
 - Bug in ``DatetimeIndex.intersection`` doesn't preserve timezone (:issue:`4690`)
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -1643,8 +1643,9 @@ def indices(self):
 
         i = 0
         for label, bin in zip(self.binlabels, self.bins):
-            if label is not tslib.NaT and i < bin:
-                indices[label] = list(range(i, bin))
+            if i < bin:
+                if label is not tslib.NaT:
+                    indices[label] = list(range(i, bin))
                 i = bin
         return indices
 
@@ -1665,6 +1666,22 @@ def levels(self):
     def names(self):
         return [self.binlabels.name]
 
+    def size(self):
+        """
+        Compute group sizes
+
+        """
+        base = Series(np.zeros(len(self.result_index), dtype=np.int64),
+                      index=self.result_index)
+        indices = self.indices
+        for k, v in compat.iteritems(indices):
+            indices[k] = len(v)
+        bin_counts = Series(indices, dtype=np.int64)
+        result = base.add(bin_counts, fill_value=0)
+        # addition with fill_value changes dtype to float64
+        result = result.astype(np.int64)
+        return result
+
     #----------------------------------------------------------------------
     # cython aggregation
 
diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py
@@ -1326,6 +1326,13 @@ def test_aggregate_normal(self):
             dt_result = getattr(dt_grouped, func)()
             assert_frame_equal(expected, dt_result)
 
+        # GH 7453
+        for func in ['size']:
+            expected = getattr(normal_grouped, func)()
+            expected.index = date_range(start='2013-01-01', freq='D', periods=5, name='key')
+            dt_result = getattr(dt_grouped, func)()
+            assert_series_equal(expected, dt_result)
+
         """
         for func in ['first', 'last']:
             expected = getattr(normal_grouped, func)()
@@ -1339,7 +1346,7 @@ def test_aggregate_normal(self):
             dt_result = getattr(dt_grouped, func)(3)
             assert_frame_equal(expected, dt_result)
         """
-        # if TimeGrouper is used included, 'size' 'first','last' and 'nth' doesn't work yet
+        # if TimeGrouper is used included, 'first','last' and 'nth' doesn't work yet
 
     def test_aggregate_with_nat(self):
         # check TimeGrouper's aggregation is identical as normal groupby
@@ -1375,7 +1382,16 @@ def test_aggregate_with_nat(self):
             dt_result = getattr(dt_grouped, func)()
             assert_frame_equal(expected, dt_result)
 
-        # if NaT is included, 'var', 'std', 'mean', 'size', 'first','last' and 'nth' doesn't work yet
+        for func in ['size']:
+            normal_result = getattr(normal_grouped, func)()
+            pad = Series([0], index=[3])
+            expected = normal_result.append(pad)
+            expected = expected.sort_index()
+            expected.index = date_range(start='2013-01-01', freq='D', periods=5, name='key')
+            dt_result = getattr(dt_grouped, func)()
+            assert_series_equal(expected, dt_result)
+
+        # if NaT is included, 'var', 'std', 'mean', 'first','last' and 'nth' doesn't work yet
 
 
 if __name__ == '__main__':
diff --git a/vb_suite/groupby.py b/vb_suite/groupby.py
@@ -108,16 +108,26 @@ def f():
 # size() speed
 
 setup = common_setup + """
-df = DataFrame({'key1': np.random.randint(0, 500, size=100000),
-                'key2': np.random.randint(0, 100, size=100000),
-                'value1' : np.random.randn(100000),
-                'value2' : np.random.randn(100000),
-                'value3' : np.random.randn(100000)})
+n = 100000
+offsets = np.random.randint(n, size=n).astype('timedelta64[ns]')
+dates = np.datetime64('now') + offsets
+df = DataFrame({'key1': np.random.randint(0, 500, size=n),
+                'key2': np.random.randint(0, 100, size=n),
+                'value1' : np.random.randn(n),
+                'value2' : np.random.randn(n),
+                'value3' : np.random.randn(n),
+                'dates' : dates})
 """
 
 groupby_multi_size = Benchmark("df.groupby(['key1', 'key2']).size()",
                                setup, start_date=datetime(2011, 10, 1))
 
+groupby_dt_size = Benchmark("df.groupby(['dates']).size()",
+                            setup, start_date=datetime(2011, 10, 1))
+
+groupby_dt_timegrouper_size = Benchmark("df.groupby(TimeGrouper(key='dates', freq='M')).size()",
+                                        setup, start_date=datetime(2011, 10, 1))
+
 #----------------------------------------------------------------------
 # count() speed