Merge pull request #11152 from behzadnouri/grby-size

jreback · jreback · commit 6d048d9ef47e · 2015-09-20T11:54:14.000-07:00
PERF: improves performance in groupby.size
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -1378,12 +1378,9 @@ def size(self):
         Compute group sizes
 
         """
-        # TODO: better impl
-        labels, _, ngroups = self.group_info
-        bin_counts = algos.value_counts(labels, sort=False)
-        bin_counts = bin_counts.reindex(np.arange(ngroups))
-        bin_counts.index = self.result_index
-        return bin_counts
+        ids, _, ngroup = self.group_info
+        out = np.bincount(ids[ids != -1], minlength=ngroup)
+        return Series(out, index=self.result_index)
 
     @cache_readonly
     def _max_groupsize(self):
@@ -1845,24 +1842,6 @@ def groupings(self):
         # for compat
         return None
 
-    def size(self):
-        """
-        Compute group sizes
-
-        """
-        index = self.result_index
-        base = Series(np.zeros(len(index), dtype=np.int64), index=index)
-        indices = self.indices
-        for k, v in compat.iteritems(indices):
-            indices[k] = len(v)
-        bin_counts = Series(indices, dtype=np.int64)
-        # make bin_counts.index to have same name to preserve it
-        bin_counts.index.name = index.name
-        result = base.add(bin_counts, fill_value=0)
-        # addition with fill_value changes dtype to float64
-        result = result.astype(np.int64)
-        return result
-
     #----------------------------------------------------------------------
     # cython aggregation
 
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -2485,6 +2485,12 @@ def test_size(self):
         for key, group in grouped:
             self.assertEqual(result[key], len(group))
 
+        df = DataFrame(np.random.choice(20, (1000, 3)), columns=list('abc'))
+        for sort, key in cart_product((False, True), ('a', 'b', ['a', 'b'])):
+            left = df.groupby(key, sort=sort).size()
+            right = df.groupby(key, sort=sort)['c'].apply(lambda a: a.shape[0])
+            assert_series_equal(left, right, check_names=False)
+
     def test_count(self):
         from string import ascii_lowercase
         n = 1 << 15
diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py
@@ -941,6 +941,20 @@ def test_resample_group_info(self):  # GH10914
 
             assert_series_equal(left, right)
 
+    def test_resample_size(self):
+        n = 10000
+        dr = date_range('2015-09-19', periods=n, freq='T')
+        ts = Series(np.random.randn(n), index=np.random.choice(dr, n))
+
+        left = ts.resample('7T', how='size')
+        ix = date_range(start=left.index.min(), end=ts.index.max(), freq='7T')
+
+        bins = np.searchsorted(ix.values, ts.index.values, side='right')
+        val = np.bincount(bins, minlength=len(ix) + 1)[1:]
+
+        right = Series(val, index=ix)
+        assert_series_equal(left, right)
+
     def test_resmaple_dst_anchor(self):
         # 5172
         dti = DatetimeIndex([datetime(2012, 11, 4, 23)], tz='US/Eastern')