diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 8e44480c0c09b..20c1140bbd80e 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1378,12 +1378,9 @@ def size(self): Compute group sizes """ - # TODO: better impl - labels, _, ngroups = self.group_info - bin_counts = algos.value_counts(labels, sort=False) - bin_counts = bin_counts.reindex(np.arange(ngroups)) - bin_counts.index = self.result_index - return bin_counts + ids, _, ngroup = self.group_info + out = np.bincount(ids[ids != -1], minlength=ngroup) + return Series(out, index=self.result_index) @cache_readonly def _max_groupsize(self): @@ -1845,24 +1842,6 @@ def groupings(self): # for compat return None - def size(self): - """ - Compute group sizes - - """ - index = self.result_index - base = Series(np.zeros(len(index), dtype=np.int64), index=index) - indices = self.indices - for k, v in compat.iteritems(indices): - indices[k] = len(v) - bin_counts = Series(indices, dtype=np.int64) - # make bin_counts.index to have same name to preserve it - bin_counts.index.name = index.name - result = base.add(bin_counts, fill_value=0) - # addition with fill_value changes dtype to float64 - result = result.astype(np.int64) - return result - #---------------------------------------------------------------------- # cython aggregation diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 0336ee2e9b50e..42d5af587a859 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -2485,6 +2485,12 @@ def test_size(self): for key, group in grouped: self.assertEqual(result[key], len(group)) + df = DataFrame(np.random.choice(20, (1000, 3)), columns=list('abc')) + for sort, key in cart_product((False, True), ('a', 'b', ['a', 'b'])): + left = df.groupby(key, sort=sort).size() + right = df.groupby(key, sort=sort)['c'].apply(lambda a: a.shape[0]) + assert_series_equal(left, right, check_names=False) + def test_count(self): from string import ascii_lowercase n = 1 << 15 diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index ec03d558e45b8..7052348e2e6a4 100644 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -941,6 +941,20 @@ def test_resample_group_info(self): # GH10914 assert_series_equal(left, right) + def test_resample_size(self): + n = 10000 + dr = date_range('2015-09-19', periods=n, freq='T') + ts = Series(np.random.randn(n), index=np.random.choice(dr, n)) + + left = ts.resample('7T', how='size') + ix = date_range(start=left.index.min(), end=ts.index.max(), freq='7T') + + bins = np.searchsorted(ix.values, ts.index.values, side='right') + val = np.bincount(bins, minlength=len(ix) + 1)[1:] + + right = Series(val, index=ix) + assert_series_equal(left, right) + def test_resmaple_dst_anchor(self): # 5172 dti = DatetimeIndex([datetime(2012, 11, 4, 23)], tz='US/Eastern')