Skip to content

Commit 6d048d9

Browse files
committed
Merge pull request #11152 from behzadnouri/grby-size
PERF: improves performance in groupby.size
2 parents 007bebc + 9608c87 commit 6d048d9

File tree

3 files changed

+23
-24
lines changed

3 files changed

+23
-24
lines changed

pandas/core/groupby.py

+3-24
Original file line numberDiff line numberDiff line change
@@ -1378,12 +1378,9 @@ def size(self):
13781378
Compute group sizes
13791379
13801380
"""
1381-
# TODO: better impl
1382-
labels, _, ngroups = self.group_info
1383-
bin_counts = algos.value_counts(labels, sort=False)
1384-
bin_counts = bin_counts.reindex(np.arange(ngroups))
1385-
bin_counts.index = self.result_index
1386-
return bin_counts
1381+
ids, _, ngroup = self.group_info
1382+
out = np.bincount(ids[ids != -1], minlength=ngroup)
1383+
return Series(out, index=self.result_index)
13871384

13881385
@cache_readonly
13891386
def _max_groupsize(self):
@@ -1845,24 +1842,6 @@ def groupings(self):
18451842
# for compat
18461843
return None
18471844

1848-
def size(self):
1849-
"""
1850-
Compute group sizes
1851-
1852-
"""
1853-
index = self.result_index
1854-
base = Series(np.zeros(len(index), dtype=np.int64), index=index)
1855-
indices = self.indices
1856-
for k, v in compat.iteritems(indices):
1857-
indices[k] = len(v)
1858-
bin_counts = Series(indices, dtype=np.int64)
1859-
# make bin_counts.index to have same name to preserve it
1860-
bin_counts.index.name = index.name
1861-
result = base.add(bin_counts, fill_value=0)
1862-
# addition with fill_value changes dtype to float64
1863-
result = result.astype(np.int64)
1864-
return result
1865-
18661845
#----------------------------------------------------------------------
18671846
# cython aggregation
18681847

pandas/tests/test_groupby.py

+6
Original file line numberDiff line numberDiff line change
@@ -2485,6 +2485,12 @@ def test_size(self):
24852485
for key, group in grouped:
24862486
self.assertEqual(result[key], len(group))
24872487

2488+
df = DataFrame(np.random.choice(20, (1000, 3)), columns=list('abc'))
2489+
for sort, key in cart_product((False, True), ('a', 'b', ['a', 'b'])):
2490+
left = df.groupby(key, sort=sort).size()
2491+
right = df.groupby(key, sort=sort)['c'].apply(lambda a: a.shape[0])
2492+
assert_series_equal(left, right, check_names=False)
2493+
24882494
def test_count(self):
24892495
from string import ascii_lowercase
24902496
n = 1 << 15

pandas/tseries/tests/test_resample.py

+14
Original file line numberDiff line numberDiff line change
@@ -941,6 +941,20 @@ def test_resample_group_info(self): # GH10914
941941

942942
assert_series_equal(left, right)
943943

944+
def test_resample_size(self):
945+
n = 10000
946+
dr = date_range('2015-09-19', periods=n, freq='T')
947+
ts = Series(np.random.randn(n), index=np.random.choice(dr, n))
948+
949+
left = ts.resample('7T', how='size')
950+
ix = date_range(start=left.index.min(), end=ts.index.max(), freq='7T')
951+
952+
bins = np.searchsorted(ix.values, ts.index.values, side='right')
953+
val = np.bincount(bins, minlength=len(ix) + 1)[1:]
954+
955+
right = Series(val, index=ix)
956+
assert_series_equal(left, right)
957+
944958
def test_resmaple_dst_anchor(self):
945959
# 5172
946960
dti = DatetimeIndex([datetime(2012, 11, 4, 23)], tz='US/Eastern')

0 commit comments

Comments
 (0)