Skip to content

Commit 11d2243

Browse files
committed
ENH: groupby speed enhancement due to less aggressive integrity checking in BlockManager. Implemented fast GroupBy.size function
1 parent d585ce5 commit 11d2243

File tree

2 files changed

+36
-12
lines changed

2 files changed

+36
-12
lines changed

pandas/core/groupby.py

+14
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,20 @@ def mean(self):
295295
"""
296296
return self._cython_agg_general('mean')
297297

298+
def size(self):
299+
"""
300+
Compute group sizes
301+
"""
302+
result = sorted((k, len(v)) for k, v in self.groups.iteritems())
303+
keys, values = zip(*result)
304+
305+
if len(self.groupings) > 1:
306+
index = MultiIndex.from_tuples(keys)
307+
else:
308+
index = Index(keys)
309+
310+
return Series(values, index=index)
311+
298312
def sum(self):
299313
"""
300314
Compute sum of values, excluding missing values

pandas/core/internals.py

+22-12
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@ class Block(object):
1616
"""
1717
__slots__ = ['items', 'ref_items', '_ref_locs', 'values', 'ndim']
1818

19-
def __init__(self, values, items, ref_items, ndim=2):
19+
def __init__(self, values, items, ref_items, ndim=2,
20+
do_integrity_check=False):
2021
if issubclass(values.dtype.type, basestring):
2122
values = np.array(values, dtype=object)
2223

@@ -27,7 +28,9 @@ def __init__(self, values, items, ref_items, ndim=2):
2728
self.ndim = ndim
2829
self.items = _ensure_index(items)
2930
self.ref_items = _ensure_index(ref_items)
30-
self._check_integrity()
31+
32+
if do_integrity_check:
33+
self._check_integrity()
3134

3235
def _check_integrity(self):
3336
if len(self.items) < 2:
@@ -186,7 +189,7 @@ def can_store(self, value):
186189
return not issubclass(value.dtype.type,
187190
(np.integer, np.floating, np.bool_))
188191

189-
def make_block(values, items, ref_items):
192+
def make_block(values, items, ref_items, do_integrity_check=False):
190193
dtype = values.dtype
191194
vtype = dtype.type
192195

@@ -199,7 +202,8 @@ def make_block(values, items, ref_items):
199202
else:
200203
klass = ObjectBlock
201204

202-
return klass(values, items, ref_items, ndim=values.ndim)
205+
return klass(values, items, ref_items, ndim=values.ndim,
206+
do_integrity_check=do_integrity_check)
203207

204208
# TODO: flexible with index=None and/or items=None
205209

@@ -221,15 +225,15 @@ class BlockManager(object):
221225
"""
222226
__slots__ = ['axes', 'blocks', 'ndim']
223227

224-
def __init__(self, blocks, axes, skip_integrity_check=False):
228+
def __init__(self, blocks, axes, do_integrity_check=True):
225229
self.axes = [_ensure_index(ax) for ax in axes]
226230
self.blocks = blocks
227231

228232
ndim = len(axes)
229233
for block in blocks:
230234
assert(ndim == block.values.ndim)
231235

232-
if not skip_integrity_check:
236+
if do_integrity_check:
233237
self._verify_integrity()
234238

235239
@property
@@ -281,7 +285,8 @@ def __setstate__(self, state):
281285
self.axes = [_ensure_index(ax) for ax in ax_arrays]
282286
blocks = []
283287
for values, items in zip(bvalues, bitems):
284-
blk = make_block(values, items, self.axes[0])
288+
blk = make_block(values, items, self.axes[0],
289+
do_integrity_check=True)
285290
blocks.append(blk)
286291
self.blocks = blocks
287292

@@ -346,7 +351,7 @@ def get_slice(self, slobj, axis=0):
346351
else:
347352
new_blocks = self._slice_blocks(slobj, axis)
348353

349-
return BlockManager(new_blocks, new_axes)
354+
return BlockManager(new_blocks, new_axes, do_integrity_check=False)
350355

351356
def _slice_blocks(self, slobj, axis):
352357
new_blocks = []
@@ -589,7 +594,8 @@ def reindex_items(self, new_items):
589594
block_shape[0] = len(extra_items)
590595
block_values = np.empty(block_shape, dtype=np.float64)
591596
block_values.fill(nan)
592-
na_block = make_block(block_values, extra_items, new_items)
597+
na_block = make_block(block_values, extra_items, new_items,
598+
do_integrity_check=True)
593599
new_blocks.append(na_block)
594600
new_blocks = _consolidate(new_blocks, new_items)
595601

@@ -780,7 +786,8 @@ def form_blocks(data, axes):
780786
block_values = np.empty(shape, dtype=float)
781787
block_values.fill(nan)
782788

783-
na_block = make_block(block_values, extra_items, items)
789+
na_block = make_block(block_values, extra_items, items,
790+
do_integrity_check=True)
784791
blocks.append(na_block)
785792
blocks = _consolidate(blocks, items)
786793

@@ -792,7 +799,7 @@ def _simple_blockify(dct, ref_items, dtype):
792799
if values.dtype != dtype: # pragma: no cover
793800
values = values.astype(dtype)
794801

795-
return make_block(values, block_items, ref_items)
802+
return make_block(values, block_items, ref_items, do_integrity_check=True)
796803

797804
def _stack_dict(dct, ref_items):
798805
items = [x for x in ref_items if x in dct]
@@ -857,12 +864,15 @@ def _consolidate(blocks, items):
857864

858865
return new_blocks
859866

867+
# TODO: this could be much optimized
868+
860869
def _merge_blocks(blocks, items):
861870
if len(blocks) == 1:
862871
return blocks[0]
863872
new_values = np.vstack([b.values for b in blocks])
864873
new_items = np.concatenate([b.items for b in blocks])
865-
new_block = make_block(new_values, new_items, items)
874+
new_block = make_block(new_values, new_items, items,
875+
do_integrity_check=True)
866876
return new_block.reindex_items_from(items)
867877

868878
def _union_block_items(blocks):

0 commit comments

Comments
 (0)