52
52
53
53
_apply_whitelist = frozenset (['last' , 'first' ,
54
54
'mean' , 'sum' , 'min' , 'max' ,
55
- 'head' , 'tail' ,
56
55
'cumsum' , 'cumprod' , 'cummin' , 'cummax' ,
57
56
'resample' ,
58
57
'describe' ,
@@ -482,13 +481,19 @@ def picker(arr):
482
481
return np .nan
483
482
return self .agg (picker )
484
483
485
- def cumcount (self ):
486
- """Number each item in each group from 0 to the length of that group.
484
+ def cumcount (self , ** kwargs ):
485
+ """
486
+ Number each item in each group from 0 to the length of that group - 1.
487
487
488
488
Essentially this is equivalent to
489
489
490
490
>>> self.apply(lambda x: Series(np.arange(len(x)), x.index))
491
491
492
+ Parameters
493
+ ----------
494
+ ascending : bool, default True
495
+ If False, number in reverse, from length of group - 1 to 0.
496
+
492
497
Example
493
498
-------
494
499
@@ -510,14 +515,111 @@ def cumcount(self):
510
515
4 1
511
516
5 3
512
517
dtype: int64
518
+ >>> df.groupby('A').cumcount(ascending=False)
519
+ 0 3
520
+ 1 2
521
+ 2 1
522
+ 3 1
523
+ 4 0
524
+ 5 0
525
+ dtype: int64
513
526
514
527
"""
528
+ ascending = kwargs .pop ('ascending' , True )
529
+
515
530
index = self .obj .index
516
- cumcounts = np .zeros (len (index ), dtype = 'int64' )
517
- for v in self .indices .values ():
518
- cumcounts [v ] = np .arange (len (v ), dtype = 'int64' )
531
+ rng = np .arange (self .grouper ._max_groupsize , dtype = 'int64' )
532
+ cumcounts = self ._cumcount_array (rng , ascending = ascending )
519
533
return Series (cumcounts , index )
520
534
535
+ def head (self , n = 5 ):
536
+ """
537
+ Returns first n rows of each group.
538
+
539
+ Essentially equivalent to ``.apply(lambda x: x.head(n))``
540
+
541
+ Example
542
+ -------
543
+
544
+ >>> df = DataFrame([[1, 2], [1, 4], [5, 6]],
545
+ columns=['A', 'B'])
546
+ >>> df.groupby('A', as_index=False).head(1)
547
+ A B
548
+ 0 1 2
549
+ 2 5 6
550
+ >>> df.groupby('A').head(1)
551
+ A B
552
+ A
553
+ 1 0 1 2
554
+ 5 2 5 6
555
+
556
+ """
557
+ rng = np .arange (self .grouper ._max_groupsize , dtype = 'int64' )
558
+ in_head = self ._cumcount_array (rng ) < n
559
+ head = self .obj [in_head ]
560
+ if self .as_index :
561
+ head .index = self ._index_with_as_index (in_head )
562
+ return head
563
+
564
+ def tail (self , n = 5 ):
565
+ """
566
+ Returns last n rows of each group
567
+
568
+ Essentially equivalent to ``.apply(lambda x: x.tail(n))``
569
+
570
+ Example
571
+ -------
572
+
573
+ >>> df = DataFrame([[1, 2], [1, 4], [5, 6]],
574
+ columns=['A', 'B'])
575
+ >>> df.groupby('A', as_index=False).tail(1)
576
+ A B
577
+ 0 1 2
578
+ 2 5 6
579
+ >>> df.groupby('A').head(1)
580
+ A B
581
+ A
582
+ 1 0 1 2
583
+ 5 2 5 6
584
+
585
+ """
586
+ rng = np .arange (0 , - self .grouper ._max_groupsize , - 1 , dtype = 'int64' )
587
+ in_tail = self ._cumcount_array (rng , ascending = False ) > - n
588
+ tail = self .obj [in_tail ]
589
+ if self .as_index :
590
+ tail .index = self ._index_with_as_index (in_tail )
591
+ return tail
592
+
593
+ def _cumcount_array (self , arr , ** kwargs ):
594
+ ascending = kwargs .pop ('ascending' , True )
595
+
596
+ len_index = len (self .obj .index )
597
+ cumcounts = np .zeros (len_index , dtype = 'int64' )
598
+ if ascending :
599
+ for v in self .indices .values ():
600
+ cumcounts [v ] = arr [:len (v )]
601
+ else :
602
+ for v in self .indices .values ():
603
+ cumcounts [v ] = arr [len (v )- 1 ::- 1 ]
604
+ return cumcounts
605
+
606
+ def _index_with_as_index (self , b ):
607
+ """
608
+ Take boolean mask of index to be returned from apply, if as_index=True
609
+
610
+ """
611
+ # TODO perf, it feels like this should already be somewhere...
612
+ from itertools import chain
613
+ original = self .obj .index
614
+ gp = self .grouper
615
+ levels = chain ((gp .levels [i ][gp .labels [i ][b ]]
616
+ for i in range (len (gp .groupings ))),
617
+ (original .get_level_values (i )[b ]
618
+ for i in range (original .nlevels )))
619
+ new = MultiIndex .from_arrays (list (levels ))
620
+ new .names = gp .names + original .names
621
+ return new
622
+
521
623
def _try_cast (self , result , obj ):
522
624
"""
523
625
try to cast the result to our obj original type,
@@ -758,14 +860,28 @@ def names(self):
758
860
def size (self ):
759
861
"""
760
862
Compute group sizes
863
+
761
864
"""
762
865
# TODO: better impl
763
866
labels , _ , ngroups = self .group_info
764
- bin_counts = Series ( labels ) .value_counts ()
867
+ bin_counts = algos .value_counts (labels , sort = False )
765
868
bin_counts = bin_counts .reindex (np .arange (ngroups ))
766
869
bin_counts .index = self .result_index
767
870
return bin_counts
768
871
872
+ @cache_readonly
873
+ def _max_groupsize (self ):
874
+ '''
875
+ Compute size of largest group
876
+
877
+ '''
878
+ # For many items in each group this is much faster than
879
+ # self.size().max(), in worst case marginally slower
880
+ if self .indices :
881
+ return max (len (v ) for v in self .indices .values ())
882
+ else :
883
+ return 0
884
+
769
885
@cache_readonly
770
886
def groups (self ):
771
887
if len (self .groupings ) == 1 :
0 commit comments