6
6
7
7
import numpy as np
8
8
np.random.seed(123456 )
9
- from pandas import *
10
- options.display.max_rows= 15
11
- randn = np.random.randn
12
9
np.set_printoptions(precision = 4 , suppress = True )
13
- import matplotlib.pyplot as plt
14
- plt.close( ' all ' )
10
+ import pandas as pd
11
+ pd.options.display.max_rows = 15
15
12
import matplotlib
16
13
try :
17
14
matplotlib.style.use(' ggplot' )
18
15
except AttributeError :
19
- options.display.mpl_style = ' default'
20
- from pandas.compat import zip
16
+ pd.options.display.mpl_style = ' default'
17
+ import matplotlib.pyplot as plt
18
+ plt.close(' all' )
21
19
22
20
*****************************
23
21
Group By: split-apply-combine
@@ -105,11 +103,12 @@ consider the following DataFrame:
105
103
106
104
.. ipython :: python
107
105
108
- df = DataFrame({' A' : [' foo' , ' bar' , ' foo' , ' bar' ,
109
- ' foo' , ' bar' , ' foo' , ' foo' ],
110
- ' B' : [' one' , ' one' , ' two' , ' three' ,
111
- ' two' , ' two' , ' one' , ' three' ],
112
- ' C' : randn(8 ), ' D' : randn(8 )})
106
+ df = pd.DataFrame({' A' : [' foo' , ' bar' , ' foo' , ' bar' ,
107
+ ' foo' , ' bar' , ' foo' , ' foo' ],
108
+ ' B' : [' one' , ' one' , ' two' , ' three' ,
109
+ ' two' , ' two' , ' one' , ' three' ],
110
+ ' C' : np.random.randn(8 ),
111
+ ' D' : np.random.randn(8 )})
113
112
df
114
113
115
114
We could naturally group by either the ``A `` or ``B `` columns or both:
@@ -142,7 +141,7 @@ output of aggregation functions will only contain unique index values:
142
141
143
142
lst = [1 , 2 , 3 , 1 , 2 , 3 ]
144
143
145
- s = Series([1 , 2 , 3 , 10 , 20 , 30 ], lst)
144
+ s = pd. Series([1 , 2 , 3 , 10 , 20 , 30 ], lst)
146
145
147
146
grouped = s.groupby(level = 0 )
148
147
@@ -189,7 +188,7 @@ however pass ``sort=False`` for potential speedups:
189
188
190
189
.. ipython :: python
191
190
192
- df2 = DataFrame({' X' : [' B' , ' B' , ' A' , ' A' ], ' Y' : [1 , 2 , 3 , 4 ]})
191
+ df2 = pd. DataFrame({' X' : [' B' , ' B' , ' A' , ' A' ], ' Y' : [1 , 2 , 3 , 4 ]})
193
192
df2.groupby([' X' ], sort = True ).sum()
194
193
df2.groupby([' X' ], sort = False ).sum()
195
194
@@ -203,10 +202,10 @@ however pass ``sort=False`` for potential speedups:
203
202
n = 10
204
203
weight = np.random.normal(166 , 20 , size = n)
205
204
height = np.random.normal(60 , 10 , size = n)
206
- time = date_range(' 1/1/2000' , periods = n)
205
+ time = pd. date_range(' 1/1/2000' , periods = n)
207
206
gender = tm.choice([' male' , ' female' ], size = n)
208
- df = DataFrame({' height' : height, ' weight' : weight,
209
- ' gender' : gender}, index = time)
207
+ df = pd. DataFrame({' height' : height, ' weight' : weight,
208
+ ' gender' : gender}, index = time)
210
209
211
210
.. ipython :: python
212
211
@@ -226,11 +225,12 @@ however pass ``sort=False`` for potential speedups:
226
225
.. ipython :: python
227
226
:suppress:
228
227
229
- df = DataFrame({' A' : [' foo' , ' bar' , ' foo' , ' bar' ,
230
- ' foo' , ' bar' , ' foo' , ' foo' ],
231
- ' B' : [' one' , ' one' , ' two' , ' three' ,
232
- ' two' , ' two' , ' one' , ' three' ],
233
- ' C' : randn(8 ), ' D' : randn(8 )})
228
+ df = pd.DataFrame({' A' : [' foo' , ' bar' , ' foo' , ' bar' ,
229
+ ' foo' , ' bar' , ' foo' , ' foo' ],
230
+ ' B' : [' one' , ' one' , ' two' , ' three' ,
231
+ ' two' , ' two' , ' one' , ' three' ],
232
+ ' C' : np.random.randn(8 ),
233
+ ' D' : np.random.randn(8 )})
234
234
235
235
.. _groupby.multiindex :
236
236
@@ -248,8 +248,8 @@ natural to group by one of the levels of the hierarchy.
248
248
[' one' , ' two' , ' one' , ' two' , ' one' , ' two' , ' one' , ' two' ]]
249
249
tuples = list (zip (* arrays))
250
250
tuples
251
- index = MultiIndex.from_tuples(tuples, names = [' first' , ' second' ])
252
- s = Series(randn(8 ), index = index)
251
+ index = pd. MultiIndex.from_tuples(tuples, names = [' first' , ' second' ])
252
+ s = pd. Series(np.random. randn(8 ), index = index)
253
253
254
254
.. ipython :: python
255
255
@@ -281,13 +281,13 @@ Also as of v0.6, grouping with multiple levels is supported.
281
281
[' doo' , ' doo' , ' bee' , ' bee' , ' bop' , ' bop' , ' bop' , ' bop' ],
282
282
[' one' , ' two' , ' one' , ' two' , ' one' , ' two' , ' one' , ' two' ]]
283
283
tuples = list (zip (* arrays))
284
- index = MultiIndex.from_tuples(tuples, names = [' first' , ' second' , ' third' ])
285
- s = Series(randn(8 ), index = index)
284
+ index = pd. MultiIndex.from_tuples(tuples, names = [' first' , ' second' , ' third' ])
285
+ s = pd. Series(np.random. randn(8 ), index = index)
286
286
287
287
.. ipython :: python
288
288
289
289
s
290
- s.groupby(level = [' first' ,' second' ]).sum()
290
+ s.groupby(level = [' first' , ' second' ]).sum()
291
291
292
292
More on the ``sum `` function and aggregation later.
293
293
@@ -499,9 +499,9 @@ to standardize the data within each group:
499
499
500
500
.. ipython :: python
501
501
502
- index = date_range(' 10/1/1999' , periods = 1100 )
503
- ts = Series(np.random.normal(0.5 , 2 , 1100 ), index)
504
- ts = rolling_mean(ts, 100 , 100 ).dropna()
502
+ index = pd. date_range(' 10/1/1999' , periods = 1100 )
503
+ ts = pd. Series(np.random.normal(0.5 , 2 , 1100 ), index)
504
+ ts = pd. rolling_mean(ts, 100 , 100 ).dropna()
505
505
506
506
ts.head()
507
507
ts.tail()
@@ -528,7 +528,7 @@ We can also visually compare the original and transformed data sets.
528
528
529
529
.. ipython :: python
530
530
531
- compare = DataFrame({' Original' : ts, ' Transformed' : transformed})
531
+ compare = pd. DataFrame({' Original' : ts, ' Transformed' : transformed})
532
532
533
533
@savefig groupby_transform_plot.png
534
534
compare.plot()
@@ -539,11 +539,11 @@ Another common data transform is to replace missing data with the group mean.
539
539
:suppress:
540
540
541
541
cols = [' A' , ' B' , ' C' ]
542
- values = randn(1000 , 3 )
542
+ values = np.random. randn(1000 , 3 )
543
543
values[np.random.randint(0 , 1000 , 100 ), 0 ] = np.nan
544
544
values[np.random.randint(0 , 1000 , 50 ), 1 ] = np.nan
545
545
values[np.random.randint(0 , 1000 , 200 ), 2 ] = np.nan
546
- data_df = DataFrame(values, columns = cols)
546
+ data_df = pd. DataFrame(values, columns = cols)
547
547
548
548
.. ipython :: python
549
549
@@ -599,7 +599,7 @@ than 2.
599
599
600
600
.. ipython :: python
601
601
602
- sf = Series([1 , 1 , 2 , 3 , 3 , 3 ])
602
+ sf = pd. Series([1 , 1 , 2 , 3 , 3 , 3 ])
603
603
sf.groupby(sf).filter(lambda x : x.sum() > 2 )
604
604
605
605
The argument of ``filter `` must be a function that, applied to the group as a
@@ -610,7 +610,7 @@ with only a couple members.
610
610
611
611
.. ipython :: python
612
612
613
- dff = DataFrame({' A' : np.arange(8 ), ' B' : list (' aabbbbcc' )})
613
+ dff = pd. DataFrame({' A' : np.arange(8 ), ' B' : list (' aabbbbcc' )})
614
614
dff.groupby(' B' ).filter(lambda x : len (x) > 2 )
615
615
616
616
Alternatively, instead of dropping the offending groups, we can return a
@@ -672,9 +672,9 @@ next). This enables some operations to be carried out rather succinctly:
672
672
673
673
.. ipython :: python
674
674
675
- tsdf = DataFrame(randn(1000 , 3 ),
676
- index = date_range(' 1/1/2000' , periods = 1000 ),
677
- columns = [' A' , ' B' , ' C' ])
675
+ tsdf = pd. DataFrame(np.random. randn(1000 , 3 ),
676
+ index = pd. date_range(' 1/1/2000' , periods = 1000 ),
677
+ columns = [' A' , ' B' , ' C' ])
678
678
tsdf.ix[::2 ] = np.nan
679
679
grouped = tsdf.groupby(lambda x : x.year)
680
680
grouped.fillna(method = ' pad' )
@@ -689,8 +689,8 @@ The ``nlargest`` and ``nsmallest`` methods work on ``Series`` style groupbys:
689
689
690
690
.. ipython :: python
691
691
692
- s = Series([9 , 8 , 7 , 5 , 19 , 1 , 4.2 , 3.3 ])
693
- g = Series(list (' abababab' ))
692
+ s = pd. Series([9 , 8 , 7 , 5 , 19 , 1 , 4.2 , 3.3 ])
693
+ g = pd. Series(list (' abababab' ))
694
694
gb = s.groupby(g)
695
695
gb.nlargest(3 )
696
696
gb.nsmallest(3 )
@@ -721,8 +721,8 @@ The dimension of the returned result can also change:
721
721
In [8]: grouped = df.groupby('A')['C']
722
722
723
723
In [10]: def f(group):
724
- ....: return DataFrame({'original' : group,
725
- ....: 'demeaned' : group - group.mean()})
724
+ ....: return pd. DataFrame({'original' : group,
725
+ ....: 'demeaned' : group - group.mean()})
726
726
....:
727
727
728
728
In [11]: grouped.apply(f)
@@ -732,8 +732,8 @@ The dimension of the returned result can also change:
732
732
.. ipython :: python
733
733
734
734
def f (x ):
735
- return Series([ x, x** 2 ], index = [' x' , ' x^s' ])
736
- s = Series(np.random.rand(5 ))
735
+ return pd. Series([ x, x** 2 ], index = [' x' , ' x^s' ])
736
+ s = pd. Series(np.random.rand(5 ))
737
737
s
738
738
s.apply(f)
739
739
@@ -754,7 +754,7 @@ The dimension of the returned result can also change:
754
754
755
755
.. ipython :: python
756
756
757
- d = DataFrame({" a" :[" x" , " y" ], " b" :[1 ,2 ]})
757
+ d = pd. DataFrame({" a" :[" x" , " y" ], " b" :[1 ,2 ]})
758
758
def identity (df ):
759
759
print df
760
760
return df
@@ -802,9 +802,9 @@ can be used as group keys. If so, the order of the levels will be preserved:
802
802
803
803
.. ipython :: python
804
804
805
- data = Series(np.random.randn(100 ))
805
+ data = pd. Series(np.random.randn(100 ))
806
806
807
- factor = qcut(data, [0 , .25 , .5 , .75 , 1 .])
807
+ factor = pd. qcut(data, [0 , .25 , .5 , .75 , 1 .])
808
808
809
809
data.groupby(factor).mean()
810
810
@@ -813,27 +813,28 @@ can be used as group keys. If so, the order of the levels will be preserved:
813
813
Grouping with a Grouper specification
814
814
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
815
815
816
- Your may need to specify a bit more data to properly group. You can
816
+ You may need to specify a bit more data to properly group. You can
817
817
use the ``pd.Grouper `` to provide this local control.
818
818
819
819
.. ipython :: python
820
820
821
- import datetime as DT
822
-
823
- df = DataFrame({
824
- ' Branch' : ' A A A A A A A B' .split(),
825
- ' Buyer' : ' Carl Mark Carl Carl Joe Joe Joe Carl' .split(),
826
- ' Quantity' : [1 ,3 ,5 ,1 ,8 ,1 ,9 ,3 ],
827
- ' Date' : [
828
- DT .datetime(2013 ,1 ,1 ,13 ,0 ),
829
- DT .datetime(2013 ,1 ,1 ,13 ,5 ),
830
- DT .datetime(2013 ,10 ,1 ,20 ,0 ),
831
- DT .datetime(2013 ,10 ,2 ,10 ,0 ),
832
- DT .datetime(2013 ,10 ,1 ,20 ,0 ),
833
- DT .datetime(2013 ,10 ,2 ,10 ,0 ),
834
- DT .datetime(2013 ,12 ,2 ,12 ,0 ),
835
- DT .datetime(2013 ,12 ,2 ,14 ,0 ),
836
- ]})
821
+ import datetime
822
+
823
+ df = pd.DataFrame({
824
+ ' Branch' : ' A A A A A A A B' .split(),
825
+ ' Buyer' : ' Carl Mark Carl Carl Joe Joe Joe Carl' .split(),
826
+ ' Quantity' : [1 ,3 ,5 ,1 ,8 ,1 ,9 ,3 ],
827
+ ' Date' : [
828
+ datetime.datetime(2013 ,1 ,1 ,13 ,0 ),
829
+ datetime.datetime(2013 ,1 ,1 ,13 ,5 ),
830
+ datetime.datetime(2013 ,10 ,1 ,20 ,0 ),
831
+ datetime.datetime(2013 ,10 ,2 ,10 ,0 ),
832
+ datetime.datetime(2013 ,10 ,1 ,20 ,0 ),
833
+ datetime.datetime(2013 ,10 ,2 ,10 ,0 ),
834
+ datetime.datetime(2013 ,12 ,2 ,12 ,0 ),
835
+ datetime.datetime(2013 ,12 ,2 ,14 ,0 ),
836
+ ]
837
+ })
837
838
838
839
df
839
840
@@ -862,7 +863,7 @@ Just like for a DataFrame or Series you can call head and tail on a groupby:
862
863
863
864
.. ipython :: python
864
865
865
- df = DataFrame([[1 , 2 ], [1 , 4 ], [5 , 6 ]], columns = [' A' , ' B' ])
866
+ df = pd. DataFrame([[1 , 2 ], [1 , 4 ], [5 , 6 ]], columns = [' A' , ' B' ])
866
867
df
867
868
868
869
g = df.groupby(' A' )
@@ -894,7 +895,7 @@ To select from a DataFrame or Series the nth item, use the nth method. This is a
894
895
895
896
.. ipython :: python
896
897
897
- df = DataFrame([[1 , np.nan], [1 , 4 ], [5 , 6 ]], columns = [' A' , ' B' ])
898
+ df = pd. DataFrame([[1 , np.nan], [1 , 4 ], [5 , 6 ]], columns = [' A' , ' B' ])
898
899
g = df.groupby(' A' )
899
900
900
901
g.nth(0 )
@@ -919,7 +920,7 @@ As with other methods, passing ``as_index=False``, will achieve a filtration, wh
919
920
920
921
.. ipython :: python
921
922
922
- df = DataFrame([[1 , np.nan], [1 , 4 ], [5 , 6 ]], columns = [' A' , ' B' ])
923
+ df = pd. DataFrame([[1 , np.nan], [1 , 4 ], [5 , 6 ]], columns = [' A' , ' B' ])
923
924
g = df.groupby(' A' ,as_index = False )
924
925
925
926
g.nth(0 )
@@ -929,8 +930,8 @@ You can also select multiple rows from each group by specifying multiple nth val
929
930
930
931
.. ipython :: python
931
932
932
- business_dates = date_range(start = ' 4/1/2014' , end = ' 6/30/2014' , freq = ' B' )
933
- df = DataFrame(1 , index = business_dates, columns = [' a' , ' b' ])
933
+ business_dates = pd. date_range(start = ' 4/1/2014' , end = ' 6/30/2014' , freq = ' B' )
934
+ df = pd. DataFrame(1 , index = business_dates, columns = [' a' , ' b' ])
934
935
# get the first, 4th, and last date index for each month
935
936
df.groupby((df.index.year, df.index.month)).nth([0 , 3 , - 1 ])
936
937
@@ -961,7 +962,7 @@ the values in column 1 where the group is "B" are 3 higher on average.
961
962
.. ipython :: python
962
963
963
964
np.random.seed(1234 )
964
- df = DataFrame(np.random.randn(50 , 2 ))
965
+ df = pd. DataFrame(np.random.randn(50 , 2 ))
965
966
df[' g' ] = np.random.choice([' A' , ' B' ], size = 50 )
966
967
df.loc[df[' g' ] == ' B' , 1 ] += 3
967
968
@@ -1010,11 +1011,11 @@ column index name will be used as the name of the inserted column:
1010
1011
.. ipython :: python
1011
1012
1012
1013
df = pd.DataFrame({
1013
- ' a' : [0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 2 , 2 , 2 , 2 ],
1014
- ' b' : [0 , 0 , 1 , 1 , 0 , 0 , 1 , 1 , 0 , 0 , 1 , 1 ],
1015
- ' c' : [1 , 0 , 1 , 0 , 1 , 0 , 1 , 0 , 1 , 0 , 1 , 0 ],
1016
- ' d' : [0 , 0 , 0 , 1 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 1 ],
1017
- })
1014
+ ' a' : [0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 2 , 2 , 2 , 2 ],
1015
+ ' b' : [0 , 0 , 1 , 1 , 0 , 0 , 1 , 1 , 0 , 0 , 1 , 1 ],
1016
+ ' c' : [1 , 0 , 1 , 0 , 1 , 0 , 1 , 0 , 1 , 0 , 1 , 0 ],
1017
+ ' d' : [0 , 0 , 0 , 1 , 0 , 0 , 0 , 1 , 0 , 0 , 0 , 1 ],
1018
+ })
1018
1019
1019
1020
def compute_metrics (x ):
1020
1021
result = {' b_sum' : x[' b' ].sum(), ' c_mean' : x[' c' ].mean()}
0 commit comments