6
6
7
7
import numpy as np
8
8
np.random.seed(123456)
9
- from pandas import *
10
- options.display.max_rows=15
11
- randn = np.random.randn
12
9
np.set_printoptions(precision=4, suppress=True)
13
- import matplotlib.pyplot as plt
14
- plt.close('all')
10
+ import pandas as pd
11
+ pd.options.display.max_rows = 15
15
12
import matplotlib
16
13
try:
17
14
matplotlib.style.use('ggplot')
18
15
except AttributeError:
19
- options.display.mpl_style = 'default'
20
- from pandas.compat import zip
16
+ pd.options.display.mpl_style = 'default'
17
+ import matplotlib.pyplot as plt
18
+ plt.close('all')
21
19
22
20
*****************************
23
21
Group By: split-apply-combine
@@ -105,11 +103,12 @@ consider the following DataFrame:
105
103
106
104
.. ipython:: python
107
105
108
- df = DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
109
- 'foo', 'bar', 'foo', 'foo'],
110
- 'B' : ['one', 'one', 'two', 'three',
111
- 'two', 'two', 'one', 'three'],
112
- 'C' : randn(8), 'D' : randn(8)})
106
+ df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
107
+ 'foo', 'bar', 'foo', 'foo'],
108
+ 'B' : ['one', 'one', 'two', 'three',
109
+ 'two', 'two', 'one', 'three'],
110
+ 'C' : np.random.randn(8),
111
+ 'D' : np.random.randn(8)})
113
112
df
114
113
115
114
We could naturally group by either the ``A`` or ``B`` columns or both:
@@ -142,7 +141,7 @@ output of aggregation functions will only contain unique index values:
142
141
143
142
lst = [1, 2, 3, 1, 2, 3]
144
143
145
- s = Series([1, 2, 3, 10, 20, 30], lst)
144
+ s = pd. Series([1, 2, 3, 10, 20, 30], lst)
146
145
147
146
grouped = s.groupby(level=0)
148
147
@@ -189,7 +188,7 @@ however pass ``sort=False`` for potential speedups:
189
188
190
189
.. ipython:: python
191
190
192
- df2 = DataFrame({'X' : ['B', 'B', 'A', 'A'], 'Y' : [1, 2, 3, 4]})
191
+ df2 = pd. DataFrame({'X' : ['B', 'B', 'A', 'A'], 'Y' : [1, 2, 3, 4]})
193
192
df2.groupby(['X'], sort=True).sum()
194
193
df2.groupby(['X'], sort=False).sum()
195
194
@@ -203,10 +202,10 @@ however pass ``sort=False`` for potential speedups:
203
202
n = 10
204
203
weight = np.random.normal(166, 20, size=n)
205
204
height = np.random.normal(60, 10, size=n)
206
- time = date_range('1/1/2000', periods=n)
205
+ time = pd. date_range('1/1/2000', periods=n)
207
206
gender = tm.choice(['male', 'female'], size=n)
208
- df = DataFrame({'height': height, 'weight': weight,
209
- 'gender': gender}, index=time)
207
+ df = pd. DataFrame({'height': height, 'weight': weight,
208
+ 'gender': gender}, index=time)
210
209
211
210
.. ipython:: python
212
211
@@ -226,11 +225,12 @@ however pass ``sort=False`` for potential speedups:
226
225
.. ipython:: python
227
226
:suppress:
228
227
229
- df = DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
230
- 'foo', 'bar', 'foo', 'foo'],
231
- 'B' : ['one', 'one', 'two', 'three',
232
- 'two', 'two', 'one', 'three'],
233
- 'C' : randn(8), 'D' : randn(8)})
228
+ df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
229
+ 'foo', 'bar', 'foo', 'foo'],
230
+ 'B' : ['one', 'one', 'two', 'three',
231
+ 'two', 'two', 'one', 'three'],
232
+ 'C' : np.random.randn(8),
233
+ 'D' : np.random.randn(8)})
234
234
235
235
.. _groupby.multiindex:
236
236
@@ -248,8 +248,8 @@ natural to group by one of the levels of the hierarchy.
248
248
['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
249
249
tuples = list(zip(*arrays))
250
250
tuples
251
- index = MultiIndex.from_tuples(tuples, names=['first', 'second'])
252
- s = Series(randn(8), index=index)
251
+ index = pd. MultiIndex.from_tuples(tuples, names=['first', 'second'])
252
+ s = pd. Series(np.random. randn(8), index=index)
253
253
254
254
.. ipython:: python
255
255
@@ -281,13 +281,13 @@ Also as of v0.6, grouping with multiple levels is supported.
281
281
['doo', 'doo', 'bee', 'bee', 'bop', 'bop', 'bop', 'bop'],
282
282
['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
283
283
tuples = list(zip(*arrays))
284
- index = MultiIndex.from_tuples(tuples, names=['first', 'second', 'third'])
285
- s = Series(randn(8), index=index)
284
+ index = pd. MultiIndex.from_tuples(tuples, names=['first', 'second', 'third'])
285
+ s = pd. Series(np.random. randn(8), index=index)
286
286
287
287
.. ipython:: python
288
288
289
289
s
290
- s.groupby(level=['first','second']).sum()
290
+ s.groupby(level=['first', 'second']).sum()
291
291
292
292
More on the ``sum`` function and aggregation later.
293
293
@@ -499,9 +499,9 @@ to standardize the data within each group:
499
499
500
500
.. ipython:: python
501
501
502
- index = date_range('10/1/1999', periods=1100)
503
- ts = Series(np.random.normal(0.5, 2, 1100), index)
504
- ts = rolling_mean(ts, 100, 100).dropna()
502
+ index = pd. date_range('10/1/1999', periods=1100)
503
+ ts = pd. Series(np.random.normal(0.5, 2, 1100), index)
504
+ ts = pd. rolling_mean(ts, 100, 100).dropna()
505
505
506
506
ts.head()
507
507
ts.tail()
@@ -528,7 +528,7 @@ We can also visually compare the original and transformed data sets.
528
528
529
529
.. ipython:: python
530
530
531
- compare = DataFrame({'Original': ts, 'Transformed': transformed})
531
+ compare = pd. DataFrame({'Original': ts, 'Transformed': transformed})
532
532
533
533
@savefig groupby_transform_plot.png
534
534
compare.plot()
@@ -539,11 +539,11 @@ Another common data transform is to replace missing data with the group mean.
539
539
:suppress:
540
540
541
541
cols = ['A', 'B', 'C']
542
- values = randn(1000, 3)
542
+ values = np.random. randn(1000, 3)
543
543
values[np.random.randint(0, 1000, 100), 0] = np.nan
544
544
values[np.random.randint(0, 1000, 50), 1] = np.nan
545
545
values[np.random.randint(0, 1000, 200), 2] = np.nan
546
- data_df = DataFrame(values, columns=cols)
546
+ data_df = pd. DataFrame(values, columns=cols)
547
547
548
548
.. ipython:: python
549
549
@@ -599,7 +599,7 @@ than 2.
599
599
600
600
.. ipython:: python
601
601
602
- sf = Series([1, 1, 2, 3, 3, 3])
602
+ sf = pd. Series([1, 1, 2, 3, 3, 3])
603
603
sf.groupby(sf).filter(lambda x: x.sum() > 2)
604
604
605
605
The argument of ``filter`` must be a function that, applied to the group as a
@@ -610,7 +610,7 @@ with only a couple members.
610
610
611
611
.. ipython:: python
612
612
613
- dff = DataFrame({'A': np.arange(8), 'B': list('aabbbbcc')})
613
+ dff = pd. DataFrame({'A': np.arange(8), 'B': list('aabbbbcc')})
614
614
dff.groupby('B').filter(lambda x: len(x) > 2)
615
615
616
616
Alternatively, instead of dropping the offending groups, we can return a
@@ -672,9 +672,9 @@ next). This enables some operations to be carried out rather succinctly:
672
672
673
673
.. ipython:: python
674
674
675
- tsdf = DataFrame(randn(1000, 3),
676
- index=date_range('1/1/2000', periods=1000),
677
- columns=['A', 'B', 'C'])
675
+ tsdf = pd. DataFrame(np.random. randn(1000, 3),
676
+ index=pd. date_range('1/1/2000', periods=1000),
677
+ columns=['A', 'B', 'C'])
678
678
tsdf.ix[::2] = np.nan
679
679
grouped = tsdf.groupby(lambda x: x.year)
680
680
grouped.fillna(method='pad')
@@ -689,8 +689,8 @@ The ``nlargest`` and ``nsmallest`` methods work on ``Series`` style groupbys:
689
689
690
690
.. ipython:: python
691
691
692
- s = Series([9, 8, 7, 5, 19, 1, 4.2, 3.3])
693
- g = Series(list('abababab'))
692
+ s = pd. Series([9, 8, 7, 5, 19, 1, 4.2, 3.3])
693
+ g = pd. Series(list('abababab'))
694
694
gb = s.groupby(g)
695
695
gb.nlargest(3)
696
696
gb.nsmallest(3)
@@ -721,8 +721,8 @@ The dimension of the returned result can also change:
721
721
In [8]: grouped = df.groupby('A')['C']
722
722
723
723
In [10]: def f(group):
724
- ....: return DataFrame({'original' : group,
725
- ....: 'demeaned' : group - group.mean()})
724
+ ....: return pd. DataFrame({'original' : group,
725
+ ....: 'demeaned' : group - group.mean()})
726
726
....:
727
727
728
728
In [11]: grouped.apply(f)
@@ -732,8 +732,8 @@ The dimension of the returned result can also change:
732
732
.. ipython:: python
733
733
734
734
def f(x):
735
- return Series([ x, x**2 ], index = ['x', 'x^s'])
736
- s = Series(np.random.rand(5))
735
+ return pd. Series([ x, x**2 ], index = ['x', 'x^s'])
736
+ s = pd. Series(np.random.rand(5))
737
737
s
738
738
s.apply(f)
739
739
@@ -754,7 +754,7 @@ The dimension of the returned result can also change:
754
754
755
755
.. ipython:: python
756
756
757
- d = DataFrame({"a":["x", "y"], "b":[1,2]})
757
+ d = pd. DataFrame({"a":["x", "y"], "b":[1,2]})
758
758
def identity(df):
759
759
print df
760
760
return df
@@ -802,9 +802,9 @@ can be used as group keys. If so, the order of the levels will be preserved:
802
802
803
803
.. ipython:: python
804
804
805
- data = Series(np.random.randn(100))
805
+ data = pd. Series(np.random.randn(100))
806
806
807
- factor = qcut(data, [0, .25, .5, .75, 1.])
807
+ factor = pd. qcut(data, [0, .25, .5, .75, 1.])
808
808
809
809
data.groupby(factor).mean()
810
810
@@ -813,27 +813,28 @@ can be used as group keys. If so, the order of the levels will be preserved:
813
813
Grouping with a Grouper specification
814
814
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
815
815
816
- Your may need to specify a bit more data to properly group. You can
816
+ You may need to specify a bit more data to properly group. You can
817
817
use the ``pd.Grouper`` to provide this local control.
818
818
819
819
.. ipython:: python
820
820
821
- import datetime as DT
822
-
823
- df = DataFrame({
824
- 'Branch' : 'A A A A A A A B'.split(),
825
- 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(),
826
- 'Quantity': [1,3,5,1,8,1,9,3],
827
- 'Date' : [
828
- DT.datetime(2013,1,1,13,0),
829
- DT.datetime(2013,1,1,13,5),
830
- DT.datetime(2013,10,1,20,0),
831
- DT.datetime(2013,10,2,10,0),
832
- DT.datetime(2013,10,1,20,0),
833
- DT.datetime(2013,10,2,10,0),
834
- DT.datetime(2013,12,2,12,0),
835
- DT.datetime(2013,12,2,14,0),
836
- ]})
821
+ import datetime
822
+
823
+ df = pd.DataFrame({
824
+ 'Branch' : 'A A A A A A A B'.split(),
825
+ 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(),
826
+ 'Quantity': [1,3,5,1,8,1,9,3],
827
+ 'Date' : [
828
+ datetime.datetime(2013,1,1,13,0),
829
+ datetime.datetime(2013,1,1,13,5),
830
+ datetime.datetime(2013,10,1,20,0),
831
+ datetime.datetime(2013,10,2,10,0),
832
+ datetime.datetime(2013,10,1,20,0),
833
+ datetime.datetime(2013,10,2,10,0),
834
+ datetime.datetime(2013,12,2,12,0),
835
+ datetime.datetime(2013,12,2,14,0),
836
+ ]
837
+ })
837
838
838
839
df
839
840
@@ -862,7 +863,7 @@ Just like for a DataFrame or Series you can call head and tail on a groupby:
862
863
863
864
.. ipython:: python
864
865
865
- df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B'])
866
+ df = pd. DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B'])
866
867
df
867
868
868
869
g = df.groupby('A')
@@ -894,7 +895,7 @@ To select from a DataFrame or Series the nth item, use the nth method. This is a
894
895
895
896
.. ipython:: python
896
897
897
- df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
898
+ df = pd. DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
898
899
g = df.groupby('A')
899
900
900
901
g.nth(0)
@@ -919,7 +920,7 @@ As with other methods, passing ``as_index=False``, will achieve a filtration, wh
919
920
920
921
.. ipython:: python
921
922
922
- df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
923
+ df = pd. DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
923
924
g = df.groupby('A',as_index=False)
924
925
925
926
g.nth(0)
@@ -929,8 +930,8 @@ You can also select multiple rows from each group by specifying multiple nth val
929
930
930
931
.. ipython:: python
931
932
932
- business_dates = date_range(start='4/1/2014', end='6/30/2014', freq='B')
933
- df = DataFrame(1, index=business_dates, columns=['a', 'b'])
933
+ business_dates = pd. date_range(start='4/1/2014', end='6/30/2014', freq='B')
934
+ df = pd. DataFrame(1, index=business_dates, columns=['a', 'b'])
934
935
# get the first, 4th, and last date index for each month
935
936
df.groupby((df.index.year, df.index.month)).nth([0, 3, -1])
936
937
@@ -961,7 +962,7 @@ the values in column 1 where the group is "B" are 3 higher on average.
961
962
.. ipython:: python
962
963
963
964
np.random.seed(1234)
964
- df = DataFrame(np.random.randn(50, 2))
965
+ df = pd. DataFrame(np.random.randn(50, 2))
965
966
df['g'] = np.random.choice(['A', 'B'], size=50)
966
967
df.loc[df['g'] == 'B', 1] += 3
967
968
@@ -1010,11 +1011,11 @@ column index name will be used as the name of the inserted column:
1010
1011
.. ipython:: python
1011
1012
1012
1013
df = pd.DataFrame({
1013
- 'a': [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2],
1014
- 'b': [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1],
1015
- 'c': [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0],
1016
- 'd': [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1],
1017
- })
1014
+ 'a': [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2],
1015
+ 'b': [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1],
1016
+ 'c': [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0],
1017
+ 'd': [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1],
1018
+ })
1018
1019
1019
1020
def compute_metrics(x):
1020
1021
result = {'b_sum': x['b'].sum(), 'c_mean': x['c'].mean()}
0 commit comments