From d467a5c249c88512b840cc422e7d4a990fafc331 Mon Sep 17 00:00:00 2001 From: saurav Date: Wed, 21 Nov 2018 09:39:28 +0530 Subject: [PATCH 01/12] firt iter on errors --- doc/source/cookbook.rst | 459 +++++++++++++++++++++++----------------- 1 file changed, 269 insertions(+), 190 deletions(-) diff --git a/doc/source/cookbook.rst b/doc/source/cookbook.rst index 53468e755a722..b73358c67b1ab 100644 --- a/doc/source/cookbook.rst +++ b/doc/source/cookbook.rst @@ -9,18 +9,15 @@ import numpy as np from pandas.compat import StringIO - import random import os import itertools import functools import datetime - + import glob np.random.seed(123456) - pd.options.display.max_rows=15 + pd.options.display.max_rows = 15 - import matplotlib - # matplotlib.style.use('default') np.set_printoptions(precision=4, suppress=True) @@ -56,8 +53,9 @@ These are some neat pandas ``idioms`` .. ipython:: python - df = pd.DataFrame( - {'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40],'CCC' : [100,50,-30,-50]}); df + df = pd.DataFrame({'AAA': [4, 5, 6, 7], 'BBB': [10, 20, 30, 40], + 'CCC': [100, 50, -30, -50]}) + df if-then... ********** @@ -66,36 +64,41 @@ An if-then on one column .. ipython:: python - df.loc[df.AAA >= 5,'BBB'] = -1; df + df.loc[df.AAA >= 5, 'BBB'] = -1 + df An if-then with assignment to 2 columns: .. ipython:: python - df.loc[df.AAA >= 5,['BBB','CCC']] = 555; df + df.loc[df.AAA >= 5, ['BBB', 'CCC']] = 555 + df Add another line with different logic, to do the -else .. ipython:: python - df.loc[df.AAA < 5,['BBB','CCC']] = 2000; df + df.loc[df.AAA < 5, ['BBB', 'CCC']] = 2000 + df Or use pandas where after you've set up a mask .. ipython:: python - df_mask = pd.DataFrame({'AAA' : [True] * 4, 'BBB' : [False] * 4,'CCC' : [True,False] * 2}) - df.where(df_mask,-1000) + df_mask = pd.DataFrame({'AAA': [True] * 4, 'BBB': [False] * 4, + 'CCC': [True, False] * 2}) + df.where(df_mask, -1000) `if-then-else using numpy's where() `__ .. ipython:: python - df = pd.DataFrame( - {'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40],'CCC' : [100,50,-30,-50]}); df - - df['logic'] = np.where(df['AAA'] > 5,'high','low'); df + df = pd.DataFrame({'AAA': [4, 5, 6, 7], 'BBB': [10, 20, 30, 40], + 'CCC': [100, 50, -30, -50]}) + df + df['logic'] = np.where(df['AAA'] > 5, 'high', 'low') + df Splitting ********* @@ -105,11 +108,14 @@ Splitting .. ipython:: python - df = pd.DataFrame( - {'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40],'CCC' : [100,50,-30,-50]}); df + df = pd.DataFrame({'AAA': [4, 5, 6, 7], 'BBB': [10, 20, 30, 40], + 'CCC': [100, 50, -30, -50]}) + df - dflow = df[df.AAA <= 5]; dflow - dfhigh = df[df.AAA > 5]; dfhigh + dflow = df[df.AAA <= 5] + dflow + dfhigh = df[df.AAA > 5] + dfhigh Building Criteria ***************** @@ -119,45 +125,50 @@ Building Criteria .. ipython:: python - df = pd.DataFrame( - {'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40],'CCC' : [100,50,-30,-50]}); df + df = pd.DataFrame({'AAA': [4, 5, 6, 7], 'BBB': [10, 20, 30, 40], + 'CCC': [100, 50, -30, -50]}) + df ...and (without assignment returns a Series) .. ipython:: python - newseries = df.loc[(df['BBB'] < 25) & (df['CCC'] >= -40), 'AAA']; newseries + newseries = df.loc[(df['BBB'] < 25) & (df['CCC'] >= -40), 'AAA'] + newseries ...or (without assignment returns a Series) .. ipython:: python - newseries = df.loc[(df['BBB'] > 25) | (df['CCC'] >= -40), 'AAA']; newseries + newseries = df.loc[(df['BBB'] > 25) | (df['CCC'] >= -40), 'AAA'] + newseries ...or (with assignment modifies the DataFrame.) .. ipython:: python - df.loc[(df['BBB'] > 25) | (df['CCC'] >= 75), 'AAA'] = 0.1; df + df.loc[(df['BBB'] > 25) | (df['CCC'] >= 75), 'AAA'] = 0.1 + df `Select rows with data closest to certain value using argsort `__ .. ipython:: python - df = pd.DataFrame( - {'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40],'CCC' : [100,50,-30,-50]}); df - + df = pd.DataFrame({'AAA': [4, 5, 6, 7], 'BBB': [10, 20, 30, 40], + 'CCC': [100, 50, -30, -50]}) + df aValue = 43.0 - df.loc[(df.CCC-aValue).abs().argsort()] + df.loc[(df.CCC - aValue).abs().argsort()] `Dynamically reduce a list of criteria using a binary operators `__ .. ipython:: python - df = pd.DataFrame( - {'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40],'CCC' : [100,50,-30,-50]}); df + df = pd.DataFrame({'AAA': [4, 5, 6, 7], 'BBB': [10, 20, 30, 40], + 'CCC': [100, 50, -30, -50]}) + df Crit1 = df.AAA <= 5.5 Crit2 = df.BBB == 10.0 @@ -173,8 +184,8 @@ One could hard code: .. ipython:: python - CritList = [Crit1,Crit2,Crit3] - AllCrit = functools.reduce(lambda x,y: x & y, CritList) + CritList = [Crit1, Crit2, Crit3] + AllCrit = functools.reduce(lambda x, y: x & y, CritList) df[AllCrit] @@ -193,18 +204,22 @@ The :ref:`indexing ` docs. .. ipython:: python - df = pd.DataFrame( - {'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40],'CCC' : [100,50,-30,-50]}); df + df = pd.DataFrame({'AAA': [4, 5, 6, 7], 'BBB': [10, 20, 30, 40], + 'CCC': [100, 50, -30, -50]}) + df - df[(df.AAA <= 6) & (df.index.isin([0,2,4]))] + df[(df.AAA <= 6) & (df.index.isin([0, 2, 4]))] `Use loc for label-oriented slicing and iloc positional slicing `__ .. ipython:: python - data = {'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40],'CCC' : [100,50,-30,-50]} - df = pd.DataFrame(data=data,index=['foo','bar','boo','kar']); df + data = {'AAA': [4, 5, 6, 7], + 'BBB': [10, 20, 30, 40], + 'CCC': [100, 50, -30, -50]} + df = pd.DataFrame(data=data, index=['foo', 'bar', 'boo', 'kar']) + df There are 2 explicit slicing methods, with a third general case @@ -213,33 +228,35 @@ There are 2 explicit slicing methods, with a third general case 3. General (Either slicing style : depends on if the slice contains labels or positions) .. ipython:: python - df.iloc[0:3] #Positional + df.iloc[0:3] # Positional - df.loc['bar':'kar'] #Label + df.loc['bar': 'kar'] # Label # Generic df.iloc[0:3] - df.loc['bar':'kar'] + df.loc['bar': 'kar'] Ambiguity arises when an index consists of integers with a non-zero start or non-unit increment. .. ipython:: python - df2 = pd.DataFrame(data=data,index=[1,2,3,4]); #Note index starts at 1. + df2 = pd.DataFrame(data=data, index=[1, 2, 3, 4]) # Note index starts at 1. - df2.iloc[1:3] #Position-oriented + df2.iloc[1:3] # Position-oriented - df2.loc[1:3] #Label-oriented + df2.loc[1:3] # Label-oriented `Using inverse operator (~) to take the complement of a mask `__ .. ipython:: python - df = pd.DataFrame( - {'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40], 'CCC' : [100,50,-30,-50]}); df + df = pd.DataFrame({'AAA': [4, 5, 6, 7], + 'BBB': [10, 20, 30, 40], + 'CCC': [100, 50, -30, -50]}) + df - df[~((df.AAA <= 6) & (df.index.isin([0,2,4])))] + df[~((df.AAA <= 6) & (df.index.isin([0, 2, 4])))] Panels ****** @@ -249,14 +266,18 @@ Panels .. ipython:: python - rng = pd.date_range('1/1/2013',periods=100,freq='D') + rng = pd.date_range('1/1/2013', periods=100, freq='D') data = np.random.randn(100, 4) - cols = ['A','B','C','D'] - df1, df2, df3 = pd.DataFrame(data, rng, cols), pd.DataFrame(data, rng, cols), pd.DataFrame(data, rng, cols) + cols = ['A', 'B', 'C', 'D'] + df1 = pd.DataFrame(data, rng, cols) + df2 = pd.DataFrame(data, rng, cols) + df3 = pd.DataFrame(data, rng, cols) - pf = pd.Panel({'df1':df1,'df2':df2,'df3':df3});pf + pf = pd.Panel({'df1': df1, 'df2': df2, 'df3': df3}) + pf - pf.loc[:,:,'F'] = pd.DataFrame(data, rng, cols);pf + pf.loc[:, :, 'F'] = pd.DataFrame(data, rng, cols) + pf `Mask a panel by using np.where and then reconstructing the panel with the new masked values `__ @@ -269,22 +290,26 @@ New Columns .. ipython:: python - df = pd.DataFrame( - {'AAA' : [1,2,1,3], 'BBB' : [1,1,2,2], 'CCC' : [2,1,3,1]}); df + df = pd.DataFrame({'AAA': [1, 2, 1, 3], + 'BBB': [1, 1, 2, 2], + 'CCC': [2, 1, 3, 1]}) + df - source_cols = df.columns # or some subset would work too. + source_cols = df.columns # or some subset would work too. new_cols = [str(x) + "_cat" for x in source_cols] - categories = {1 : 'Alpha', 2 : 'Beta', 3 : 'Charlie' } + categories = {1: 'Alpha', 2: 'Beta', 3: 'Charlie'} - df[new_cols] = df[source_cols].applymap(categories.get);df + df[new_cols] = df[source_cols].applymap(categories.get) + df `Keep other columns when using min() with groupby `__ .. ipython:: python - df = pd.DataFrame( - {'AAA' : [1,1,1,2,2,2,3,3], 'BBB' : [2,1,3,4,5,1,2,3]}); df + df = pd.DataFrame({'AAA': [1, 1, 1, 2, 2, 2, 3, 3], + 'BBB': [2, 1, 3, 4, 5, 1, 2, 3]}) + df Method 1 : idxmin() to get the index of the minimums @@ -312,20 +337,26 @@ The :ref:`multindexing ` docs. .. ipython:: python - df = pd.DataFrame({'row' : [0,1,2], - 'One_X' : [1.1,1.1,1.1], - 'One_Y' : [1.2,1.2,1.2], - 'Two_X' : [1.11,1.11,1.11], - 'Two_Y' : [1.22,1.22,1.22]}); df + df = pd.DataFrame({'row': [0, 1, 2], + 'One_X': [1.1, 1.1, 1.1], + 'One_Y': [1.2, 1.2, 1.2], + 'Two_X': [1.11, 1.11, 1.11], + 'Two_Y': [1.22, 1.22, 1.22]}) + df # As Labelled Index - df = df.set_index('row');df + df = df.set_index('row') + df # With Hierarchical Columns - df.columns = pd.MultiIndex.from_tuples([tuple(c.split('_')) for c in df.columns]);df + df.columns = pd.MultiIndex.from_tuples([tuple(c.split('_')) + for c in df.columns]) + df # Now stack & Reset - df = df.stack(0).reset_index(1);df + df = df.stack(0).reset_index(1) + df # And fix the labels (Notice the label 'level_1' got added automatically) - df.columns = ['Sample','All_X','All_Y'];df + df.columns = ['Sample', 'All_X', 'All_Y'] + df Arithmetic ********** @@ -335,9 +366,12 @@ Arithmetic .. ipython:: python - cols = pd.MultiIndex.from_tuples([ (x,y) for x in ['A','B','C'] for y in ['O','I']]) - df = pd.DataFrame(np.random.randn(2,6),index=['n','m'],columns=cols); df - df = df.div(df['C'],level=1); df + cols = pd.MultiIndex.from_tuples([(x, y) for x in ['A', 'B', 'C'] + for y in ['O', 'I']]) + df = pd.DataFrame(np.random.randn(2, 6), index=['n', 'm'], columns=cols) + df + df = df.div(df['C'], level=1) + df Slicing ******* @@ -347,44 +381,49 @@ Slicing .. ipython:: python - coords = [('AA','one'),('AA','six'),('BB','one'),('BB','two'),('BB','six')] + coords = [('AA', 'one'), ('AA', 'six'), ('BB', 'one'), ('BB', 'two'), + ('BB', 'six')] index = pd.MultiIndex.from_tuples(coords) - df = pd.DataFrame([11,22,33,44,55],index,['MyData']); df + df = pd.DataFrame([11, 22, 33, 44, 55], index, ['MyData']) + df To take the cross section of the 1st level and 1st axis the index: .. ipython:: python - df.xs('BB',level=0,axis=0) #Note : level and axis are optional, and default to zero + # Note : level and axis are optional, and default to zero + df.xs('BB', level=0, axis=0) ...and now the 2nd level of the 1st axis. .. ipython:: python - df.xs('six',level=1,axis=0) + df.xs('six', level=1, axis=0) `Slicing a MultiIndex with xs, method #2 `__ .. ipython:: python - index = list(itertools.product(['Ada','Quinn','Violet'],['Comp','Math','Sci'])) - headr = list(itertools.product(['Exams','Labs'],['I','II'])) + index = list(itertools.product(['Ada', 'Quinn', 'Violet'], + ['Comp', 'Math', 'Sci'])) + headr = list(itertools.product(['Exams', 'Labs'], ['I', 'II'])) - indx = pd.MultiIndex.from_tuples(index,names=['Student','Course']) - cols = pd.MultiIndex.from_tuples(headr) #Notice these are un-named + indx = pd.MultiIndex.from_tuples(index, names=['Student', 'Course']) + cols = pd.MultiIndex.from_tuples(headr) # Notice these are un-named - data = [[70+x+y+(x*y)%3 for x in range(4)] for y in range(9)] + data = [[70 + x + y + (x * y) % 3 for x in range(4)] for y in range(9)] - df = pd.DataFrame(data,indx,cols); df + df = pd.DataFrame(data, indx, cols) + df All = slice(None) df.loc['Violet'] - df.loc[(All,'Math'),All] - df.loc[(slice('Ada','Quinn'),'Math'),All] - df.loc[(All,'Math'),('Exams')] - df.loc[(All,'Math'),(All,'II')] + df.loc[(All, 'Math'), All] + df.loc[(slice('Ada', 'Quinn'), 'Math'), All] + df.loc[(All, 'Math'), ('Exams')] + df.loc[(All, 'Math'), (All, 'II')] `Setting portions of a MultiIndex with xs `__ @@ -422,7 +461,9 @@ Fill forward a reversed timeseries .. ipython:: python - df = pd.DataFrame(np.random.randn(6,1), index=pd.date_range('2013-08-01', periods=6, freq='B'), columns=list('A')) + df = pd.DataFrame(np.random.randn(6, 1), + index=pd.date_range('2013-08-01', periods=6, freq='B'), + columns=list('A')) df.loc[df.index[3], 'A'] = np.nan df df.reindex(df.index[::-1]).ffill() @@ -453,9 +494,10 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to df = pd.DataFrame({'animal': 'cat dog cat fish dog cat cat'.split(), 'size': list('SSMMMLL'), 'weight': [8, 10, 11, 1, 20, 12, 12], - 'adult' : [False] * 5 + [True] * 2}); df + 'adult': [False] * 5 + [True] * 2}) + df - #List the size of the animals with the highest weight. + # List the size of the animals with the highest weight. df.groupby('animal').apply(lambda subf: subf['size'][subf['weight'].idxmax()]) `Using get_group @@ -473,11 +515,12 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to .. ipython:: python def GrowUp(x): - avg_weight = sum(x[x['size'] == 'S'].weight * 1.5) - avg_weight += sum(x[x['size'] == 'M'].weight * 1.25) - avg_weight += sum(x[x['size'] == 'L'].weight) - avg_weight /= len(x) - return pd.Series(['L',avg_weight,True], index=['size', 'weight', 'adult']) + avg_weight = sum(x[x['size'] == 'S'].weight * 1.5) + avg_weight += sum(x[x['size'] == 'M'].weight * 1.25) + avg_weight += sum(x[x['size'] == 'L'].weight) + avg_weight /= len(x) + return pd.Series(['L', avg_weight, True], + index=['size', 'weight', 'adult']) expected_df = gb.apply(GrowUp) @@ -488,13 +531,13 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to .. ipython:: python - S = pd.Series([i / 100.0 for i in range(1,11)]) + S = pd.Series([i / 100.0 for i in range(1, 11)]) - def CumRet(x,y): - return x * (1 + y) + def CumRet(x, y): + return x * (1 + y) def Red(x): - return functools.reduce(CumRet,x,1.0) + return functools.reduce(CumRet, x, 1.0) S.expanding().apply(Red, raw=True) @@ -504,7 +547,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to .. ipython:: python - df = pd.DataFrame({'A' : [1, 1, 2, 2], 'B' : [1, -1, 1, 2]}) + df = pd.DataFrame({'A': [1, 1, 2, 2], 'B': [1, -1, 1, 2]}) gb = df.groupby('A') def replace(g): @@ -535,15 +578,15 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to .. ipython:: python - rng = pd.date_range(start="2014-10-07",periods=10,freq='2min') - ts = pd.Series(data = list(range(10)), index = rng) + rng = pd.date_range(start="2014-10-07", periods=10, freq='2min') + ts = pd.Series(data=list(range(10)), index=rng) def MyCust(x): - if len(x) > 2: - return x[1] * 1.234 - return pd.NaT + if len(x) > 2: + return x[1] * 1.234 + return pd.NaT - mhc = {'Mean' : np.mean, 'Max' : np.max, 'Custom' : MyCust} + mhc = {'Mean': np.mean, 'Max': np.max, 'Custom': MyCust} ts.resample("5min").apply(mhc) ts @@ -553,7 +596,8 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to .. ipython:: python df = pd.DataFrame({'Color': 'Red Red Red Blue'.split(), - 'Value': [100, 150, 50, 50]}); df + 'Value': [100, 150, 50, 50]}) + df df['Counts'] = df.groupby(['Color']).transform(len) df @@ -562,11 +606,12 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to .. ipython:: python - df = pd.DataFrame( - {u'line_race': [10, 10, 8, 10, 10, 8], - u'beyer': [99, 102, 103, 103, 88, 100]}, - index=[u'Last Gunfighter', u'Last Gunfighter', u'Last Gunfighter', - u'Paynter', u'Paynter', u'Paynter']); df + df = pd.DataFrame({u'line_race': [10, 10, 8, 10, 10, 8], + u'beyer': [99, 102, 103, 103, 88, 100]}, + index=[u'Last Gunfighter', u'Last Gunfighter', + u'Last Gunfighter', u'Paynter', u'Paynter', + u'Paynter']) + df df['beyer_shifted'] = df.groupby(level=0)['beyer'].shift(1) df @@ -575,9 +620,9 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to .. ipython:: python - df = pd.DataFrame({'host':['other','other','that','this','this'], - 'service':['mail','web','mail','mail','web'], - 'no':[1, 2, 1, 2, 1]}).set_index(['host', 'service']) + df = pd.DataFrame({'host': ['other', 'other', 'that', 'this', 'this'], + 'service': ['mail', 'web', 'mail', 'mail', 'web'], + 'no': [1, 2, 1, 2, 1]}).set_index(['host', 'service']) mask = df.groupby(level=0).agg('idxmax') df_count = df.loc[mask['no']].reset_index() df_count @@ -613,10 +658,12 @@ Create a list of dataframes, split using a delineation based on logic included i .. ipython:: python - df = pd.DataFrame(data={'Case' : ['A','A','A','B','A','A','B','A','A'], - 'Data' : np.random.randn(9)}) + df = pd.DataFrame(data={'Case': ['A', 'A', 'A', 'B', 'A', 'A', 'B', 'A', + 'A'], + 'Data': np.random.randn(9)}) - dfs = list(zip(*df.groupby((1*(df['Case']=='B')).cumsum().rolling(window=3,min_periods=1).median())))[-1] + dfs = list(zip(*df.groupby((1 * (df['Case'] == 'B')).cumsum() + .rolling(window=3, min_periods=1).median())))[-1] dfs[0] dfs[1] @@ -633,10 +680,13 @@ The :ref:`Pivot ` docs. .. ipython:: python - df = pd.DataFrame(data={'Province' : ['ON','QC','BC','AL','AL','MN','ON'], - 'City' : ['Toronto','Montreal','Vancouver','Calgary','Edmonton','Winnipeg','Windsor'], - 'Sales' : [13,6,16,8,4,3,1]}) - table = pd.pivot_table(df,values=['Sales'],index=['Province'],columns=['City'],aggfunc=np.sum,margins=True) + df = pd.DataFrame(data={'Province': ['ON', 'QC', 'BC', 'AL', 'AL', 'MN', 'ON'], + 'City': ['Toronto', 'Montreal', 'Vancouver', + 'Calgary', 'Edmonton', 'Winnipeg', + 'Windsor'], + 'Sales': [13, 6, 16, 8, 4, 3, 1]}) + table = pd.pivot_table(df, values=['Sales'], index=['Province'], + columns=['City'], aggfunc=np.sum, margins=True) table.stack('City') `Frequency table like plyr in R @@ -644,20 +694,26 @@ The :ref:`Pivot ` docs. .. ipython:: python - grades = [48,99,75,80,42,80,72,68,36,78] - df = pd.DataFrame( {'ID': ["x%d" % r for r in range(10)], - 'Gender' : ['F', 'M', 'F', 'M', 'F', 'M', 'F', 'M', 'M', 'M'], - 'ExamYear': ['2007','2007','2007','2008','2008','2008','2008','2009','2009','2009'], - 'Class': ['algebra', 'stats', 'bio', 'algebra', 'algebra', 'stats', 'stats', 'algebra', 'bio', 'bio'], - 'Participated': ['yes','yes','yes','yes','no','yes','yes','yes','yes','yes'], - 'Passed': ['yes' if x > 50 else 'no' for x in grades], - 'Employed': [True,True,True,False,False,False,False,True,True,False], - 'Grade': grades}) + grades = [48, 99, 75, 80, 42, 80, 72, 68, 36, 78] + df = pd.DataFrame({'ID': ["x%d" % r for r in range(10)], + 'Gender': ['F', 'M', 'F', 'M', 'F', + 'M', 'F', 'M', 'M', 'M'], + 'ExamYear': ['2007', '2007', '2007', '2008', '2008', + '2008', '2008', '2009', '2009', '2009'], + 'Class': ['algebra', 'stats', 'bio', 'algebra', + 'algebra', 'stats', 'stats', 'algebra', + 'bio', 'bio'], + 'Participated': ['yes', 'yes', 'yes', 'yes', 'no', + 'yes', 'yes', 'yes', 'yes', 'yes'], + 'Passed': ['yes' if x > 50 else 'no' for x in grades], + 'Employed': [True, True, True, False, + False, False, False, True, True, False], + 'Grade': grades}) df.groupby('ExamYear').agg({'Participated': lambda x: x.value_counts()['yes'], - 'Passed': lambda x: sum(x == 'yes'), - 'Employed' : lambda x : sum(x), - 'Grade' : lambda x : sum(x) / len(x)}) + 'Passed': lambda x: sum(x == 'yes'), + 'Employed': lambda x: sum(x), + 'Grade': lambda x: sum(x) / len(x)}) `Plot pandas DataFrame with year over year data `__ @@ -680,12 +736,15 @@ Apply .. ipython:: python - df = pd.DataFrame(data={'A' : [[2,4,8,16],[100,200],[10,20,30]], 'B' : [['a','b','c'],['jj','kk'],['ccc']]},index=['I','II','III']) + df = pd.DataFrame(data={'A': [[2, 4, 8, 16], [100, 200], [10, 20, 30]], + 'B': [['a', 'b', 'c'], ['jj', 'kk'], ['ccc']]}, + index=['I', 'II', 'III']) def SeriesFromSubList(aList): - return pd.Series(aList) + return pd.Series(aList) - df_orgz = pd.concat(dict([ (ind,row.apply(SeriesFromSubList)) for ind,row in df.iterrows() ])) + df_orgz = pd.concat({[(ind, row.apply(SeriesFromSubList)) + for ind, row in df.iterrows()]}) `Rolling Apply with a DataFrame returning a Series `__ @@ -694,15 +753,18 @@ Rolling Apply to multiple columns where function calculates a Series before a Sc .. ipython:: python - df = pd.DataFrame(data=np.random.randn(2000,2)/10000, - index=pd.date_range('2001-01-01',periods=2000), - columns=['A','B']); df + df = pd.DataFrame(data=np.random.randn(2000, 2) / 10000, + index=pd.date_range('2001-01-01', periods=2000), + columns=['A', 'B']) + df - def gm(aDF,Const): - v = ((((aDF.A+aDF.B)+1).cumprod())-1)*Const - return (aDF.index[0],v.iloc[-1]) + def gm(aDF, Const): + v = ((((aDF.A + aDF.B) + 1).cumprod()) - 1) * Const + return (aDF.index[0], v.iloc[-1]) - S = pd.Series(dict([ gm(df.iloc[i:min(i+51,len(df)-1)],5) for i in range(len(df)-50) ])); S + S = pd.Series({[gm(df.iloc[i:min(i + 51, len(df) - 1)], 5) + for i in range(len(df) - 50)]}) + S `Rolling apply with a DataFrame returning a Scalar `__ @@ -711,14 +773,20 @@ Rolling Apply to multiple columns where function returns a Scalar (Volume Weight .. ipython:: python - rng = pd.date_range(start = '2014-01-01',periods = 100) - df = pd.DataFrame({'Open' : np.random.randn(len(rng)), - 'Close' : np.random.randn(len(rng)), - 'Volume' : np.random.randint(100,2000,len(rng))}, index=rng); df + rng = pd.date_range(start='2014-01-01', periods=100) + df = pd.DataFrame({'Open': np.random.randn(len(rng)), + 'Close': np.random.randn(len(rng)), + 'Volume': np.random.randint(100, 2000, len(rng))}, + index=rng) + df - def vwap(bars): return ((bars.Close*bars.Volume).sum()/bars.Volume.sum()) + def vwap(bars): + return ((bars.Close * bars.Volume).sum() / bars.Volume.sum()) window = 5 - s = pd.concat([ (pd.Series(vwap(df.iloc[i:i+window]), index=[df.index[i+window]])) for i in range(len(df)-window) ]); + s = pd.concat([(pd.Series(vwap(df.iloc[i:i + window]), + index=[df.index[i + window]])) + for i in range(len(df) - window)]) + s.round(2) s.round(2) Timeseries @@ -806,21 +874,25 @@ Depending on df construction, ``ignore_index`` may be needed .. ipython:: python - df = df1.append(df2,ignore_index=True); df + df = df1.append(df2, ignore_index=True) + df `Self Join of a DataFrame `__ .. ipython:: python - df = pd.DataFrame(data={'Area' : ['A'] * 5 + ['C'] * 2, - 'Bins' : [110] * 2 + [160] * 3 + [40] * 2, - 'Test_0' : [0, 1, 0, 1, 2, 0, 1], - 'Data' : np.random.randn(7)});df + df = pd.DataFrame(data={'Area': ['A'] * 5 + ['C'] * 2, + 'Bins': [110] * 2 + [160] * 3 + [40] * 2, + 'Test_0': [0, 1, 0, 1, 2, 0, 1], + 'Data': np.random.randn(7)}) + df df['Test_1'] = df['Test_0'] - 1 - pd.merge(df, df, left_on=['Bins', 'Area','Test_0'], right_on=['Bins', 'Area','Test_1'],suffixes=('_L','_R')) + pd.merge(df, df, left_on=['Bins', 'Area', 'Test_0'], + right_on=['Bins', 'Area', 'Test_1'], + suffixes=('_L', '_R')) `How to set the index and join `__ @@ -871,8 +943,8 @@ The :ref:`Plotting ` docs. .. ipython:: python df = pd.DataFrame( - {u'stratifying_var': np.random.uniform(0, 100, 20), - u'price': np.random.normal(100, 5, 20)}) + {u'stratifying_var': np.random.uniform(0, 100, 20), + u'price': np.random.normal(100, 5, 20)}) df[u'quartiles'] = pd.qcut( df[u'stratifying_var'], @@ -951,7 +1023,6 @@ You can use the same approach to read all files matching a pattern. Here is an .. ipython:: python - import glob files = glob.glob('file_*.csv') result = pd.concat([pd.read_csv(f) for f in files], ignore_index=True) @@ -970,9 +1041,9 @@ Parsing date components in multi-columns is faster with a format .. code-block:: ipython - In [30]: i = pd.date_range('20000101',periods=10000) + In [30]: i = pd.date_range('20000101', periods=10000) - In [31]: df = pd.DataFrame(dict(year = i.year, month = i.month, day = i.day)) + In [31]: df = pd.DataFrame({'year': i.year, 'month': i.month, 'day': i.day}) In [32]: df.head() Out[32]: @@ -983,11 +1054,12 @@ Parsing date components in multi-columns is faster with a format 3 4 1 2000 4 5 1 2000 - In [33]: %timeit pd.to_datetime(df.year*10000+df.month*100+df.day,format='%Y%m%d') - 100 loops, best of 3: 7.08 ms per loop + In [33]: %timeit pd.to_datetime(df.year * 10000 + df.month * 100 + df.day, format='%Y%m%d') + 4.8 ms ± 23.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) # simulate combinging into a string, then parsing - In [34]: ds = df.apply(lambda x: "%04d%02d%02d" % (x['year'],x['month'],x['day']),axis=1) + In [34]: ds = df.apply(lambda x: "%04d%02d%02d" % (x['year'], + x['month'], x['day']), axis=1) In [35]: ds.head() Out[35]: @@ -999,6 +1071,7 @@ Parsing date components in multi-columns is faster with a format dtype: object In [36]: %timeit pd.to_datetime(ds) + Out[36]: 1 loops, best of 3: 488 ms per loop Skip row between header and data @@ -1032,8 +1105,8 @@ Option 1: pass rows explicitly to skip rows .. ipython:: python - pd.read_csv(StringIO(data), sep=';', skiprows=[11,12], - index_col=0, parse_dates=True, header=10) + pd.read_csv(StringIO(data), sep=';', skiprows=[11, 12], + index_col=0, parse_dates=True, header=10) Option 2: read column names and then data """"""""""""""""""""""""""""""""""""""""" @@ -1138,12 +1211,12 @@ Storing Attributes to a group node .. ipython:: python - df = pd.DataFrame(np.random.randn(8,3)) + df = pd.DataFrame(np.random.randn(8, 3)) store = pd.HDFStore('test.h5') - store.put('df',df) + store.put('df', df) # you can store an arbitrary Python object via pickle - store.get_storer('df').attrs.my_attribute = dict(A = 10) + store.get_storer('df').attrs.my_attribute = {'A': 10} store.get_storer('df').attrs.my_attribute .. ipython:: python @@ -1267,6 +1340,7 @@ The `method` argument within `DataFrame.corr` can accept a callable in addition ... return cov_ab / std_a / std_b ... ... + ... >>> df = pd.DataFrame(np.random.normal(size=(100, 3))) ... >>> df.corr(method=distcorr) @@ -1285,17 +1359,17 @@ The :ref:`Timedeltas ` docs. .. ipython:: python - s = pd.Series(pd.date_range('2012-1-1', periods=3, freq='D')) + s = pd.Series(pd.date_range('2012-1-1', periods=3, freq='D')) s - s.max() s.max() - s - s - datetime.datetime(2011,1,1,3,5) + s - datetime.datetime(2011, 1, 1, 3, 5) s + datetime.timedelta(minutes=5) - datetime.datetime(2011,1,1,3,5) - s + datetime.datetime(2011, 1, 1, 3, 5) - s datetime.timedelta(minutes=5) + s @@ -1304,13 +1378,15 @@ The :ref:`Timedeltas ` docs. .. ipython:: python - deltas = pd.Series([ datetime.timedelta(days=i) for i in range(3) ]) + deltas = pd.Series([datetime.timedelta(days=i) for i in range(3)]) - df = pd.DataFrame(dict(A = s, B = deltas)); df + df = pd.DataFrame({'A': s, 'B': deltas}) + df - df['New Dates'] = df['A'] + df['B']; + df['New Dates'] = df['A'] + df['B'] - df['Delta'] = df['A'] - df['New Dates']; df + df['Delta'] = df['A'] - df['New Dates'] + df df.dtypes @@ -1321,9 +1397,11 @@ Values can be set to NaT using np.nan, similar to datetime .. ipython:: python - y = s - s.shift(); y + y = s - s.shift() + y - y[1] = np.nan; y + y[1] = np.nan + y Aliasing Axis Names ------------------- @@ -1333,23 +1411,24 @@ To globally provide aliases for axis names, one can define these 2 functions: .. ipython:: python def set_axis_alias(cls, axis, alias): - if axis not in cls._AXIS_NUMBERS: - raise Exception("invalid axis [%s] for alias [%s]" % (axis, alias)) - cls._AXIS_ALIASES[alias] = axis + if axis not in cls._AXIS_NUMBERS: + raise Exception("invalid axis [%s] for alias [%s]" % (axis, alias)) + cls._AXIS_ALIASES[alias] = axis .. ipython:: python def clear_axis_alias(cls, axis, alias): - if axis not in cls._AXIS_NUMBERS: - raise Exception("invalid axis [%s] for alias [%s]" % (axis, alias)) - cls._AXIS_ALIASES.pop(alias,None) + if axis not in cls._AXIS_NUMBERS: + raise Exception("invalid axis [%s] for alias [%s]" % (axis, alias)) + cls._AXIS_ALIASES.pop(alias, None) .. ipython:: python - set_axis_alias(pd.DataFrame,'columns', 'myaxis2') - df2 = pd.DataFrame(np.random.randn(3,2),columns=['c1','c2'],index=['i1','i2','i3']) + set_axis_alias(pd.DataFrame, 'columns', 'myaxis2') + df2 = pd.DataFrame(np.random.randn(3, 2), columns=['c1', 'c2'], + index=['i1', 'i2', 'i3']) df2.sum(axis='myaxis2') - clear_axis_alias(pd.DataFrame,'columns', 'myaxis2') + clear_axis_alias(pd.DataFrame, 'columns', 'myaxis2') Creating Example Data --------------------- @@ -1362,11 +1441,11 @@ of the data values: def expand_grid(data_dict): - rows = itertools.product(*data_dict.values()) - return pd.DataFrame.from_records(rows, columns=data_dict.keys()) + rows = itertools.product(*data_dict.values()) + return pd.DataFrame.from_records(rows, columns=data_dict.keys()) df = expand_grid( - {'height': [60, 70], - 'weight': [100, 140, 180], - 'sex': ['Male', 'Female']}) + {'height': [60, 70], + 'weight': [100, 140, 180], + 'sex': ['Male', 'Female']}) df From c3c26ea9ad43aa0975281a94fdada44eca64ef1d Mon Sep 17 00:00:00 2001 From: saurav Date: Wed, 21 Nov 2018 09:54:52 +0530 Subject: [PATCH 02/12] compatible wiht PEP-8 standard --- doc/source/cookbook.rst | 45 ++++++++++------------------------------- 1 file changed, 11 insertions(+), 34 deletions(-) diff --git a/doc/source/cookbook.rst b/doc/source/cookbook.rst index b73358c67b1ab..3a4fa4ad518fb 100644 --- a/doc/source/cookbook.rst +++ b/doc/source/cookbook.rst @@ -1039,40 +1039,17 @@ Parsing date components in multi-columns Parsing date components in multi-columns is faster with a format -.. code-block:: ipython - - In [30]: i = pd.date_range('20000101', periods=10000) - - In [31]: df = pd.DataFrame({'year': i.year, 'month': i.month, 'day': i.day}) - - In [32]: df.head() - Out[32]: - day month year - 0 1 1 2000 - 1 2 1 2000 - 2 3 1 2000 - 3 4 1 2000 - 4 5 1 2000 - - In [33]: %timeit pd.to_datetime(df.year * 10000 + df.month * 100 + df.day, format='%Y%m%d') - 4.8 ms ± 23.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) - - # simulate combinging into a string, then parsing - In [34]: ds = df.apply(lambda x: "%04d%02d%02d" % (x['year'], - x['month'], x['day']), axis=1) - - In [35]: ds.head() - Out[35]: - 0 20000101 - 1 20000102 - 2 20000103 - 3 20000104 - 4 20000105 - dtype: object - - In [36]: %timeit pd.to_datetime(ds) - Out[36]: - 1 loops, best of 3: 488 ms per loop +.. ipython:: python + i = pd.date_range('20000101', periods=10000) + df = pd.DataFrame({'year': i.year, 'month': i.month, 'day': i.day}) + df.head() + %timeit pd.to_datetime(df.year * 10000 + df.month * 100 + df.day, + format='%Y%m%d') + ds = df.apply(lambda x: "%04d%02d%02d" % (x['year'], + x['month'], x['day']), axis=1) + + ds.head() + %timeit pd.to_datetime(ds) Skip row between header and data ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ From 27b32452358bf6555be39753b73c3b07ce695e9b Mon Sep 17 00:00:00 2001 From: saurav Date: Wed, 21 Nov 2018 10:54:36 +0530 Subject: [PATCH 03/12] DOC: compatible wiht PEP-8 standard --- doc/source/cookbook.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/cookbook.rst b/doc/source/cookbook.rst index 3a4fa4ad518fb..bd2b245adb3f5 100644 --- a/doc/source/cookbook.rst +++ b/doc/source/cookbook.rst @@ -1038,7 +1038,7 @@ Parsing date components in multi-columns ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Parsing date components in multi-columns is faster with a format - + .. ipython:: python i = pd.date_range('20000101', periods=10000) df = pd.DataFrame({'year': i.year, 'month': i.month, 'day': i.day}) From 01d98267f6060727f34a2233d4b84420c0463f87 Mon Sep 17 00:00:00 2001 From: saurav Date: Wed, 21 Nov 2018 16:30:29 +0530 Subject: [PATCH 04/12] DOC: PEP8 compatibility for cookbook.rst --- doc/source/cookbook.rst | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/doc/source/cookbook.rst b/doc/source/cookbook.rst index bd2b245adb3f5..76fee27d7a001 100644 --- a/doc/source/cookbook.rst +++ b/doc/source/cookbook.rst @@ -743,8 +743,9 @@ Apply def SeriesFromSubList(aList): return pd.Series(aList) - df_orgz = pd.concat({[(ind, row.apply(SeriesFromSubList)) - for ind, row in df.iterrows()]}) + df_orgz = pd.concat(dict([(ind, row.apply(SeriesFromSubList)) + for ind, row in df.iterrows()])) + df_orgz `Rolling Apply with a DataFrame returning a Series `__ @@ -762,8 +763,8 @@ Rolling Apply to multiple columns where function calculates a Series before a Sc v = ((((aDF.A + aDF.B) + 1).cumprod()) - 1) * Const return (aDF.index[0], v.iloc[-1]) - S = pd.Series({[gm(df.iloc[i:min(i + 51, len(df) - 1)], 5) - for i in range(len(df) - 50)]}) + S = pd.Series(dict([gm(df.iloc[i:min(i + 51, len(df) - 1)], 5) + for i in range(len(df) - 50)])) S `Rolling apply with a DataFrame returning a Scalar @@ -782,12 +783,12 @@ Rolling Apply to multiple columns where function returns a Scalar (Volume Weight def vwap(bars): return ((bars.Close * bars.Volume).sum() / bars.Volume.sum()) + window = 5 s = pd.concat([(pd.Series(vwap(df.iloc[i:i + window]), index=[df.index[i + window]])) for i in range(len(df) - window)]) s.round(2) - s.round(2) Timeseries ---------- @@ -1038,8 +1039,10 @@ Parsing date components in multi-columns ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Parsing date components in multi-columns is faster with a format - + .. ipython:: python + :suppress: + i = pd.date_range('20000101', periods=10000) df = pd.DataFrame({'year': i.year, 'month': i.month, 'day': i.day}) df.head() From 2df61db5b560c7fcb85eb0e1cb262bf479cafdd1 Mon Sep 17 00:00:00 2001 From: saurav Date: Wed, 21 Nov 2018 19:50:58 +0530 Subject: [PATCH 05/12] DOC: PEP8 compatibility for cookbook.rst --- doc/source/cookbook.rst | 87 +++++++++++++++++++++-------------------- 1 file changed, 45 insertions(+), 42 deletions(-) diff --git a/doc/source/cookbook.rst b/doc/source/cookbook.rst index 76fee27d7a001..b729dab60b847 100644 --- a/doc/source/cookbook.rst +++ b/doc/source/cookbook.rst @@ -53,7 +53,8 @@ These are some neat pandas ``idioms`` .. ipython:: python - df = pd.DataFrame({'AAA': [4, 5, 6, 7], 'BBB': [10, 20, 30, 40], + df = pd.DataFrame({'AAA': [4, 5, 6, 7], + 'BBB': [10, 20, 30, 40], 'CCC': [100, 50, -30, -50]}) df @@ -85,7 +86,8 @@ Or use pandas where after you've set up a mask .. ipython:: python - df_mask = pd.DataFrame({'AAA': [True] * 4, 'BBB': [False] * 4, + df_mask = pd.DataFrame({'AAA': [True] * 4, + 'BBB': [False] * 4, 'CCC': [True, False] * 2}) df.where(df_mask, -1000) @@ -94,7 +96,8 @@ Or use pandas where after you've set up a mask .. ipython:: python - df = pd.DataFrame({'AAA': [4, 5, 6, 7], 'BBB': [10, 20, 30, 40], + df = pd.DataFrame({'AAA': [4, 5, 6, 7], + 'BBB': [10, 20, 30, 40], 'CCC': [100, 50, -30, -50]}) df df['logic'] = np.where(df['AAA'] > 5, 'high', 'low') @@ -108,14 +111,13 @@ Splitting .. ipython:: python - df = pd.DataFrame({'AAA': [4, 5, 6, 7], 'BBB': [10, 20, 30, 40], + df = pd.DataFrame({'AAA': [4, 5, 6, 7], + 'BBB': [10, 20, 30, 40], 'CCC': [100, 50, -30, -50]}) df - dflow = df[df.AAA <= 5] - dflow - dfhigh = df[df.AAA > 5] - dfhigh + df[df.AAA <= 5] + df[df.AAA > 5] Building Criteria ***************** @@ -125,7 +127,8 @@ Building Criteria .. ipython:: python - df = pd.DataFrame({'AAA': [4, 5, 6, 7], 'BBB': [10, 20, 30, 40], + df = pd.DataFrame({'AAA': [4, 5, 6, 7], + 'BBB': [10, 20, 30, 40], 'CCC': [100, 50, -30, -50]}) df @@ -133,15 +136,13 @@ Building Criteria .. ipython:: python - newseries = df.loc[(df['BBB'] < 25) & (df['CCC'] >= -40), 'AAA'] - newseries + df.loc[(df['BBB'] < 25) & (df['CCC'] >= -40), 'AAA'] ...or (without assignment returns a Series) .. ipython:: python - newseries = df.loc[(df['BBB'] > 25) | (df['CCC'] >= -40), 'AAA'] - newseries + df.loc[(df['BBB'] > 25) | (df['CCC'] >= -40), 'AAA'] ...or (with assignment modifies the DataFrame.) @@ -155,7 +156,8 @@ Building Criteria .. ipython:: python - df = pd.DataFrame({'AAA': [4, 5, 6, 7], 'BBB': [10, 20, 30, 40], + df = pd.DataFrame({'AAA': [4, 5, 6, 7], + 'BBB': [10, 20, 30, 40], 'CCC': [100, 50, -30, -50]}) df aValue = 43.0 @@ -166,7 +168,8 @@ Building Criteria .. ipython:: python - df = pd.DataFrame({'AAA': [4, 5, 6, 7], 'BBB': [10, 20, 30, 40], + df = pd.DataFrame({'AAA': [4, 5, 6, 7], + 'BBB': [10, 20, 30, 40], 'CCC': [100, 50, -30, -50]}) df @@ -204,7 +207,8 @@ The :ref:`indexing ` docs. .. ipython:: python - df = pd.DataFrame({'AAA': [4, 5, 6, 7], 'BBB': [10, 20, 30, 40], + df = pd.DataFrame({'AAA': [4, 5, 6, 7], + 'BBB': [10, 20, 30, 40], 'CCC': [100, 50, -30, -50]}) df @@ -215,11 +219,11 @@ The :ref:`indexing ` docs. .. ipython:: python - data = {'AAA': [4, 5, 6, 7], - 'BBB': [10, 20, 30, 40], - 'CCC': [100, 50, -30, -50]} - df = pd.DataFrame(data=data, index=['foo', 'bar', 'boo', 'kar']) - df + df = pd.DataFrame({'AAA': [4, 5, 6, 7], + 'BBB': [10, 20, 30, 40], + 'CCC': [100, 50, -30, -50]}, + index=['foo', 'bar', 'boo', 'kar']) + There are 2 explicit slicing methods, with a third general case @@ -295,7 +299,7 @@ New Columns 'CCC': [2, 1, 3, 1]}) df - source_cols = df.columns # or some subset would work too. + source_cols = df.columns # Or some subset would work too new_cols = [str(x) + "_cat" for x in source_cols] categories = {1: 'Alpha', 2: 'Beta', 3: 'Charlie'} @@ -533,10 +537,10 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to S = pd.Series([i / 100.0 for i in range(1, 11)]) - def CumRet(x, y): + def cumRet(x, y): return x * (1 + y) - def Red(x): + def red(x): return functools.reduce(CumRet, x, 1.0) S.expanding().apply(Red, raw=True) @@ -606,11 +610,11 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to .. ipython:: python - df = pd.DataFrame({u'line_race': [10, 10, 8, 10, 10, 8], - u'beyer': [99, 102, 103, 103, 88, 100]}, - index=[u'Last Gunfighter', u'Last Gunfighter', - u'Last Gunfighter', u'Paynter', u'Paynter', - u'Paynter']) + df = pd.DataFrame({'line_race': [10, 10, 8, 10, 10, 8], + 'beyer': [99, 102, 103, 103, 88, 100]}, + index=['Last Gunfighter', 'Last Gunfighter', + 'Last Gunfighter', 'Paynter', 'Paynter', + 'Paynter']) df df['beyer_shifted'] = df.groupby(level=0)['beyer'].shift(1) df @@ -759,9 +763,9 @@ Rolling Apply to multiple columns where function calculates a Series before a Sc columns=['A', 'B']) df - def gm(aDF, Const): - v = ((((aDF.A + aDF.B) + 1).cumprod()) - 1) * Const - return (aDF.index[0], v.iloc[-1]) + def gm(df, const): + v = ((((df.A + df.B) + 1).cumprod()) - 1) * const + return (df.index[0], v.iloc[-1]) S = pd.Series(dict([gm(df.iloc[i:min(i + 51, len(df) - 1)], 5) for i in range(len(df) - 50)])) @@ -944,16 +948,16 @@ The :ref:`Plotting ` docs. .. ipython:: python df = pd.DataFrame( - {u'stratifying_var': np.random.uniform(0, 100, 20), - u'price': np.random.normal(100, 5, 20)}) + {'stratifying_var': np.random.uniform(0, 100, 20), + 'price': np.random.normal(100, 5, 20)}) - df[u'quartiles'] = pd.qcut( - df[u'stratifying_var'], + df['quartiles'] = pd.qcut( + df['stratifying_var'], 4, - labels=[u'0-25%', u'25-50%', u'50-75%', u'75-100%']) + labels=['0-25%', '25-50%', '50-75%', '75-100%']) @savefig quartile_boxplot.png - df.boxplot(column=u'price', by=u'quartiles') + df.boxplot(column='price', by='quartiles') Data In/Out ----------- @@ -1424,8 +1428,7 @@ of the data values: rows = itertools.product(*data_dict.values()) return pd.DataFrame.from_records(rows, columns=data_dict.keys()) - df = expand_grid( - {'height': [60, 70], - 'weight': [100, 140, 180], - 'sex': ['Male', 'Female']}) + df = expand_grid({'height': [60, 70], + 'weight': [100, 140, 180], + 'sex': ['Male', 'Female']}) df From cd94d8d337914bbc3387d32a789fa8e5ddd4803d Mon Sep 17 00:00:00 2001 From: saurav Date: Wed, 21 Nov 2018 23:34:20 +0530 Subject: [PATCH 06/12] DOC: PEP8 compatible v3 --- doc/source/cookbook.rst | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/doc/source/cookbook.rst b/doc/source/cookbook.rst index b729dab60b847..a9c0be5052e73 100644 --- a/doc/source/cookbook.rst +++ b/doc/source/cookbook.rst @@ -243,11 +243,13 @@ There are 2 explicit slicing methods, with a third general case Ambiguity arises when an index consists of integers with a non-zero start or non-unit increment. .. ipython:: python + :verbatim: + data = {'AAA': [4, 5, 6, 7], + 'BBB': [10, 20, 30, 40], + 'CCC': [100, 50, -30, -50]}) df2 = pd.DataFrame(data=data, index=[1, 2, 3, 4]) # Note index starts at 1. - df2.iloc[1:3] # Position-oriented - df2.loc[1:3] # Label-oriented `Using inverse operator (~) to take the complement of a mask @@ -537,13 +539,13 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to S = pd.Series([i / 100.0 for i in range(1, 11)]) - def cumRet(x, y): + def cum_ret(x, y): return x * (1 + y) def red(x): - return functools.reduce(CumRet, x, 1.0) + return functools.reduce(cum_ret, x, 1.0) - S.expanding().apply(Red, raw=True) + S.expanding().apply(red, raw=True) `Replacing some values with mean of the rest of a group @@ -747,8 +749,8 @@ Apply def SeriesFromSubList(aList): return pd.Series(aList) - df_orgz = pd.concat(dict([(ind, row.apply(SeriesFromSubList)) - for ind, row in df.iterrows()])) + df_orgz = pd.concat({ind: row.apply(SeriesFromSubList) + for ind, row in df.iterrows()}) df_orgz `Rolling Apply with a DataFrame returning a Series @@ -765,11 +767,11 @@ Rolling Apply to multiple columns where function calculates a Series before a Sc def gm(df, const): v = ((((df.A + df.B) + 1).cumprod()) - 1) * const - return (df.index[0], v.iloc[-1]) + return (v.iloc[-1]) - S = pd.Series(dict([gm(df.iloc[i:min(i + 51, len(df) - 1)], 5) - for i in range(len(df) - 50)])) - S + s = pd.Series({df.index[i]: gm(df.iloc[i:min(i + 51, len(df) - 1)], 5) + for i in range(len(df) - 50)}) + s `Rolling apply with a DataFrame returning a Scalar `__ @@ -1034,7 +1036,7 @@ You can use the same approach to read all files matching a pattern. Here is an Finally, this strategy will work with the other ``pd.read_*(...)`` functions described in the :ref:`io docs`. .. ipython:: python - :suppress: + :verbatim: for i in range(3): os.remove('file_{}.csv'.format(i)) @@ -1045,16 +1047,14 @@ Parsing date components in multi-columns Parsing date components in multi-columns is faster with a format .. ipython:: python - :suppress: + :verbatim: i = pd.date_range('20000101', periods=10000) df = pd.DataFrame({'year': i.year, 'month': i.month, 'day': i.day}) df.head() - %timeit pd.to_datetime(df.year * 10000 + df.month * 100 + df.day, - format='%Y%m%d') + %timeit pd.to_datetime(df.year * 10000 + df.month * 100 + df.day, format='%Y%m%d') ds = df.apply(lambda x: "%04d%02d%02d" % (x['year'], x['month'], x['day']), axis=1) - ds.head() %timeit pd.to_datetime(ds) From 085ac5fcfe98b821ea07f9520df3a6a86c60a8f8 Mon Sep 17 00:00:00 2001 From: saurav Date: Wed, 21 Nov 2018 23:36:35 +0530 Subject: [PATCH 07/12] DOC: PEP8 compatible v4 --- doc/source/cookbook.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/cookbook.rst b/doc/source/cookbook.rst index a9c0be5052e73..8819f4ba9e311 100644 --- a/doc/source/cookbook.rst +++ b/doc/source/cookbook.rst @@ -247,7 +247,7 @@ Ambiguity arises when an index consists of integers with a non-zero start or non data = {'AAA': [4, 5, 6, 7], 'BBB': [10, 20, 30, 40], - 'CCC': [100, 50, -30, -50]}) + 'CCC': [100, 50, -30, -50]} df2 = pd.DataFrame(data=data, index=[1, 2, 3, 4]) # Note index starts at 1. df2.iloc[1:3] # Position-oriented df2.loc[1:3] # Label-oriented From ce05dba4a28fb8fa8892edd2cb5ffa4d5015e801 Mon Sep 17 00:00:00 2001 From: saurav Date: Wed, 21 Nov 2018 23:53:25 +0530 Subject: [PATCH 08/12] DOC: PEP8 compatible v5 --- doc/source/cookbook.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/cookbook.rst b/doc/source/cookbook.rst index 8819f4ba9e311..176ea93a72fc1 100644 --- a/doc/source/cookbook.rst +++ b/doc/source/cookbook.rst @@ -234,11 +234,11 @@ There are 2 explicit slicing methods, with a third general case .. ipython:: python df.iloc[0:3] # Positional - df.loc['bar': 'kar'] # Label + df.loc['bar':'kar'] # Label # Generic df.iloc[0:3] - df.loc['bar': 'kar'] + df.loc['bar':'kar'] Ambiguity arises when an index consists of integers with a non-zero start or non-unit increment. From acfb29d2e5971e09e261976b42601c905531a7e6 Mon Sep 17 00:00:00 2001 From: saurav Date: Thu, 22 Nov 2018 01:35:54 +0530 Subject: [PATCH 09/12] DOC: PEP8 compatible v6 --- doc/source/cookbook.rst | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/doc/source/cookbook.rst b/doc/source/cookbook.rst index 176ea93a72fc1..1e6a145654224 100644 --- a/doc/source/cookbook.rst +++ b/doc/source/cookbook.rst @@ -8,17 +8,13 @@ import pandas as pd import numpy as np from pandas.compat import StringIO - import os import itertools import functools import datetime import glob np.random.seed(123456) - pd.options.display.max_rows = 15 - - np.set_printoptions(precision=4, suppress=True) @@ -243,7 +239,6 @@ There are 2 explicit slicing methods, with a third general case Ambiguity arises when an index consists of integers with a non-zero start or non-unit increment. .. ipython:: python - :verbatim: data = {'AAA': [4, 5, 6, 7], 'BBB': [10, 20, 30, 40], @@ -414,17 +409,13 @@ To take the cross section of the 1st level and 1st axis the index: index = list(itertools.product(['Ada', 'Quinn', 'Violet'], ['Comp', 'Math', 'Sci'])) headr = list(itertools.product(['Exams', 'Labs'], ['I', 'II'])) - indx = pd.MultiIndex.from_tuples(index, names=['Student', 'Course']) cols = pd.MultiIndex.from_tuples(headr) # Notice these are un-named - data = [[70 + x + y + (x * y) % 3 for x in range(4)] for y in range(9)] - df = pd.DataFrame(data, indx, cols) df All = slice(None) - df.loc['Violet'] df.loc[(All, 'Math'), All] df.loc[(slice('Ada', 'Quinn'), 'Math'), All] @@ -512,7 +503,6 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to .. ipython:: python gb = df.groupby(['animal']) - gb.get_group('cat') `Apply to different items in a group @@ -529,7 +519,6 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to index=['size', 'weight', 'adult']) expected_df = gb.apply(GrowUp) - expected_df `Expanding Apply @@ -767,7 +756,7 @@ Rolling Apply to multiple columns where function calculates a Series before a Sc def gm(df, const): v = ((((df.A + df.B) + 1).cumprod()) - 1) * const - return (v.iloc[-1]) + return v.iloc[-1] s = pd.Series({df.index[i]: gm(df.iloc[i:min(i + 51, len(df) - 1)], 5) for i in range(len(df) - 50)}) @@ -1036,7 +1025,7 @@ You can use the same approach to read all files matching a pattern. Here is an Finally, this strategy will work with the other ``pd.read_*(...)`` functions described in the :ref:`io docs`. .. ipython:: python - :verbatim: + :suppress: for i in range(3): os.remove('file_{}.csv'.format(i)) @@ -1047,7 +1036,6 @@ Parsing date components in multi-columns Parsing date components in multi-columns is faster with a format .. ipython:: python - :verbatim: i = pd.date_range('20000101', periods=10000) df = pd.DataFrame({'year': i.year, 'month': i.month, 'day': i.day}) @@ -1058,6 +1046,7 @@ Parsing date components in multi-columns is faster with a format ds.head() %timeit pd.to_datetime(ds) + Skip row between header and data ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -1326,7 +1315,6 @@ The `method` argument within `DataFrame.corr` can accept a callable in addition ... ... >>> df = pd.DataFrame(np.random.normal(size=(100, 3))) - ... >>> df.corr(method=distcorr) 0 1 2 0 1.000000 0.171368 0.145302 From 8cde4a7a72518933495fbab8510f533dbd6cb1d8 Mon Sep 17 00:00:00 2001 From: saurav Date: Thu, 22 Nov 2018 10:25:39 +0530 Subject: [PATCH 10/12] DOC: Modified the order of imports --- doc/source/cookbook.rst | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/doc/source/cookbook.rst b/doc/source/cookbook.rst index 1e6a145654224..8c412ac389629 100644 --- a/doc/source/cookbook.rst +++ b/doc/source/cookbook.rst @@ -5,17 +5,17 @@ .. ipython:: python :suppress: - import pandas as pd - import numpy as np - from pandas.compat import StringIO - import os - import itertools - import functools import datetime + import functools import glob + import itertools + import os + import numpy as np + import pandas as pd + from pandas.compat import StringIO np.random.seed(123456) - pd.options.display.max_rows = 15 np.set_printoptions(precision=4, suppress=True) + pd.options.display.max_rows = 15 ******** From 8895fde096cc5dbabfc5d4a2ff4b6bdbde786886 Mon Sep 17 00:00:00 2001 From: saurav Date: Thu, 22 Nov 2018 14:45:04 +0530 Subject: [PATCH 11/12] DOC: Changed code to iPython and conform to PEP8 --- doc/source/cookbook.rst | 61 ++++++++++++++++++----------------------- 1 file changed, 27 insertions(+), 34 deletions(-) diff --git a/doc/source/cookbook.rst b/doc/source/cookbook.rst index 8c412ac389629..78938daae7ec6 100644 --- a/doc/source/cookbook.rst +++ b/doc/source/cookbook.rst @@ -1285,41 +1285,34 @@ Often it's useful to obtain the lower (or upper) triangular form of a correlatio The `method` argument within `DataFrame.corr` can accept a callable in addition to the named correlation types. Here we compute the `distance correlation `__ matrix for a `DataFrame` object. -.. code-block:: python +.. ipython:: python + + def distcorr(x, y): + n = len(x) + a = np.zeros(shape=(n, n)) + b = np.zeros(shape=(n, n)) + + for i in range(n): + for j in range(i + 1, n): + a[i, j] = abs(x[i] - x[j]) + b[i, j] = abs(y[i] - y[j]) + + a += a.T + b += b.T + + a_bar = np.vstack([np.nanmean(a, axis=0)] * n) + b_bar = np.vstack([np.nanmean(b, axis=0)] * n) + + A = a - a_bar - a_bar.T + np.full(shape=(n, n), fill_value=a_bar.mean()) + B = b - b_bar - b_bar.T + np.full(shape=(n, n), fill_value=b_bar.mean()) + cov_ab = np.sqrt(np.nansum(A * B)) / n + std_a = np.sqrt(np.sqrt(np.nansum(A**2)) / n) + std_b = np.sqrt(np.sqrt(np.nansum(B**2)) / n) + + return cov_ab / std_a / std_b - >>> def distcorr(x, y): - ... n = len(x) - ... a = np.zeros(shape=(n, n)) - ... b = np.zeros(shape=(n, n)) - ... - ... for i in range(n): - ... for j in range(i + 1, n): - ... a[i, j] = abs(x[i] - x[j]) - ... b[i, j] = abs(y[i] - y[j]) - ... - ... a += a.T - ... b += b.T - ... - ... a_bar = np.vstack([np.nanmean(a, axis=0)] * n) - ... b_bar = np.vstack([np.nanmean(b, axis=0)] * n) - ... - ... A = a - a_bar - a_bar.T + np.full(shape=(n, n), fill_value=a_bar.mean()) - ... B = b - b_bar - b_bar.T + np.full(shape=(n, n), fill_value=b_bar.mean()) - ... - ... cov_ab = np.sqrt(np.nansum(A * B)) / n - ... std_a = np.sqrt(np.sqrt(np.nansum(A**2)) / n) - ... std_b = np.sqrt(np.sqrt(np.nansum(B**2)) / n) - ... - ... return cov_ab / std_a / std_b - ... - ... - ... - >>> df = pd.DataFrame(np.random.normal(size=(100, 3))) - >>> df.corr(method=distcorr) - 0 1 2 - 0 1.000000 0.171368 0.145302 - 1 0.171368 1.000000 0.189919 - 2 0.145302 0.189919 1.000000 + df = pd.DataFrame(np.random.normal(size=(100, 3))) + df.corr(method=distcorr) Timedeltas ---------- From d57be90426b9f1d04f9cf2778b04c33be7bb60bf Mon Sep 17 00:00:00 2001 From: saurav Date: Sat, 24 Nov 2018 11:15:14 +0530 Subject: [PATCH 12/12] DOC: PEP8 standards - blank lines in import block --- doc/source/cookbook.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/source/cookbook.rst b/doc/source/cookbook.rst index 78938daae7ec6..16d756acaca51 100644 --- a/doc/source/cookbook.rst +++ b/doc/source/cookbook.rst @@ -10,9 +10,12 @@ import glob import itertools import os + import numpy as np import pandas as pd from pandas.compat import StringIO + + np.random.seed(123456) np.set_printoptions(precision=4, suppress=True) pd.options.display.max_rows = 15