From 92092245f8e6951e241c3c1898a01e4e78828ad3 Mon Sep 17 00:00:00 2001 From: Vincent Arel-Bundock Date: Thu, 25 Jul 2013 16:52:34 +0000 Subject: [PATCH] DOC: io.wb example --- doc/source/io.rst | 120 ++++++++++++++++++++++++++++++++++++++++++++++ pandas/io/wb.py | 1 + 2 files changed, 121 insertions(+) diff --git a/doc/source/io.rst b/doc/source/io.rst index 7dddc43b136cf..04f73f22610d1 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -2584,3 +2584,123 @@ Tthe dataset names are listed at `Fama/French Data Library import pandas.io.data as web ip=web.DataReader("5_Industry_Portfolios", "famafrench") ip[4].ix[192607] + + +World Bank panel data in Pandas +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +``Pandas`` users can easily access thousands of panel data series from the +`World Bank's World Development Indicators `_ +by using the ``wb`` I/O functions. + +For example, if you wanted to compare the Gross Domestic Products per capita in +constant dollars in North America, you would use the ``search`` function: + +.. code:: python + + In [1]: from pandas.io.wb import search, download + + In [2]: search('gdp.*capita.*const').iloc[:,:2] + Out[2]: + id name + 3242 GDPPCKD GDP per Capita, constant US$, millions + 5143 NY.GDP.PCAP.KD GDP per capita (constant 2005 US$) + 5145 NY.GDP.PCAP.KN GDP per capita (constant LCU) + 5147 NY.GDP.PCAP.PP.KD GDP per capita, PPP (constant 2005 internation... + +Then you would use the ``download`` function to acquire the data from the World +Bank's servers: + +.. code:: python + + In [3]: dat = download(indicator='NY.GDP.PCAP.KD', country=['US', 'CA', 'MX'], start=2005, end=2008) + + In [4]: print dat + NY.GDP.PCAP.KD + country year + Canada 2008 36005.5004978584 + 2007 36182.9138439757 + 2006 35785.9698172849 + 2005 35087.8925933298 + Mexico 2008 8113.10219480083 + 2007 8119.21298908649 + 2006 7961.96818458178 + 2005 7666.69796097264 + United States 2008 43069.5819857208 + 2007 43635.5852068142 + 2006 43228.111147107 + 2005 42516.3934699993 + +The resulting dataset is a properly formatted ``DataFrame`` with a hierarchical +index, so it is easy to apply ``.groupby`` transformations to it: + +.. code:: python + + In [6]: dat['NY.GDP.PCAP.KD'].groupby(level=0).mean() + Out[6]: + country + Canada 35765.569188 + Mexico 7965.245332 + United States 43112.417952 + dtype: float64 + +Now imagine you want to compare GDP to the share of people with cellphone +contracts around the world. + +.. code:: python + + In [7]: search('cell.*%').iloc[:,:2] + Out[7]: + id name + 3990 IT.CEL.SETS.FE.ZS Mobile cellular telephone users, female (% of ... + 3991 IT.CEL.SETS.MA.ZS Mobile cellular telephone users, male (% of po... + 4027 IT.MOB.COV.ZS Population coverage of mobile cellular telepho... + +Notice that this second search was much faster than the first one because +``Pandas`` now has a cached list of available data series. + +.. code:: python + + In [13]: ind = ['NY.GDP.PCAP.KD', 'IT.MOB.COV.ZS'] + In [14]: dat = download(indicator=ind, country='all', start=2011, end=2011).dropna() + In [15]: dat.columns = ['gdp', 'cellphone'] + In [16]: print dat.tail() + gdp cellphone + country year + Swaziland 2011 2413.952853 94.9 + Tunisia 2011 3687.340170 100.0 + Uganda 2011 405.332501 100.0 + Zambia 2011 767.911290 62.0 + Zimbabwe 2011 419.236086 72.4 + +Finally, we use the ``statsmodels`` package to assess the relationship between +our two variables using ordinary least squares regression. Unsurprisingly, +populations in rich countries tend to use cellphones at a higher rate: + +.. code:: python + + In [17]: import numpy as np + In [18]: import statsmodels.formula.api as smf + In [19]: mod = smf.ols("cellphone ~ np.log(gdp)", dat).fit() + In [20]: print mod.summary() + OLS Regression Results + ============================================================================== + Dep. Variable: cellphone R-squared: 0.297 + Model: OLS Adj. R-squared: 0.274 + Method: Least Squares F-statistic: 13.08 + Date: Thu, 25 Jul 2013 Prob (F-statistic): 0.00105 + Time: 15:24:42 Log-Likelihood: -139.16 + No. Observations: 33 AIC: 282.3 + Df Residuals: 31 BIC: 285.3 + Df Model: 1 + =============================================================================== + coef std err t P>|t| [95.0% Conf. Int.] + ------------------------------------------------------------------------------- + Intercept 16.5110 19.071 0.866 0.393 -22.384 55.406 + np.log(gdp) 9.9333 2.747 3.616 0.001 4.331 15.535 + ============================================================================== + Omnibus: 36.054 Durbin-Watson: 2.071 + Prob(Omnibus): 0.000 Jarque-Bera (JB): 119.133 + Skew: -2.314 Prob(JB): 1.35e-26 + Kurtosis: 11.077 Cond. No. 45.8 + ============================================================================== diff --git a/pandas/io/wb.py b/pandas/io/wb.py index 4d83337a9062e..f83ed296e360c 100644 --- a/pandas/io/wb.py +++ b/pandas/io/wb.py @@ -75,6 +75,7 @@ def download(country=['MX', 'CA', 'US'], indicator=['GDPPCKD', 'GDPPCKN'], # Clean out = out.drop('iso2c', axis=1) out = out.set_index(['country', 'year']) + out = out.convert_objects(convert_numeric=True) return out