diff --git a/doc/source/remote_data.rst b/doc/source/remote_data.rst index 842fcb6896680..019aa82fed1aa 100644 --- a/doc/source/remote_data.rst +++ b/doc/source/remote_data.rst @@ -2,34 +2,21 @@ .. currentmodule:: pandas -.. ipython:: python - :suppress: - - import os - import csv - import pandas as pd - - import numpy as np - np.random.seed(123456) - randn = np.random.randn - np.set_printoptions(precision=4, suppress=True) - - import matplotlib.pyplot as plt - plt.close('all') - - from pandas import * - options.display.max_rows=15 - import pandas.util.testing as tm - ****************** Remote Data Access ****************** .. _remote_data.pandas_datareader: -.. warning:: +DataReader +---------- - In pandas 0.17.0, the sub-package ``pandas.io.data`` will be removed in favor of a separately installable `pandas-datareader package `_. This will allow the data modules to be independently updated to your pandas installation. The API for ``pandas-datareader v0.1.1`` is the same as in ``pandas v0.16.1``. (:issue:`8961`) +The sub-package ``pandas.io.data`` is removed in favor of a separately +installable `pandas-datareader package +`_. This will allow the data +modules to be independently updated to your pandas installation. The API for +``pandas-datareader v0.1.1`` is the same as in ``pandas v0.16.1``. +(:issue:`8961`) You should replace the imports of the following: @@ -43,310 +30,6 @@ Remote Data Access from pandas_datareader import data, wb -.. _remote_data.data_reader: - -Functions from :mod:`pandas.io.data` and :mod:`pandas.io.ga` extract data from various Internet sources into a DataFrame. Currently the following sources are supported: - - - :ref:`Yahoo! Finance` - - :ref:`Google Finance` - - :ref:`St.Louis FED (FRED)` - - :ref:`Kenneth French's data library` - - :ref:`World Bank` - - :ref:`Google Analytics` - -It should be noted, that various sources support different kinds of data, so not all sources implement the same methods and the data elements returned might also differ. - -.. _remote_data.yahoo: - -Yahoo! Finance --------------- - -.. ipython:: python - :okwarning: - - import pandas.io.data as web - import datetime - start = datetime.datetime(2010, 1, 1) - end = datetime.datetime(2013, 1, 27) - f = web.DataReader("F", 'yahoo', start, end) - f.ix['2010-01-04'] - -.. _remote_data.yahoo_options: - -Yahoo! Finance Options ----------------------- -***Experimental*** - -The ``Options`` class allows the download of options data from Yahoo! Finance. - -The ``get_all_data`` method downloads and caches option data for all expiry months -and provides a formatted ``DataFrame`` with a hierarchical index, so it is easy to get -to the specific option you want. - -.. ipython:: python - - from pandas.io.data import Options - aapl = Options('aapl', 'yahoo') - data = aapl.get_all_data() - data.iloc[0:5, 0:5] - - # Show the $100 strike puts at all expiry dates: - data.loc[(100, slice(None), 'put'),:].iloc[0:5, 0:5] - - # Show the volume traded of $100 strike puts at all expiry dates: - data.loc[(100, slice(None), 'put'),'Vol'].head() - -If you don't want to download all the data, more specific requests can be made. - -.. ipython:: python - - import datetime - expiry = datetime.date(2016, 1, 1) - data = aapl.get_call_data(expiry=expiry) - data.iloc[0:5:, 0:5] - -Note that if you call ``get_all_data`` first, this second call will happen much faster, -as the data is cached. - -If a given expiry date is not available, data for the next available expiry will be -returned (January 15, 2015 in the above example). - -Available expiry dates can be accessed from the ``expiry_dates`` property. - -.. ipython:: python - - aapl.expiry_dates - data = aapl.get_call_data(expiry=aapl.expiry_dates[0]) - data.iloc[0:5:, 0:5] - -A list-like object containing dates can also be passed to the expiry parameter, -returning options data for all expiry dates in the list. - -.. ipython:: python - - data = aapl.get_near_stock_price(expiry=aapl.expiry_dates[0:3]) - data.iloc[0:5:, 0:5] - -The ``month`` and ``year`` parameters can be used to get all options data for a given month. - -.. _remote_data.google: - -Google Finance --------------- - -.. ipython:: python - - import pandas.io.data as web - import datetime - start = datetime.datetime(2010, 1, 1) - end = datetime.datetime(2013, 1, 27) - f = web.DataReader("F", 'google', start, end) - f.ix['2010-01-04'] - -.. _remote_data.fred: - -FRED ----- - -.. ipython:: python - - import pandas.io.data as web - import datetime - start = datetime.datetime(2010, 1, 1) - end = datetime.datetime(2013, 1, 27) - gdp=web.DataReader("GDP", "fred", start, end) - gdp.ix['2013-01-01'] - - # Multiple series: - inflation = web.DataReader(["CPIAUCSL", "CPILFESL"], "fred", start, end) - inflation.head() -.. _remote_data.ff: - -Fama/French ------------ - -Dataset names are listed at `Fama/French Data Library -`__. - -.. ipython:: python - - import pandas.io.data as web - ip = web.DataReader("5_Industry_Portfolios", "famafrench") - ip[4].ix[192607] - -.. _remote_data.wb: - -World Bank ----------- - -``pandas`` users can easily access thousands of panel data series from the -`World Bank's World Development Indicators `__ -by using the ``wb`` I/O functions. - -Indicators -~~~~~~~~~~ - -Either from exploring the World Bank site, or using the search function included, -every world bank indicator is accessible. - -For example, if you wanted to compare the Gross Domestic Products per capita in -constant dollars in North America, you would use the ``search`` function: - -.. code-block:: ipython - - In [1]: from pandas.io import wb - - In [2]: wb.search('gdp.*capita.*const').iloc[:,:2] - Out[2]: - id name - 3242 GDPPCKD GDP per Capita, constant US$, millions - 5143 NY.GDP.PCAP.KD GDP per capita (constant 2005 US$) - 5145 NY.GDP.PCAP.KN GDP per capita (constant LCU) - 5147 NY.GDP.PCAP.PP.KD GDP per capita, PPP (constant 2005 internation... - -Then you would use the ``download`` function to acquire the data from the World -Bank's servers: - -.. code-block:: ipython - - In [3]: dat = wb.download(indicator='NY.GDP.PCAP.KD', country=['US', 'CA', 'MX'], start=2005, end=2008) - - In [4]: print(dat) - NY.GDP.PCAP.KD - country year - Canada 2008 36005.5004978584 - 2007 36182.9138439757 - 2006 35785.9698172849 - 2005 35087.8925933298 - Mexico 2008 8113.10219480083 - 2007 8119.21298908649 - 2006 7961.96818458178 - 2005 7666.69796097264 - United States 2008 43069.5819857208 - 2007 43635.5852068142 - 2006 43228.111147107 - 2005 42516.3934699993 - -The resulting dataset is a properly formatted ``DataFrame`` with a hierarchical -index, so it is easy to apply ``.groupby`` transformations to it: - -.. code-block:: ipython - - In [6]: dat['NY.GDP.PCAP.KD'].groupby(level=0).mean() - Out[6]: - country - Canada 35765.569188 - Mexico 7965.245332 - United States 43112.417952 - dtype: float64 - -Now imagine you want to compare GDP to the share of people with cellphone -contracts around the world. - -.. code-block:: ipython - - In [7]: wb.search('cell.*%').iloc[:,:2] - Out[7]: - id name - 3990 IT.CEL.SETS.FE.ZS Mobile cellular telephone users, female (% of ... - 3991 IT.CEL.SETS.MA.ZS Mobile cellular telephone users, male (% of po... - 4027 IT.MOB.COV.ZS Population coverage of mobile cellular telepho... - -Notice that this second search was much faster than the first one because -``pandas`` now has a cached list of available data series. - -.. code-block:: ipython - - In [13]: ind = ['NY.GDP.PCAP.KD', 'IT.MOB.COV.ZS'] - In [14]: dat = wb.download(indicator=ind, country='all', start=2011, end=2011).dropna() - In [15]: dat.columns = ['gdp', 'cellphone'] - In [16]: print(dat.tail()) - gdp cellphone - country year - Swaziland 2011 2413.952853 94.9 - Tunisia 2011 3687.340170 100.0 - Uganda 2011 405.332501 100.0 - Zambia 2011 767.911290 62.0 - Zimbabwe 2011 419.236086 72.4 - -Finally, we use the ``statsmodels`` package to assess the relationship between -our two variables using ordinary least squares regression. Unsurprisingly, -populations in rich countries tend to use cellphones at a higher rate: - -.. code-block:: ipython - - In [17]: import numpy as np - In [18]: import statsmodels.formula.api as smf - In [19]: mod = smf.ols("cellphone ~ np.log(gdp)", dat).fit() - In [20]: print(mod.summary()) - OLS Regression Results - ============================================================================== - Dep. Variable: cellphone R-squared: 0.297 - Model: OLS Adj. R-squared: 0.274 - Method: Least Squares F-statistic: 13.08 - Date: Thu, 25 Jul 2013 Prob (F-statistic): 0.00105 - Time: 15:24:42 Log-Likelihood: -139.16 - No. Observations: 33 AIC: 282.3 - Df Residuals: 31 BIC: 285.3 - Df Model: 1 - =============================================================================== - coef std err t P>|t| [95.0% Conf. Int.] - ------------------------------------------------------------------------------- - Intercept 16.5110 19.071 0.866 0.393 -22.384 55.406 - np.log(gdp) 9.9333 2.747 3.616 0.001 4.331 15.535 - ============================================================================== - Omnibus: 36.054 Durbin-Watson: 2.071 - Prob(Omnibus): 0.000 Jarque-Bera (JB): 119.133 - Skew: -2.314 Prob(JB): 1.35e-26 - Kurtosis: 11.077 Cond. No. 45.8 - ============================================================================== - -Country Codes -~~~~~~~~~~~~~ - -.. versionadded:: 0.15.1 - -The ``country`` argument accepts a string or list of mixed -`two `__ or `three `__ character -ISO country codes, as well as dynamic `World Bank exceptions `__ to the ISO standards. - -For a list of the the hard-coded country codes (used solely for error handling logic) see ``pandas.io.wb.country_codes``. - -Problematic Country Codes & Indicators -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. note:: - - The World Bank's country list and indicators are dynamic. As of 0.15.1, - :func:`wb.download()` is more flexible. To achieve this, the warning - and exception logic changed. - -The world bank converts some country codes in their response, which makes error -checking by pandas difficult. Retired indicators still persist in the search. - -Given the new flexibility of 0.15.1, improved error handling by the user -may be necessary for fringe cases. - -To help identify issues: - -There are at least 4 kinds of country codes: - -1. Standard (2/3 digit ISO) - returns data, will warn and error properly. -2. Non-standard (WB Exceptions) - returns data, but will falsely warn. -3. Blank - silently missing from the response. -4. Bad - causes the entire response from WB to fail, always exception inducing. - -There are at least 3 kinds of indicators: - -1. Current - Returns data. -2. Retired - Appears in search results, yet won't return data. -3. Bad - Will not return data. - -Use the ``errors`` argument to control warnings and exceptions. Setting -errors to ignore or warn, won't stop failed responses. (ie, 100% bad -indicators, or a single "bad" (#4 above) country code). - -See docstrings for more info. .. _remote_data.ga: diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index cdae0d5c27c7d..f4c34d5953c9f 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -597,6 +597,8 @@ Removal of prior version deprecations/changes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - The ``pd.sandbox`` module has been removed in favor of the external library ``pandas-qt`` (:issue:`13670`) +- The ``pandas.io.data`` and ``pandas.io.wb`` modules are removed in favor of + the `pandas-datareader package `__ (:issue:`13724`). - ``DataFrame.to_csv()`` has dropped the ``engine`` parameter, as was deprecated in 0.17.1 (:issue:`11274`, :issue:`13419`) - ``DataFrame.to_dict()`` has dropped the ``outtype`` parameter in favor of ``orient`` (:issue:`13627`, :issue:`8486`) - ``pd.Categorical`` has dropped setting of the ``ordered`` attribute directly in favor of the ``set_ordered`` method (:issue:`13671`) diff --git a/pandas/io/data.py b/pandas/io/data.py index 68151fbb091fa..e76790a6ab98b 100644 --- a/pandas/io/data.py +++ b/pandas/io/data.py @@ -1,1247 +1,6 @@ -""" -Module contains tools for collecting data from various remote sources - - -""" -# flake8: noqa - -import warnings -import tempfile -import datetime as dt -import time - -from collections import defaultdict - -import numpy as np - -from pandas.compat import( - StringIO, bytes_to_str, range, lmap, zip -) -import pandas.compat as compat -from pandas import Panel, DataFrame, Series, read_csv, concat, to_datetime, DatetimeIndex, DateOffset - -from pandas.types.common import is_list_like -from pandas.core.common import PandasError -from pandas.io.common import urlopen, ZipFile, urlencode -from pandas.tseries.offsets import MonthEnd -from pandas.util.testing import _network_error_classes -from pandas.io.html import read_html - -warnings.warn("\n" - "The pandas.io.data module is moved to a separate package " - "(pandas-datareader) and will be removed from pandas in a " - "future version.\nAfter installing the pandas-datareader package " - "(https://github.com/pydata/pandas-datareader), you can change " - "the import ``from pandas.io import data, wb`` to " - "``from pandas_datareader import data, wb``.", - FutureWarning) - -class SymbolWarning(UserWarning): - pass - - -class RemoteDataError(PandasError, IOError): - pass - - -def DataReader(name, data_source=None, start=None, end=None, - retry_count=3, pause=0.001): - """ - Imports data from a number of online sources. - - Currently supports Yahoo! Finance, Google Finance, St. Louis FED (FRED) - and Kenneth French's data library. - - Parameters - ---------- - name : str or list of strs - the name of the dataset. Some data sources (yahoo, google, fred) will - accept a list of names. - data_source: str, default: None - the data source ("yahoo", "google", "fred", or "ff") - start : datetime, default: None - left boundary for range (defaults to 1/1/2010) - end : datetime, default: None - right boundary for range (defaults to today) - retry_count : int, default 3 - Number of times to retry query request. - pause : numeric, default 0.001 - Time, in seconds, to pause between consecutive queries of chunks. If - single value given for symbol, represents the pause between retries. - - Examples - ---------- - - # Data from Yahoo! Finance - gs = DataReader("GS", "yahoo") - - # Data from Google Finance - aapl = DataReader("AAPL", "google") - - # Data from FRED - vix = DataReader("VIXCLS", "fred") - - # Data from Fama/French - ff = DataReader("F-F_Research_Data_Factors", "famafrench") - ff = DataReader("F-F_Research_Data_Factors_weekly", "famafrench") - ff = DataReader("6_Portfolios_2x3", "famafrench") - ff = DataReader("F-F_ST_Reversal_Factor", "famafrench") - """ - start, end = _sanitize_dates(start, end) - - if data_source == "yahoo": - return get_data_yahoo(symbols=name, start=start, end=end, - adjust_price=False, chunksize=25, - retry_count=retry_count, pause=pause) - elif data_source == "google": - return get_data_google(symbols=name, start=start, end=end, - adjust_price=False, chunksize=25, - retry_count=retry_count, pause=pause) - elif data_source == "fred": - return get_data_fred(name, start, end) - elif data_source == "famafrench": - return get_data_famafrench(name) - - -def _sanitize_dates(start, end): - from pandas.core.datetools import to_datetime - start = to_datetime(start) - end = to_datetime(end) - if start is None: - start = dt.datetime(2010, 1, 1) - if end is None: - end = dt.datetime.today() - return start, end - - -def _in_chunks(seq, size): - """ - Return sequence in 'chunks' of size defined by size - """ - return (seq[pos:pos + size] for pos in range(0, len(seq), size)) - - -_yahoo_codes = {'symbol': 's', 'last': 'l1', 'change_pct': 'p2', 'PE': 'r', - 'time': 't1', 'short_ratio': 's7'} - - -_YAHOO_QUOTE_URL = 'http://finance.yahoo.com/d/quotes.csv?' - - -def get_quote_yahoo(symbols): - """ - Get current yahoo quote - - Returns a DataFrame - """ - if isinstance(symbols, compat.string_types): - sym_list = symbols - else: - sym_list = '+'.join(symbols) - - # for codes see: http://www.gummy-stuff.org/Yahoo-data.htm - request = ''.join(compat.itervalues(_yahoo_codes)) # code request string - header = list(_yahoo_codes.keys()) - - data = defaultdict(list) - - url_str = _YAHOO_QUOTE_URL + 's=%s&f=%s' % (sym_list, request) - - with urlopen(url_str) as url: - lines = url.readlines() - - for line in lines: - fields = line.decode('utf-8').strip().split(',') - for i, field in enumerate(fields): - if field[-2:] == '%"': - v = float(field.strip('"%')) - elif field[0] == '"': - v = field.strip('"') - else: - try: - v = float(field) - except ValueError: - v = field - data[header[i]].append(v) - - idx = data.pop('symbol') - return DataFrame(data, index=idx) - - -def get_quote_google(symbols): - raise NotImplementedError("Google Finance doesn't have this functionality") - - -def _retry_read_url(url, retry_count, pause, name): - for _ in range(retry_count): - time.sleep(pause) - - # kludge to close the socket ASAP - try: - with urlopen(url) as resp: - lines = resp.read() - except _network_error_classes: - pass - else: - rs = read_csv(StringIO(bytes_to_str(lines)), index_col=0, - parse_dates=True, na_values='-')[::-1] - # Yahoo! Finance sometimes does this awesome thing where they - # return 2 rows for the most recent business day - if len(rs) > 2 and rs.index[-1] == rs.index[-2]: # pragma: no cover - rs = rs[:-1] - - #Get rid of unicode characters in index name. - try: - rs.index.name = rs.index.name.decode('unicode_escape').encode('ascii', 'ignore') - except AttributeError: - #Python 3 string has no decode method. - rs.index.name = rs.index.name.encode('ascii', 'ignore').decode() - - return rs - - raise IOError("after %d tries, %s did not " - "return a 200 for url %r" % (retry_count, name, url)) - - -_HISTORICAL_YAHOO_URL = 'http://ichart.finance.yahoo.com/table.csv?' - - -def _get_hist_yahoo(sym, start, end, interval, retry_count, pause): - """ - Get historical data for the given name from yahoo. - Date format is datetime - - Returns a DataFrame. - """ - start, end = _sanitize_dates(start, end) - url = (_HISTORICAL_YAHOO_URL + 's=%s' % sym + - '&a=%s' % (start.month - 1) + - '&b=%s' % start.day + - '&c=%s' % start.year + - '&d=%s' % (end.month - 1) + - '&e=%s' % end.day + - '&f=%s' % end.year + - '&g=%s' % interval + - '&ignore=.csv') - return _retry_read_url(url, retry_count, pause, 'Yahoo!') - - -_HISTORICAL_GOOGLE_URL = 'http://www.google.com/finance/historical?' - - -def _get_hist_google(sym, start, end, interval, retry_count, pause): - """ - Get historical data for the given name from google. - Date format is datetime - - Returns a DataFrame. - """ - start, end = _sanitize_dates(start, end) - - # www.google.com/finance/historical?q=GOOG&startdate=Jun+9%2C+2011&enddate=Jun+8%2C+2013&output=csv - url = "%s%s" % (_HISTORICAL_GOOGLE_URL, - urlencode({"q": sym, - "startdate": start.strftime('%b %d, ' '%Y'), - "enddate": end.strftime('%b %d, %Y'), - "output": "csv"})) - return _retry_read_url(url, retry_count, pause, 'Google') - - -def _adjust_prices(hist_data, price_list=None): - """ - Return modifed DataFrame or Panel with adjusted prices based on - 'Adj Close' price. Adds 'Adj_Ratio' column. - """ - if price_list is None: - price_list = 'Open', 'High', 'Low', 'Close' - adj_ratio = hist_data['Adj Close'] / hist_data['Close'] - - data = hist_data.copy() - for item in price_list: - data[item] = hist_data[item] * adj_ratio - data['Adj_Ratio'] = adj_ratio - del data['Adj Close'] - return data - - -def _calc_return_index(price_df): - """ - Return a returns index from a input price df or series. Initial value - (typically NaN) is set to 1. - """ - df = price_df.pct_change().add(1).cumprod() - mask = df.ix[1].notnull() & df.ix[0].isnull() - df.ix[0][mask] = 1 - - # Check for first stock listings after starting date of index in ret_index - # If True, find first_valid_index and set previous entry to 1. - if (~mask).any(): - for sym in mask.index[~mask]: - tstamp = df[sym].first_valid_index() - t_idx = df.index.get_loc(tstamp) - 1 - df[sym].ix[t_idx] = 1 - - return df - - -_YAHOO_COMPONENTS_URL = 'http://download.finance.yahoo.com/d/quotes.csv?' - - -def get_components_yahoo(idx_sym): - """ - Returns DataFrame containing list of component information for - index represented in idx_sym from yahoo. Includes component symbol - (ticker), exchange, and name. - - Parameters - ---------- - idx_sym : str - Stock index symbol - Examples: - '^DJI' (Dow Jones Industrial Average) - '^NYA' (NYSE Composite) - '^IXIC' (NASDAQ Composite) - - See: http://finance.yahoo.com/indices for other index symbols - - Returns - ------- - idx_df : DataFrame - """ - stats = 'snx' - # URL of form: - # http://download.finance.yahoo.com/d/quotes.csv?s=@%5EIXIC&f=snxl1d1t1c1ohgv - url = _YAHOO_COMPONENTS_URL + 's={0}&f={1}&e=.csv&h={2}' - - idx_mod = idx_sym.replace('^', '@%5E') - url_str = url.format(idx_mod, stats, 1) - - idx_df = DataFrame() - mask = [True] - comp_idx = 1 - - # LOOP across component index structure, - # break when no new components are found - while True in mask: - url_str = url.format(idx_mod, stats, comp_idx) - with urlopen(url_str) as resp: - raw = resp.read() - lines = raw.decode('utf-8').strip().strip('"').split('"\r\n"') - lines = [line.strip().split('","') for line in lines] - - temp_df = DataFrame(lines, columns=['ticker', 'name', 'exchange']) - temp_df = temp_df.drop_duplicates() - temp_df = temp_df.set_index('ticker') - mask = ~temp_df.index.isin(idx_df.index) - - comp_idx = comp_idx + 50 - idx_df = idx_df.append(temp_df[mask]) - - return idx_df - - -def _dl_mult_symbols(symbols, start, end, interval, chunksize, retry_count, pause, - method): - stocks = {} - failed = [] - passed = [] - for sym_group in _in_chunks(symbols, chunksize): - for sym in sym_group: - try: - stocks[sym] = method(sym, start, end, interval, retry_count, pause) - passed.append(sym) - except IOError: - warnings.warn('Failed to read symbol: {0!r}, replacing with ' - 'NaN.'.format(sym), SymbolWarning) - failed.append(sym) - - if len(passed) == 0: - raise RemoteDataError("No data fetched using " - "{0!r}".format(method.__name__)) - try: - if len(stocks) > 0 and len(failed) > 0 and len(passed) > 0: - df_na = stocks[passed[0]].copy() - df_na[:] = np.nan - for sym in failed: - stocks[sym] = df_na - return Panel(stocks).swapaxes('items', 'minor') - except AttributeError: - # cannot construct a panel with just 1D nans indicating no data - raise RemoteDataError("No data fetched using " - "{0!r}".format(method.__name__)) - -_source_functions = {'google': _get_hist_google, 'yahoo': _get_hist_yahoo} - - -def _get_data_from(symbols, start, end, interval, retry_count, pause, adjust_price, - ret_index, chunksize, source): - - src_fn = _source_functions[source] - - # If a single symbol, (e.g., 'GOOG') - if isinstance(symbols, (compat.string_types, int)): - hist_data = src_fn(symbols, start, end, interval, retry_count, pause) - # Or multiple symbols, (e.g., ['GOOG', 'AAPL', 'MSFT']) - elif isinstance(symbols, DataFrame): - hist_data = _dl_mult_symbols(symbols.index, start, end, interval, chunksize, - retry_count, pause, src_fn) - else: - hist_data = _dl_mult_symbols(symbols, start, end, interval, chunksize, - retry_count, pause, src_fn) - if source.lower() == 'yahoo': - if ret_index: - hist_data['Ret_Index'] = _calc_return_index(hist_data['Adj Close']) - if adjust_price: - hist_data = _adjust_prices(hist_data) - - return hist_data - - -def get_data_yahoo(symbols=None, start=None, end=None, retry_count=3, - pause=0.001, adjust_price=False, ret_index=False, - chunksize=25, interval='d'): - """ - Returns DataFrame/Panel of historical stock prices from symbols, over date - range, start to end. To avoid being penalized by Yahoo! Finance servers, - pauses between downloading 'chunks' of symbols can be specified. - - Parameters - ---------- - symbols : string, array-like object (list, tuple, Series), or DataFrame, default: None - Single stock symbol (ticker), array-like object of symbols or - DataFrame with index containing stock symbols - start : string, (defaults to '1/1/2010') - Starting date, timestamp. Parses many different kind of date - representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980') - end : string, (defaults to today) - Ending date, timestamp. Same format as starting date. - retry_count : int, default: 3 - Number of times to retry query request. - pause : numeric, default: 0.001 - Time, in seconds, to pause between consecutive queries of chunks. If - single value given for symbol, represents the pause between retries. - adjust_price : bool, default: False - If True, adjusts all prices in hist_data ('Open', 'High', 'Low', - 'Close') based on 'Adj Close' price. Adds 'Adj_Ratio' column and drops - 'Adj Close'. - ret_index : bool, default: False - If True, includes a simple return index 'Ret_Index' in hist_data. - chunksize : int, default: 25 - Number of symbols to download consecutively before intiating pause. - interval : string, default: 'd' - Time interval code, valid values are 'd' for daily, 'w' for weekly, - 'm' for monthly and 'v' for dividend. - - Returns - ------- - hist_data : DataFrame (str) or Panel (array-like object, DataFrame) - """ - if interval not in ['d', 'w', 'm', 'v']: - raise ValueError("Invalid interval: valid values are 'd', 'w', 'm' and 'v'") - return _get_data_from(symbols, start, end, interval, retry_count, pause, - adjust_price, ret_index, chunksize, 'yahoo') - - -def get_data_google(symbols=None, start=None, end=None, retry_count=3, - pause=0.001, adjust_price=False, ret_index=False, - chunksize=25): - """ - Returns DataFrame/Panel of historical stock prices from symbols, over date - range, start to end. To avoid being penalized by Google Finance servers, - pauses between downloading 'chunks' of symbols can be specified. - - Parameters - ---------- - symbols : string, array-like object (list, tuple, Series), or DataFrame - Single stock symbol (ticker), array-like object of symbols or - DataFrame with index containing stock symbols. - start : string, (defaults to '1/1/2010') - Starting date, timestamp. Parses many different kind of date - representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980') - end : string, (defaults to today) - Ending date, timestamp. Same format as starting date. - retry_count : int, default: 3 - Number of times to retry query request. - pause : numeric, default: 0.001 - Time, in seconds, to pause between consecutive queries of chunks. If - single value given for symbol, represents the pause between retries. - chunksize : int, default: 25 - Number of symbols to download consecutively before intiating pause. - ret_index : bool, default: False - If True, includes a simple return index 'Ret_Index' in hist_data. - - Returns - ------- - hist_data : DataFrame (str) or Panel (array-like object, DataFrame) - """ - return _get_data_from(symbols, start, end, None, retry_count, pause, - adjust_price, ret_index, chunksize, 'google') - - -_FRED_URL = "http://research.stlouisfed.org/fred2/series/" - - -def get_data_fred(name, start=dt.datetime(2010, 1, 1), - end=dt.datetime.today()): - """ - Get data for the given name from the St. Louis FED (FRED). - Date format is datetime - - Returns a DataFrame. - - If multiple names are passed for "series" then the index of the - DataFrame is the outer join of the indicies of each series. - """ - start, end = _sanitize_dates(start, end) - - if not is_list_like(name): - names = [name] - else: - names = name - - urls = [_FRED_URL + '%s' % n + '/downloaddata/%s' % n + '.csv' for - n in names] - - def fetch_data(url, name): - with urlopen(url) as resp: - data = read_csv(resp, index_col=0, parse_dates=True, - header=None, skiprows=1, names=["DATE", name], - na_values='.') - try: - return data.truncate(start, end) - except KeyError: - if data.ix[3].name[7:12] == 'Error': - raise IOError("Failed to get the data. Check that {0!r} is " - "a valid FRED series.".format(name)) - raise - df = concat([fetch_data(url, n) for url, n in zip(urls, names)], - axis=1, join='outer') - return df - - -_FAMAFRENCH_URL = 'http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp' - - -def get_data_famafrench(name): - # path of zip files - zip_file_path = '{0}/{1}_TXT.zip'.format(_FAMAFRENCH_URL, name) - - with urlopen(zip_file_path) as url: - raw = url.read() - - with tempfile.TemporaryFile() as tmpf: - tmpf.write(raw) - - with ZipFile(tmpf, 'r') as zf: - data = zf.open(zf.namelist()[0]).readlines() - - line_lengths = np.array(lmap(len, data)) - file_edges = np.where(line_lengths == 2)[0] - - datasets = {} - edges = zip(file_edges + 1, file_edges[1:]) - for i, (left_edge, right_edge) in enumerate(edges): - dataset = [d.split() for d in data[left_edge:right_edge]] - if len(dataset) > 10: - ncol_raw = np.array(lmap(len, dataset)) - ncol = np.median(ncol_raw) - header_index = np.where(ncol_raw == ncol - 1)[0][-1] - header = dataset[header_index] - ds_header = dataset[header_index + 1:] - # to ensure the header is unique - header = ['{0} {1}'.format(j, hj) for j, hj in enumerate(header, - start=1)] - index = np.array([d[0] for d in ds_header], dtype=int) - dataset = np.array([d[1:] for d in ds_header], dtype=float) - datasets[i] = DataFrame(dataset, index, columns=header) - - return datasets - - -# Items needed for options class -CUR_MONTH = dt.datetime.now().month -CUR_YEAR = dt.datetime.now().year -CUR_DAY = dt.datetime.now().day - - -def _two_char(s): - return '{0:0>2}'.format(s) - - -class Options(object): - """ - ***Experimental*** - This class fetches call/put data for a given stock/expiry month. - - It is instantiated with a string representing the ticker symbol. - - The class has the following methods: - get_options_data:(month, year, expiry) - get_call_data:(month, year, expiry) - get_put_data: (month, year, expiry) - get_near_stock_price(opt_frame, above_below) - get_all_data(call, put) - get_forward_data(months, call, put) (deprecated) - - Examples - -------- - # Instantiate object with ticker - >>> aapl = Options('aapl', 'yahoo') - - # Fetch next expiry call data - >>> calls = aapl.get_call_data() - - # Can now access aapl.calls instance variable - >>> aapl.calls - - # Fetch next expiry put data - >>> puts = aapl.get_put_data() - - # Can now access aapl.puts instance variable - >>> aapl.puts - - # cut down the call data to be 3 below and 3 above the stock price. - >>> cut_calls = aapl.get_near_stock_price(call=True, above_below=3) - - # Fetch call and put data with expiry from now to 8 months out - >>> forward_data = aapl.get_forward_data(8, call=True, put=True) - - # Fetch all call and put data - >>> all_data = aapl.get_all_data() - """ - - _TABLE_LOC = {'calls': 1, 'puts': 2} - _OPTIONS_BASE_URL = 'http://finance.yahoo.com/q/op?s={sym}' - _FINANCE_BASE_URL = 'http://finance.yahoo.com' - - def __init__(self, symbol, data_source=None): - """ Instantiates options_data with a ticker saved as symbol """ - self.symbol = symbol.upper() - if data_source is None: - warnings.warn("Options(symbol) is deprecated, use Options(symbol," - " data_source) instead", FutureWarning, stacklevel=2) - data_source = "yahoo" - if data_source != "yahoo": - raise NotImplementedError("currently only yahoo supported") - - def get_options_data(self, month=None, year=None, expiry=None): - """ - ***Experimental*** - Gets call/put data for the stock with the expiration data in the - given month and year - - Parameters - ---------- - month : number, int, optional(default=None) - The month the options expire. This should be either 1 or 2 - digits. - - year : number, int, optional(default=None) - The year the options expire. This should be a 4 digit int. - - expiry : date-like or convertible or list-like object, optional (default=None) - The date (or dates) when options expire (defaults to current month) - - Returns - ------- - pandas.DataFrame - A DataFrame with requested options data. - - Index: - Strike: Option strike, int - Expiry: Option expiry, Timestamp - Type: Call or Put, string - Symbol: Option symbol as reported on Yahoo, string - Columns: - Last: Last option price, float - Chg: Change from prior day, float - Bid: Bid price, float - Ask: Ask price, float - Vol: Volume traded, int64 - Open_Int: Open interest, int64 - IsNonstandard: True if the the deliverable is not 100 shares, otherwise false - Underlying: Ticker of the underlying security, string - Underlying_Price: Price of the underlying security, float64 - Quote_Time: Time of the quote, Timestamp - - Notes - ----- - Note: Format of returned data frame is dependent on Yahoo and may change. - - When called, this function will add instance variables named - calls and puts. See the following example: - - >>> aapl = Options('aapl', 'yahoo') # Create object - >>> aapl.calls # will give an AttributeError - >>> aapl.get_options() # Get data and set ivars - >>> aapl.calls # Doesn't throw AttributeError - - Also note that aapl.calls and appl.puts will always be the calls - and puts for the next expiry. If the user calls this method with - a different expiry, the ivar will be named callsYYMMDD or putsYYMMDD, - where YY, MM and DD are, respectively, two digit representations of - the year, month and day for the expiry of the options. - - """ - return concat([f(month, year, expiry) - for f in (self.get_put_data, - self.get_call_data)]).sortlevel() - - def _get_option_frames_from_yahoo(self, expiry): - url = self._yahoo_url_from_expiry(expiry) - option_frames = self._option_frames_from_url(url) - frame_name = '_frames' + self._expiry_to_string(expiry) - setattr(self, frame_name, option_frames) - return option_frames - - @staticmethod - def _expiry_to_string(expiry): - m1 = _two_char(expiry.month) - d1 = _two_char(expiry.day) - return str(expiry.year)[-2:] + m1 + d1 - - def _yahoo_url_from_expiry(self, expiry): - try: - expiry_links = self._expiry_links - - except AttributeError: - _, expiry_links = self._get_expiry_dates_and_links() - - return self._FINANCE_BASE_URL + expiry_links[expiry] - - def _option_frames_from_url(self, url): - frames = read_html(url) - nframes = len(frames) - frames_req = max(self._TABLE_LOC.values()) - if nframes < frames_req: - raise RemoteDataError("%s options tables found (%s expected)" % (nframes, frames_req)) - - if not hasattr(self, 'underlying_price'): - try: - self.underlying_price, self.quote_time = self._underlying_price_and_time_from_url(url) - except IndexError: - self.underlying_price, self.quote_time = np.nan, np.nan - - calls = frames[self._TABLE_LOC['calls']] - puts = frames[self._TABLE_LOC['puts']] - - calls = self._process_data(calls, 'call') - puts = self._process_data(puts, 'put') - - return {'calls': calls, 'puts': puts} - - def _underlying_price_and_time_from_url(self, url): - root = self._parse_url(url) - underlying_price = self._underlying_price_from_root(root) - quote_time = self._quote_time_from_root(root) - return underlying_price, quote_time - - @staticmethod - def _underlying_price_from_root(root): - underlying_price = root.xpath('.//*[@class="time_rtq_ticker Fz-30 Fw-b"]')[0]\ - .getchildren()[0].text - underlying_price = underlying_price.replace(',', '') #GH11 - - try: - underlying_price = float(underlying_price) - except ValueError: - underlying_price = np.nan - - return underlying_price - - @staticmethod - def _quote_time_from_root(root): - #Gets the time of the quote, note this is actually the time of the underlying price. - try: - quote_time_text = root.xpath('.//*[@class="time_rtq Fz-m"]')[0].getchildren()[1].getchildren()[0].text - ##TODO: Enable timezone matching when strptime can match EST with %Z - quote_time_text = quote_time_text.split(' ')[0] - quote_time = dt.datetime.strptime(quote_time_text, "%I:%M%p") - quote_time = quote_time.replace(year=CUR_YEAR, month=CUR_MONTH, day=CUR_DAY) - except ValueError: - quote_time = np.nan - - return quote_time - - def _get_option_data(self, expiry, name): - frame_name = '_frames' + self._expiry_to_string(expiry) - - try: - frames = getattr(self, frame_name) - except AttributeError: - frames = self._get_option_frames_from_yahoo(expiry) - - option_data = frames[name] - if expiry != self.expiry_dates[0]: - name += self._expiry_to_string(expiry) - - setattr(self, name, option_data) - return option_data - - def get_call_data(self, month=None, year=None, expiry=None): - """ - ***Experimental*** - Gets call/put data for the stock with the expiration data in the - given month and year - - Parameters - ---------- - month : number, int, optional(default=None) - The month the options expire. This should be either 1 or 2 - digits. - - year : number, int, optional(default=None) - The year the options expire. This should be a 4 digit int. - - expiry : date-like or convertible or list-like object, optional (default=None) - The date (or dates) when options expire (defaults to current month) - - Returns - ------- - call_data: pandas.DataFrame - A DataFrame with requested options data. - - Index: - Strike: Option strike, int - Expiry: Option expiry, Timestamp - Type: Call or Put, string - Symbol: Option symbol as reported on Yahoo, string - Columns: - Last: Last option price, float - Chg: Change from prior day, float - Bid: Bid price, float - Ask: Ask price, float - Vol: Volume traded, int64 - Open_Int: Open interest, int64 - IsNonstandard: True if the the deliverable is not 100 shares, otherwise false - Underlying: Ticker of the underlying security, string - Underlying_Price: Price of the underlying security, float64 - Quote_Time: Time of the quote, Timestamp - - Notes - ----- - Note: Format of returned data frame is dependent on Yahoo and may change. - - When called, this function will add instance variables named - calls and puts. See the following example: - - >>> aapl = Options('aapl', 'yahoo') # Create object - >>> aapl.calls # will give an AttributeError - >>> aapl.get_call_data() # Get data and set ivars - >>> aapl.calls # Doesn't throw AttributeError - - Also note that aapl.calls will always be the calls for the next - expiry. If the user calls this method with a different month - or year, the ivar will be named callsYYMMDD where YY, MM and DD are, - respectively, two digit representations of the year, month and day - for the expiry of the options. - """ - expiry = self._try_parse_dates(year, month, expiry) - return self._get_data_in_date_range(expiry, call=True, put=False) - - def get_put_data(self, month=None, year=None, expiry=None): - """ - ***Experimental*** - Gets put data for the stock with the expiration data in the - given month and year - - Parameters - ---------- - month : number, int, optional(default=None) - The month the options expire. This should be either 1 or 2 - digits. - - year : number, int, optional(default=None) - The year the options expire. This should be a 4 digit int. - - expiry : date-like or convertible or list-like object, optional (default=None) - The date (or dates) when options expire (defaults to current month) - - Returns - ------- - put_data: pandas.DataFrame - A DataFrame with requested options data. - - Index: - Strike: Option strike, int - Expiry: Option expiry, Timestamp - Type: Call or Put, string - Symbol: Option symbol as reported on Yahoo, string - Columns: - Last: Last option price, float - Chg: Change from prior day, float - Bid: Bid price, float - Ask: Ask price, float - Vol: Volume traded, int64 - Open_Int: Open interest, int64 - IsNonstandard: True if the the deliverable is not 100 shares, otherwise false - Underlying: Ticker of the underlying security, string - Underlying_Price: Price of the underlying security, float64 - Quote_Time: Time of the quote, Timestamp - - Notes - ----- - Note: Format of returned data frame is dependent on Yahoo and may change. - - When called, this function will add instance variables named - puts. See the following example: - - >>> aapl = Options('aapl') # Create object - >>> aapl.puts # will give an AttributeError - >>> aapl.get_put_data() # Get data and set ivars - >>> aapl.puts # Doesn't throw AttributeError - - return self.__setattr__(self, str(str(x) + str(y))) - - Also note that aapl.puts will always be the puts for the next - expiry. If the user calls this method with a different month - or year, the ivar will be named putsYYMMDD where YY, MM and DD are, - respectively, two digit representations of the year, month and day - for the expiry of the options. - """ - expiry = self._try_parse_dates(year, month, expiry) - return self._get_data_in_date_range(expiry, put=True, call=False) - - def get_near_stock_price(self, above_below=2, call=True, put=False, - month=None, year=None, expiry=None): - """ - ***Experimental*** - Returns a data frame of options that are near the current stock price. - - Parameters - ---------- - above_below : number, int, optional (default=2) - The number of strike prices above and below the stock price that - should be taken - - call : bool, default: True - Tells the function whether or not it should be using calls - - put : bool, default: False - Tells the function weather or not it should be using puts - - month : number, int, optional(default=None) - The month the options expire. This should be either 1 or 2 - digits. - - year : number, int, optional(default=None) - The year the options expire. This should be a 4 digit int. - - expiry : date-like or convertible or list-like object, optional (default=None) - The date (or dates) when options expire (defaults to current month) - - Returns - ------- - chopped: DataFrame - The resultant DataFrame chopped down to be 2 * above_below + 1 rows - desired. If there isn't data as far out as the user has asked for - then - - Note: Format of returned data frame is dependent on Yahoo and may change. - - """ - expiry = self._try_parse_dates(year, month, expiry) - data = self._get_data_in_date_range(expiry, call=call, put=put) - return self.chop_data(data, above_below, self.underlying_price) - - def chop_data(self, df, above_below=2, underlying_price=None): - """Returns a data frame only options that are near the current stock price.""" - - if not underlying_price: - try: - underlying_price = self.underlying_price - except AttributeError: - underlying_price = np.nan - - max_strike = max(df.index.get_level_values('Strike')) - min_strike = min(df.index.get_level_values('Strike')) - - if not np.isnan(underlying_price) and min_strike < underlying_price < max_strike: - start_index = np.where(df.index.get_level_values('Strike') - > underlying_price)[0][0] - - get_range = slice(start_index - above_below, - start_index + above_below + 1) - df = df[get_range].dropna(how='all') - - return df - - def _try_parse_dates(self, year, month, expiry): - """ - Validates dates provided by user. Ensures the user either provided both a month and a year or an expiry. - - Parameters - ---------- - year : int - Calendar year - - month : int - Calendar month - - expiry : date-like or convertible, (preferred) - Expiry date - - Returns - ------- - list of expiry dates (datetime.date) - """ - - #Checks if the user gave one of the month or the year but not both and did not provide an expiry: - if (month is not None and year is None) or (month is None and year is not None) and expiry is None: - msg = "You must specify either (`year` and `month`) or `expiry` " \ - "or none of these options for the next expiry." - raise ValueError(msg) - - if expiry is not None: - if hasattr(expiry, '__iter__'): - expiry = [self._validate_expiry(exp) for exp in expiry] - else: - expiry = [self._validate_expiry(expiry)] - - if len(expiry) == 0: - raise ValueError('No expiries available for given input.') - - elif year is None and month is None: - #No arguments passed, provide next expiry - year = CUR_YEAR - month = CUR_MONTH - expiry = dt.date(year, month, 1) - expiry = [self._validate_expiry(expiry)] - - else: - #Year and month passed, provide all expiries in that month - expiry = [expiry for expiry in self.expiry_dates if expiry.year == year and expiry.month == month] - if len(expiry) == 0: - raise ValueError('No expiries available in %s-%s' % (year, month)) - - return expiry - - def _validate_expiry(self, expiry): - """Ensures that an expiry date has data available on Yahoo - If the expiry date does not have options that expire on that day, return next expiry""" - - expiry_dates = self.expiry_dates - expiry = to_datetime(expiry) - if hasattr(expiry, 'date'): - expiry = expiry.date() - - if expiry in expiry_dates: - return expiry - else: - index = DatetimeIndex(expiry_dates).sort_values() - return index[index.date >= expiry][0].date() - - def get_forward_data(self, months, call=True, put=False, near=False, - above_below=2): - """ - ***Experimental*** - Gets either call, put, or both data for months starting in the current - month and going out in the future a specified amount of time. - - Parameters - ---------- - months : number, int - How many months to go out in the collection of the data. This is - inclusive. - - call : bool, optional (default=True) - Whether or not to collect data for call options - - put : bool, optional (default=False) - Whether or not to collect data for put options. - - near : bool, optional (default=False) - Whether this function should get only the data near the - current stock price. Uses Options.get_near_stock_price - - above_below : number, int, optional (default=2) - The number of strike prices above and below the stock price that - should be taken if the near option is set to True - - Returns - ------- - pandas.DataFrame - A DataFrame with requested options data. - - Index: - Strike: Option strike, int - Expiry: Option expiry, Timestamp - Type: Call or Put, string - Symbol: Option symbol as reported on Yahoo, string - Columns: - Last: Last option price, float - Chg: Change from prior day, float - Bid: Bid price, float - Ask: Ask price, float - Vol: Volume traded, int64 - Open_Int: Open interest, int64 - IsNonstandard: True if the the deliverable is not 100 shares, otherwise false - Underlying: Ticker of the underlying security, string - Underlying_Price: Price of the underlying security, float64 - Quote_Time: Time of the quote, Timestamp - - Note: Format of returned data frame is dependent on Yahoo and may change. - - """ - warnings.warn("get_forward_data() is deprecated", FutureWarning, - stacklevel=2) - end_date = dt.date.today() + MonthEnd(months) - dates = (date for date in self.expiry_dates if date <= end_date.date()) - data = self._get_data_in_date_range(dates, call=call, put=put) - if near: - data = self.chop_data(data, above_below=above_below) - return data - - def get_all_data(self, call=True, put=True): - """ - ***Experimental*** - Gets either call, put, or both data for all available months starting - in the current month. - - Parameters - ---------- - call : bool, optional (default=True) - Whether or not to collect data for call options - - put : bool, optional (default=True) - Whether or not to collect data for put options. - - Returns - ------- - pandas.DataFrame - A DataFrame with requested options data. - - Index: - Strike: Option strike, int - Expiry: Option expiry, Timestamp - Type: Call or Put, string - Symbol: Option symbol as reported on Yahoo, string - Columns: - Last: Last option price, float - Chg: Change from prior day, float - Bid: Bid price, float - Ask: Ask price, float - Vol: Volume traded, int64 - Open_Int: Open interest, int64 - IsNonstandard: True if the the deliverable is not 100 shares, otherwise false - Underlying: Ticker of the underlying security, string - Underlying_Price: Price of the underlying security, float64 - Quote_Time: Time of the quote, Timestamp - - Note: Format of returned data frame is dependent on Yahoo and may change. - - """ - - try: - expiry_dates = self.expiry_dates - except AttributeError: - expiry_dates, _ = self._get_expiry_dates_and_links() - - return self._get_data_in_date_range(dates=expiry_dates, call=call, put=put) - - def _get_data_in_date_range(self, dates, call=True, put=True): - - to_ret = Series({'calls': call, 'puts': put}) - to_ret = to_ret[to_ret].index - data = [] - - for name in to_ret: - for expiry_date in dates: - nam = name + self._expiry_to_string(expiry_date) - try: # Try to access on the instance - frame = getattr(self, nam) - except AttributeError: - frame = self._get_option_data(expiry=expiry_date, name=name) - data.append(frame) - - return concat(data).sortlevel() - - @property - def expiry_dates(self): - """ - Returns a list of available expiry dates - """ - try: - expiry_dates = self._expiry_dates - except AttributeError: - expiry_dates, _ = self._get_expiry_dates_and_links() - return expiry_dates - - def _get_expiry_dates_and_links(self): - """ - Gets available expiry dates. - - Returns - ------- - Tuple of: - List of datetime.date objects - Dict of datetime.date objects as keys and corresponding links - """ - - url = self._OPTIONS_BASE_URL.format(sym=self.symbol) - root = self._parse_url(url) - - try: - links = root.xpath('//*[@id="options_menu"]/form/select/option') - except IndexError: - raise RemoteDataError('Expiry dates not available') - - expiry_dates = [dt.datetime.strptime(element.text, "%B %d, %Y").date() for element in links] - links = [element.attrib['data-selectbox-link'] for element in links] - - if len(expiry_dates) == 0: - raise RemoteDataError('Data not available') - - expiry_links = dict(zip(expiry_dates, links)) - self._expiry_links = expiry_links - self._expiry_dates = expiry_dates - return expiry_dates, expiry_links - - def _parse_url(self, url): - """ - Downloads and parses a URL, returns xml root. - - """ - try: - from lxml.html import parse - except ImportError: - raise ImportError("Please install lxml if you want to use the " - "{0!r} class".format(self.__class__.__name__)) - try: - doc = parse(url) - except _network_error_classes: - raise RemoteDataError("Unable to parse URL " - "{0!r}".format(url)) - else: - root = doc.getroot() - if root is None: - raise RemoteDataError("Parsed URL {0!r} has no root" - "element".format(url)) - return root - - def _process_data(self, frame, type): - """ - Adds columns for Expiry, IsNonstandard (ie: deliverable is not 100 shares) - and Tag (the tag indicating what is actually deliverable, None if standard). - - """ - frame.columns = ['Strike', 'Symbol', 'Last', 'Bid', 'Ask', 'Chg', 'PctChg', 'Vol', 'Open_Int', 'IV'] - frame["Rootexp"] = frame.Symbol.str[0:-9] - frame["Root"] = frame.Rootexp.str[0:-6] - frame["Expiry"] = to_datetime(frame.Rootexp.str[-6:]) - #Removes dashes in equity ticker to map to option ticker. - #Ex: BRK-B to BRKB140517C00100000 - frame["IsNonstandard"] = frame['Root'] != self.symbol.replace('-', '') - del frame["Rootexp"] - frame["Underlying"] = self.symbol - try: - frame['Underlying_Price'] = self.underlying_price - frame["Quote_Time"] = self.quote_time - except AttributeError: - frame['Underlying_Price'] = np.nan - frame["Quote_Time"] = np.nan - frame.rename(columns={'Open Int': 'Open_Int'}, inplace=True) - frame['Type'] = type - frame.set_index(['Strike', 'Expiry', 'Type', 'Symbol'], inplace=True) - - return frame +raise ImportError( + "The pandas.io.data module is moved to a separate package " + "(pandas-datareader). After installing the pandas-datareader package " + "(https://github.com/pydata/pandas-datareader), you can change " + "the import ``from pandas.io import data, wb`` to " + "``from pandas_datareader import data, wb``.") diff --git a/pandas/io/tests/data/yahoo_options1.html b/pandas/io/tests/data/yahoo_options1.html deleted file mode 100644 index 2846a2bd12732..0000000000000 --- a/pandas/io/tests/data/yahoo_options1.html +++ /dev/null @@ -1,6065 +0,0 @@ - - - - - AAPL Options | Yahoo! Inc. Stock - Yahoo! Finance - - - - - - - - - - - - - - - - - - - - - - - - - -
-
-
- -
-
- - -
- -
- -
- - - - - - - -
- - -
- - -
-
- - - - - -
-
-
- - - - - -
-
-
- - - - - -
- -
- - - -
-
-
-
- - - - Dow - - - - Up - - - 1.32% - - - - - - - Nasdaq - - - - Up - - - 1.60% - - - - - - -
- -
-
-
- - -
- -
-

More on AAPL

-
-
- - -

Quotes

- - -

Charts

- - -

News & Info

- - -

Company

- - -

Analyst Coverage

- - -

Ownership

- - -

Financials

- - - -
-
- -
-
- -
-
-
-
-
- -
-
-
-
-
-
-
-
-
-
-
-
-
-
- - -
-
- - - - - -
-
-
- - - - -
-
-
-
-
-
-

Apple Inc. (AAPL)

- -
-
-
-
- - 104.83 - - - - - Up +1.84(1.79%) - - - NasdaqGS - As of 4:00PM EDT - -
-
| - - -
-
- -
- - -
-
-
-
-
- - - - -
-
-
- - - - - -
- -
- - - -
- - -
-
- - - - - -
-
-
- -
-
- -
-
October 24, 2014
- -
- - -
- -
- - -
-
-
- - -
-
-
-
-
- - -
-
- In The Money -
-
- - - -
-
-

Show Me Strikes From

-
- $ - to $ -
- Apply Filter - Clear Filter -
- - - - - -
- -
-
-

Show Me Strikes From

-
- $ - to $ -
- Apply Filter - Clear Filter -
- - - - - -
- - -
- - -
-
-
- - - - - -
- -
- - - -
- - -
- -
-
- - - -
-
- - - - - - - - - - - - - - - -
- - - - - - - - - - - - \ No newline at end of file diff --git a/pandas/io/tests/data/yahoo_options2.html b/pandas/io/tests/data/yahoo_options2.html deleted file mode 100644 index bae9c193e03e1..0000000000000 --- a/pandas/io/tests/data/yahoo_options2.html +++ /dev/null @@ -1,5853 +0,0 @@ - - - - - AAPL Option Chain | Yahoo! Inc. Stock - Yahoo! Finance - - - - - - - - - - - - - - - - - - - - - - - - - -
-
-
- -
-
- - -
- -
- -
- - - - - - - -
- - -
- - -
-
- - - - - -
-
-
- - - - - -
-
-
- - - - - -
- -
- - - -
-
-
-
- - - - Dow - - - - Up - - - 0.58% - - - - - - - Nasdaq - - - - Down - - - 0.06% - - - - - - -
- -
-
-
- - -
- -
-

More on AAPL

-
-
- - -

Quotes

- - -

Charts

- - -

News & Info

- - -

Company

- - -

Analyst Coverage

- - -

Ownership

- - -

Financials

- - - -
-
- -
-
- -
-
-
-
-
- -
-
-
-
-
-
-
-
-
-
-
-
-
-
- - -
-
- - - - - -
-
-
- - - - -
-
-
-
-
-
-

Apple Inc. (AAPL)

- -
-
-
-
- - 108.86 - - - - - Up +0.26(0.24%) - - - NasdaqGS - As of 4:00PM EST - -
-
| - - After Hours: - 108.86 0.00 (0.00%) 7:59PM EST - - -
-
- -
- - -
-
-
-
-
- - - - -
-
-
- - - - - -
- -
- - - -
- - -
-
- - - - - -
-
-
- -
-
- -
-
November 7, 2014
- -
- - - -
-
-
- - -
-
-
-
-
- - -
-
- In The Money -
-
- - - -
-
-

Show Me Strikes From

-
- $ - to $ -
- Apply Filter - Clear Filter -
- - - - - -
- -
-
-

Show Me Strikes From

-
- $ - to $ -
- Apply Filter - Clear Filter -
- - - - - -
- - -
- - -
-
-
- - - - - -
- -
- - - -
- - -
- -
-
- - - -
-
- - - - - - - - - - - - - - - -
- - - - - - - - - - - - \ No newline at end of file diff --git a/pandas/io/tests/data/yahoo_options3.html b/pandas/io/tests/data/yahoo_options3.html deleted file mode 100644 index 6e79bb9bf9f36..0000000000000 --- a/pandas/io/tests/data/yahoo_options3.html +++ /dev/null @@ -1,2807 +0,0 @@ - - - - - SPWR Option Chain | Yahoo! Inc. Stock - Yahoo! Finance - - - - - - - - - - - - - - - - - - - - - - - - - -
-
-
- -
  • FirefoxInstall the new Firefox »
  • -
    - - -
    - -
    - -
    - - - - - - - -
    - - -
    - - -
    - -
    - - - -
    -
    -
    -
    - - - - Dow - - - - Down - - - 0.58% - - - - - - - Nasdaq - - - - Down - - - 0.32% - - - - - - -
    - -
    -
    -
    - - -
    - -
    -

    More on SPWR

    -
    -
    - - -

    Quotes

    - - -

    Charts

    - - -

    News & Info

    - - -

    Company

    - - -

    Analyst Coverage

    - - -

    Ownership

    - - -

    Financials

    - - -
    -
    - -
    -
    - -
    -
    -
    -
    -
    - -
    -
    -
    -
    -
    -
    -
    -
    -
    -
    -
    -
    -
    -
    - - -
    - - -
    -
    -
    -
    -
    -
    -

    SunPower Corporation (SPWR)

    - -
    -
    -
    -
    - - 33.05 - - - - - Up +0.07(0.21%) - - - NASDAQ - As of 4:00PM EDT - -
    -
    | - - After Hours: - 33.10 Up +0.05 (0.15%) 7:47PM EDT - - -
    -
    - -
    - - -
    -
    -
    -
    -
    - - -
    - - - -
    - - -
    -
    - -
    -
    May 1, 2015
    - -
    - - - -
    -
    -
    - - -
    -
    -
    -
    -
    - - -
    -
    - In The Money -
    -
    - - - -
    -
    -

    Show Me Strikes From

    -
    - $ - to $ -
    - Apply Filter - Clear Filter -
    - - - - - -
    - -
    -
    -

    Show Me Strikes From

    -
    - $ - to $ -
    - Apply Filter - Clear Filter -
    - - - - - -
    - - -
    -
    - - - -
    - - -
    - -
    -
    - - - -
    -
    - - - - - - - - - - - - - - - -
    - - - - - - - - - - - - \ No newline at end of file diff --git a/pandas/io/tests/test_data.py b/pandas/io/tests/test_data.py deleted file mode 100644 index 1efa8b13598a7..0000000000000 --- a/pandas/io/tests/test_data.py +++ /dev/null @@ -1,586 +0,0 @@ -# flake8: noqa - -from __future__ import print_function -from pandas import compat -import warnings -import nose -from nose.tools import assert_equal -from datetime import datetime -import os - -import numpy as np -import pandas as pd -from pandas import DataFrame, Timestamp -from pandas.util.testing import (assert_series_equal, assert_produces_warning, - network, assert_frame_equal) -import pandas.util.testing as tm - -with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - from pandas.io import data as web - -from pandas.io.data import DataReader, SymbolWarning, RemoteDataError, _yahoo_codes - -if compat.PY3: - from urllib.error import HTTPError -else: - from urllib2 import HTTPError - - -def _skip_if_no_lxml(): - try: - import lxml - except ImportError: - raise nose.SkipTest("no lxml") - -def _skip_if_no_bs(): - try: - import bs4 - import html5lib - except ImportError: - raise nose.SkipTest("no html5lib/bs4") - - -def assert_n_failed_equals_n_null_columns(wngs, obj, cls=SymbolWarning): - all_nan_cols = pd.Series(dict((k, pd.isnull(v).all()) for k, v in - compat.iteritems(obj))) - n_all_nan_cols = all_nan_cols.sum() - valid_warnings = pd.Series([wng for wng in wngs if wng.category == cls]) - assert_equal(len(valid_warnings), n_all_nan_cols) - failed_symbols = all_nan_cols[all_nan_cols].index - msgs = valid_warnings.map(lambda x: x.message) - assert msgs.str.contains('|'.join(failed_symbols)).all() - - -class TestGoogle(tm.TestCase): - @classmethod - def setUpClass(cls): - super(TestGoogle, cls).setUpClass() - cls.locales = tm.get_locales(prefix='en_US') - if not cls.locales: - raise nose.SkipTest("US English locale not available for testing") - - @classmethod - def tearDownClass(cls): - super(TestGoogle, cls).tearDownClass() - del cls.locales - - @network - def test_google(self): - # asserts that google is minimally working and that it throws - # an exception when DataReader can't get a 200 response from - # google - start = datetime(2010, 1, 1) - end = datetime(2013, 1, 27) - - for locale in self.locales: - with tm.set_locale(locale): - panel = web.DataReader("F", 'google', start, end) - self.assertEqual(panel.Close[-1], 13.68) - - self.assertRaises(Exception, web.DataReader, "NON EXISTENT TICKER", - 'google', start, end) - - @network - def test_get_quote_fails(self): - self.assertRaises(NotImplementedError, web.get_quote_google, - pd.Series(['GOOG', 'AAPL', 'GOOG'])) - - @network - def test_get_goog_volume(self): - for locale in self.locales: - with tm.set_locale(locale): - df = web.get_data_google('GOOG').sort_index() - self.assertEqual(df.Volume.ix['JAN-02-2015'], 1446662) - - @network - def test_get_multi1(self): - for locale in self.locales: - sl = ['AAPL', 'AMZN', 'GOOG'] - with tm.set_locale(locale): - pan = web.get_data_google(sl, '2012', '2013') - ts = pan.Close.GOOG.index[pan.Close.AAPL < pan.Close.GOOG] - if (hasattr(pan, 'Close') and hasattr(pan.Close, 'GOOG') and - hasattr(pan.Close, 'AAPL')): - self.assertEqual(ts[0].dayofyear, 3) - else: - self.assertRaises(AttributeError, lambda: pan.Close) - - @network - def test_get_multi_invalid(self): - sl = ['AAPL', 'AMZN', 'INVALID'] - with tm.assert_produces_warning(SymbolWarning): - pan = web.get_data_google(sl, '2012') - self.assertIn('INVALID', pan.minor_axis) - - @network - def test_get_multi_all_invalid(self): - sl = ['INVALID', 'INVALID2', 'INVALID3'] - with tm.assert_produces_warning(SymbolWarning): - self.assertRaises(RemoteDataError, web.get_data_google, sl, '2012') - - @network - def test_get_multi2(self): - with warnings.catch_warnings(record=True) as w: - for locale in self.locales: - with tm.set_locale(locale): - pan = web.get_data_google(['GE', 'MSFT', 'INTC'], - 'JAN-01-12', 'JAN-31-12') - result = pan.Close.ix['01-18-12'] - assert_n_failed_equals_n_null_columns(w, result) - - # sanity checking - - self.assertTrue(np.issubdtype(result.dtype, np.floating)) - result = pan.Open.ix['Jan-15-12':'Jan-20-12'] - self.assertEqual((4, 3), result.shape) - assert_n_failed_equals_n_null_columns(w, result) - - @network - def test_dtypes(self): - #GH3995, #GH8980 - data = web.get_data_google('F', start='JAN-01-10', end='JAN-27-13') - self.assertTrue(np.issubdtype(data.Open.dtype, np.number)) - self.assertTrue(np.issubdtype(data.Close.dtype, np.number)) - self.assertTrue(np.issubdtype(data.Low.dtype, np.number)) - self.assertTrue(np.issubdtype(data.High.dtype, np.number)) - self.assertTrue(np.issubdtype(data.Volume.dtype, np.number)) - - @network - def test_unicode_date(self): - #GH8967 - data = web.get_data_google('F', start='JAN-01-10', end='JAN-27-13') - self.assertEqual(data.index.name, 'Date') - - -class TestYahoo(tm.TestCase): - @classmethod - def setUpClass(cls): - super(TestYahoo, cls).setUpClass() - _skip_if_no_lxml() - - @network - def test_yahoo(self): - # asserts that yahoo is minimally working and that it throws - # an exception when DataReader can't get a 200 response from - # yahoo - start = datetime(2010, 1, 1) - end = datetime(2013, 1, 27) - - self.assertEqual(web.DataReader("F", 'yahoo', start, end)['Close'][-1], - 13.68) - - @network - def test_yahoo_fails(self): - start = datetime(2010, 1, 1) - end = datetime(2013, 1, 27) - self.assertRaises(Exception, web.DataReader, "NON EXISTENT TICKER", - 'yahoo', start, end) - - @network - def test_get_quote_series(self): - df = web.get_quote_yahoo(pd.Series(['GOOG', 'AAPL', 'GOOG'])) - assert_series_equal(df.ix[0], df.ix[2]) - - @network - def test_get_quote_string(self): - df = web.get_quote_yahoo('GOOG') - - @network - def test_get_quote_string(self): - _yahoo_codes.update({'MarketCap': 'j1'}) - df = web.get_quote_yahoo('GOOG') - self.assertFalse(pd.isnull(df['MarketCap'][0])) - - @network - def test_get_quote_stringlist(self): - df = web.get_quote_yahoo(['GOOG', 'AAPL', 'GOOG']) - assert_series_equal(df.ix[0], df.ix[2]) - - @network - def test_get_components_dow_jones(self): - raise nose.SkipTest('unreliable test, receive partial components back for dow_jones') - - df = web.get_components_yahoo('^DJI') #Dow Jones - self.assertIsInstance(df, pd.DataFrame) - self.assertEqual(len(df), 30) - - @network - def test_get_components_dax(self): - raise nose.SkipTest('unreliable test, receive partial components back for dax') - - df = web.get_components_yahoo('^GDAXI') #DAX - self.assertIsInstance(df, pd.DataFrame) - self.assertEqual(len(df), 30) - self.assertEqual(df[df.name.str.contains('adidas', case=False)].index, - 'ADS.DE') - - @network - def test_get_components_nasdaq_100(self): - # as of 7/12/13 the conditional will test false because the link is invalid - raise nose.SkipTest('unreliable test, receive partial components back for nasdaq_100') - - df = web.get_components_yahoo('^NDX') #NASDAQ-100 - self.assertIsInstance(df, pd.DataFrame) - - if len(df) > 1: - # Usual culprits, should be around for a while - self.assertTrue('AAPL' in df.index) - self.assertTrue('GOOG' in df.index) - self.assertTrue('AMZN' in df.index) - else: - expected = DataFrame({'exchange': 'N/A', 'name': '@^NDX'}, - index=['@^NDX']) - assert_frame_equal(df, expected) - - @network - def test_get_data_single_symbol(self): - #single symbol - #http://finance.yahoo.com/q/hp?s=GOOG&a=09&b=08&c=2010&d=09&e=10&f=2010&g=d - # just test that we succeed - web.get_data_yahoo('GOOG') - - @network - def test_get_data_interval(self): - # daily interval data - pan = web.get_data_yahoo('XOM', '2013-01-01', '2013-12-31', interval='d') - self.assertEqual(len(pan), 252) - - # weekly interval data - pan = web.get_data_yahoo('XOM', '2013-01-01', '2013-12-31', interval='w') - self.assertEqual(len(pan), 53) - - # montly interval data - pan = web.get_data_yahoo('XOM', '2013-01-01', '2013-12-31', interval='m') - self.assertEqual(len(pan), 12) - - # dividend data - pan = web.get_data_yahoo('XOM', '2013-01-01', '2013-12-31', interval='v') - self.assertEqual(len(pan), 4) - - # test fail on invalid interval - self.assertRaises(ValueError, web.get_data_yahoo, 'XOM', interval='NOT VALID') - - @network - def test_get_data_multiple_symbols(self): - # just test that we succeed - sl = ['AAPL', 'AMZN', 'GOOG'] - web.get_data_yahoo(sl, '2012') - - @network - def test_get_data_multiple_symbols_two_dates(self): - pan = web.get_data_yahoo(['GE', 'MSFT', 'INTC'], 'JAN-01-12', - 'JAN-31-12') - result = pan.Close.ix['01-18-12'] - self.assertEqual(len(result), 3) - - # sanity checking - self.assertTrue(np.issubdtype(result.dtype, np.floating)) - - expected = np.array([[18.99, 28.4, 25.18], - [18.58, 28.31, 25.13], - [19.03, 28.16, 25.52], - [18.81, 28.82, 25.87]]) - result = pan.Open.ix['Jan-15-12':'Jan-20-12'] - self.assertEqual(expected.shape, result.shape) - - @network - def test_get_date_ret_index(self): - pan = web.get_data_yahoo(['GE', 'INTC', 'IBM'], '1977', '1987', - ret_index=True) - self.assertTrue(hasattr(pan, 'Ret_Index')) - if hasattr(pan, 'Ret_Index') and hasattr(pan.Ret_Index, 'INTC'): - tstamp = pan.Ret_Index.INTC.first_valid_index() - result = pan.Ret_Index.ix[tstamp]['INTC'] - self.assertEqual(result, 1.0) - - # sanity checking - self.assertTrue(np.issubdtype(pan.values.dtype, np.floating)) - - -class TestYahooOptions(tm.TestCase): - - @classmethod - def setUpClass(cls): - super(TestYahooOptions, cls).setUpClass() - raise nose.SkipTest('disable Yahoo Options tests') - - _skip_if_no_lxml() - _skip_if_no_bs() - raise nose.SkipTest('unreliable test') - - # aapl has monthlies - cls.aapl = web.Options('aapl', 'yahoo') - d = (Timestamp.today() + pd.offsets.MonthBegin(1)).normalize() - cls.year = d.year - cls.month = d.month - cls.expiry = d - cls.expiry2 = d + pd.offsets.MonthBegin(1) - cls.dirpath = tm.get_data_path() - cls.html1 = os.path.join(cls.dirpath, 'yahoo_options1.html') - cls.html2 = os.path.join(cls.dirpath, 'yahoo_options2.html') - cls.html3 = os.path.join(cls.dirpath, 'yahoo_options3.html') #Empty table GH#22 - cls.data1 = cls.aapl._option_frames_from_url(cls.html1)['puts'] - - @classmethod - def tearDownClass(cls): - super(TestYahooOptions, cls).tearDownClass() - del cls.aapl, cls.expiry - - @network - def test_get_options_data(self): - # regression test GH6105 - self.assertRaises(ValueError, self.aapl.get_options_data, month=3) - self.assertRaises(ValueError, self.aapl.get_options_data, year=1992) - - try: - options = self.aapl.get_options_data(expiry=self.expiry) - except RemoteDataError as e: - raise nose.SkipTest(e) - self.assertTrue(len(options) > 1) - - @network - def test_get_near_stock_price(self): - try: - options = self.aapl.get_near_stock_price(call=True, put=True, - expiry=[self.expiry,self.expiry2]) - except RemoteDataError as e: - raise nose.SkipTest(e) - self.assertTrue(len(options) > 1) - - @network - def test_get_call_data(self): - try: - calls = self.aapl.get_call_data(expiry=self.expiry) - except RemoteDataError as e: - raise nose.SkipTest(e) - self.assertTrue(len(calls) > 1) - - @network - def test_get_put_data(self): - try: - puts = self.aapl.get_put_data(expiry=self.expiry) - except RemoteDataError as e: - raise nose.SkipTest(e) - self.assertTrue(len(puts) > 1) - - @network - def test_get_expiry_dates(self): - try: - dates, _ = self.aapl._get_expiry_dates_and_links() - except RemoteDataError as e: - raise nose.SkipTest(e) - self.assertTrue(len(dates) > 1) - - @network - def test_get_all_data(self): - - try: - data = self.aapl.get_all_data(put=True) - except RemoteDataError as e: - raise nose.SkipTest(e) - self.assertTrue(len(data) > 1) - - @network - def test_get_data_with_list(self): - try: - data = self.aapl.get_call_data(expiry=self.aapl.expiry_dates) - except RemoteDataError as e: - raise nose.SkipTest(e) - self.assertTrue(len(data) > 1) - - @network - def test_get_all_data_calls_only(self): - try: - data = self.aapl.get_all_data(call=True, put=False) - except RemoteDataError as e: - raise nose.SkipTest(e) - self.assertTrue(len(data) > 1) - - @network - def test_get_underlying_price(self): - #GH7 - try: - options_object = web.Options('^spxpm', 'yahoo') - url = options_object._yahoo_url_from_expiry(options_object.expiry_dates[0]) - root = options_object._parse_url(url) - quote_price = options_object._underlying_price_from_root(root) - except RemoteDataError as e: - raise nose.SkipTest(e) - self.assertIsInstance(quote_price, float) - - def test_sample_page_price_quote_time1(self): - #Tests the weekend quote time format - price, quote_time = self.aapl._underlying_price_and_time_from_url(self.html1) - self.assertIsInstance(price, (int, float, complex)) - self.assertIsInstance(quote_time, (datetime, Timestamp)) - - def test_chop(self): - #regression test for #7625 - self.aapl.chop_data(self.data1, above_below=2, underlying_price=np.nan) - chopped = self.aapl.chop_data(self.data1, above_below=2, underlying_price=100) - self.assertIsInstance(chopped, DataFrame) - self.assertTrue(len(chopped) > 1) - - def test_chop_out_of_strike_range(self): - #regression test for #7625 - self.aapl.chop_data(self.data1, above_below=2, underlying_price=np.nan) - chopped = self.aapl.chop_data(self.data1, above_below=2, underlying_price=100000) - self.assertIsInstance(chopped, DataFrame) - self.assertTrue(len(chopped) > 1) - - - @network - def test_sample_page_price_quote_time2(self): - #Tests the EDT page format - #regression test for #8741 - price, quote_time = self.aapl._underlying_price_and_time_from_url(self.html2) - self.assertIsInstance(price, (int, float, complex)) - self.assertIsInstance(quote_time, (datetime, Timestamp)) - - @network - def test_sample_page_chg_float(self): - #Tests that numeric columns with comma's are appropriately dealt with - self.assertEqual(self.data1['Chg'].dtype, 'float64') - - @network - def test_month_year(self): - try: - data = self.aapl.get_call_data(month=self.month, year=self.year) - except RemoteDataError as e: - raise nose.SkipTest(e) - - self.assertTrue(len(data) > 1) - - @network - def test_empty_table(self): - #GH22 - empty = self.aapl._option_frames_from_url(self.html3)['puts'] - self.assertTrue(len(empty) == 0) - - -class TestOptionsWarnings(tm.TestCase): - @classmethod - def setUpClass(cls): - super(TestOptionsWarnings, cls).setUpClass() - - @classmethod - def tearDownClass(cls): - super(TestOptionsWarnings, cls).tearDownClass() - - @network - def test_options_source_warning(self): - with assert_produces_warning(): - aapl = web.Options('aapl') - - -class TestDataReader(tm.TestCase): - - @network - def test_read_yahoo(self): - gs = DataReader("GS", "yahoo") - self.assertIsInstance(gs, DataFrame) - - @network - def test_read_google(self): - gs = DataReader("GS", "google") - self.assertIsInstance(gs, DataFrame) - - @network - def test_read_fred(self): - vix = DataReader("VIXCLS", "fred") - self.assertIsInstance(vix, DataFrame) - - @network - def test_read_famafrench(self): - raise nose.SkipTest('buggy as of 2/14/16; maybe a data revision?') - for name in ("F-F_Research_Data_Factors", - "F-F_Research_Data_Factors_weekly", "6_Portfolios_2x3", - "F-F_ST_Reversal_Factor", "F-F_Momentum_Factor"): - ff = DataReader(name, "famafrench") - self.assertTrue(ff is not None) - self.assertIsInstance(ff, dict) - - -class TestFred(tm.TestCase): - - @classmethod - def setUpClass(cls): - super(TestFred, cls).setUpClass() - raise nose.SkipTest('disable Fred tests') - - @network - def test_fred(self): - raise nose.SkipTest('buggy as of 2/14/16; maybe a data revision?') - - # Throws an exception when DataReader can't get a 200 response from - # FRED. - - start = datetime(2010, 1, 1) - end = datetime(2013, 1, 27) - - received = web.DataReader("GDP", "fred", start, end)['GDP'].tail(1)[0] - self.assertTrue(int(received) > 10000) - - self.assertRaises(Exception, web.DataReader, "NON EXISTENT SERIES", - 'fred', start, end) - - @network - def test_fred_nan(self): - start = datetime(2010, 1, 1) - end = datetime(2013, 1, 27) - df = web.DataReader("DFII5", "fred", start, end) - self.assertTrue(pd.isnull(df.ix['2010-01-01'][0])) - - @network - def test_fred_parts(self): - raise nose.SkipTest('buggy as of 2/18/14; maybe a data revision?') - - start = datetime(2010, 1, 1) - end = datetime(2013, 1, 27) - df = web.get_data_fred("CPIAUCSL", start, end) - self.assertEqual(df.ix['2010-05-01'][0], 217.23) - - t = df.CPIAUCSL.values - self.assertTrue(np.issubdtype(t.dtype, np.floating)) - self.assertEqual(t.shape, (37,)) - - @network - def test_fred_part2(self): - expected = [[576.7], - [962.9], - [684.7], - [848.3], - [933.3]] - result = web.get_data_fred("A09024USA144NNBR", start="1915").ix[:5] - tm.assert_numpy_array_equal(result.values, np.array(expected)) - - @network - def test_invalid_series(self): - name = "NOT A REAL SERIES" - self.assertRaises(Exception, web.get_data_fred, name) - - @network - def test_fred_multi(self): - raise nose.SkipTest('buggy as of 2/18/14; maybe a data revision?') - - names = ['CPIAUCSL', 'CPALTT01USQ661S', 'CPILFESL'] - start = datetime(2010, 1, 1) - end = datetime(2013, 1, 27) - - received = web.DataReader(names, "fred", start, end).head(1) - expected = DataFrame([[217.478, 0.99701529, 220.544]], columns=names, - index=[pd.tslib.Timestamp('2010-01-01 00:00:00')]) - expected.index.rename('DATE', inplace=True) - assert_frame_equal(received, expected, check_less_precise=True) - - @network - def test_fred_multi_bad_series(self): - - names = ['NOTAREALSERIES', 'CPIAUCSL', "ALSO FAKE"] - with tm.assertRaises(HTTPError): - DataReader(names, data_source="fred") - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/io/tests/test_wb.py b/pandas/io/tests/test_wb.py deleted file mode 100644 index 42884b19de03a..0000000000000 --- a/pandas/io/tests/test_wb.py +++ /dev/null @@ -1,114 +0,0 @@ -# flake8: noqa - -import nose - -import pandas -from pandas.compat import u -from pandas.util.testing import network -from pandas.util.testing import assert_frame_equal -import pandas.util.testing as tm - -# deprecated -with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - from pandas.io.wb import search, download, get_countries - -class TestWB(tm.TestCase): - - @tm.slow - @network - def test_wdi_search(self): - - # Test that a name column exists, and that some results were returned - # ...without being too strict about what the actual contents of the - # results actually are. The fact that there are some, is good enough. - - result = search('gdp.*capita.*constant') - self.assertTrue(result.name.str.contains('GDP').any()) - - @tm.slow - @network - def test_wdi_download(self): - - # Test a bad indicator with double (US), triple (USA), - # standard (CA, MX), non standard (KSV), - # duplicated (US, US, USA), and unknown (BLA) country codes - - # ...but NOT a crash inducing country code (World bank strips pandas - # users of the luxury of laziness, because they create their - # own exceptions, and don't clean up legacy country codes. - # ...but NOT a retired indicator (User should want it to error.) - - cntry_codes = ['CA', 'MX', 'USA', 'US', 'US', 'KSV', 'BLA'] - inds = ['NY.GDP.PCAP.CD','BAD.INDICATOR'] - - expected = {'NY.GDP.PCAP.CD': {('Canada', '2003'): 28026.006013044702, ('Mexico', '2003'): 6601.0420648056606, ('Canada', '2004'): 31829.522562759001, ('Kosovo', '2003'): 1969.56271307405, ('Mexico', '2004'): 7042.0247834044303, ('United States', '2004'): 41928.886136479705, ('United States', '2003'): 39682.472247320402, ('Kosovo', '2004'): 2135.3328465238301}} - expected = pandas.DataFrame(expected) - #Round, to ignore revisions to data. - expected = pandas.np.round(expected,decimals=-3) - expected.sort(inplace=True) - result = download(country=cntry_codes, indicator=inds, - start=2003, end=2004, errors='ignore') - result.sort(inplace=True) - #Round, to ignore revisions to data. - result = pandas.np.round(result,decimals=-3) - expected.index = result.index - assert_frame_equal(result, pandas.DataFrame(expected)) - - @tm.slow - @network - def test_wdi_download_w_retired_indicator(self): - - cntry_codes = ['CA', 'MX', 'US'] - # Despite showing up in the search feature, and being listed online, - # the api calls to GDPPCKD don't work in their own query builder, nor - # pandas module. GDPPCKD used to be a common symbol. - # This test is written to ensure that error messages to pandas users - # continue to make sense, rather than a user getting some missing - # key error, cause their JSON message format changed. If - # World bank ever finishes the deprecation of this symbol, - # this nose test should still pass. - - inds = ['GDPPCKD'] - - try: - result = download(country=cntry_codes, indicator=inds, - start=2003, end=2004, errors='ignore') - # If for some reason result actually ever has data, it's cause WB - # fixed the issue with this ticker. Find another bad one. - except ValueError as e: - raise nose.SkipTest("No indicators returned data: {0}".format(e)) - - # if it ever gets here, it means WB unretired the indicator. - # even if they dropped it completely, it would still get caught above - # or the WB API changed somehow in a really unexpected way. - if len(result) > 0: - raise nose.SkipTest("Invalid results") - - @tm.slow - @network - def test_wdi_download_w_crash_inducing_countrycode(self): - - cntry_codes = ['CA', 'MX', 'US', 'XXX'] - inds = ['NY.GDP.PCAP.CD'] - - try: - result = download(country=cntry_codes, indicator=inds, - start=2003, end=2004, errors='ignore') - except ValueError as e: - raise nose.SkipTest("No indicators returned data: {0}".format(e)) - - # if it ever gets here, it means the country code XXX got used by WB - # or the WB API changed somehow in a really unexpected way. - if len(result) > 0: - raise nose.SkipTest("Invalid results") - - @tm.slow - @network - def test_wdi_get_countries(self): - result = get_countries() - self.assertTrue('Zimbabwe' in list(result['name'])) - self.assertTrue(len(result) > 100) - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/io/wb.py b/pandas/io/wb.py index 81b4947f06b16..5dc4d9ce1adc4 100644 --- a/pandas/io/wb.py +++ b/pandas/io/wb.py @@ -1,314 +1,6 @@ -# -*- coding: utf-8 -*- - -# flake8: noqa - -from __future__ import print_function - -from pandas.compat import map, reduce, range, lrange -from pandas.io.common import urlopen -from pandas.io import json -import pandas -import numpy as np -import warnings - -warnings.warn("\n" - "The pandas.io.wb module is moved to a separate package " - "(pandas-datareader) and will be removed from pandas in a " - "future version.\nAfter installing the pandas-datareader package " - "(https://github.com/pydata/pandas-datareader), you can change " - "the import ``from pandas.io import data, wb`` to " - "``from pandas_datareader import data, wb``.", - FutureWarning) - - -# This list of country codes was pulled from wikipedia during October 2014. -# While some exceptions do exist, it is the best proxy for countries supported -# by World Bank. It is an aggregation of the 2-digit ISO 3166-1 alpha-2, and -# 3-digit ISO 3166-1 alpha-3, codes, with 'all', 'ALL', and 'All' appended ot -# the end. - -country_codes = ['AD', 'AE', 'AF', 'AG', 'AI', 'AL', 'AM', 'AO', 'AQ', 'AR', \ - 'AS', 'AT', 'AU', 'AW', 'AX', 'AZ', 'BA', 'BB', 'BD', 'BE', \ - 'BF', 'BG', 'BH', 'BI', 'BJ', 'BL', 'BM', 'BN', 'BO', 'BQ', \ - 'BR', 'BS', 'BT', 'BV', 'BW', 'BY', 'BZ', 'CA', 'CC', 'CD', \ - 'CF', 'CG', 'CH', 'CI', 'CK', 'CL', 'CM', 'CN', 'CO', 'CR', \ - 'CU', 'CV', 'CW', 'CX', 'CY', 'CZ', 'DE', 'DJ', 'DK', 'DM', \ - 'DO', 'DZ', 'EC', 'EE', 'EG', 'EH', 'ER', 'ES', 'ET', 'FI', \ - 'FJ', 'FK', 'FM', 'FO', 'FR', 'GA', 'GB', 'GD', 'GE', 'GF', \ - 'GG', 'GH', 'GI', 'GL', 'GM', 'GN', 'GP', 'GQ', 'GR', 'GS', \ - 'GT', 'GU', 'GW', 'GY', 'HK', 'HM', 'HN', 'HR', 'HT', 'HU', \ - 'ID', 'IE', 'IL', 'IM', 'IN', 'IO', 'IQ', 'IR', 'IS', 'IT', \ - 'JE', 'JM', 'JO', 'JP', 'KE', 'KG', 'KH', 'KI', 'KM', 'KN', \ - 'KP', 'KR', 'KW', 'KY', 'KZ', 'LA', 'LB', 'LC', 'LI', 'LK', \ - 'LR', 'LS', 'LT', 'LU', 'LV', 'LY', 'MA', 'MC', 'MD', 'ME', \ - 'MF', 'MG', 'MH', 'MK', 'ML', 'MM', 'MN', 'MO', 'MP', 'MQ', \ - 'MR', 'MS', 'MT', 'MU', 'MV', 'MW', 'MX', 'MY', 'MZ', 'NA', \ - 'NC', 'NE', 'NF', 'NG', 'NI', 'NL', 'NO', 'NP', 'NR', 'NU', \ - 'NZ', 'OM', 'PA', 'PE', 'PF', 'PG', 'PH', 'PK', 'PL', 'PM', \ - 'PN', 'PR', 'PS', 'PT', 'PW', 'PY', 'QA', 'RE', 'RO', 'RS', \ - 'RU', 'RW', 'SA', 'SB', 'SC', 'SD', 'SE', 'SG', 'SH', 'SI', \ - 'SJ', 'SK', 'SL', 'SM', 'SN', 'SO', 'SR', 'SS', 'ST', 'SV', \ - 'SX', 'SY', 'SZ', 'TC', 'TD', 'TF', 'TG', 'TH', 'TJ', 'TK', \ - 'TL', 'TM', 'TN', 'TO', 'TR', 'TT', 'TV', 'TW', 'TZ', 'UA', \ - 'UG', 'UM', 'US', 'UY', 'UZ', 'VA', 'VC', 'VE', 'VG', 'VI', \ - 'VN', 'VU', 'WF', 'WS', 'YE', 'YT', 'ZA', 'ZM', 'ZW', \ - 'ABW', 'AFG', 'AGO', 'AIA', 'ALA', 'ALB', 'AND', 'ARE', \ - 'ARG', 'ARM', 'ASM', 'ATA', 'ATF', 'ATG', 'AUS', 'AUT', \ - 'AZE', 'BDI', 'BEL', 'BEN', 'BES', 'BFA', 'BGD', 'BGR', \ - 'BHR', 'BHS', 'BIH', 'BLM', 'BLR', 'BLZ', 'BMU', 'BOL', \ - 'BRA', 'BRB', 'BRN', 'BTN', 'BVT', 'BWA', 'CAF', 'CAN', \ - 'CCK', 'CHE', 'CHL', 'CHN', 'CIV', 'CMR', 'COD', 'COG', \ - 'COK', 'COL', 'COM', 'CPV', 'CRI', 'CUB', 'CUW', 'CXR', \ - 'CYM', 'CYP', 'CZE', 'DEU', 'DJI', 'DMA', 'DNK', 'DOM', \ - 'DZA', 'ECU', 'EGY', 'ERI', 'ESH', 'ESP', 'EST', 'ETH', \ - 'FIN', 'FJI', 'FLK', 'FRA', 'FRO', 'FSM', 'GAB', 'GBR', \ - 'GEO', 'GGY', 'GHA', 'GIB', 'GIN', 'GLP', 'GMB', 'GNB', \ - 'GNQ', 'GRC', 'GRD', 'GRL', 'GTM', 'GUF', 'GUM', 'GUY', \ - 'HKG', 'HMD', 'HND', 'HRV', 'HTI', 'HUN', 'IDN', 'IMN', \ - 'IND', 'IOT', 'IRL', 'IRN', 'IRQ', 'ISL', 'ISR', 'ITA', \ - 'JAM', 'JEY', 'JOR', 'JPN', 'KAZ', 'KEN', 'KGZ', 'KHM', \ - 'KIR', 'KNA', 'KOR', 'KWT', 'LAO', 'LBN', 'LBR', 'LBY', \ - 'LCA', 'LIE', 'LKA', 'LSO', 'LTU', 'LUX', 'LVA', 'MAC', \ - 'MAF', 'MAR', 'MCO', 'MDA', 'MDG', 'MDV', 'MEX', 'MHL', \ - 'MKD', 'MLI', 'MLT', 'MMR', 'MNE', 'MNG', 'MNP', 'MOZ', \ - 'MRT', 'MSR', 'MTQ', 'MUS', 'MWI', 'MYS', 'MYT', 'NAM', \ - 'NCL', 'NER', 'NFK', 'NGA', 'NIC', 'NIU', 'NLD', 'NOR', \ - 'NPL', 'NRU', 'NZL', 'OMN', 'PAK', 'PAN', 'PCN', 'PER', \ - 'PHL', 'PLW', 'PNG', 'POL', 'PRI', 'PRK', 'PRT', 'PRY', \ - 'PSE', 'PYF', 'QAT', 'REU', 'ROU', 'RUS', 'RWA', 'SAU', \ - 'SDN', 'SEN', 'SGP', 'SGS', 'SHN', 'SJM', 'SLB', 'SLE', \ - 'SLV', 'SMR', 'SOM', 'SPM', 'SRB', 'SSD', 'STP', 'SUR', \ - 'SVK', 'SVN', 'SWE', 'SWZ', 'SXM', 'SYC', 'SYR', 'TCA', \ - 'TCD', 'TGO', 'THA', 'TJK', 'TKL', 'TKM', 'TLS', 'TON', \ - 'TTO', 'TUN', 'TUR', 'TUV', 'TWN', 'TZA', 'UGA', 'UKR', \ - 'UMI', 'URY', 'USA', 'UZB', 'VAT', 'VCT', 'VEN', 'VGB', \ - 'VIR', 'VNM', 'VUT', 'WLF', 'WSM', 'YEM', 'ZAF', 'ZMB', \ - 'ZWE', 'all', 'ALL', 'All'] - -def download(country=None, indicator=None, - start=2003, end=2005,errors='warn'): - """ - Download data series from the World Bank's World Development Indicators - - Parameters - ---------- - - indicator: string or list of strings - taken from the ``id`` field in ``WDIsearch()`` - - country: string or list of strings. - ``all`` downloads data for all countries - 2 or 3 character ISO country codes select individual - countries (e.g.``US``,``CA``) or (e.g.``USA``,``CAN``). The codes - can be mixed. - - The two ISO lists of countries, provided by wikipedia, are hardcoded - into pandas as of 11/10/2014. - - start: int - First year of the data series - - end: int - Last year of the data series (inclusive) - - errors: str {'ignore', 'warn', 'raise'}, default 'warn' - Country codes are validated against a hardcoded list. This controls - the outcome of that validation, and attempts to also apply - to the results from world bank. - - errors='raise', will raise a ValueError on a bad country code. - - Returns - ------- - - ``pandas`` DataFrame with columns: country, iso_code, year, - indicator value. - - """ - if country is None: - country = ['MX', 'CA', 'US'] - if indicator is None: - indicator = ['NY.GDP.MKTP.CD', 'NY.GNS.ICTR.ZS'] - - if type(country) == str: - country = [country] - - bad_countries = np.setdiff1d(country, country_codes) - - # Validate the input - if len(bad_countries) > 0: - tmp = ", ".join(bad_countries) - if errors == 'raise': - raise ValueError("Invalid Country Code(s): %s" % tmp) - if errors == 'warn': - warnings.warn('Non-standard ISO country codes: %s' % tmp) - - # Work with a list of indicators - if type(indicator) == str: - indicator = [indicator] - - # Download - data = [] - bad_indicators = {} - for ind in indicator: - one_indicator_data,msg = _get_data(ind, country, start, end) - if msg == "Success": - data.append(one_indicator_data) - else: - bad_indicators[ind] = msg - - if len(bad_indicators.keys()) > 0: - bad_ind_msgs = [i + " : " + m for i,m in bad_indicators.items()] - bad_ind_msgs = "\n\n".join(bad_ind_msgs) - bad_ind_msgs = "\n\nInvalid Indicators:\n\n%s" % bad_ind_msgs - if errors == 'raise': - raise ValueError(bad_ind_msgs) - if errors == 'warn': - warnings.warn(bad_ind_msgs) - - # Confirm we actually got some data, and build Dataframe - if len(data) > 0: - out = reduce(lambda x, y: x.merge(y, how='outer'), data) - out = out.drop('iso_code', axis=1) - out = out.set_index(['country', 'year']) - out = out._convert(datetime=True, numeric=True) - return out - else: - msg = "No indicators returned data." - if errors == 'ignore': - msg += " Set errors='warn' for more information." - raise ValueError(msg) - -def _get_data(indicator="NY.GNS.ICTR.GN.ZS", country='US', - start=2002, end=2005): - - if type(country) == str: - country = [country] - - countries = ';'.join(country) - - # Build URL for api call - url = ("http://api.worldbank.org/countries/" + countries + "/indicators/" + - indicator + "?date=" + str(start) + ":" + str(end) + - "&per_page=25000&format=json") - - # Download - with urlopen(url) as response: - data = response.read() - - # Check to see if there is a possible problem - possible_message = json.loads(data)[0] - if 'message' in possible_message.keys(): - msg = possible_message['message'][0] - try: - msg = msg['key'].split() + ["\n "] + msg['value'].split() - wb_err = ' '.join(msg) - except: - wb_err = "" - if 'key' in msg.keys(): - wb_err = msg['key'] + "\n " - if 'value' in msg.keys(): - wb_err += msg['value'] - error_msg = "Problem with a World Bank Query \n %s" - return None, error_msg % wb_err - - if 'total' in possible_message.keys(): - if possible_message['total'] == 0: - return None, "No results from world bank." - - # Parse JSON file - data = json.loads(data)[1] - country = [x['country']['value'] for x in data] - iso_code = [x['country']['id'] for x in data] - year = [x['date'] for x in data] - value = [x['value'] for x in data] - # Prepare output - out = pandas.DataFrame([country, iso_code, year, value]).T - out.columns = ['country', 'iso_code', 'year', indicator] - return out,"Success" - -def get_countries(): - """Query information about countries - """ - url = 'http://api.worldbank.org/countries/?per_page=1000&format=json' - with urlopen(url) as response: - data = response.read() - data = json.loads(data)[1] - data = pandas.DataFrame(data) - data.adminregion = [x['value'] for x in data.adminregion] - data.incomeLevel = [x['value'] for x in data.incomeLevel] - data.lendingType = [x['value'] for x in data.lendingType] - data.region = [x['value'] for x in data.region] - data = data.rename(columns={'id': 'iso3c', 'iso2Code': 'iso2c'}) - return data - -def get_indicators(): - """Download information about all World Bank data series - """ - url = 'http://api.worldbank.org/indicators?per_page=50000&format=json' - with urlopen(url) as response: - data = response.read() - data = json.loads(data)[1] - data = pandas.DataFrame(data) - # Clean fields - data.source = [x['value'] for x in data.source] - fun = lambda x: x.encode('ascii', 'ignore') - data.sourceOrganization = data.sourceOrganization.apply(fun) - # Clean topic field - - def get_value(x): - try: - return x['value'] - except: - return '' - fun = lambda x: [get_value(y) for y in x] - data.topics = data.topics.apply(fun) - data.topics = data.topics.apply(lambda x: ' ; '.join(x)) - # Clean outpu - data = data.sort(columns='id') - data.index = pandas.Index(lrange(data.shape[0])) - return data - -_cached_series = None - - -def search(string='gdp.*capi', field='name', case=False): - """ - Search available data series from the world bank - - Parameters - ---------- - - string: string - regular expression - field: string - id, name, source, sourceNote, sourceOrganization, topics - See notes below - case: bool - case sensitive search? - - Notes - ----- - - The first time this function is run it will download and cache the full - list of available series. Depending on the speed of your network - connection, this can take time. Subsequent searches will use the cached - copy, so they should be much faster. - - id : Data series indicator (for use with the ``indicator`` argument of - ``WDI()``) e.g. NY.GNS.ICTR.GN.ZS" - name: Short description of the data series - source: Data collection project - sourceOrganization: Data collection organization - note: - sourceNote: - topics: - """ - # Create cached list of series if it does not exist - global _cached_series - if type(_cached_series) is not pandas.core.frame.DataFrame: - _cached_series = get_indicators() - data = _cached_series[field] - idx = data.str.contains(string, case=case) - out = _cached_series.ix[idx].dropna() - return out +raise ImportError( + "The pandas.io.wb module is moved to a separate package " + "(pandas-datareader). After installing the pandas-datareader package " + "(https://github.com/pydata/pandas-datareader), you can change " + "the import ``from pandas.io import data, wb`` to " + "``from pandas_datareader import data, wb``.")