diff --git a/pandas_datareader/__init__.py b/pandas_datareader/__init__.py index b3d0f9b8..eba620ba 100644 --- a/pandas_datareader/__init__.py +++ b/pandas_datareader/__init__.py @@ -1,4 +1,4 @@ -__version__ = version = '0.4.0' +__version__ = version = '0.4.1' from .data import (get_components_yahoo, get_data_famafrench, get_data_google, get_data_yahoo, get_data_enigma, # noqa get_data_yahoo_actions, get_quote_google, get_quote_yahoo, DataReader, Options) # noqa diff --git a/pandas_datareader/base.py b/pandas_datareader/base.py index 3f0faf03..6eb421ce 100644 --- a/pandas_datareader/base.py +++ b/pandas_datareader/base.py @@ -53,6 +53,7 @@ def __init__(self, symbols, start=None, end=None, self.retry_count = retry_count self.pause = pause self.timeout = timeout + self.pause_multiplier = 1 self.session = _init_session(session, retry_count) @property @@ -85,6 +86,10 @@ def _read_url_as_StringIO(self, url, params=None): response = self._get_response(url, params=params) text = self._sanitize_response(response) out = StringIO() + if len(text) == 0: + service = self.__class__.__name__ + raise IOError("{} request returned no data; check URL for invalid " + "inputs: {}".format(service, self.url)) if isinstance(text, compat.binary_type): out.write(bytes_to_str(text)) else: @@ -99,7 +104,7 @@ def _sanitize_response(response): """ return response.content - def _get_response(self, url, params=None): + def _get_response(self, url, params=None, headers=None): """ send raw HTTP request to get requests.Response from the specified url Parameters ---------- @@ -110,17 +115,29 @@ def _get_response(self, url, params=None): """ # initial attempt + retry + pause = self.pause for i in range(self.retry_count + 1): - response = self.session.get(url, params=params) + response = self.session.get(url, params=params, headers=headers) if response.status_code == requests.codes.ok: return response - time.sleep(self.pause) + time.sleep(pause) + + # Increase time between subsequent requests, per subclass. + pause *= self.pause_multiplier + # Get a new breadcrumb if necessary, in case ours is invalidated + if isinstance(params, list) and 'crumb' in params: + params['crumb'] = self._get_crumb(self.retry_count) if params is not None and len(params) > 0: url = url + "?" + urlencode(params) raise RemoteDataError('Unable to read URL: {0}'.format(url)) + def _get_crumb(self, *args): + """ To be implemented by subclass """ + raise NotImplementedError("Subclass has not implemented method.") + def _read_lines(self, out): - rs = read_csv(out, index_col=0, parse_dates=True, na_values='-')[::-1] + rs = read_csv(out, index_col=0, parse_dates=True, + na_values=('-', 'null'))[::-1] # Yahoo! Finance sometimes does this awesome thing where they # return 2 rows for the most recent business day if len(rs) > 2 and rs.index[-1] == rs.index[-2]: # pragma: no cover diff --git a/pandas_datareader/data.py b/pandas_datareader/data.py index da090872..08de52c1 100644 --- a/pandas_datareader/data.py +++ b/pandas_datareader/data.py @@ -9,7 +9,7 @@ from pandas_datareader.yahoo.daily import YahooDailyReader from pandas_datareader.yahoo.quotes import YahooQuotesReader -from pandas_datareader.yahoo.actions import YahooActionReader +from pandas_datareader.yahoo.actions import (YahooActionReader, YahooDivReader) from pandas_datareader.yahoo.components import _get_data as get_components_yahoo # noqa from pandas_datareader.yahoo.options import Options as YahooOptions from pandas_datareader.google.options import Options as GoogleOptions @@ -121,10 +121,10 @@ def DataReader(name, data_source=None, start=None, end=None, retry_count=retry_count, pause=pause, session=session).read() elif data_source == "yahoo-dividends": - return YahooDailyReader(symbols=name, start=start, end=end, - adjust_price=False, chunksize=25, - retry_count=retry_count, pause=pause, - session=session, interval='v').read() + return YahooDivReader(symbols=name, start=start, end=end, + adjust_price=False, chunksize=25, + retry_count=retry_count, pause=pause, + session=session, interval='d').read() elif data_source == "google": return GoogleDailyReader(symbols=name, start=start, end=end, diff --git a/pandas_datareader/tests/yahoo/test_yahoo.py b/pandas_datareader/tests/yahoo/test_yahoo.py index 925b5383..e68877bf 100644 --- a/pandas_datareader/tests/yahoo/test_yahoo.py +++ b/pandas_datareader/tests/yahoo/test_yahoo.py @@ -10,7 +10,9 @@ import pandas_datareader.data as web from pandas_datareader.data import YahooDailyReader +from pandas_datareader._utils import RemoteDataError from pandas_datareader.yahoo.quotes import _yahoo_codes +from pandas_datareader._testing import skip_on_exception class TestYahoo(object): @@ -87,18 +89,21 @@ def test_get_components_nasdaq_100(self): # pragma: no cover index=['@^NDX']) tm.assert_frame_equal(df, expected) + @skip_on_exception(RemoteDataError) def test_get_data_single_symbol(self): # single symbol # http://finance.yahoo.com/q/hp?s=GOOG&a=09&b=08&c=2010&d=09&e=10&f=2010&g=d # just test that we succeed web.get_data_yahoo('GOOG') + @skip_on_exception(RemoteDataError) def test_get_data_adjust_price(self): goog = web.get_data_yahoo('GOOG') goog_adj = web.get_data_yahoo('GOOG', adjust_price=True) assert 'Adj Close' not in goog_adj.columns assert (goog['Open'] * goog_adj['Adj_Ratio']).equals(goog_adj['Open']) + @skip_on_exception(RemoteDataError) def test_get_data_interval(self): # daily interval data pan = web.get_data_yahoo('XOM', '2013-01-01', @@ -108,43 +113,42 @@ def test_get_data_interval(self): # weekly interval data pan = web.get_data_yahoo('XOM', '2013-01-01', '2013-12-31', interval='w') - assert len(pan) == 53 + assert len(pan) == 52 - # montly interval data - pan = web.get_data_yahoo('XOM', '2013-01-01', + # monthly interval data + pan = web.get_data_yahoo('XOM', '2012-12-31', '2013-12-31', interval='m') assert len(pan) == 12 - # dividend data - pan = web.get_data_yahoo('XOM', '2013-01-01', - '2013-12-31', interval='v') - assert len(pan) == 4 - # test fail on invalid interval with pytest.raises(ValueError): web.get_data_yahoo('XOM', interval='NOT VALID') + @skip_on_exception(RemoteDataError) def test_get_data_multiple_symbols(self): # just test that we succeed sl = ['AAPL', 'AMZN', 'GOOG'] web.get_data_yahoo(sl, '2012') + @skip_on_exception(RemoteDataError) def test_get_data_multiple_symbols_two_dates(self): pan = web.get_data_yahoo(['GE', 'MSFT', 'INTC'], 'JAN-01-12', 'JAN-31-12') - result = pan.Close.ix['01-18-12'] - assert len(result) == 3 + result = pan.Close['01-18-12'].T + assert result.size == 3 # sanity checking - assert np.issubdtype(result.dtype, np.floating) + assert result.dtypes.all() == np.floating expected = np.array([[18.99, 28.4, 25.18], [18.58, 28.31, 25.13], [19.03, 28.16, 25.52], [18.81, 28.82, 25.87]]) - result = pan.Open.ix['Jan-15-12':'Jan-20-12'] + df = pan.Open + result = df[(df.index >= 'Jan-15-12') & (df.index <= 'Jan-20-12')] assert expected.shape == result.shape + @skip_on_exception(RemoteDataError) def test_get_date_ret_index(self): pan = web.get_data_yahoo(['GE', 'INTC', 'IBM'], '1977', '1987', ret_index=True) @@ -158,6 +162,7 @@ def test_get_date_ret_index(self): # sanity checking assert np.issubdtype(pan.values.dtype, np.floating) + @skip_on_exception(RemoteDataError) def test_get_data_yahoo_actions(self): start = datetime(1990, 1, 1) end = datetime(2000, 4, 5) @@ -191,6 +196,7 @@ def test_yahoo_reader_class(self): r = YahooDailyReader('GOOG', session=session) assert r.session is session + @skip_on_exception(RemoteDataError) def test_yahoo_DataReader(self): start = datetime(2010, 1, 1) end = datetime(2015, 5, 9) @@ -212,8 +218,14 @@ def test_yahoo_DataReader(self): 0.47, 0.43571, 0.43571, 0.43571, 0.43571, 0.37857, 0.37857, 0.37857]}, index=exp_idx) + exp.index.name = 'Date' + + exp = exp.sort_index(axis=1) + result = result.sort_index(axis=1) + tm.assert_frame_equal(result, exp) + @skip_on_exception(RemoteDataError) def test_yahoo_DataReader_multi(self): start = datetime(2010, 1, 1) end = datetime(2015, 5, 9) diff --git a/pandas_datareader/yahoo/actions.py b/pandas_datareader/yahoo/actions.py index 9e8b33ce..5965971a 100644 --- a/pandas_datareader/yahoo/actions.py +++ b/pandas_datareader/yahoo/actions.py @@ -1,61 +1,53 @@ -import csv -from pandas import to_datetime, DataFrame +from pandas import (concat, DataFrame) +from pandas_datareader.yahoo.daily import YahooDailyReader -from pandas_datareader.base import _DailyBaseReader - - -class YahooActionReader(_DailyBaseReader): +class YahooActionReader(YahooDailyReader): """ Returns DataFrame of historical corporate actions (dividends and stock splits) from symbols, over date range, start to end. All dates in the resulting DataFrame correspond with dividend and stock split ex-dates. """ + def read(self): + dividends = YahooDivReader(symbols=self.symbols, + start=self.start, + end=self.end, + retry_count=self.retry_count, + pause=self.pause, + session=self.session).read() + # Add a label column so we can combine our two DFs + if isinstance(dividends, DataFrame): + dividends["action"] = "DIVIDEND" + dividends = dividends.rename(columns={'Dividends': 'value'}) + + splits = YahooSplitReader(symbols=self.symbols, + start=self.start, + end=self.end, + retry_count=self.retry_count, + pause=self.pause, + session=self.session).read() + # Add a label column so we can combine our two DFs + if isinstance(splits, DataFrame): + splits["action"] = "SPLIT" + splits = splits.rename(columns={'Stock Splits': 'value'}) + # Converts fractional form splits (i.e. "2/1") into conversion + # ratios, then take the reciprocal + splits['value'] = splits.apply(lambda x: 1/eval(x['value']), axis=1) # noqa + + output = concat([dividends, splits]).sort_index(ascending=False) + + return output + + +class YahooDivReader(YahooDailyReader): + + @property + def service(self): + return 'div' + + +class YahooSplitReader(YahooDailyReader): @property - def url(self): - return 'http://ichart.finance.yahoo.com/x' - - def _get_params(self, symbols=None): - params = { - 's': self.symbols, - 'a': self.start.month - 1, - 'b': self.start.day, - 'c': self.start.year, - 'd': self.end.month - 1, - 'e': self.end.day, - 'f': self.end.year, - 'g': 'v' - } - return params - - def _read_lines(self, out): - actions_index = [] - actions_entries = [] - - for line in csv.reader(out.readlines()): - # Ignore lines that aren't dividends or splits (Yahoo - # add a bunch of irrelevant fields.) - if len(line) != 3 or line[0] not in ('DIVIDEND', 'SPLIT'): - continue - - action, date, value = line - if action == 'DIVIDEND': - actions_index.append(to_datetime(date)) - actions_entries.append({ - 'action': action, - 'value': float(value) - }) - elif action == 'SPLIT' and ':' in value: - # Convert the split ratio to a fraction. For example a - # 4:1 split expressed as a fraction is 1/4 = 0.25. - denominator, numerator = value.split(':', 1) - split_fraction = float(numerator) / float(denominator) - - actions_index.append(to_datetime(date)) - actions_entries.append({ - 'action': action, - 'value': split_fraction - }) - - return DataFrame(actions_entries, index=actions_index) + def service(self): + return 'split' diff --git a/pandas_datareader/yahoo/daily.py b/pandas_datareader/yahoo/daily.py index 9ee6cdcc..7133fc0f 100644 --- a/pandas_datareader/yahoo/daily.py +++ b/pandas_datareader/yahoo/daily.py @@ -1,4 +1,10 @@ -from pandas_datareader.base import _DailyBaseReader +import re +import time +import warnings +import numpy as np +from pandas import Panel +from pandas_datareader.base import (_DailyBaseReader, _in_chunks) +from pandas_datareader._utils import (RemoteDataError, SymbolWarning) class YahooDailyReader(_DailyBaseReader): @@ -39,36 +45,66 @@ class YahooDailyReader(_DailyBaseReader): """ def __init__(self, symbols=None, start=None, end=None, retry_count=3, - pause=0.001, session=None, adjust_price=False, + pause=0.35, session=None, adjust_price=False, ret_index=False, chunksize=25, interval='d'): super(YahooDailyReader, self).__init__(symbols=symbols, start=start, end=end, retry_count=retry_count, pause=pause, session=session, chunksize=chunksize) + # Ladder up the wait time between subsequent requests to improve + # probability of a successful retry + self.pause_multiplier = 2.5 + + self.headers = { + 'Connection': 'keep-alive', + 'Expires': str(-1), + 'Upgrade-Insecure-Requests': str(1), + # Google Chrome: + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36' # noqa + } + self.adjust_price = adjust_price self.ret_index = ret_index - - if interval not in ['d', 'w', 'm', 'v']: - raise ValueError("Invalid interval: valid values are " - "'d', 'w', 'm' and 'v'") self.interval = interval + if self.interval not in ['d', 'wk', 'mo', 'm', 'w']: + raise ValueError("Invalid interval: valid values are 'd', 'wk' and 'mo'. 'm' and 'w' have been implemented for " # noqa + "backward compatibility. 'v' has been moved to the yahoo-actions or yahoo-dividends APIs.") # noqa + elif self.interval in ['m', 'mo']: + self.pdinterval = 'm' + self.interval = 'mo' + elif self.interval in ['w', 'wk']: + self.pdinterval = 'w' + self.interval = 'wk' + + self.interval = '1' + self.interval + self.crumb = self._get_crumb(retry_count) + + @property + def service(self): + return 'history' + @property def url(self): - return 'http://ichart.finance.yahoo.com/table.csv' + return 'https://query1.finance.yahoo.com/v7/finance/download/{}'\ + .format(self.symbols) + + @staticmethod + def yurl(symbol): + return 'https://query1.finance.yahoo.com/v7/finance/download/{}'\ + .format(symbol) def _get_params(self, symbol): + unix_start = int(time.mktime(self.start.timetuple())) + unix_end = int(time.mktime(self.end.timetuple())) + params = { - 's': symbol, - 'a': self.start.month - 1, - 'b': self.start.day, - 'c': self.start.year, - 'd': self.end.month - 1, - 'e': self.end.day, - 'f': self.end.year, - 'g': self.interval, - 'ignore': '.csv' + 'period1': unix_start, + 'period2': unix_end, + 'interval': self.interval, + 'events': self.service, + 'crumb': self.crumb } return params @@ -79,7 +115,49 @@ def read(self): df['Ret_Index'] = _calc_return_index(df['Adj Close']) if self.adjust_price: df = _adjust_prices(df) - return df + return df.sort_index() + + def _dl_mult_symbols(self, symbols): + stocks = {} + failed = [] + passed = [] + for sym_group in _in_chunks(symbols, self.chunksize): + for sym in sym_group: + try: + stocks[sym] = self._read_one_data(self.yurl(sym), + self._get_params(sym)) + passed.append(sym) + except IOError: + msg = 'Failed to read symbol: {0!r}, replacing with NaN.' + warnings.warn(msg.format(sym), SymbolWarning) + failed.append(sym) + + if len(passed) == 0: + msg = "No data fetched using {0!r}" + raise RemoteDataError(msg.format(self.__class__.__name__)) + try: + if len(stocks) > 0 and len(failed) > 0 and len(passed) > 0: + df_na = stocks[passed[0]].copy() + df_na[:] = np.nan + for sym in failed: + stocks[sym] = df_na + return Panel(stocks).swapaxes('items', 'minor') + except AttributeError: + # cannot construct a panel with just 1D nans indicating no data + msg = "No data fetched using {0!r}" + raise RemoteDataError(msg.format(self.__class__.__name__)) + + def _get_crumb(self, retries): + # Scrape a history page for a valid crumb ID: + tu = "https://finance.yahoo.com/quote/{}/history".format(self.symbols) + response = self._get_response(tu, + params=self.params, headers=self.headers) + out = str(self._sanitize_response(response)) + # Matches: {"crumb":"AlphaNumeric"} + rpat = '"CrumbStore":{"crumb":"([^"]+)"}' + + crumb = re.findall(rpat, out)[0] + return crumb.encode('ascii').decode('unicode-escape') def _adjust_prices(hist_data, price_list=None):