diff --git a/pandas_datareader/commons.py b/pandas_datareader/commons.py new file mode 100644 index 00000000..a3ab4a41 --- /dev/null +++ b/pandas_datareader/commons.py @@ -0,0 +1,116 @@ +import time +import warnings +import numpy as np +import datetime as dt + +import pandas.compat as compat +from pandas.core.datetools import to_datetime +from pandas.core.common import PandasError +from pandas import Panel, DataFrame +from pandas.io.common import urlopen +from pandas import read_csv +from pandas.compat import StringIO, bytes_to_str +from pandas.util.testing import _network_error_classes + + +class SymbolWarning(UserWarning): + pass + + +class RemoteDataError(PandasError, IOError): + pass + + +def _sanitize_dates(start, end): + start = to_datetime(start) + end = to_datetime(end) + if start is None: + start = dt.datetime(2010, 1, 1) + if end is None: + end = dt.datetime.today() + return start, end + + +def _in_chunks(seq, size): + """ + Return sequence in 'chunks' of size defined by size + """ + return (seq[pos:pos + size] for pos in range(0, len(seq), size)) + + +def _retry_read_url(url, retry_count, pause, name): + for _ in range(retry_count): + time.sleep(pause) + + # kludge to close the socket ASAP + try: + with urlopen(url) as resp: + lines = resp.read() + except _network_error_classes: + pass + else: + rs = read_csv(StringIO(bytes_to_str(lines)), index_col=0, + parse_dates=True, na_values='-')[::-1] + # Yahoo! Finance sometimes does this awesome thing where they + # return 2 rows for the most recent business day + if len(rs) > 2 and rs.index[-1] == rs.index[-2]: # pragma: no cover + rs = rs[:-1] + + #Get rid of unicode characters in index name. + try: + rs.index.name = rs.index.name.decode('unicode_escape').encode('ascii', 'ignore') + except AttributeError: + #Python 3 string has no decode method. + rs.index.name = rs.index.name.encode('ascii', 'ignore').decode() + + return rs + + raise IOError("after %d tries, %s did not " + "return a 200 for url %r" % (retry_count, name, url)) + + +def _dl_mult_symbols(symbols, start, end, interval, chunksize, retry_count, pause, + method): + stocks = {} + failed = [] + passed = [] + for sym_group in _in_chunks(symbols, chunksize): + for sym in sym_group: + try: + stocks[sym] = method(sym, start, end, interval, retry_count, pause) + passed.append(sym) + except IOError: + warnings.warn('Failed to read symbol: {0!r}, replacing with ' + 'NaN.'.format(sym), SymbolWarning) + failed.append(sym) + + if len(passed) == 0: + raise RemoteDataError("No data fetched using " + "{0!r}".format(method.__name__)) + try: + if len(stocks) > 0 and len(failed) > 0 and len(passed) > 0: + df_na = stocks[passed[0]].copy() + df_na[:] = np.nan + for sym in failed: + stocks[sym] = df_na + return Panel(stocks).swapaxes('items', 'minor') + except AttributeError: + # cannot construct a panel with just 1D nans indicating no data + raise RemoteDataError("No data fetched using " + "{0!r}".format(method.__name__)) + +def _get_data_from(symbols, start, end, interval, retry_count, pause, + chunksize, src_fn): + + # If a single symbol, (e.g., 'GOOG') + if isinstance(symbols, (compat.string_types, int)): + hist_data = src_fn(symbols, start, end, interval, retry_count, pause) + # Or multiple symbols, (e.g., ['GOOG', 'AAPL', 'MSFT']) + elif isinstance(symbols, DataFrame): + hist_data = _dl_mult_symbols(symbols.index, start, end, interval, chunksize, + retry_count, pause, src_fn) + else: + hist_data = _dl_mult_symbols(symbols, start, end, interval, chunksize, + retry_count, pause, src_fn) + + return hist_data diff --git a/pandas_datareader/data.py b/pandas_datareader/data.py index 1f322d41..090f879e 100644 --- a/pandas_datareader/data.py +++ b/pandas_datareader/data.py @@ -18,19 +18,26 @@ ) import pandas.compat as compat from pandas import Panel, DataFrame, Series, read_csv, concat, to_datetime, DatetimeIndex, DateOffset -from pandas.core.common import is_list_like, PandasError +from pandas.core.common import is_list_like from pandas.io.common import urlopen, ZipFile, urlencode from pandas.tseries.offsets import MonthEnd from pandas.util.testing import _network_error_classes from pandas.io.html import read_html -class SymbolWarning(UserWarning): - pass +from pandas_datareader.commons import _in_chunks, _sanitize_dates, \ + _retry_read_url, _dl_mult_symbols, _get_data_from +from pandas_datareader.google.hist import get_data_google +from pandas_datareader.google.quotes import get_quote_google -class RemoteDataError(PandasError, IOError): - pass +from pandas_datareader.yahoo.actions import get_data_yahoo_actions +from pandas_datareader.yahoo.components import get_components_yahoo +from pandas_datareader.yahoo.daily import get_data_yahoo +#from pandas_datareader.yahoo.options import Options as YahooOptions +from pandas_datareader.yahoo.quotes import get_quote_yahoo +from pandas_datareader.famafrench import get_data_famafrench +from pandas_datareader.fred import get_data_fred def DataReader(name, data_source=None, start=None, end=None, retry_count=3, pause=0.001): @@ -84,7 +91,7 @@ def DataReader(name, data_source=None, start=None, end=None, retry_count=retry_count, pause=pause) elif data_source == "google": return get_data_google(symbols=name, start=start, end=end, - adjust_price=False, chunksize=25, + chunksize=25, retry_count=retry_count, pause=pause) elif data_source == "fred": return get_data_fred(name, start, end) @@ -94,533 +101,6 @@ def DataReader(name, data_source=None, start=None, end=None, raise NotImplementedError( "data_source=%r is not implemented" % data_source) -def _sanitize_dates(start, end): - from pandas.core.datetools import to_datetime - start = to_datetime(start) - end = to_datetime(end) - if start is None: - start = dt.datetime(2010, 1, 1) - if end is None: - end = dt.datetime.today() - return start, end - - -def _in_chunks(seq, size): - """ - Return sequence in 'chunks' of size defined by size - """ - return (seq[pos:pos + size] for pos in range(0, len(seq), size)) - - -_yahoo_codes = {'symbol': 's', 'last': 'l1', 'change_pct': 'p2', 'PE': 'r', - 'time': 't1', 'short_ratio': 's7'} - - -_YAHOO_QUOTE_URL = 'http://finance.yahoo.com/d/quotes.csv?' - - -def get_quote_yahoo(symbols): - """ - Get current yahoo quote - - Returns a DataFrame - """ - if isinstance(symbols, compat.string_types): - sym_list = symbols - else: - sym_list = '+'.join(symbols) - - # for codes see: http://www.gummy-stuff.org/Yahoo-data.htm - request = ''.join(compat.itervalues(_yahoo_codes)) # code request string - header = list(_yahoo_codes.keys()) - - data = defaultdict(list) - - url_str = _YAHOO_QUOTE_URL + 's=%s&f=%s' % (sym_list, request) - - with urlopen(url_str) as url: - lines = url.readlines() - - for line in lines: - fields = line.decode('utf-8').strip().split(',') - for i, field in enumerate(fields): - if field[-2:] == '%"': - v = float(field.strip('"%')) - elif field[0] == '"': - v = field.strip('"') - else: - try: - v = float(field) - except ValueError: - v = field - data[header[i]].append(v) - - idx = data.pop('symbol') - return DataFrame(data, index=idx) - - -def get_quote_google(symbols): - raise NotImplementedError("Google Finance doesn't have this functionality") - - -def _retry_read_url(url, retry_count, pause, name): - for _ in range(retry_count): - time.sleep(pause) - - # kludge to close the socket ASAP - try: - with urlopen(url) as resp: - lines = resp.read() - except _network_error_classes: - pass - else: - rs = read_csv(StringIO(bytes_to_str(lines)), index_col=0, - parse_dates=True, na_values='-')[::-1] - # Yahoo! Finance sometimes does this awesome thing where they - # return 2 rows for the most recent business day - if len(rs) > 2 and rs.index[-1] == rs.index[-2]: # pragma: no cover - rs = rs[:-1] - - #Get rid of unicode characters in index name. - try: - rs.index.name = rs.index.name.decode('unicode_escape').encode('ascii', 'ignore') - except AttributeError: - #Python 3 string has no decode method. - rs.index.name = rs.index.name.encode('ascii', 'ignore').decode() - - return rs - - raise IOError("after %d tries, %s did not " - "return a 200 for url %r" % (retry_count, name, url)) - - -_HISTORICAL_YAHOO_URL = 'http://ichart.finance.yahoo.com/table.csv?' - - -def _get_hist_yahoo(sym, start, end, interval, retry_count, pause): - """ - Get historical data for the given name from yahoo. - Date format is datetime - - Returns a DataFrame. - """ - start, end = _sanitize_dates(start, end) - url = (_HISTORICAL_YAHOO_URL + 's=%s' % sym + - '&a=%s' % (start.month - 1) + - '&b=%s' % start.day + - '&c=%s' % start.year + - '&d=%s' % (end.month - 1) + - '&e=%s' % end.day + - '&f=%s' % end.year + - '&g=%s' % interval + - '&ignore=.csv') - return _retry_read_url(url, retry_count, pause, 'Yahoo!') - - -_HISTORICAL_GOOGLE_URL = 'http://www.google.com/finance/historical?' - - -def _get_hist_google(sym, start, end, interval, retry_count, pause): - """ - Get historical data for the given name from google. - Date format is datetime - - Returns a DataFrame. - """ - start, end = _sanitize_dates(start, end) - - # www.google.com/finance/historical?q=GOOG&startdate=Jun+9%2C+2011&enddate=Jun+8%2C+2013&output=csv - url = "%s%s" % (_HISTORICAL_GOOGLE_URL, - urlencode({"q": sym, - "startdate": start.strftime('%b %d, ' '%Y'), - "enddate": end.strftime('%b %d, %Y'), - "output": "csv"})) - return _retry_read_url(url, retry_count, pause, 'Google') - - -def _adjust_prices(hist_data, price_list=None): - """ - Return modifed DataFrame or Panel with adjusted prices based on - 'Adj Close' price. Adds 'Adj_Ratio' column. - """ - if price_list is None: - price_list = 'Open', 'High', 'Low', 'Close' - adj_ratio = hist_data['Adj Close'] / hist_data['Close'] - - data = hist_data.copy() - for item in price_list: - data[item] = hist_data[item] * adj_ratio - data['Adj_Ratio'] = adj_ratio - del data['Adj Close'] - return data - - -def _calc_return_index(price_df): - """ - Return a returns index from a input price df or series. Initial value - (typically NaN) is set to 1. - """ - df = price_df.pct_change().add(1).cumprod() - mask = df.ix[1].notnull() & df.ix[0].isnull() - df.ix[0][mask] = 1 - - # Check for first stock listings after starting date of index in ret_index - # If True, find first_valid_index and set previous entry to 1. - if (~mask).any(): - for sym in mask.index[~mask]: - tstamp = df[sym].first_valid_index() - t_idx = df.index.get_loc(tstamp) - 1 - df[sym].ix[t_idx] = 1 - - return df - - -_YAHOO_COMPONENTS_URL = 'http://download.finance.yahoo.com/d/quotes.csv?' - - -def get_components_yahoo(idx_sym): - """ - Returns DataFrame containing list of component information for - index represented in idx_sym from yahoo. Includes component symbol - (ticker), exchange, and name. - - Parameters - ---------- - idx_sym : str - Stock index symbol - Examples: - '^DJI' (Dow Jones Industrial Average) - '^NYA' (NYSE Composite) - '^IXIC' (NASDAQ Composite) - - See: http://finance.yahoo.com/indices for other index symbols - - Returns - ------- - idx_df : DataFrame - """ - stats = 'snx' - # URL of form: - # http://download.finance.yahoo.com/d/quotes.csv?s=@%5EIXIC&f=snxl1d1t1c1ohgv - url = _YAHOO_COMPONENTS_URL + 's={0}&f={1}&e=.csv&h={2}' - - idx_mod = idx_sym.replace('^', '@%5E') - url_str = url.format(idx_mod, stats, 1) - - idx_df = DataFrame() - mask = [True] - comp_idx = 1 - - # LOOP across component index structure, - # break when no new components are found - while True in mask: - url_str = url.format(idx_mod, stats, comp_idx) - with urlopen(url_str) as resp: - raw = resp.read() - lines = raw.decode('utf-8').strip().strip('"').split('"\r\n"') - lines = [line.strip().split('","') for line in lines] - - temp_df = DataFrame(lines, columns=['ticker', 'name', 'exchange']) - temp_df = temp_df.drop_duplicates() - temp_df = temp_df.set_index('ticker') - mask = ~temp_df.index.isin(idx_df.index) - - comp_idx = comp_idx + 50 - idx_df = idx_df.append(temp_df[mask]) - - return idx_df - - -def _dl_mult_symbols(symbols, start, end, interval, chunksize, retry_count, pause, - method): - stocks = {} - failed = [] - passed = [] - for sym_group in _in_chunks(symbols, chunksize): - for sym in sym_group: - try: - stocks[sym] = method(sym, start, end, interval, retry_count, pause) - passed.append(sym) - except IOError: - warnings.warn('Failed to read symbol: {0!r}, replacing with ' - 'NaN.'.format(sym), SymbolWarning) - failed.append(sym) - - if len(passed) == 0: - raise RemoteDataError("No data fetched using " - "{0!r}".format(method.__name__)) - try: - if len(stocks) > 0 and len(failed) > 0 and len(passed) > 0: - df_na = stocks[passed[0]].copy() - df_na[:] = np.nan - for sym in failed: - stocks[sym] = df_na - return Panel(stocks).swapaxes('items', 'minor') - except AttributeError: - # cannot construct a panel with just 1D nans indicating no data - raise RemoteDataError("No data fetched using " - "{0!r}".format(method.__name__)) - -_source_functions = {'google': _get_hist_google, 'yahoo': _get_hist_yahoo} - - -def _get_data_from(symbols, start, end, interval, retry_count, pause, adjust_price, - ret_index, chunksize, source): - - src_fn = _source_functions[source] - - # If a single symbol, (e.g., 'GOOG') - if isinstance(symbols, (compat.string_types, int)): - hist_data = src_fn(symbols, start, end, interval, retry_count, pause) - # Or multiple symbols, (e.g., ['GOOG', 'AAPL', 'MSFT']) - elif isinstance(symbols, DataFrame): - hist_data = _dl_mult_symbols(symbols.index, start, end, interval, chunksize, - retry_count, pause, src_fn) - else: - hist_data = _dl_mult_symbols(symbols, start, end, interval, chunksize, - retry_count, pause, src_fn) - if source.lower() == 'yahoo': - if ret_index: - hist_data['Ret_Index'] = _calc_return_index(hist_data['Adj Close']) - if adjust_price: - hist_data = _adjust_prices(hist_data) - - return hist_data - - -def get_data_yahoo(symbols=None, start=None, end=None, retry_count=3, - pause=0.001, adjust_price=False, ret_index=False, - chunksize=25, interval='d'): - """ - Returns DataFrame/Panel of historical stock prices from symbols, over date - range, start to end. To avoid being penalized by Yahoo! Finance servers, - pauses between downloading 'chunks' of symbols can be specified. - - Parameters - ---------- - symbols : string, array-like object (list, tuple, Series), or DataFrame - Single stock symbol (ticker), array-like object of symbols or - DataFrame with index containing stock symbols. - start : string, (defaults to '1/1/2010') - Starting date, timestamp. Parses many different kind of date - representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980') - end : string, (defaults to today) - Ending date, timestamp. Same format as starting date. - retry_count : int, default 3 - Number of times to retry query request. - pause : int, default 0 - Time, in seconds, to pause between consecutive queries of chunks. If - single value given for symbol, represents the pause between retries. - adjust_price : bool, default False - If True, adjusts all prices in hist_data ('Open', 'High', 'Low', - 'Close') based on 'Adj Close' price. Adds 'Adj_Ratio' column and drops - 'Adj Close'. - ret_index : bool, default False - If True, includes a simple return index 'Ret_Index' in hist_data. - chunksize : int, default 25 - Number of symbols to download consecutively before intiating pause. - interval : string, default 'd' - Time interval code, valid values are 'd' for daily, 'w' for weekly, - 'm' for monthly and 'v' for dividend. - - Returns - ------- - hist_data : DataFrame (str) or Panel (array-like object, DataFrame) - """ - if interval not in ['d', 'w', 'm', 'v']: - raise ValueError("Invalid interval: valid values are 'd', 'w', 'm' and 'v'") - return _get_data_from(symbols, start, end, interval, retry_count, pause, - adjust_price, ret_index, chunksize, 'yahoo') - -_HISTORICAL_YAHOO_ACTIONS_URL = 'http://ichart.finance.yahoo.com/x?' - -def get_data_yahoo_actions(symbol, start=None, end=None, retry_count=3, - pause=0.001): - """ - Returns DataFrame of historical corporate actions (dividends and stock - splits) from symbols, over date range, start to end. All dates in the - resulting DataFrame correspond with dividend and stock split ex-dates. - - Parameters - ---------- - sym : string with a single Single stock symbol (ticker). - start : string, (defaults to '1/1/2010') - Starting date, timestamp. Parses many different kind of date - representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980') - end : string, (defaults to today) - Ending date, timestamp. Same format as starting date. - retry_count : int, default 3 - Number of times to retry query request. - pause : int, default 0 - Time, in seconds, of the pause between retries. - """ - - start, end = _sanitize_dates(start, end) - url = (_HISTORICAL_YAHOO_ACTIONS_URL + 's=%s' % symbol + - '&a=%s' % (start.month - 1) + - '&b=%s' % start.day + - '&c=%s' % start.year + - '&d=%s' % (end.month - 1) + - '&e=%s' % end.day + - '&f=%s' % end.year + - '&g=v') - - for _ in range(retry_count): - time.sleep(pause) - - try: - with urlopen(url) as resp: - lines = resp.read() - except _network_error_classes: - pass - else: - actions_index = [] - actions_entries = [] - - for line in csv.reader(StringIO(bytes_to_str(lines))): - # Ignore lines that aren't dividends or splits (Yahoo - # add a bunch of irrelevant fields.) - if len(line) != 3 or line[0] not in ('DIVIDEND', 'SPLIT'): - continue - - action, date, value = line - if action == 'DIVIDEND': - actions_index.append(to_datetime(date)) - actions_entries.append({ - 'action': action, - 'value': float(value) - }) - elif action == 'SPLIT' and ':' in value: - # Convert the split ratio to a fraction. For example a - # 4:1 split expressed as a fraction is 1/4 = 0.25. - denominator, numerator = value.split(':', 1) - split_fraction = float(numerator) / float(denominator) - - actions_index.append(to_datetime(date)) - actions_entries.append({ - 'action': action, - 'value': split_fraction - }) - - return DataFrame(actions_entries, index=actions_index) - - raise IOError("after %d tries, Yahoo! did not " - "return a 200 for url %r" % (retry_count, url)) - - -def get_data_google(symbols=None, start=None, end=None, retry_count=3, - pause=0.001, adjust_price=False, ret_index=False, - chunksize=25): - """ - Returns DataFrame/Panel of historical stock prices from symbols, over date - range, start to end. To avoid being penalized by Google Finance servers, - pauses between downloading 'chunks' of symbols can be specified. - - Parameters - ---------- - symbols : string, array-like object (list, tuple, Series), or DataFrame - Single stock symbol (ticker), array-like object of symbols or - DataFrame with index containing stock symbols. - start : string, (defaults to '1/1/2010') - Starting date, timestamp. Parses many different kind of date - representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980') - end : string, (defaults to today) - Ending date, timestamp. Same format as starting date. - retry_count : int, default 3 - Number of times to retry query request. - pause : int, default 0 - Time, in seconds, to pause between consecutive queries of chunks. If - single value given for symbol, represents the pause between retries. - chunksize : int, default 25 - Number of symbols to download consecutively before intiating pause. - - Returns - ------- - hist_data : DataFrame (str) or Panel (array-like object, DataFrame) - """ - return _get_data_from(symbols, start, end, None, retry_count, pause, - adjust_price, ret_index, chunksize, 'google') - - -_FRED_URL = "http://research.stlouisfed.org/fred2/series/" - - -def get_data_fred(name, start=dt.datetime(2010, 1, 1), - end=dt.datetime.today()): - """ - Get data for the given name from the St. Louis FED (FRED). - Date format is datetime - - Returns a DataFrame. - - If multiple names are passed for "series" then the index of the - DataFrame is the outer join of the indicies of each series. - """ - start, end = _sanitize_dates(start, end) - - if not is_list_like(name): - names = [name] - else: - names = name - - urls = [_FRED_URL + '%s' % n + '/downloaddata/%s' % n + '.csv' for - n in names] - - def fetch_data(url, name): - with urlopen(url) as resp: - data = read_csv(resp, index_col=0, parse_dates=True, - header=None, skiprows=1, names=["DATE", name], - na_values='.') - try: - return data.truncate(start, end) - except KeyError: - if data.ix[3].name[7:12] == 'Error': - raise IOError("Failed to get the data. Check that {0!r} is " - "a valid FRED series.".format(name)) - raise - df = concat([fetch_data(url, n) for url, n in zip(urls, names)], - axis=1, join='outer') - return df - - -_FAMAFRENCH_URL = 'http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp' - - -def get_data_famafrench(name): - # path of zip files - zip_file_path = '{0}/{1}_TXT.zip'.format(_FAMAFRENCH_URL, name) - - with urlopen(zip_file_path) as url: - raw = url.read() - - with tempfile.TemporaryFile() as tmpf: - tmpf.write(raw) - - with ZipFile(tmpf, 'r') as zf: - data = zf.open(zf.namelist()[0]).readlines() - - line_lengths = np.array(lmap(len, data)) - file_edges = np.where(line_lengths == 2)[0] - - datasets = {} - edges = zip(file_edges + 1, file_edges[1:]) - for i, (left_edge, right_edge) in enumerate(edges): - dataset = [d.split() for d in data[left_edge:right_edge]] - if len(dataset) > 10: - ncol_raw = np.array(lmap(len, dataset)) - ncol = np.median(ncol_raw) - header_index = np.where(ncol_raw == ncol - 1)[0][-1] - header = dataset[header_index] - ds_header = dataset[header_index + 1:] - # to ensure the header is unique - header = ['{0} {1}'.format(j, hj) for j, hj in enumerate(header, - start=1)] - index = np.array([d[0] for d in ds_header], dtype=int) - dataset = np.array([d[1:] for d in ds_header], dtype=float) - datasets[i] = DataFrame(dataset, index, columns=header) - - return datasets - # Items needed for options class CUR_MONTH = dt.datetime.now().month diff --git a/pandas_datareader/famafrench.py b/pandas_datareader/famafrench.py new file mode 100644 index 00000000..b360124f --- /dev/null +++ b/pandas_datareader/famafrench.py @@ -0,0 +1,44 @@ +import tempfile +import numpy as np +from pandas.io.common import urlopen, ZipFile +from pandas.compat import lmap +from pandas import DataFrame + + +_FAMAFRENCH_URL = 'http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp' + + +def get_data_famafrench(name): + # path of zip files + zip_file_path = '{0}/{1}_TXT.zip'.format(_FAMAFRENCH_URL, name) + + with urlopen(zip_file_path) as url: + raw = url.read() + + with tempfile.TemporaryFile() as tmpf: + tmpf.write(raw) + + with ZipFile(tmpf, 'r') as zf: + data = zf.open(zf.namelist()[0]).readlines() + + line_lengths = np.array(lmap(len, data)) + file_edges = np.where(line_lengths == 2)[0] + + datasets = {} + edges = zip(file_edges + 1, file_edges[1:]) + for i, (left_edge, right_edge) in enumerate(edges): + dataset = [d.split() for d in data[left_edge:right_edge]] + if len(dataset) > 10: + ncol_raw = np.array(lmap(len, dataset)) + ncol = np.median(ncol_raw) + header_index = np.where(ncol_raw == ncol - 1)[0][-1] + header = dataset[header_index] + ds_header = dataset[header_index + 1:] + # to ensure the header is unique + header = ['{0} {1}'.format(j, hj) for j, hj in enumerate(header, + start=1)] + index = np.array([d[0] for d in ds_header], dtype=int) + dataset = np.array([d[1:] for d in ds_header], dtype=float) + datasets[i] = DataFrame(dataset, index, columns=header) + + return datasets diff --git a/pandas_datareader/fred.py b/pandas_datareader/fred.py new file mode 100644 index 00000000..0d055398 --- /dev/null +++ b/pandas_datareader/fred.py @@ -0,0 +1,47 @@ +import datetime as dt +from pandas.core.common import is_list_like +from pandas.io.common import urlopen +from pandas import concat, read_csv + +from pandas_datareader.commons import _sanitize_dates + + +_FRED_URL = "http://research.stlouisfed.org/fred2/series/" + + +def get_data_fred(name, start=dt.datetime(2010, 1, 1), + end=dt.datetime.today()): + """ + Get data for the given name from the St. Louis FED (FRED). + Date format is datetime + + Returns a DataFrame. + + If multiple names are passed for "series" then the index of the + DataFrame is the outer join of the indicies of each series. + """ + start, end = _sanitize_dates(start, end) + + if not is_list_like(name): + names = [name] + else: + names = name + + urls = [_FRED_URL + '%s' % n + '/downloaddata/%s' % n + '.csv' for + n in names] + + def fetch_data(url, name): + with urlopen(url) as resp: + data = read_csv(resp, index_col=0, parse_dates=True, + header=None, skiprows=1, names=["DATE", name], + na_values='.') + try: + return data.truncate(start, end) + except KeyError: + if data.ix[3].name[7:12] == 'Error': + raise IOError("Failed to get the data. Check that {0!r} is " + "a valid FRED series.".format(name)) + raise + df = concat([fetch_data(url, n) for url, n in zip(urls, names)], + axis=1, join='outer') + return df diff --git a/pandas_datareader/google/__init__.py b/pandas_datareader/google/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pandas_datareader/google/hist.py b/pandas_datareader/google/hist.py new file mode 100644 index 00000000..0cccb905 --- /dev/null +++ b/pandas_datareader/google/hist.py @@ -0,0 +1,58 @@ +from pandas.io.common import urlencode +from pandas_datareader.commons import _retry_read_url +from pandas_datareader.commons import _sanitize_dates +from pandas_datareader.commons import _get_data_from + + +_HISTORICAL_GOOGLE_URL = 'http://www.google.com/finance/historical?' + + +def get_data_google(symbols=None, start=None, end=None, retry_count=3, + pause=0.001, chunksize=25): + """ + Returns DataFrame/Panel of historical stock prices from symbols, over date + range, start to end. To avoid being penalized by Google Finance servers, + pauses between downloading 'chunks' of symbols can be specified. + + Parameters + ---------- + symbols : string, array-like object (list, tuple, Series), or DataFrame + Single stock symbol (ticker), array-like object of symbols or + DataFrame with index containing stock symbols. + start : string, (defaults to '1/1/2010') + Starting date, timestamp. Parses many different kind of date + representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980') + end : string, (defaults to today) + Ending date, timestamp. Same format as starting date. + retry_count : int, default 3 + Number of times to retry query request. + pause : int, default 0 + Time, in seconds, to pause between consecutive queries of chunks. If + single value given for symbol, represents the pause between retries. + chunksize : int, default 25 + Number of symbols to download consecutively before intiating pause. + + Returns + ------- + hist_data : DataFrame (str) or Panel (array-like object, DataFrame) + """ + return _get_data_from(symbols, start, end, None, retry_count, pause, + chunksize, _get_hist_google) + + +def _get_hist_google(sym, start, end, interval, retry_count, pause): + """ + Get historical data for the given name from google. + Date format is datetime + + Returns a DataFrame. + """ + start, end = _sanitize_dates(start, end) + + # www.google.com/finance/historical?q=GOOG&startdate=Jun+9%2C+2011&enddate=Jun+8%2C+2013&output=csv + url = "%s%s" % (_HISTORICAL_GOOGLE_URL, + urlencode({"q": sym, + "startdate": start.strftime('%b %d, ' '%Y'), + "enddate": end.strftime('%b %d, %Y'), + "output": "csv"})) + return _retry_read_url(url, retry_count, pause, 'Google') diff --git a/pandas_datareader/google/quotes.py b/pandas_datareader/google/quotes.py new file mode 100644 index 00000000..5258f793 --- /dev/null +++ b/pandas_datareader/google/quotes.py @@ -0,0 +1,2 @@ +def get_quote_google(symbols): + raise NotImplementedError("Google Finance doesn't have this functionality") diff --git a/pandas_datareader/tests/test_data.py b/pandas_datareader/tests/test_data.py index cbf2a365..8eb7996a 100644 --- a/pandas_datareader/tests/test_data.py +++ b/pandas_datareader/tests/test_data.py @@ -24,8 +24,9 @@ from urllib2 import HTTPError import pandas_datareader.data as web -from pandas_datareader.data import ( - DataReader, SymbolWarning, RemoteDataError, _yahoo_codes) +from pandas_datareader.data import DataReader +from pandas_datareader.commons import SymbolWarning, RemoteDataError +from pandas_datareader.yahoo.quotes import _yahoo_codes def _skip_if_no_lxml(): try: diff --git a/pandas_datareader/yahoo/__init__.py b/pandas_datareader/yahoo/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pandas_datareader/yahoo/actions.py b/pandas_datareader/yahoo/actions.py new file mode 100644 index 00000000..69e88c58 --- /dev/null +++ b/pandas_datareader/yahoo/actions.py @@ -0,0 +1,84 @@ +import time +import csv +from pandas import to_datetime, DataFrame +from pandas.io.common import urlopen +from pandas.util.testing import _network_error_classes +from pandas.compat import StringIO, bytes_to_str +from pandas_datareader.commons import _sanitize_dates + + +_HISTORICAL_YAHOO_ACTIONS_URL = 'http://ichart.finance.yahoo.com/x?' + + +def get_data_yahoo_actions(symbol, start=None, end=None, retry_count=3, + pause=0.001): + """ + Returns DataFrame of historical corporate actions (dividends and stock + splits) from symbols, over date range, start to end. All dates in the + resulting DataFrame correspond with dividend and stock split ex-dates. + + Parameters + ---------- + sym : string with a single Single stock symbol (ticker). + start : string, (defaults to '1/1/2010') + Starting date, timestamp. Parses many different kind of date + representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980') + end : string, (defaults to today) + Ending date, timestamp. Same format as starting date. + retry_count : int, default 3 + Number of times to retry query request. + pause : int, default 0 + Time, in seconds, of the pause between retries. + """ + + start, end = _sanitize_dates(start, end) + url = (_HISTORICAL_YAHOO_ACTIONS_URL + 's=%s' % symbol + + '&a=%s' % (start.month - 1) + + '&b=%s' % start.day + + '&c=%s' % start.year + + '&d=%s' % (end.month - 1) + + '&e=%s' % end.day + + '&f=%s' % end.year + + '&g=v') + + for _ in range(retry_count): + time.sleep(pause) + + try: + with urlopen(url) as resp: + lines = resp.read() + except _network_error_classes: + pass + else: + actions_index = [] + actions_entries = [] + + for line in csv.reader(StringIO(bytes_to_str(lines))): + # Ignore lines that aren't dividends or splits (Yahoo + # add a bunch of irrelevant fields.) + if len(line) != 3 or line[0] not in ('DIVIDEND', 'SPLIT'): + continue + + action, date, value = line + if action == 'DIVIDEND': + actions_index.append(to_datetime(date)) + actions_entries.append({ + 'action': action, + 'value': float(value) + }) + elif action == 'SPLIT' and ':' in value: + # Convert the split ratio to a fraction. For example a + # 4:1 split expressed as a fraction is 1/4 = 0.25. + denominator, numerator = value.split(':', 1) + split_fraction = float(numerator) / float(denominator) + + actions_index.append(to_datetime(date)) + actions_entries.append({ + 'action': action, + 'value': split_fraction + }) + + return DataFrame(actions_entries, index=actions_index) + + raise IOError("after %d tries, Yahoo! did not " + "return a 200 for url %r" % (retry_count, url)) diff --git a/pandas_datareader/yahoo/components.py b/pandas_datareader/yahoo/components.py new file mode 100644 index 00000000..79019caf --- /dev/null +++ b/pandas_datareader/yahoo/components.py @@ -0,0 +1,58 @@ +from pandas import DataFrame +from pandas.io.common import urlopen + + +_YAHOO_COMPONENTS_URL = 'http://download.finance.yahoo.com/d/quotes.csv?' + + +def get_components_yahoo(idx_sym): + """ + Returns DataFrame containing list of component information for + index represented in idx_sym from yahoo. Includes component symbol + (ticker), exchange, and name. + + Parameters + ---------- + idx_sym : str + Stock index symbol + Examples: + '^DJI' (Dow Jones Industrial Average) + '^NYA' (NYSE Composite) + '^IXIC' (NASDAQ Composite) + + See: http://finance.yahoo.com/indices for other index symbols + + Returns + ------- + idx_df : DataFrame + """ + stats = 'snx' + # URL of form: + # http://download.finance.yahoo.com/d/quotes.csv?s=@%5EIXIC&f=snxl1d1t1c1ohgv + url = _YAHOO_COMPONENTS_URL + 's={0}&f={1}&e=.csv&h={2}' + + idx_mod = idx_sym.replace('^', '@%5E') + url_str = url.format(idx_mod, stats, 1) + + idx_df = DataFrame() + mask = [True] + comp_idx = 1 + + # LOOP across component index structure, + # break when no new components are found + while True in mask: + url_str = url.format(idx_mod, stats, comp_idx) + with urlopen(url_str) as resp: + raw = resp.read() + lines = raw.decode('utf-8').strip().strip('"').split('"\r\n"') + lines = [line.strip().split('","') for line in lines] + + temp_df = DataFrame(lines, columns=['ticker', 'name', 'exchange']) + temp_df = temp_df.drop_duplicates() + temp_df = temp_df.set_index('ticker') + mask = ~temp_df.index.isin(idx_df.index) + + comp_idx = comp_idx + 50 + idx_df = idx_df.append(temp_df[mask]) + + return idx_df diff --git a/pandas_datareader/yahoo/daily.py b/pandas_datareader/yahoo/daily.py new file mode 100644 index 00000000..d6a016ce --- /dev/null +++ b/pandas_datareader/yahoo/daily.py @@ -0,0 +1,110 @@ +from pandas_datareader.commons import _retry_read_url, _sanitize_dates, _get_data_from + +_HISTORICAL_YAHOO_URL = 'http://ichart.finance.yahoo.com/table.csv?' + + +def get_data_yahoo(symbols=None, start=None, end=None, retry_count=3, + pause=0.001, adjust_price=False, ret_index=False, + chunksize=25, interval='d'): + """ + Returns DataFrame/Panel of historical stock prices from symbols, over date + range, start to end. To avoid being penalized by Yahoo! Finance servers, + pauses between downloading 'chunks' of symbols can be specified. + + Parameters + ---------- + symbols : string, array-like object (list, tuple, Series), or DataFrame + Single stock symbol (ticker), array-like object of symbols or + DataFrame with index containing stock symbols. + start : string, (defaults to '1/1/2010') + Starting date, timestamp. Parses many different kind of date + representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980') + end : string, (defaults to today) + Ending date, timestamp. Same format as starting date. + retry_count : int, default 3 + Number of times to retry query request. + pause : int, default 0 + Time, in seconds, to pause between consecutive queries of chunks. If + single value given for symbol, represents the pause between retries. + adjust_price : bool, default False + If True, adjusts all prices in hist_data ('Open', 'High', 'Low', + 'Close') based on 'Adj Close' price. Adds 'Adj_Ratio' column and drops + 'Adj Close'. + ret_index : bool, default False + If True, includes a simple return index 'Ret_Index' in hist_data. + chunksize : int, default 25 + Number of symbols to download consecutively before intiating pause. + interval : string, default 'd' + Time interval code, valid values are 'd' for daily, 'w' for weekly, + 'm' for monthly and 'v' for dividend. + + Returns + ------- + hist_data : DataFrame (str) or Panel (array-like object, DataFrame) + """ + if interval not in ['d', 'w', 'm', 'v']: + raise ValueError("Invalid interval: valid values are 'd', 'w', 'm' and 'v'") + hist_data = _get_data_from(symbols, start, end, interval, retry_count, pause, + chunksize, _get_hist_yahoo) + if ret_index: + hist_data['Ret_Index'] = _calc_return_index(hist_data['Adj Close']) + if adjust_price: + hist_data = _adjust_prices(hist_data) + return hist_data + + +def _get_hist_yahoo(sym, start, end, interval, retry_count, pause): + """ + Get historical data for the given name from yahoo. + Date format is datetime + + Returns a DataFrame. + """ + start, end = _sanitize_dates(start, end) + url = (_HISTORICAL_YAHOO_URL + 's=%s' % sym + + '&a=%s' % (start.month - 1) + + '&b=%s' % start.day + + '&c=%s' % start.year + + '&d=%s' % (end.month - 1) + + '&e=%s' % end.day + + '&f=%s' % end.year + + '&g=%s' % interval + + '&ignore=.csv') + return _retry_read_url(url, retry_count, pause, 'Yahoo!') + + +def _adjust_prices(hist_data, price_list=None): + """ + Return modifed DataFrame or Panel with adjusted prices based on + 'Adj Close' price. Adds 'Adj_Ratio' column. + """ + if price_list is None: + price_list = 'Open', 'High', 'Low', 'Close' + adj_ratio = hist_data['Adj Close'] / hist_data['Close'] + + data = hist_data.copy() + for item in price_list: + data[item] = hist_data[item] * adj_ratio + data['Adj_Ratio'] = adj_ratio + del data['Adj Close'] + return data + + +def _calc_return_index(price_df): + """ + Return a returns index from a input price df or series. Initial value + (typically NaN) is set to 1. + """ + df = price_df.pct_change().add(1).cumprod() + mask = df.ix[1].notnull() & df.ix[0].isnull() + df.ix[0][mask] = 1 + + # Check for first stock listings after starting date of index in ret_index + # If True, find first_valid_index and set previous entry to 1. + if (~mask).any(): + for sym in mask.index[~mask]: + tstamp = df[sym].first_valid_index() + t_idx = df.index.get_loc(tstamp) - 1 + df[sym].ix[t_idx] = 1 + + return df diff --git a/pandas_datareader/yahoo/options.py b/pandas_datareader/yahoo/options.py new file mode 100644 index 00000000..e69de29b diff --git a/pandas_datareader/yahoo/quotes.py b/pandas_datareader/yahoo/quotes.py new file mode 100644 index 00000000..106b2575 --- /dev/null +++ b/pandas_datareader/yahoo/quotes.py @@ -0,0 +1,51 @@ +from collections import defaultdict +import pandas.compat as compat +from pandas.io.common import urlopen +from pandas import DataFrame + + +_yahoo_codes = {'symbol': 's', 'last': 'l1', 'change_pct': 'p2', 'PE': 'r', + 'time': 't1', 'short_ratio': 's7'} + + +_YAHOO_QUOTE_URL = 'http://finance.yahoo.com/d/quotes.csv?' + + +def get_quote_yahoo(symbols): + """ + Get current yahoo quote + + Returns a DataFrame + """ + if isinstance(symbols, compat.string_types): + sym_list = symbols + else: + sym_list = '+'.join(symbols) + + # for codes see: http://www.gummy-stuff.org/Yahoo-data.htm + request = ''.join(compat.itervalues(_yahoo_codes)) # code request string + header = list(_yahoo_codes.keys()) + + data = defaultdict(list) + + url_str = _YAHOO_QUOTE_URL + 's=%s&f=%s' % (sym_list, request) + + with urlopen(url_str) as url: + lines = url.readlines() + + for line in lines: + fields = line.decode('utf-8').strip().split(',') + for i, field in enumerate(fields): + if field[-2:] == '%"': + v = float(field.strip('"%')) + elif field[0] == '"': + v = field.strip('"') + else: + try: + v = float(field) + except ValueError: + v = field + data[header[i]].append(v) + + idx = data.pop('symbol') + return DataFrame(data, index=idx)