diff --git a/pandas_datareader/data.py b/pandas_datareader/data.py index 2f8a99c8..e2b15613 100644 --- a/pandas_datareader/data.py +++ b/pandas_datareader/data.py @@ -14,7 +14,7 @@ from pandas_datareader.yahoo.options import Options as YahooOptions from pandas_datareader.fred import FredReader -from pandas_datareader.famafrench import _get_data as get_data_famafrench +from pandas_datareader.famafrench import FamaFrenchReader from pandas_datareader.oecd import OECDReader @@ -22,6 +22,9 @@ def get_data_fred(*args, **kwargs): return FredReader(*args, **kwargs).read() +def get_data_famafrench(*args, **kwargs): + return FamaFrenchReader(*args, **kwargs).read() + def get_data_google(*args, **kwargs): return GoogleDailyReader(*args, **kwargs).read() @@ -88,21 +91,28 @@ def DataReader(name, data_source=None, start=None, end=None, adjust_price=False, chunksize=25, retry_count=retry_count, pause=pause, session=session).read() + elif data_source == "yahoo-actions": return YahooActionReader(symbol=name, start=start, end=end, retry_count=retry_count, pause=pause, session=session).read() + elif data_source == "google": return GoogleDailyReader(symbols=name, start=start, end=end, chunksize=25, retry_count=retry_count, pause=pause, session=session).read() + elif data_source == "fred": return FredReader(symbols=name, start=start, end=end, retry_count=retry_count, pause=pause, session=session).read() + elif data_source == "famafrench": - return get_data_famafrench(name) + return FamaFrenchReader(symbols=name, start=start, end=end, + retry_count=retry_count, pause=pause, + session=session).read() + elif data_source == "oecd": return OECDReader(symbols=name, start=start, end=end, retry_count=retry_count, pause=pause, diff --git a/pandas_datareader/famafrench.py b/pandas_datareader/famafrench.py index f0736579..5dc1e4b5 100644 --- a/pandas_datareader/famafrench.py +++ b/pandas_datareader/famafrench.py @@ -1,10 +1,11 @@ import tempfile import re import datetime as dt -from pandas.io.common import urlopen, ZipFile +from pandas.io.common import ZipFile from pandas.compat import lmap, StringIO from pandas import read_csv, to_datetime +from pandas_datareader.base import _BaseReader _URL = 'http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/' _URL_PREFIX = 'ftp/' @@ -33,28 +34,17 @@ def get_available_datasets(): return lmap(lambda x: x[len(_URL_PREFIX):-len(_URL_SUFFIX)], l) -def _download_data_famafrench(name): - url = ''.join([_URL, _URL_PREFIX, name, _URL_SUFFIX]) - with urlopen(url) as socket: - raw = socket.read() - - with tempfile.TemporaryFile() as tmpf: - tmpf.write(raw) - - with ZipFile(tmpf, 'r') as zf: - data = zf.open(zf.namelist()[0]).read().decode() - - return data - - def _parse_date_famafrench(x): x = x.strip() - try: return dt.datetime.strptime(x, '%Y%m') - except: pass + try: + return dt.datetime.strptime(x, '%Y%m') + except: + pass return to_datetime(x) -def _get_data(name): +class FamaFrenchReader(_BaseReader): + """ Get data for the given name from the Fama/French data library. @@ -66,43 +56,75 @@ def _get_data(name): df : a dictionary of pandas.DataFrame. Tables are accessed by integer keys. See df['DESCR'] for a description of the dataset """ - params = {'index_col': 0, - 'parse_dates': [0], - 'date_parser': _parse_date_famafrench} - - # headers in these files are not valid - if name.endswith('_Breakpoints'): - c = ['<=0', '>0'] if name.find('-') > -1 else ['Count'] - r = list(range(0, 105, 5)) - params['names'] = ['Date'] + c + list(zip(r, r[1:])) - params['skiprows'] = 1 if name != 'Prior_2-12_Breakpoints' else 3 - - doc_chunks, tables = [], [] - data = _download_data_famafrench(name) - for chunk in data.split(2 * '\r\n'): - if len(chunk) < 800: - doc_chunks.append(chunk.replace('\r\n', ' ').strip()) - else: - tables.append(chunk) - - datasets, table_desc = {}, [] - for i, src in enumerate(tables): - match = re.search('^\s*,', src, re.M) # the table starts there - start = 0 if not match else match.start() - - df = read_csv(StringIO('Date' + src[start:]), **params) - try: df = df.to_period(df.index.inferred_freq[:1]) - except: pass - datasets[i] = df - - title = src[:start].replace('\r\n', ' ').strip() - shape = '({0} rows x {1} cols)'.format(*df.shape) - table_desc.append('{0} {1}'.format(title, shape).strip()) - - descr = '{0}\n{1}\n\n'.format(name.replace('_', ' '), len(name) * '-') - if doc_chunks: descr += ' '.join(doc_chunks).replace(2 * ' ', ' ') + '\n\n' - - table_descr = map(lambda x: '{0:3} : {1}'.format(*x), enumerate(table_desc)) - - datasets['DESCR'] = descr + '\n'.join(table_descr) - return datasets + + @property + def url(self): + return ''.join([_URL, _URL_PREFIX, self.symbols, _URL_SUFFIX]) + + def _read_zipfile(self, url): + raw = self._get_response(url).content + + with tempfile.TemporaryFile() as tmpf: + tmpf.write(raw) + + with ZipFile(tmpf, 'r') as zf: + data = zf.open(zf.namelist()[0]).read().decode() + + return data + + def _read_one_data(self, url, params): + + params = {'index_col': 0, + 'parse_dates': [0], + 'date_parser': _parse_date_famafrench} + + # headers in these files are not valid + if self.symbols.endswith('_Breakpoints'): + + if self.symbols.find('-') > -1: + c = ['<=0', '>0'] + else: + c = ['Count'] + r = list(range(0, 105, 5)) + params['names'] = ['Date'] + c + list(zip(r, r[1:])) + + if self.symbols != 'Prior_2-12_Breakpoints': + params['skiprows'] = 1 + else: + params['skiprows'] = 3 + + doc_chunks, tables = [], [] + data = self._read_zipfile(url) + + for chunk in data.split(2 * '\r\n'): + if len(chunk) < 800: + doc_chunks.append(chunk.replace('\r\n', ' ').strip()) + else: + tables.append(chunk) + + datasets, table_desc = {}, [] + for i, src in enumerate(tables): + match = re.search('^\s*,', src, re.M) # the table starts there + start = 0 if not match else match.start() + + df = read_csv(StringIO('Date' + src[start:]), **params) + try: + idx_name = df.index.name # hack for pandas 0.16.2 + df = df.to_period(df.index.inferred_freq[:1]) + df.index.name = idx_name + except: + pass + df = df.truncate(self.start, self.end) + datasets[i] = df + + title = src[:start].replace('\r\n', ' ').strip() + shape = '({0} rows x {1} cols)'.format(*df.shape) + table_desc.append('{0} {1}'.format(title, shape).strip()) + + descr = '{0}\n{1}\n\n'.format(self.symbols.replace('_', ' '), len(self.symbols) * '-') + if doc_chunks: + descr += ' '.join(doc_chunks).replace(2 * ' ', ' ') + '\n\n' + table_descr = map(lambda x: '{0:3} : {1}'.format(*x), enumerate(table_desc)) + datasets['DESCR'] = descr + '\n'.join(table_descr) + + return datasets diff --git a/pandas_datareader/oecd.py b/pandas_datareader/oecd.py index 6b692115..f654e611 100644 --- a/pandas_datareader/oecd.py +++ b/pandas_datareader/oecd.py @@ -6,8 +6,6 @@ from pandas import concat, read_csv from pandas_datareader.io import read_jsdmx - - from pandas_datareader.base import _BaseReader @@ -18,6 +16,7 @@ class OECDReader(_BaseReader): @property def url(self): url = 'http://stats.oecd.org/SDMX-JSON/data' + if not isinstance(self.symbols, compat.string_types): raise ValueError('data name must be string') diff --git a/pandas_datareader/tests/test_famafrench.py b/pandas_datareader/tests/test_famafrench.py index d20eb105..9af2c694 100644 --- a/pandas_datareader/tests/test_famafrench.py +++ b/pandas_datareader/tests/test_famafrench.py @@ -1,4 +1,5 @@ import nose +import pandas as pd import pandas.util.testing as tm import pandas_datareader.data as web @@ -27,6 +28,55 @@ def test_index(self): assert ff[0].index.freq == 'M' assert ff[1].index.freq == 'A-DEC' + def test_f_f_research(self): + results = web.DataReader("F-F_Research_Data_Factors", "famafrench", + start='2010-01-01', end='2010-12-01') + self.assertTrue(isinstance(results, dict)) + self.assertEqual(len(results), 3) + + exp = pd.DataFrame({'Mkt-RF': [-3.36, 3.4, 6.31, 2., -7.89, -5.56, + 6.93, -4.77, 9.54, 3.88, 0.6, 6.82], + 'SMB': [0.2, 1.44, 1.57, 4.92, -0.09, -2.15, + 0.24, -3.03, 3.84, 1.01, 3.69, 0.85], + 'HML': [0.61, 2.74, 2.01, 3.12, -2.32, -4.27, + 0.04, -1.51, -2.94, -2.23, -0.58, 3.47], + 'RF': [0., 0., 0.01, 0.01, 0.01, 0.01, 0.01, + 0.01, 0.01, 0.01, 0.01, 0.01]}, + index=pd.period_range('2010-01-01', '2010-12-01', freq='M', name='Date'), + columns=['Mkt-RF', 'SMB', 'HML', 'RF']) + tm.assert_frame_equal(results[0], exp) + + def test_me_breakpoints(self): + results = web.DataReader("ME_Breakpoints", "famafrench", + start='2010-01-01', end='2010-12-01') + self.assertTrue(isinstance(results, dict)) + self.assertEqual(len(results), 2) + self.assertEqual(results[0].shape, (12, 21)) + + exp_columns = pd.Index(['Count', (0, 5), (5, 10), (10, 15), (15, 20), (20, 25), + (25, 30), (30, 35), (35, 40), (40, 45), (45, 50), (50, 55), + (55, 60), (60, 65), (65, 70), (70, 75), (75, 80), (80, 85), + (85, 90), (90, 95), (95, 100)], dtype='object') + tm.assert_index_equal(results[0].columns, exp_columns) + + exp_index = pd.period_range('2010-01-01', '2010-12-01', freq='M', name='Date') + tm.assert_index_equal(results[0].index, exp_index) + + def test_prior_2_12_breakpoints(self): + results = web.DataReader("Prior_2-12_Breakpoints", "famafrench", + start='2010-01-01', end='2010-12-01') + self.assertTrue(isinstance(results, dict)) + self.assertEqual(len(results), 2) + self.assertEqual(results[0].shape, (12, 22)) + + exp_columns = pd.Index(['<=0', '>0', (0, 5), (5, 10), (10, 15), (15, 20), (20, 25), + (25, 30), (30, 35), (35, 40), (40, 45), (45, 50), (50, 55), + (55, 60), (60, 65), (65, 70), (70, 75), (75, 80), (80, 85), + (85, 90), (90, 95), (95, 100)], dtype='object') + tm.assert_index_equal(results[0].columns, exp_columns) + + exp_index = pd.period_range('2010-01-01', '2010-12-01', freq='M', name='Date') + tm.assert_index_equal(results[0].index, exp_index) if __name__ == '__main__':