From d3391eb761956b88a9627ea88351925a48c29cc9 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Wed, 17 Jul 2013 21:08:12 -0400 Subject: [PATCH] BUG: fix data.py regression --- doc/source/release.rst | 2 ++ doc/source/v0.12.0.txt | 2 ++ pandas/io/data.py | 19 ++++++++++--------- pandas/io/tests/test_data.py | 3 ++- 4 files changed, 16 insertions(+), 10 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index f3029cfe41349..76a84d40400d0 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -342,6 +342,8 @@ pandas 0.12 - Fixed bug in initializing ``DatetimeIndex`` with an array of strings in a certain time zone (:issue:`4229`) - Fixed bug where html5lib wasn't being properly skipped (:issue:`4265`) + - Fixed bug where get_data_famafrench wasn't using the correct file edges + (:issue:`4281`) pandas 0.11.0 ============= diff --git a/doc/source/v0.12.0.txt b/doc/source/v0.12.0.txt index 76fd81b882e84..3424915aadddf 100644 --- a/doc/source/v0.12.0.txt +++ b/doc/source/v0.12.0.txt @@ -475,6 +475,8 @@ Bug Fixes - Fixed bug in initializing ``DatetimeIndex`` with an array of strings in a certain time zone (:issue:`4229`) - Fixed bug where html5lib wasn't being properly skipped (:issue:`4265`) + - Fixed bug where get_data_famafrench wasn't using the correct file edges + (:issue:`4281`) See the :ref:`full release notes ` or issue tracker diff --git a/pandas/io/data.py b/pandas/io/data.py index e3b0af542bb41..1b51ae5ec8a02 100644 --- a/pandas/io/data.py +++ b/pandas/io/data.py @@ -453,8 +453,8 @@ def get_data_fred(name, start=dt.datetime(2010, 1, 1), def get_data_famafrench(name): # path of zip files zip_file_url = ('http://mba.tuck.dartmouth.edu/pages/faculty/' - 'ken.french/ftp/') - zip_file_path = '{0}{1}.zip'.format(zip_file_url, name) + 'ken.french/ftp') + zip_file_path = '{0}/{1}.zip'.format(zip_file_url, name) with urlopen(zip_file_path) as url: raw = url.read() @@ -463,13 +463,13 @@ def get_data_famafrench(name): tmpf.write(raw) with ZipFile(tmpf, 'r') as zf: - data = zf.read(name + '.txt').splitlines() + data = zf.open(name + '.txt').readlines() line_lengths = np.array(map(len, data)) - file_edges = np.where(line_lengths)[0] + file_edges = np.where(line_lengths == 2)[0] datasets = {} - edges = itertools.izip(file_edges[:-1], file_edges[1:]) + edges = itertools.izip(file_edges + 1, file_edges[1:]) for i, (left_edge, right_edge) in enumerate(edges): dataset = [d.split() for d in data[left_edge:right_edge]] if len(dataset) > 10: @@ -479,14 +479,15 @@ def get_data_famafrench(name): header = dataset[header_index] ds_header = dataset[header_index + 1:] # to ensure the header is unique - header = ['{0} {1}'.format(*items) for items in enumerate(header, - start=1)] - index = np.fromiter((d[0] for d in ds_header), dtype=int) - dataset = np.fromiter((d[1:] for d in ds_header), dtype=float) + header = ['{0} {1}'.format(j, hj) for j, hj in enumerate(header, + start=1)] + index = np.array([d[0] for d in ds_header], dtype=int) + dataset = np.array([d[1:] for d in ds_header], dtype=float) datasets[i] = DataFrame(dataset, index, columns=header) return datasets + # Items needed for options class CUR_MONTH = dt.datetime.now().month CUR_YEAR = dt.datetime.now().year diff --git a/pandas/io/tests/test_data.py b/pandas/io/tests/test_data.py index df1b292d9ba5f..849f79afe3855 100644 --- a/pandas/io/tests/test_data.py +++ b/pandas/io/tests/test_data.py @@ -10,7 +10,7 @@ from pandas.io import data as web from pandas.io.data import DataReader, SymbolWarning from pandas.util.testing import (assert_series_equal, assert_produces_warning, - assert_frame_equal, network) + network) from numpy.testing import assert_array_equal @@ -343,6 +343,7 @@ def test_read_famafrench(self): "F-F_Research_Data_Factors_weekly", "6_Portfolios_2x3", "F-F_ST_Reversal_Factor"): ff = DataReader(name, "famafrench") + assert ff assert isinstance(ff, dict)