BUG: fix data.py regression

cpcloud · cpcloud · commit d3391eb76195 · 2013-07-17T21:54:25.000-04:00
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -342,6 +342,8 @@ pandas 0.12
   - Fixed bug in initializing ``DatetimeIndex`` with an array of strings
     in a certain time zone (:issue:`4229`)
   - Fixed bug where html5lib wasn't being properly skipped (:issue:`4265`)
+  - Fixed bug where get_data_famafrench wasn't using the correct file edges
+    (:issue:`4281`)
 
 pandas 0.11.0
 =============
diff --git a/doc/source/v0.12.0.txt b/doc/source/v0.12.0.txt
@@ -475,6 +475,8 @@ Bug Fixes
   - Fixed bug in initializing ``DatetimeIndex`` with an array of strings
     in a certain time zone (:issue:`4229`)
   - Fixed bug where html5lib wasn't being properly skipped (:issue:`4265`)
+  - Fixed bug where get_data_famafrench wasn't using the correct file edges
+    (:issue:`4281`)
 
 See the :ref:`full release notes
 <release>` or issue tracker
diff --git a/pandas/io/data.py b/pandas/io/data.py
@@ -453,8 +453,8 @@ def get_data_fred(name, start=dt.datetime(2010, 1, 1),
 def get_data_famafrench(name):
     # path of zip files
     zip_file_url = ('http://mba.tuck.dartmouth.edu/pages/faculty/'
-                    'ken.french/ftp/')
-    zip_file_path = '{0}{1}.zip'.format(zip_file_url, name)
+                    'ken.french/ftp')
+    zip_file_path = '{0}/{1}.zip'.format(zip_file_url, name)
 
     with urlopen(zip_file_path) as url:
         raw = url.read()
@@ -463,13 +463,13 @@ def get_data_famafrench(name):
         tmpf.write(raw)
 
         with ZipFile(tmpf, 'r') as zf:
-            data = zf.read(name + '.txt').splitlines()
+            data = zf.open(name + '.txt').readlines()
 
     line_lengths = np.array(map(len, data))
-    file_edges = np.where(line_lengths)[0]
+    file_edges = np.where(line_lengths == 2)[0]
 
     datasets = {}
-    edges = itertools.izip(file_edges[:-1], file_edges[1:])
+    edges = itertools.izip(file_edges + 1, file_edges[1:])
     for i, (left_edge, right_edge) in enumerate(edges):
         dataset = [d.split() for d in data[left_edge:right_edge]]
         if len(dataset) > 10:
@@ -479,14 +479,15 @@ def get_data_famafrench(name):
             header = dataset[header_index]
             ds_header = dataset[header_index + 1:]
             # to ensure the header is unique
-            header = ['{0} {1}'.format(*items) for items in enumerate(header,
-                                                                      start=1)]
-            index = np.fromiter((d[0] for d in ds_header), dtype=int)
-            dataset = np.fromiter((d[1:] for d in ds_header), dtype=float)
+            header = ['{0} {1}'.format(j, hj) for j, hj in enumerate(header,
+                                                                     start=1)]
+            index = np.array([d[0] for d in ds_header], dtype=int)
+            dataset = np.array([d[1:] for d in ds_header], dtype=float)
             datasets[i] = DataFrame(dataset, index, columns=header)
 
     return datasets
 
+
 # Items needed for options class
 CUR_MONTH = dt.datetime.now().month
 CUR_YEAR = dt.datetime.now().year
diff --git a/pandas/io/tests/test_data.py b/pandas/io/tests/test_data.py
@@ -10,7 +10,7 @@
 from pandas.io import data as web
 from pandas.io.data import DataReader, SymbolWarning
 from pandas.util.testing import (assert_series_equal, assert_produces_warning,
-                                 assert_frame_equal, network)
+                                 network)
 from numpy.testing import assert_array_equal
 
 
@@ -343,6 +343,7 @@ def test_read_famafrench(self):
                      "F-F_Research_Data_Factors_weekly", "6_Portfolios_2x3",
                      "F-F_ST_Reversal_Factor"):
             ff = DataReader(name, "famafrench")
+            assert ff
             assert isinstance(ff, dict)