Skip to content

Commit c882161

Browse files
Merge pull request #159 from jtkiley/master
Added retrieval of EDGAR daily indices.
2 parents 823e6fc + 047f7cc commit c882161

File tree

7 files changed

+258
-18
lines changed

7 files changed

+258
-18
lines changed

docs/source/remote_data.rst

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -404,8 +404,21 @@ EDGAR Index
404404
405405
Company filing index from EDGAR (SEC).
406406
407+
The daily indices get large quickly (i.e. the set of daily indices from 1994
408+
to 2015 is 1.5GB), and the FTP server will close the connection past some
409+
downloading threshold . In testing, pulling one year at a time works well.
410+
If the FTP server starts refusing your connections, you should be able to
411+
reconnect after waiting a few minutes.
412+
413+
407414
.. ipython:: python
408415
409416
import pandas_datareader.data as web
410417
ed = web.DataReader('full', 'edgar-index')
411418
ed[:5]
419+
420+
.. ipython:: python
421+
422+
import pandas_datareader.data as web
423+
ed = web.DataReader('daily', 'edgar-index', '1998-05-18', '1998-05-18')
424+
ed[:5]

docs/source/whatsnew/v0.2.2.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ New features
2020

2121
- ``DataReader`` now supports dividend only pulls from Yahoo! Finance, see :ref:`here<remote_data.yahoo>` (:issue:`138`).
2222
- ``DataReader`` now supports SEC EDGAR full (current quarter) index retrieval, see :ref:`here<remote_data.edgar>` (:issue:`143`).
23+
- ``DataReader`` now supports SEC EDGAR daily (back to 7/1/1994) index retrieval, see :ref:`here<remote_data.edgar>` (:issue:`147`).
2324

2425
.. _whatsnew_022.api_breaking:
2526

pandas_datareader/compat/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from io import BytesIO

pandas_datareader/data.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,8 @@ def DataReader(name, data_source=None, start=None, end=None,
9494
ff = DataReader("F-F_ST_Reversal_Factor", "famafrench")
9595
9696
# Data from EDGAR index
97-
ed = DataReader("master", "edgar-index")
97+
ed = DataReader("full", "edgar-index")
98+
ed2 = DataReader("daily", "edgar-index")
9899
"""
99100
if data_source == "yahoo":
100101
return YahooDailyReader(symbols=name, start=start, end=end,

pandas_datareader/edgar.py

Lines changed: 212 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,39 @@
1+
import re
2+
import datetime as dt
3+
from ftplib import FTP
4+
import gzip
5+
16
from pandas import read_csv
7+
from pandas import DataFrame
8+
from pandas import to_datetime
29
from pandas.io.common import ZipFile
310
from pandas.compat import StringIO
4-
from pandas.compat import BytesIO
11+
from pandas.core.common import is_number
512

613
from pandas_datareader.base import _BaseReader
14+
from pandas_datareader.compat import BytesIO
15+
from pandas_datareader._utils import RemoteDataError
16+
717

18+
_URL_FULL = 'edgar/full-index/master.zip'
19+
_URL_DAILY = 'ftp://ftp.sec.gov/'
20+
_SEC_FTP = 'ftp.sec.gov'
821

9-
_URL_FULL = 'ftp://ftp.sec.gov/edgar/full-index/master.zip'
1022
_COLUMNS = ['cik', 'company_name', 'form_type', 'date_filed', 'filename']
23+
_DIVIDER = re.compile('--------------')
24+
_EDGAR = 'edgar/'
25+
_EDGAR_DAILY = 'edgar/daily-index'
26+
_EDGAR_RE = re.compile(_EDGAR)
27+
_EDGAR_MIN_DATE = dt.datetime(1994, 7, 1)
28+
_ZIP_RE = re.compile('\.zip$')
29+
_GZ_RE = re.compile('\.gz$')
30+
31+
_MLSD_VALUES_RE = re.compile('modify=(?P<modify>.*?);.*'
32+
'type=(?P<type>.*?);.*'
33+
'; (?P<name>.*)$')
34+
_FILENAME_DATE_RE = re.compile('\w*?\.(\d*)\.idx')
35+
_FILENAME_MASTER_RE = re.compile('master\.\d*\.idx')
36+
_EDGAR_MAX_6_DIGIT_DATE = dt.datetime(1998, 5, 15)
1137

1238

1339
class EdgarIndexReader(_BaseReader):
@@ -17,27 +43,202 @@ class EdgarIndexReader(_BaseReader):
1743
Returns
1844
-------
1945
edgar_index : pandas.DataFrame.
20-
DataFrame of EDGAR master index.
46+
DataFrame of EDGAR index.
2147
"""
2248

2349
@property
2450
def url(self):
25-
return _URL_FULL
26-
27-
def _read_zipfile(self, url):
51+
if self.symbols == 'full':
52+
return _URL_FULL
53+
elif self.symbols == 'daily':
54+
return _URL_DAILY
55+
else:
56+
return _URL_FULL # Should probably raise or use full unless daily.
2857

29-
zipf = BytesIO(self._get_response(url).content)
58+
def _read_zipfile(self, ftppath):
3059

60+
zipf = BytesIO()
61+
try:
62+
self._sec_ftp_session.retrbinary('RETR ' + ftppath, zipf.write)
63+
except EOFError:
64+
raise RemoteDataError('FTP server has closed the connection.')
65+
zipf.seek(0)
3166
with ZipFile(zipf, 'r') as zf:
3267
data = zf.open(zf.namelist()[0]).read().decode()
3368

34-
return data
69+
return StringIO(data)
70+
71+
def _read_gzfile(self, ftppath):
72+
73+
zipf = BytesIO()
74+
try:
75+
self._sec_ftp_session.retrbinary('RETR ' + ftppath, zipf.write)
76+
except EOFError:
77+
raise RemoteDataError('FTP server has closed the connection.')
78+
zipf.seek(0)
79+
zf = gzip.GzipFile(fileobj=zipf, mode='rb')
80+
try:
81+
data = zf.read().decode('iso-8859-1')
82+
finally:
83+
zf.close()
84+
85+
return StringIO(data)
3586

36-
def _read_one_data(self, url, params):
87+
def _read_one_data(self, ftppath, params):
3788

38-
index_file = StringIO(self._read_zipfile(url))
89+
if re.search(_ZIP_RE, ftppath) is not None:
90+
index_file = self._read_zipfile(ftppath)
91+
elif re.search(_GZ_RE, ftppath) is not None:
92+
index_file = self._read_gzfile(ftppath)
93+
else:
94+
index_file = StringIO()
95+
index_list = []
96+
try:
97+
self._sec_ftp_session.retrlines('RETR ' + ftppath,
98+
index_list.append)
99+
except EOFError:
100+
raise RemoteDataError('FTP server has closed the connection.')
39101

102+
for line in index_list:
103+
index_file.write(line + '\n')
104+
index_file.seek(0)
105+
106+
index_file = self._remove_header(index_file)
40107
index = read_csv(index_file, delimiter='|', header=None,
41-
index_col=False, skiprows=10, names=_COLUMNS,
108+
index_col=False, names=_COLUMNS,
42109
low_memory=False)
110+
index['filename'] = index['filename'].map(self._fix_old_file_paths)
43111
return index
112+
113+
def _read_daily_data(self, url, params):
114+
doc_index = DataFrame()
115+
file_index = self._get_dir_lists()
116+
for idx_entry in file_index:
117+
if self._check_idx(idx_entry):
118+
daily_idx_path = (idx_entry['path'] + '/' + idx_entry['name'])
119+
daily_idx = self._read_one_data(daily_idx_path, params)
120+
doc_index = doc_index.append(daily_idx)
121+
return doc_index
122+
123+
def _check_idx(self, idx_entry):
124+
if re.match(_FILENAME_MASTER_RE, idx_entry['name']):
125+
if idx_entry['date'] is not None:
126+
if (self.start <= idx_entry['date'] <= self.end):
127+
return True
128+
else:
129+
return False
130+
131+
def _remove_header(self, data):
132+
header = True
133+
cleaned_datafile = StringIO()
134+
for line in data:
135+
if header is False:
136+
cleaned_datafile.write(line + '\n')
137+
elif re.search(_DIVIDER, line) is not None:
138+
header = False
139+
140+
cleaned_datafile.seek(0)
141+
return cleaned_datafile
142+
143+
def _fix_old_file_paths(self, path):
144+
if type(path) == float: # pd.read_csv turns blank into np.nan
145+
return path
146+
if re.match(_EDGAR_RE, path) is None:
147+
path = _EDGAR + path
148+
return path
149+
150+
def read(self):
151+
try:
152+
self._sec_ftp_session = FTP(_SEC_FTP)
153+
self._sec_ftp_session.login()
154+
except EOFError:
155+
raise RemoteDataError('FTP server has closed the connection.')
156+
try:
157+
if self.symbols == 'full':
158+
return self._read_one_data(self.url, self.params)
159+
160+
elif self.symbols == 'daily':
161+
return self._read_daily_data(self.url, self.params)
162+
finally:
163+
self._sec_ftp_session.quit()
164+
165+
def _sanitize_dates(self, start, end):
166+
if is_number(start):
167+
start = dt.datetime(start, 1, 1)
168+
start = to_datetime(start)
169+
170+
if is_number(end):
171+
end = dt.datetime(end, 1, 1)
172+
end = to_datetime(end)
173+
174+
if start is None:
175+
start = dt.datetime(2015, 1, 1)
176+
if end is None:
177+
end = dt.datetime(2015, 1, 3)
178+
if start < _EDGAR_MIN_DATE:
179+
start = _EDGAR_MIN_DATE
180+
181+
return start, end
182+
183+
def _get_dir_lists(self):
184+
mlsd_tree = self._get_mlsd_tree(_EDGAR_DAILY)
185+
return mlsd_tree
186+
187+
def _get_mlsd_tree(self, dir, top=True):
188+
initial_mlsd = self._get_mlsd(dir)
189+
mlsd = initial_mlsd[:]
190+
for entry in initial_mlsd:
191+
if entry['type'] == 'dir':
192+
if top is True:
193+
if self._check_mlsd_year(entry) is not True:
194+
continue
195+
subdir = dir + '/' + entry['name']
196+
mlsd.extend(self._get_mlsd_tree(subdir, False))
197+
return mlsd
198+
199+
def _get_mlsd(self, dir):
200+
dir_list = []
201+
try:
202+
self._sec_ftp_session.retrlines('MLSD' + ' ' + dir,
203+
dir_list.append)
204+
except EOFError:
205+
raise RemoteDataError('FTP server has closed the connection.')
206+
207+
dict_list = []
208+
for line in dir_list:
209+
entry = self._process_mlsd_line(line)
210+
entry['path'] = dir
211+
dict_list.append(entry)
212+
213+
return dict_list
214+
215+
def _process_mlsd_line(self, line):
216+
line_dict = re.match(_MLSD_VALUES_RE, line).groupdict()
217+
line_dict['date'] = self._get_index_date(line_dict['name'])
218+
return line_dict
219+
220+
def _get_index_date(self, filename):
221+
try:
222+
idx_date = re.search(_FILENAME_DATE_RE, filename).group(1)
223+
if len(idx_date) == 6:
224+
if idx_date[-2:] == '94':
225+
filedate = dt.datetime.strptime(idx_date, '%m%d%y')
226+
else:
227+
filedate = dt.datetime.strptime(idx_date, '%y%m%d')
228+
if filedate > _EDGAR_MAX_6_DIGIT_DATE:
229+
filedate = None
230+
elif len(idx_date) == 8:
231+
filedate = dt.datetime.strptime(idx_date, '%Y%m%d')
232+
except AttributeError:
233+
filedate = None
234+
235+
return filedate
236+
237+
def _check_mlsd_year(self, entry):
238+
try:
239+
if (self.start.year <= int(entry['name']) <= self.end.year):
240+
return True
241+
else:
242+
return False
243+
except TypeError:
244+
return False

pandas_datareader/tests/test_data.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -528,10 +528,6 @@ def test_read_fred(self):
528528
vix = DataReader("VIXCLS", "fred")
529529
assert isinstance(vix, DataFrame)
530530

531-
def test_read_edgar_index(self):
532-
ed = DataReader("full", "edgar-index")
533-
assert isinstance(ed, DataFrame)
534-
535531
def test_not_implemented(self):
536532
self.assertRaises(NotImplementedError, DataReader, "NA", "NA")
537533

pandas_datareader/tests/test_edgar.py

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,40 @@
22
import pandas.util.testing as tm
33

44
import pandas_datareader.data as web
5+
from pandas_datareader._utils import RemoteDataError
56

67

78
class TestEdgarIndex(tm.TestCase):
8-
def test_get_index(self):
9-
ed = web.DataReader('full', 'edgar-index')
9+
def test_get_full_index(self):
10+
try:
11+
ed = web.DataReader('full', 'edgar-index')
12+
except RemoteDataError as e:
13+
raise nose.SkipTest(e)
1014
assert len(ed > 1000)
1115

16+
def test_get_nonzip_index_and_low_date(self):
17+
try:
18+
ed = web.DataReader('daily', 'edgar-index', '1994-06-30',
19+
'1994-07-02')
20+
except RemoteDataError as e:
21+
raise nose.SkipTest(e)
22+
assert len(ed > 200)
23+
24+
def test_get_gz_index_and_no_date(self):
25+
try:
26+
ed = web.DataReader('daily', 'edgar-index')
27+
except RemoteDataError as e:
28+
raise nose.SkipTest(e)
29+
assert len(ed > 2000)
30+
31+
def test_6_digit_date(self):
32+
try:
33+
ed = web.DataReader('daily', 'edgar-index', '1998-05-18',
34+
'1998-05-18')
35+
except RemoteDataError as e:
36+
raise nose.SkipTest(e)
37+
assert len(ed < 1200)
38+
1239
if __name__ == '__main__':
1340
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
1441
exit=False)

0 commit comments

Comments
 (0)