1
+ import re
2
+ import datetime as dt
3
+ from ftplib import FTP
4
+ import gzip
5
+
1
6
from pandas import read_csv
7
+ from pandas import DataFrame
8
+ from pandas import to_datetime
2
9
from pandas .io .common import ZipFile
3
10
from pandas .compat import StringIO
4
- from pandas .compat import BytesIO
11
+ from pandas .core . common import is_number
5
12
6
13
from pandas_datareader .base import _BaseReader
14
+ from pandas_datareader .compat import BytesIO
15
+ from pandas_datareader ._utils import RemoteDataError
16
+
7
17
18
+ _URL_FULL = 'edgar/full-index/master.zip'
19
+ _URL_DAILY = 'ftp://ftp.sec.gov/'
20
+ _SEC_FTP = 'ftp.sec.gov'
8
21
9
- _URL_FULL = 'ftp://ftp.sec.gov/edgar/full-index/master.zip'
10
22
_COLUMNS = ['cik' , 'company_name' , 'form_type' , 'date_filed' , 'filename' ]
23
+ _DIVIDER = re .compile ('--------------' )
24
+ _EDGAR = 'edgar/'
25
+ _EDGAR_DAILY = 'edgar/daily-index'
26
+ _EDGAR_RE = re .compile (_EDGAR )
27
+ _EDGAR_MIN_DATE = dt .datetime (1994 , 7 , 1 )
28
+ _ZIP_RE = re .compile ('\.zip$' )
29
+ _GZ_RE = re .compile ('\.gz$' )
30
+
31
+ _MLSD_VALUES_RE = re .compile ('modify=(?P<modify>.*?);.*'
32
+ 'type=(?P<type>.*?);.*'
33
+ '; (?P<name>.*)$' )
34
+ _FILENAME_DATE_RE = re .compile ('\w*?\.(\d*)\.idx' )
35
+ _FILENAME_MASTER_RE = re .compile ('master\.\d*\.idx' )
36
+ _EDGAR_MAX_6_DIGIT_DATE = dt .datetime (1998 , 5 , 15 )
11
37
12
38
13
39
class EdgarIndexReader (_BaseReader ):
@@ -17,27 +43,202 @@ class EdgarIndexReader(_BaseReader):
17
43
Returns
18
44
-------
19
45
edgar_index : pandas.DataFrame.
20
- DataFrame of EDGAR master index.
46
+ DataFrame of EDGAR index.
21
47
"""
22
48
23
49
@property
24
50
def url (self ):
25
- return _URL_FULL
26
-
27
- def _read_zipfile (self , url ):
51
+ if self .symbols == 'full' :
52
+ return _URL_FULL
53
+ elif self .symbols == 'daily' :
54
+ return _URL_DAILY
55
+ else :
56
+ return _URL_FULL # Should probably raise or use full unless daily.
28
57
29
- zipf = BytesIO (self . _get_response ( url ). content )
58
+ def _read_zipfile (self , ftppath ):
30
59
60
+ zipf = BytesIO ()
61
+ try :
62
+ self ._sec_ftp_session .retrbinary ('RETR ' + ftppath , zipf .write )
63
+ except EOFError :
64
+ raise RemoteDataError ('FTP server has closed the connection.' )
65
+ zipf .seek (0 )
31
66
with ZipFile (zipf , 'r' ) as zf :
32
67
data = zf .open (zf .namelist ()[0 ]).read ().decode ()
33
68
34
- return data
69
+ return StringIO (data )
70
+
71
+ def _read_gzfile (self , ftppath ):
72
+
73
+ zipf = BytesIO ()
74
+ try :
75
+ self ._sec_ftp_session .retrbinary ('RETR ' + ftppath , zipf .write )
76
+ except EOFError :
77
+ raise RemoteDataError ('FTP server has closed the connection.' )
78
+ zipf .seek (0 )
79
+ zf = gzip .GzipFile (fileobj = zipf , mode = 'rb' )
80
+ try :
81
+ data = zf .read ().decode ('iso-8859-1' )
82
+ finally :
83
+ zf .close ()
84
+
85
+ return StringIO (data )
35
86
36
- def _read_one_data (self , url , params ):
87
+ def _read_one_data (self , ftppath , params ):
37
88
38
- index_file = StringIO (self ._read_zipfile (url ))
89
+ if re .search (_ZIP_RE , ftppath ) is not None :
90
+ index_file = self ._read_zipfile (ftppath )
91
+ elif re .search (_GZ_RE , ftppath ) is not None :
92
+ index_file = self ._read_gzfile (ftppath )
93
+ else :
94
+ index_file = StringIO ()
95
+ index_list = []
96
+ try :
97
+ self ._sec_ftp_session .retrlines ('RETR ' + ftppath ,
98
+ index_list .append )
99
+ except EOFError :
100
+ raise RemoteDataError ('FTP server has closed the connection.' )
39
101
102
+ for line in index_list :
103
+ index_file .write (line + '\n ' )
104
+ index_file .seek (0 )
105
+
106
+ index_file = self ._remove_header (index_file )
40
107
index = read_csv (index_file , delimiter = '|' , header = None ,
41
- index_col = False , skiprows = 10 , names = _COLUMNS ,
108
+ index_col = False , names = _COLUMNS ,
42
109
low_memory = False )
110
+ index ['filename' ] = index ['filename' ].map (self ._fix_old_file_paths )
43
111
return index
112
+
113
+ def _read_daily_data (self , url , params ):
114
+ doc_index = DataFrame ()
115
+ file_index = self ._get_dir_lists ()
116
+ for idx_entry in file_index :
117
+ if self ._check_idx (idx_entry ):
118
+ daily_idx_path = (idx_entry ['path' ] + '/' + idx_entry ['name' ])
119
+ daily_idx = self ._read_one_data (daily_idx_path , params )
120
+ doc_index = doc_index .append (daily_idx )
121
+ return doc_index
122
+
123
+ def _check_idx (self , idx_entry ):
124
+ if re .match (_FILENAME_MASTER_RE , idx_entry ['name' ]):
125
+ if idx_entry ['date' ] is not None :
126
+ if (self .start <= idx_entry ['date' ] <= self .end ):
127
+ return True
128
+ else :
129
+ return False
130
+
131
+ def _remove_header (self , data ):
132
+ header = True
133
+ cleaned_datafile = StringIO ()
134
+ for line in data :
135
+ if header is False :
136
+ cleaned_datafile .write (line + '\n ' )
137
+ elif re .search (_DIVIDER , line ) is not None :
138
+ header = False
139
+
140
+ cleaned_datafile .seek (0 )
141
+ return cleaned_datafile
142
+
143
+ def _fix_old_file_paths (self , path ):
144
+ if type (path ) == float : # pd.read_csv turns blank into np.nan
145
+ return path
146
+ if re .match (_EDGAR_RE , path ) is None :
147
+ path = _EDGAR + path
148
+ return path
149
+
150
+ def read (self ):
151
+ try :
152
+ self ._sec_ftp_session = FTP (_SEC_FTP )
153
+ self ._sec_ftp_session .login ()
154
+ except EOFError :
155
+ raise RemoteDataError ('FTP server has closed the connection.' )
156
+ try :
157
+ if self .symbols == 'full' :
158
+ return self ._read_one_data (self .url , self .params )
159
+
160
+ elif self .symbols == 'daily' :
161
+ return self ._read_daily_data (self .url , self .params )
162
+ finally :
163
+ self ._sec_ftp_session .quit ()
164
+
165
+ def _sanitize_dates (self , start , end ):
166
+ if is_number (start ):
167
+ start = dt .datetime (start , 1 , 1 )
168
+ start = to_datetime (start )
169
+
170
+ if is_number (end ):
171
+ end = dt .datetime (end , 1 , 1 )
172
+ end = to_datetime (end )
173
+
174
+ if start is None :
175
+ start = dt .datetime (2015 , 1 , 1 )
176
+ if end is None :
177
+ end = dt .datetime (2015 , 1 , 3 )
178
+ if start < _EDGAR_MIN_DATE :
179
+ start = _EDGAR_MIN_DATE
180
+
181
+ return start , end
182
+
183
+ def _get_dir_lists (self ):
184
+ mlsd_tree = self ._get_mlsd_tree (_EDGAR_DAILY )
185
+ return mlsd_tree
186
+
187
+ def _get_mlsd_tree (self , dir , top = True ):
188
+ initial_mlsd = self ._get_mlsd (dir )
189
+ mlsd = initial_mlsd [:]
190
+ for entry in initial_mlsd :
191
+ if entry ['type' ] == 'dir' :
192
+ if top is True :
193
+ if self ._check_mlsd_year (entry ) is not True :
194
+ continue
195
+ subdir = dir + '/' + entry ['name' ]
196
+ mlsd .extend (self ._get_mlsd_tree (subdir , False ))
197
+ return mlsd
198
+
199
+ def _get_mlsd (self , dir ):
200
+ dir_list = []
201
+ try :
202
+ self ._sec_ftp_session .retrlines ('MLSD' + ' ' + dir ,
203
+ dir_list .append )
204
+ except EOFError :
205
+ raise RemoteDataError ('FTP server has closed the connection.' )
206
+
207
+ dict_list = []
208
+ for line in dir_list :
209
+ entry = self ._process_mlsd_line (line )
210
+ entry ['path' ] = dir
211
+ dict_list .append (entry )
212
+
213
+ return dict_list
214
+
215
+ def _process_mlsd_line (self , line ):
216
+ line_dict = re .match (_MLSD_VALUES_RE , line ).groupdict ()
217
+ line_dict ['date' ] = self ._get_index_date (line_dict ['name' ])
218
+ return line_dict
219
+
220
+ def _get_index_date (self , filename ):
221
+ try :
222
+ idx_date = re .search (_FILENAME_DATE_RE , filename ).group (1 )
223
+ if len (idx_date ) == 6 :
224
+ if idx_date [- 2 :] == '94' :
225
+ filedate = dt .datetime .strptime (idx_date , '%m%d%y' )
226
+ else :
227
+ filedate = dt .datetime .strptime (idx_date , '%y%m%d' )
228
+ if filedate > _EDGAR_MAX_6_DIGIT_DATE :
229
+ filedate = None
230
+ elif len (idx_date ) == 8 :
231
+ filedate = dt .datetime .strptime (idx_date , '%Y%m%d' )
232
+ except AttributeError :
233
+ filedate = None
234
+
235
+ return filedate
236
+
237
+ def _check_mlsd_year (self , entry ):
238
+ try :
239
+ if (self .start .year <= int (entry ['name' ]) <= self .end .year ):
240
+ return True
241
+ else :
242
+ return False
243
+ except TypeError :
244
+ return False
0 commit comments