13
13
14
14
import sys
15
15
import struct
16
+ from dateutil .relativedelta import relativedelta
16
17
from pandas .core .base import StringMixin
17
18
from pandas .core .frame import DataFrame
18
19
from pandas .core .series import Series
19
20
from pandas .core .categorical import Categorical
20
21
import datetime
21
- from pandas import compat
22
+ from pandas import compat , to_timedelta , to_datetime
22
23
from pandas .compat import lrange , lmap , lzip , text_type , string_types , range , \
23
24
zip
24
- from pandas import isnull
25
25
from pandas .io .common import get_filepath_or_buffer
26
26
from pandas .lib import max_len_string_array , is_string_array
27
- from pandas .tslib import NaT
27
+ from pandas .tslib import NaT , Timestamp
28
28
29
29
def read_stata (filepath_or_buffer , convert_dates = True ,
30
30
convert_categoricals = True , encoding = None , index = None ,
@@ -62,6 +62,7 @@ def read_stata(filepath_or_buffer, convert_dates=True,
62
62
_date_formats = ["%tc" , "%tC" , "%td" , "%d" , "%tw" , "%tm" , "%tq" , "%th" , "%ty" ]
63
63
64
64
65
+ stata_epoch = datetime .datetime (1960 , 1 , 1 )
65
66
def _stata_elapsed_date_to_datetime (date , fmt ):
66
67
"""
67
68
Convert from SIF to datetime. http://www.stata.com/help.cgi?datetime
@@ -111,9 +112,7 @@ def _stata_elapsed_date_to_datetime(date, fmt):
111
112
#TODO: When pandas supports more than datetime64[ns], this should be improved to use correct range, e.g. datetime[Y] for yearly
112
113
if np .isnan (date ):
113
114
return NaT
114
-
115
115
date = int (date )
116
- stata_epoch = datetime .datetime (1960 , 1 , 1 )
117
116
if fmt in ["%tc" , "tc" ]:
118
117
from dateutil .relativedelta import relativedelta
119
118
return stata_epoch + relativedelta (microseconds = date * 1000 )
@@ -148,6 +147,158 @@ def _stata_elapsed_date_to_datetime(date, fmt):
148
147
raise ValueError ("Date fmt %s not understood" % fmt )
149
148
150
149
150
+ def _stata_elapsed_date_to_datetime_vec (dates , fmt ):
151
+ """
152
+ Convert from SIF to datetime. http://www.stata.com/help.cgi?datetime
153
+
154
+ Parameters
155
+ ----------
156
+ dates : array-like
157
+ The Stata Internal Format date to convert to datetime according to fmt
158
+ fmt : str
159
+ The format to convert to. Can be, tc, td, tw, tm, tq, th, ty
160
+ Returns
161
+
162
+ Returns
163
+ -------
164
+ converted : Series
165
+ The converted dates
166
+
167
+ Examples
168
+ --------
169
+ >>> _stata_elapsed_date_to_datetime(52, "%tw")
170
+ datetime.datetime(1961, 1, 1, 0, 0)
171
+
172
+ Notes
173
+ -----
174
+ datetime/c - tc
175
+ milliseconds since 01jan1960 00:00:00.000, assuming 86,400 s/day
176
+ datetime/C - tC - NOT IMPLEMENTED
177
+ milliseconds since 01jan1960 00:00:00.000, adjusted for leap seconds
178
+ date - td
179
+ days since 01jan1960 (01jan1960 = 0)
180
+ weekly date - tw
181
+ weeks since 1960w1
182
+ This assumes 52 weeks in a year, then adds 7 * remainder of the weeks.
183
+ The datetime value is the start of the week in terms of days in the
184
+ year, not ISO calendar weeks.
185
+ monthly date - tm
186
+ months since 1960m1
187
+ quarterly date - tq
188
+ quarters since 1960q1
189
+ half-yearly date - th
190
+ half-years since 1960h1 yearly
191
+ date - ty
192
+ years since 0000
193
+
194
+ If you don't have pandas with datetime support, then you can't do
195
+ milliseconds accurately.
196
+ """
197
+ MIN_YEAR , MAX_YEAR = Timestamp .min .year , Timestamp .max .year
198
+ MAX_DAY_DELTA = (Timestamp .max - datetime .datetime (1960 , 1 , 1 )).days
199
+ MIN_DAY_DELTA = (Timestamp .min - datetime .datetime (1960 , 1 , 1 )).days
200
+ MIN_MS_DELTA = MIN_DAY_DELTA * 24 * 3600 * 1000
201
+ MAX_MS_DELTA = MAX_DAY_DELTA * 24 * 3600 * 1000
202
+
203
+ def convert_year_month_safe (year , month ):
204
+ """
205
+ Convert year and month to datetimes, using pandas vectorized versions
206
+ when the date range falls within the range supported by pandas. Other
207
+ wise it falls back to a slower but more robust method using datetime.
208
+ """
209
+ if year .max () < MAX_YEAR and year .min () > MIN_YEAR :
210
+ return to_datetime (100 * year + month , format = '%Y%m' )
211
+ else :
212
+ return Series (
213
+ [datetime .datetime (y , m , 1 ) for y , m in zip (year , month )])
214
+
215
+ def convert_year_days_safe (year , days ):
216
+ """
217
+ Converts year (e.g. 1999) and days since the start of the year to a
218
+ datetime or datetime64 Series
219
+ """
220
+ if year .max () < (MAX_YEAR - 1 ) and year .min () > MIN_YEAR :
221
+ return to_datetime (year , format = '%Y' ) + to_timedelta (days , unit = 'd' )
222
+ else :
223
+ value = [datetime .datetime (y , 1 , 1 ) + relativedelta (days = int (d )) for
224
+ y , d in zip (year , days )]
225
+ return Series (value )
226
+
227
+ def convert_delta_safe (base , deltas , unit ):
228
+ """
229
+ Convert base dates and deltas to datetimes, using pandas vectorized
230
+ versions if the deltas satisfy restrictions required to be expressed
231
+ as dates in pandas.
232
+ """
233
+ if unit == 'd' :
234
+ if deltas .max () > MAX_DAY_DELTA or deltas .min () < MIN_DAY_DELTA :
235
+ values = [base + relativedelta (days = int (d )) for d in deltas ]
236
+ return Series (values )
237
+ elif unit == 'ms' :
238
+ if deltas .max () > MAX_MS_DELTA or deltas .min () < MIN_MS_DELTA :
239
+ values = [base + relativedelta (microseconds = (int (d ) * 1000 )) for
240
+ d in deltas ]
241
+ return Series (values )
242
+ else :
243
+ raise ValueError ('format not understood' )
244
+
245
+ base = to_datetime (base )
246
+ deltas = to_timedelta (deltas , unit = unit )
247
+ return base + deltas
248
+
249
+ # TODO: If/when pandas supports more than datetime64[ns], this should be improved to use correct range, e.g. datetime[Y] for yearly
250
+ bad_locs = np .isnan (dates )
251
+ has_bad_values = False
252
+ if bad_locs .any ():
253
+ has_bad_values = True
254
+ data_col = Series (dates )
255
+ data_col [bad_locs ] = 1.0 # Replace with NaT
256
+ dates = dates .astype (np .int64 )
257
+
258
+ if fmt in ["%tc" , "tc" ]: # Delta ms relative to base
259
+ base = stata_epoch
260
+ ms = dates
261
+ conv_dates = convert_delta_safe (base , ms , 'ms' )
262
+ elif fmt in ["%tC" , "tC" ]:
263
+ from warnings import warn
264
+
265
+ warn ("Encountered %tC format. Leaving in Stata Internal Format." )
266
+ conv_dates = Series (dates , dtype = np .object )
267
+ if has_bad_values :
268
+ conv_dates [bad_locs ] = np .nan
269
+ return conv_dates
270
+ elif fmt in ["%td" , "td" , "%d" , "d" ]: # Delta days relative to base
271
+ base = stata_epoch
272
+ days = dates
273
+ conv_dates = convert_delta_safe (base , days , 'd' )
274
+ elif fmt in ["%tw" , "tw" ]: # does not count leap days - 7 days is a week
275
+ year = stata_epoch .year + dates // 52
276
+ days = (dates % 52 ) * 7
277
+ conv_dates = convert_year_days_safe (year , days )
278
+ elif fmt in ["%tm" , "tm" ]: # Delta months relative to base
279
+ year = stata_epoch .year + dates // 12
280
+ month = (dates % 12 ) + 1
281
+ conv_dates = convert_year_month_safe (year , month )
282
+ elif fmt in ["%tq" , "tq" ]: # Delta quarters relative to base
283
+ year = stata_epoch .year + dates // 4
284
+ month = (dates % 4 ) * 3 + 1
285
+ conv_dates = convert_year_month_safe (year , month )
286
+ elif fmt in ["%th" , "th" ]: # Delta half-years relative to base
287
+ year = stata_epoch .year + dates // 2
288
+ month = (dates % 2 ) * 6 + 1
289
+ conv_dates = convert_year_month_safe (year , month )
290
+ elif fmt in ["%ty" , "ty" ]: # Years -- not delta
291
+ # TODO: Check about negative years, here, and raise or warn if needed
292
+ year = dates
293
+ month = np .ones_like (dates )
294
+ conv_dates = convert_year_month_safe (year , month )
295
+ else :
296
+ raise ValueError ("Date fmt %s not understood" % fmt )
297
+
298
+ if has_bad_values : # Restore NaT for bad values
299
+ conv_dates [bad_locs ] = NaT
300
+ return conv_dates
301
+
151
302
def _datetime_to_stata_elapsed (date , fmt ):
152
303
"""
153
304
Convert from datetime to SIF. http://www.stata.com/help.cgi?datetime
@@ -477,6 +628,14 @@ def __init__(self, encoding):
477
628
'f' : np .float32 (struct .unpack ('<f' , b'\x00 \x00 \x00 \x7f ' )[0 ]),
478
629
'd' : np .float64 (struct .unpack ('<d' , b'\x00 \x00 \x00 \x00 \x00 \x00 \xe0 \x7f ' )[0 ])
479
630
}
631
+ self .NUMPY_TYPE_MAP = \
632
+ {
633
+ 'b' : 'i1' ,
634
+ 'h' : 'i2' ,
635
+ 'l' : 'i4' ,
636
+ 'f' : 'f4' ,
637
+ 'd' : 'f8'
638
+ }
480
639
481
640
# Reserved words cannot be used as variable names
482
641
self .RESERVED_WORDS = ('aggregate' , 'array' , 'boolean' , 'break' ,
@@ -759,15 +918,6 @@ def _calcsize(self, fmt):
759
918
return (type (fmt ) is int and fmt
760
919
or struct .calcsize (self .byteorder + fmt ))
761
920
762
- def _col_size (self , k = None ):
763
- if k is None :
764
- return self .col_sizes
765
- else :
766
- return self .col_sizes [k ]
767
-
768
- def _unpack (self , fmt , byt ):
769
- return struct .unpack (self .byteorder + fmt , byt )[0 ]
770
-
771
921
def _null_terminate (self , s ):
772
922
if compat .PY3 or self ._encoding is not None : # have bytes not strings,
773
923
# so must decode
@@ -784,55 +934,6 @@ def _null_terminate(self, s):
784
934
except :
785
935
return s
786
936
787
- def _next (self ):
788
- typlist = self .typlist
789
- if self .has_string_data :
790
- data = [None ] * self .nvar
791
- for i in range (len (data )):
792
- if type (typlist [i ]) is int :
793
- data [i ] = self ._null_terminate (
794
- self .path_or_buf .read (typlist [i ])
795
- )
796
- else :
797
- data [i ] = self ._unpack (
798
- typlist [i ], self .path_or_buf .read (self ._col_size (i ))
799
- )
800
- return data
801
- else :
802
- return lmap (
803
- lambda i : self ._unpack (typlist [i ],
804
- self .path_or_buf .read (
805
- self ._col_size (i )
806
- )),
807
- range (self .nvar )
808
- )
809
-
810
-
811
- def _dataset (self ):
812
- """
813
- Returns a Python generator object for iterating over the dataset.
814
-
815
-
816
- Parameters
817
- ----------
818
-
819
- Returns
820
- -------
821
- Generator object for iterating over the dataset. Yields each row of
822
- observations as a list by default.
823
-
824
- Notes
825
- -----
826
- If missing_values is True during instantiation of StataReader then
827
- observations with _StataMissingValue(s) are not filtered and should
828
- be handled by your applcation.
829
- """
830
-
831
- self .path_or_buf .seek (self .data_location )
832
-
833
- for i in range (self .nobs ):
834
- yield self ._next ()
835
-
836
937
def _read_value_labels (self ):
837
938
if self .format_version >= 117 :
838
939
self .path_or_buf .seek (self .seek_value_labels )
@@ -932,27 +1033,32 @@ def data(self, convert_dates=True, convert_categoricals=True, index=None,
932
1033
if self .format_version >= 117 :
933
1034
self ._read_strls ()
934
1035
935
- stata_dta = self ._dataset ()
936
-
937
- data = []
938
- for rownum , line in enumerate (stata_dta ):
939
- # doesn't handle missing value objects, just casts
940
- # None will only work without missing value object.
941
- for i , val in enumerate (line ):
942
- #NOTE: This will only be scalar types because missing strings
943
- # are empty not None in Stata
944
- if val is None :
945
- line [i ] = np .nan
946
- data .append (tuple (line ))
1036
+ # Read data
1037
+ count = self .nobs
1038
+ dtype = [] # Convert struct data types to numpy data type
1039
+ for i , typ in enumerate (self .typlist ):
1040
+ if typ in self .NUMPY_TYPE_MAP :
1041
+ dtype .append (('s' + str (i ), self .NUMPY_TYPE_MAP [typ ]))
1042
+ else :
1043
+ dtype .append (('s' + str (i ), 'S' + str (typ )))
1044
+ dtype = np .dtype (dtype )
1045
+ read_len = count * dtype .itemsize
1046
+ self .path_or_buf .seek (self .data_location )
1047
+ data = np .frombuffer (self .path_or_buf .read (read_len ),dtype = dtype ,count = count )
1048
+ self ._data_read = True
947
1049
948
1050
if convert_categoricals :
949
1051
self ._read_value_labels ()
950
1052
951
- # TODO: Refactor to use a dictionary constructor and the correct dtype from the start?
952
1053
if len (data )== 0 :
953
1054
data = DataFrame (columns = self .varlist , index = index )
954
1055
else :
955
- data = DataFrame (data , columns = self .varlist , index = index )
1056
+ data = DataFrame .from_records (data , index = index )
1057
+ data .columns = self .varlist
1058
+
1059
+ for col , typ in zip (data , self .typlist ):
1060
+ if type (typ ) is int :
1061
+ data [col ] = data [col ].apply (self ._null_terminate , convert_dtype = True ,)
956
1062
957
1063
cols_ = np .where (self .dtyplist )[0 ]
958
1064
@@ -1010,8 +1116,7 @@ def data(self, convert_dates=True, convert_categoricals=True, index=None,
1010
1116
self .fmtlist ))[0 ]
1011
1117
for i in cols :
1012
1118
col = data .columns [i ]
1013
- data [col ] = data [col ].apply (_stata_elapsed_date_to_datetime ,
1014
- args = (self .fmtlist [i ],))
1119
+ data [col ] = _stata_elapsed_date_to_datetime_vec (data [col ], self .fmtlist [i ])
1015
1120
1016
1121
if convert_categoricals :
1017
1122
cols = np .where (
0 commit comments