36
36
from pandas .util ._decorators import Appender
37
37
from pandas .util ._decorators import deprecate_kwarg
38
38
39
- VALID_ENCODINGS = ('ascii' , 'us-ascii' , 'latin-1' , 'latin_1' , 'iso-8859-1' ,
39
+ # Allowed encodings of Stata dta files. Preferred is first entry
40
+ VALID_ENCODINGS = ('latin-1' , 'latin_1' , 'ascii' , 'us-ascii' , 'iso-8859-1' ,
40
41
'iso8859-1' , '8859' , 'cp819' , 'latin' , 'latin1' , 'L1' )
41
42
43
+ VALID_ENCODINGS_118 = ('utf8' , 'utf-8' )
44
+
42
45
_version_error = ("Version of given Stata file is not 104, 105, 108, "
43
46
"111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), "
44
47
"115 (Stata 12), 117 (Stata 13), or 118 (Stata 14)" )
169
172
170
173
171
174
@Appender (_read_stata_doc )
175
+ @deprecate_kwarg (old_arg_name = 'encoding' , new_arg_name = None )
172
176
@deprecate_kwarg (old_arg_name = 'index' , new_arg_name = 'index_col' )
173
177
def read_stata (filepath_or_buffer , convert_dates = True ,
174
178
convert_categoricals = True , encoding = None , index_col = None ,
@@ -182,7 +186,7 @@ def read_stata(filepath_or_buffer, convert_dates=True,
182
186
preserve_dtypes = preserve_dtypes ,
183
187
columns = columns ,
184
188
order_categoricals = order_categoricals ,
185
- chunksize = chunksize , encoding = encoding )
189
+ chunksize = chunksize )
186
190
187
191
if iterator or chunksize :
188
192
data = reader
@@ -399,16 +403,19 @@ def parse_dates_safe(dates, delta=False, year=False, days=False):
399
403
elif infer_dtype (dates ) == 'datetime' :
400
404
if delta :
401
405
delta = dates .values - stata_epoch
402
- f = lambda x : \
403
- US_PER_DAY * x .days + 1000000 * x .seconds + x .microseconds
406
+
407
+ def f (x ):
408
+ return US_PER_DAY * x .days + \
409
+ 1000000 * x .seconds + x .microseconds
404
410
v = np .vectorize (f )
405
411
d ['delta' ] = v (delta )
406
412
if year :
407
413
year_month = dates .apply (lambda x : 100 * x .year + x .month )
408
414
d ['year' ] = year_month .values // 100
409
415
d ['month' ] = (year_month .values - d ['year' ] * 100 )
410
416
if days :
411
- f = lambda x : (x - datetime .datetime (x .year , 1 , 1 )).days
417
+ def f (x ):
418
+ return (x - datetime .datetime (x .year , 1 , 1 )).days
412
419
v = np .vectorize (f )
413
420
d ['days' ] = v (dates )
414
421
else :
@@ -838,7 +845,6 @@ def get_base_missing_value(cls, dtype):
838
845
839
846
840
847
class StataParser (object ):
841
- _default_encoding = 'latin-1'
842
848
843
849
def __init__ (self , encoding ):
844
850
if encoding is not None :
@@ -959,12 +965,13 @@ def __init__(self, encoding):
959
965
class StataReader (StataParser , BaseIterator ):
960
966
__doc__ = _stata_reader_doc
961
967
968
+ @deprecate_kwarg (old_arg_name = 'encoding' , new_arg_name = None )
962
969
@deprecate_kwarg (old_arg_name = 'index' , new_arg_name = 'index_col' )
963
970
def __init__ (self , path_or_buf , convert_dates = True ,
964
971
convert_categoricals = True , index_col = None ,
965
972
convert_missing = False , preserve_dtypes = True ,
966
973
columns = None , order_categoricals = True ,
967
- encoding = 'latin-1' , chunksize = None ):
974
+ encoding = None , chunksize = None ):
968
975
super (StataReader , self ).__init__ (encoding )
969
976
self .col_sizes = ()
970
977
@@ -977,10 +984,6 @@ def __init__(self, path_or_buf, convert_dates=True,
977
984
self ._preserve_dtypes = preserve_dtypes
978
985
self ._columns = columns
979
986
self ._order_categoricals = order_categoricals
980
- if encoding is not None :
981
- if encoding not in VALID_ENCODINGS :
982
- raise ValueError ('Unknown encoding. Only latin-1 and ascii '
983
- 'supported.' )
984
987
self ._encoding = encoding
985
988
self ._chunksize = chunksize
986
989
@@ -998,18 +1001,13 @@ def __init__(self, path_or_buf, convert_dates=True,
998
1001
path_or_buf = _stringify_path (path_or_buf )
999
1002
if isinstance (path_or_buf , str ):
1000
1003
path_or_buf , encoding , _ , should_close = get_filepath_or_buffer (
1001
- path_or_buf , encoding = self ._default_encoding
1002
- )
1004
+ path_or_buf )
1003
1005
1004
1006
if isinstance (path_or_buf , (str , text_type , bytes )):
1005
1007
self .path_or_buf = open (path_or_buf , 'rb' )
1006
1008
else :
1007
1009
# Copy to BytesIO, and ensure no encoding
1008
1010
contents = path_or_buf .read ()
1009
- try :
1010
- contents = contents .encode (self ._default_encoding )
1011
- except :
1012
- pass
1013
1011
self .path_or_buf = BytesIO (contents )
1014
1012
1015
1013
self ._read_header ()
@@ -1030,6 +1028,15 @@ def close(self):
1030
1028
except IOError :
1031
1029
pass
1032
1030
1031
+ def _set_encoding (self ):
1032
+ """
1033
+ Check validity of user-set encoding set the default encoding
1034
+ """
1035
+ if self .format_version < 118 :
1036
+ self ._encoding = 'latin-1'
1037
+ else :
1038
+ self ._encoding = 'utf-8'
1039
+
1033
1040
def _read_header (self ):
1034
1041
first_char = self .path_or_buf .read (1 )
1035
1042
if struct .unpack ('c' , first_char )[0 ] == b'<' :
@@ -1049,6 +1056,7 @@ def _read_new_header(self, first_char):
1049
1056
self .format_version = int (self .path_or_buf .read (3 ))
1050
1057
if self .format_version not in [117 , 118 ]:
1051
1058
raise ValueError (_version_error )
1059
+ self ._set_encoding ()
1052
1060
self .path_or_buf .read (21 ) # </release><byteorder>
1053
1061
self .byteorder = self .path_or_buf .read (3 ) == b'MSF' and '>' or '<'
1054
1062
self .path_or_buf .read (15 ) # </byteorder><K>
@@ -1235,6 +1243,7 @@ def _read_old_header(self, first_char):
1235
1243
self .format_version = struct .unpack ('b' , first_char )[0 ]
1236
1244
if self .format_version not in [104 , 105 , 108 , 111 , 113 , 114 , 115 ]:
1237
1245
raise ValueError (_version_error )
1246
+ self ._set_encoding ()
1238
1247
self .byteorder = struct .unpack ('b' , self .path_or_buf .read (1 ))[
1239
1248
0 ] == 0x1 and '>' or '<'
1240
1249
self .filetype = struct .unpack ('b' , self .path_or_buf .read (1 ))[0 ]
@@ -1338,16 +1347,9 @@ def _decode(self, s):
1338
1347
return s .decode ('utf-8' )
1339
1348
1340
1349
def _null_terminate (self , s ):
1341
- if compat .PY3 or self ._encoding is not None :
1342
- # have bytes not strings, so must decode
1343
- s = s .partition (b"\0 " )[0 ]
1344
- return s .decode (self ._encoding or self ._default_encoding )
1345
- else :
1346
- null_byte = "\0 "
1347
- try :
1348
- return s .lstrip (null_byte )[:s .index (null_byte )]
1349
- except :
1350
- return s
1350
+ # have bytes not strings, so must decode
1351
+ s = s .partition (b"\0 " )[0 ]
1352
+ return s .decode (self ._encoding )
1351
1353
1352
1354
def _read_value_labels (self ):
1353
1355
if self ._value_labels_read :
@@ -1433,10 +1435,7 @@ def _read_strls(self):
1433
1435
self .path_or_buf .read (4 ))[0 ]
1434
1436
va = self .path_or_buf .read (length )
1435
1437
if typ == 130 :
1436
- encoding = 'utf-8'
1437
- if self .format_version == 117 :
1438
- encoding = self ._encoding or self ._default_encoding
1439
- va = va [0 :- 1 ].decode (encoding )
1438
+ va = va [0 :- 1 ].decode (self ._encoding )
1440
1439
# Wrap v_o in a string to allow uint64 values as keys on 32bit OS
1441
1440
self .GSO [str (v_o )] = va
1442
1441
0 commit comments