169
169
170
170
171
171
@Appender (_read_stata_doc )
172
+ @deprecate_kwarg (old_arg_name = 'encoding' , new_arg_name = None )
172
173
@deprecate_kwarg (old_arg_name = 'index' , new_arg_name = 'index_col' )
173
174
def read_stata (filepath_or_buffer , convert_dates = True ,
174
175
convert_categoricals = True , encoding = None , index_col = None ,
@@ -182,7 +183,7 @@ def read_stata(filepath_or_buffer, convert_dates=True,
182
183
preserve_dtypes = preserve_dtypes ,
183
184
columns = columns ,
184
185
order_categoricals = order_categoricals ,
185
- chunksize = chunksize , encoding = encoding )
186
+ chunksize = chunksize )
186
187
187
188
if iterator or chunksize :
188
189
data = reader
@@ -838,15 +839,8 @@ def get_base_missing_value(cls, dtype):
838
839
839
840
840
841
class StataParser (object ):
841
- _default_encoding = 'latin-1'
842
842
843
- def __init__ (self , encoding ):
844
- if encoding is not None :
845
- if encoding not in VALID_ENCODINGS :
846
- raise ValueError ('Unknown encoding. Only latin-1 and ascii '
847
- 'supported.' )
848
-
849
- self ._encoding = encoding
843
+ def __init__ (self ):
850
844
851
845
# type code.
852
846
# --------------------
@@ -959,13 +953,14 @@ def __init__(self, encoding):
959
953
class StataReader (StataParser , BaseIterator ):
960
954
__doc__ = _stata_reader_doc
961
955
956
+ @deprecate_kwarg (old_arg_name = 'encoding' , new_arg_name = None )
962
957
@deprecate_kwarg (old_arg_name = 'index' , new_arg_name = 'index_col' )
963
958
def __init__ (self , path_or_buf , convert_dates = True ,
964
959
convert_categoricals = True , index_col = None ,
965
960
convert_missing = False , preserve_dtypes = True ,
966
961
columns = None , order_categoricals = True ,
967
- encoding = 'latin-1' , chunksize = None ):
968
- super (StataReader , self ).__init__ (encoding )
962
+ encoding = None , chunksize = None ):
963
+ super (StataReader , self ).__init__ ()
969
964
self .col_sizes = ()
970
965
971
966
# Arguments to the reader (can be temporarily overridden in
@@ -977,10 +972,6 @@ def __init__(self, path_or_buf, convert_dates=True,
977
972
self ._preserve_dtypes = preserve_dtypes
978
973
self ._columns = columns
979
974
self ._order_categoricals = order_categoricals
980
- if encoding is not None :
981
- if encoding not in VALID_ENCODINGS :
982
- raise ValueError ('Unknown encoding. Only latin-1 and ascii '
983
- 'supported.' )
984
975
self ._encoding = encoding
985
976
self ._chunksize = chunksize
986
977
@@ -998,18 +989,13 @@ def __init__(self, path_or_buf, convert_dates=True,
998
989
path_or_buf = _stringify_path (path_or_buf )
999
990
if isinstance (path_or_buf , str ):
1000
991
path_or_buf , encoding , _ , should_close = get_filepath_or_buffer (
1001
- path_or_buf , encoding = self ._default_encoding
1002
- )
992
+ path_or_buf )
1003
993
1004
994
if isinstance (path_or_buf , (str , text_type , bytes )):
1005
995
self .path_or_buf = open (path_or_buf , 'rb' )
1006
996
else :
1007
997
# Copy to BytesIO, and ensure no encoding
1008
998
contents = path_or_buf .read ()
1009
- try :
1010
- contents = contents .encode (self ._default_encoding )
1011
- except :
1012
- pass
1013
999
self .path_or_buf = BytesIO (contents )
1014
1000
1015
1001
self ._read_header ()
@@ -1030,6 +1016,15 @@ def close(self):
1030
1016
except IOError :
1031
1017
pass
1032
1018
1019
+ def _set_encoding (self ):
1020
+ """
1021
+ Set string encoding which depends on file version
1022
+ """
1023
+ if self .format_version < 118 :
1024
+ self ._encoding = 'latin-1'
1025
+ else :
1026
+ self ._encoding = 'utf-8'
1027
+
1033
1028
def _read_header (self ):
1034
1029
first_char = self .path_or_buf .read (1 )
1035
1030
if struct .unpack ('c' , first_char )[0 ] == b'<' :
@@ -1049,6 +1044,7 @@ def _read_new_header(self, first_char):
1049
1044
self .format_version = int (self .path_or_buf .read (3 ))
1050
1045
if self .format_version not in [117 , 118 ]:
1051
1046
raise ValueError (_version_error )
1047
+ self ._set_encoding ()
1052
1048
self .path_or_buf .read (21 ) # </release><byteorder>
1053
1049
self .byteorder = self .path_or_buf .read (3 ) == b'MSF' and '>' or '<'
1054
1050
self .path_or_buf .read (15 ) # </byteorder><K>
@@ -1235,6 +1231,7 @@ def _read_old_header(self, first_char):
1235
1231
self .format_version = struct .unpack ('b' , first_char )[0 ]
1236
1232
if self .format_version not in [104 , 105 , 108 , 111 , 113 , 114 , 115 ]:
1237
1233
raise ValueError (_version_error )
1234
+ self ._set_encoding ()
1238
1235
self .byteorder = struct .unpack ('b' , self .path_or_buf .read (1 ))[
1239
1236
0 ] == 0x1 and '>' or '<'
1240
1237
self .filetype = struct .unpack ('b' , self .path_or_buf .read (1 ))[0 ]
@@ -1338,16 +1335,9 @@ def _decode(self, s):
1338
1335
return s .decode ('utf-8' )
1339
1336
1340
1337
def _null_terminate (self , s ):
1341
- if compat .PY3 or self ._encoding is not None :
1342
- # have bytes not strings, so must decode
1343
- s = s .partition (b"\0 " )[0 ]
1344
- return s .decode (self ._encoding or self ._default_encoding )
1345
- else :
1346
- null_byte = "\0 "
1347
- try :
1348
- return s .lstrip (null_byte )[:s .index (null_byte )]
1349
- except :
1350
- return s
1338
+ # have bytes not strings, so must decode
1339
+ s = s .partition (b"\0 " )[0 ]
1340
+ return s .decode (self ._encoding )
1351
1341
1352
1342
def _read_value_labels (self ):
1353
1343
if self ._value_labels_read :
@@ -1433,10 +1423,7 @@ def _read_strls(self):
1433
1423
self .path_or_buf .read (4 ))[0 ]
1434
1424
va = self .path_or_buf .read (length )
1435
1425
if typ == 130 :
1436
- encoding = 'utf-8'
1437
- if self .format_version == 117 :
1438
- encoding = self ._encoding or self ._default_encoding
1439
- va = va [0 :- 1 ].decode (encoding )
1426
+ va = va [0 :- 1 ].decode (self ._encoding )
1440
1427
# Wrap v_o in a string to allow uint64 values as keys on 32bit OS
1441
1428
self .GSO [str (v_o )] = va
1442
1429
@@ -1980,9 +1967,14 @@ class StataWriter(StataParser):
1980
1967
def __init__ (self , fname , data , convert_dates = None , write_index = True ,
1981
1968
encoding = "latin-1" , byteorder = None , time_stamp = None ,
1982
1969
data_label = None , variable_labels = None ):
1983
- super (StataWriter , self ).__init__ (encoding )
1970
+ super (StataWriter , self ).__init__ ()
1984
1971
self ._convert_dates = {} if convert_dates is None else convert_dates
1985
1972
self ._write_index = write_index
1973
+ if encoding is not None :
1974
+ if encoding not in VALID_ENCODINGS :
1975
+ raise ValueError ('Unknown encoding. Only latin-1 and ascii '
1976
+ 'supported.' )
1977
+ self ._encoding = encoding
1986
1978
self ._time_stamp = time_stamp
1987
1979
self ._data_label = data_label
1988
1980
self ._variable_labels = variable_labels
0 commit comments