33
33
from pandas ._libs .lib import max_len_string_array , infer_dtype
34
34
from pandas ._libs .tslib import NaT , Timestamp
35
35
36
+ VALID_ENCODINGS = ('ascii' , 'us-ascii' , 'latin-1' , 'latin_1' , 'iso-8859-1' ,
37
+ 'iso8859-1' , '8859' , 'cp819' , 'latin' , 'latin1' , 'L1' )
38
+
36
39
_version_error = ("Version of given Stata file is not 104, 105, 108, "
37
40
"111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), "
38
41
"115 (Stata 12), 117 (Stata 13), or 118 (Stata 14)" )
45
48
46
49
_encoding_params = """\
47
50
encoding : string, None or encoding
48
- Encoding used to parse the files. None defaults to iso-8859 -1."""
51
+ Encoding used to parse the files. None defaults to latin -1."""
49
52
50
53
_statafile_processing_params2 = """\
51
54
index : identifier of index column
@@ -816,9 +819,14 @@ def get_base_missing_value(cls, dtype):
816
819
817
820
818
821
class StataParser (object ):
819
- _default_encoding = 'iso-8859 -1'
822
+ _default_encoding = 'latin -1'
820
823
821
824
def __init__ (self , encoding ):
825
+ if encoding is not None :
826
+ if encoding not in VALID_ENCODINGS :
827
+ raise ValueError ('Unknown encoding. Only latin-1 and ascii '
828
+ 'supported.' )
829
+
822
830
self ._encoding = encoding
823
831
824
832
# type code.
@@ -936,7 +944,7 @@ def __init__(self, path_or_buf, convert_dates=True,
936
944
convert_categoricals = True , index = None ,
937
945
convert_missing = False , preserve_dtypes = True ,
938
946
columns = None , order_categoricals = True ,
939
- encoding = 'iso-8859 -1' , chunksize = None ):
947
+ encoding = 'latin -1' , chunksize = None ):
940
948
super (StataReader , self ).__init__ (encoding )
941
949
self .col_sizes = ()
942
950
@@ -949,6 +957,10 @@ def __init__(self, path_or_buf, convert_dates=True,
949
957
self ._preserve_dtypes = preserve_dtypes
950
958
self ._columns = columns
951
959
self ._order_categoricals = order_categoricals
960
+ if encoding is not None :
961
+ if encoding not in VALID_ENCODINGS :
962
+ raise ValueError ('Unknown encoding. Only latin-1 and ascii '
963
+ 'supported.' )
952
964
self ._encoding = encoding
953
965
self ._chunksize = chunksize
954
966
@@ -1362,7 +1374,8 @@ def _read_value_labels(self):
1362
1374
1363
1375
def _read_strls (self ):
1364
1376
self .path_or_buf .seek (self .seek_strls )
1365
- self .GSO = {0 : '' }
1377
+ # Wrap v_o in a string to allow uint64 values as keys on 32bit OS
1378
+ self .GSO = {'0' : '' }
1366
1379
while True :
1367
1380
if self .path_or_buf .read (3 ) != b'GSO' :
1368
1381
break
@@ -1387,7 +1400,8 @@ def _read_strls(self):
1387
1400
if self .format_version == 117 :
1388
1401
encoding = self ._encoding or self ._default_encoding
1389
1402
va = va [0 :- 1 ].decode (encoding )
1390
- self .GSO [v_o ] = va
1403
+ # Wrap v_o in a string to allow uint64 values as keys on 32bit OS
1404
+ self .GSO [str (v_o )] = va
1391
1405
1392
1406
# legacy
1393
1407
@Appender ('DEPRECATED: ' + _data_method_doc )
@@ -1623,7 +1637,8 @@ def _insert_strls(self, data):
1623
1637
for i , typ in enumerate (self .typlist ):
1624
1638
if typ != 'Q' :
1625
1639
continue
1626
- data .iloc [:, i ] = [self .GSO [k ] for k in data .iloc [:, i ]]
1640
+ # Wrap v_o in a string to allow uint64 values as keys on 32bit OS
1641
+ data .iloc [:, i ] = [self .GSO [str (k )] for k in data .iloc [:, i ]]
1627
1642
return data
1628
1643
1629
1644
def _do_select_columns (self , data , columns ):
@@ -1855,7 +1870,7 @@ class StataWriter(StataParser):
1855
1870
write_index : bool
1856
1871
Write the index to Stata dataset.
1857
1872
encoding : str
1858
- Default is latin-1. Unicode is not supported
1873
+ Default is latin-1. Only latin-1 and ascii are supported.
1859
1874
byteorder : str
1860
1875
Can be ">", "<", "little", or "big". default is `sys.byteorder`
1861
1876
time_stamp : datetime
0 commit comments