33
33
from pandas ._libs .lib import max_len_string_array , infer_dtype
34
34
from pandas ._libs .tslib import NaT , Timestamp
35
35
36
+ VALID_ENCODINGS = ('ascii' , 'us-ascii' , 'latin-1' , 'latin_1' , 'iso-8859-1' ,
37
+ 'iso8859-1' , '8859' , 'cp819' , 'latin' , 'latin1' , 'L1' )
38
+
36
39
_version_error = ("Version of given Stata file is not 104, 105, 108, "
37
40
"111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), "
38
41
"115 (Stata 12), 117 (Stata 13), or 118 (Stata 14)" )
45
48
46
49
_encoding_params = """\
47
50
encoding : string, None or encoding
48
- Encoding used to parse the files. None defaults to iso-8859 -1."""
51
+ Encoding used to parse the files. None defaults to latin -1."""
49
52
50
53
_statafile_processing_params2 = """\
51
54
index : identifier of index column
153
156
154
157
@Appender (_read_stata_doc )
155
158
def read_stata (filepath_or_buffer , convert_dates = True ,
156
- convert_categoricals = True , encoding = None , index = None ,
159
+ convert_categoricals = True , encoding = 'latin-1' , index = None ,
157
160
convert_missing = False , preserve_dtypes = True , columns = None ,
158
161
order_categoricals = True , chunksize = None , iterator = False ):
159
162
@@ -816,9 +819,14 @@ def get_base_missing_value(cls, dtype):
816
819
817
820
818
821
class StataParser (object ):
819
- _default_encoding = 'iso-8859-1'
822
+ _default_encoding = 'latin-1'
823
+
824
+ def __init__ (self , encoding = 'latin-1' ):
825
+
826
+ if encoding not in VALID_ENCODINGS :
827
+ raise ValueError ('Unknown encoding. Only latin-1 and ascii '
828
+ 'supported.' )
820
829
821
- def __init__ (self , encoding ):
822
830
self ._encoding = encoding
823
831
824
832
# type code.
@@ -936,7 +944,7 @@ def __init__(self, path_or_buf, convert_dates=True,
936
944
convert_categoricals = True , index = None ,
937
945
convert_missing = False , preserve_dtypes = True ,
938
946
columns = None , order_categoricals = True ,
939
- encoding = 'iso-8859 -1' , chunksize = None ):
947
+ encoding = 'latin -1' , chunksize = None ):
940
948
super (StataReader , self ).__init__ (encoding )
941
949
self .col_sizes = ()
942
950
@@ -949,6 +957,9 @@ def __init__(self, path_or_buf, convert_dates=True,
949
957
self ._preserve_dtypes = preserve_dtypes
950
958
self ._columns = columns
951
959
self ._order_categoricals = order_categoricals
960
+ if encoding not in VALID_ENCODINGS :
961
+ raise ValueError ('Unknown encoding. Only latin-1 and ascii '
962
+ 'supported.' )
952
963
self ._encoding = encoding
953
964
self ._chunksize = chunksize
954
965
@@ -1855,7 +1866,7 @@ class StataWriter(StataParser):
1855
1866
write_index : bool
1856
1867
Write the index to Stata dataset.
1857
1868
encoding : str
1858
- Default is latin-1. Unicode is not supported
1869
+ Default is latin-1. Only latin-1 and ascii are supported.
1859
1870
byteorder : str
1860
1871
Can be ">", "<", "little", or "big". default is `sys.byteorder`
1861
1872
time_stamp : datetime
0 commit comments