@@ -1136,7 +1136,7 @@ def _get_varlist(self):
1136
1136
elif self .format_version == 118 :
1137
1137
b = 129
1138
1138
1139
- return [self ._null_terminate (self .path_or_buf .read (b ))
1139
+ return [self ._decode (self .path_or_buf .read (b ))
1140
1140
for i in range (self .nvar )]
1141
1141
1142
1142
# Returns the format list
@@ -1150,7 +1150,7 @@ def _get_fmtlist(self):
1150
1150
else :
1151
1151
b = 7
1152
1152
1153
- return [self ._null_terminate (self .path_or_buf .read (b ))
1153
+ return [self ._decode (self .path_or_buf .read (b ))
1154
1154
for i in range (self .nvar )]
1155
1155
1156
1156
# Returns the label list
@@ -1161,18 +1161,18 @@ def _get_lbllist(self):
1161
1161
b = 33
1162
1162
else :
1163
1163
b = 9
1164
- return [self ._null_terminate (self .path_or_buf .read (b ))
1164
+ return [self ._decode (self .path_or_buf .read (b ))
1165
1165
for i in range (self .nvar )]
1166
1166
1167
1167
def _get_variable_labels (self ):
1168
1168
if self .format_version == 118 :
1169
1169
vlblist = [self ._decode (self .path_or_buf .read (321 ))
1170
1170
for i in range (self .nvar )]
1171
1171
elif self .format_version > 105 :
1172
- vlblist = [self ._null_terminate (self .path_or_buf .read (81 ))
1172
+ vlblist = [self ._decode (self .path_or_buf .read (81 ))
1173
1173
for i in range (self .nvar )]
1174
1174
else :
1175
- vlblist = [self ._null_terminate (self .path_or_buf .read (32 ))
1175
+ vlblist = [self ._decode (self .path_or_buf .read (32 ))
1176
1176
for i in range (self .nvar )]
1177
1177
return vlblist
1178
1178
@@ -1191,21 +1191,21 @@ def _get_data_label(self):
1191
1191
return self ._decode (self .path_or_buf .read (strlen ))
1192
1192
elif self .format_version == 117 :
1193
1193
strlen = struct .unpack ('b' , self .path_or_buf .read (1 ))[0 ]
1194
- return self ._null_terminate (self .path_or_buf .read (strlen ))
1194
+ return self ._decode (self .path_or_buf .read (strlen ))
1195
1195
elif self .format_version > 105 :
1196
- return self ._null_terminate (self .path_or_buf .read (81 ))
1196
+ return self ._decode (self .path_or_buf .read (81 ))
1197
1197
else :
1198
- return self ._null_terminate (self .path_or_buf .read (32 ))
1198
+ return self ._decode (self .path_or_buf .read (32 ))
1199
1199
1200
1200
def _get_time_stamp (self ):
1201
1201
if self .format_version == 118 :
1202
1202
strlen = struct .unpack ('b' , self .path_or_buf .read (1 ))[0 ]
1203
1203
return self .path_or_buf .read (strlen ).decode ("utf-8" )
1204
1204
elif self .format_version == 117 :
1205
1205
strlen = struct .unpack ('b' , self .path_or_buf .read (1 ))[0 ]
1206
- return self ._null_terminate (self .path_or_buf .read (strlen ))
1206
+ return self ._decode (self .path_or_buf .read (strlen ))
1207
1207
elif self .format_version > 104 :
1208
- return self ._null_terminate (self .path_or_buf .read (18 ))
1208
+ return self ._decode (self .path_or_buf .read (18 ))
1209
1209
else :
1210
1210
raise ValueError ()
1211
1211
@@ -1266,10 +1266,10 @@ def _read_old_header(self, first_char):
1266
1266
.format (',' .join (str (x ) for x in typlist )))
1267
1267
1268
1268
if self .format_version > 108 :
1269
- self .varlist = [self ._null_terminate (self .path_or_buf .read (33 ))
1269
+ self .varlist = [self ._decode (self .path_or_buf .read (33 ))
1270
1270
for i in range (self .nvar )]
1271
1271
else :
1272
- self .varlist = [self ._null_terminate (self .path_or_buf .read (9 ))
1272
+ self .varlist = [self ._decode (self .path_or_buf .read (9 ))
1273
1273
for i in range (self .nvar )]
1274
1274
self .srtlist = struct .unpack (
1275
1275
self .byteorder + ('h' * (self .nvar + 1 )),
@@ -1326,20 +1326,19 @@ def _calcsize(self, fmt):
1326
1326
struct .calcsize (self .byteorder + fmt ))
1327
1327
1328
1328
def _decode (self , s ):
1329
- s = s .partition (b"\0 " )[0 ]
1330
- try :
1331
- return s .decode ('utf-8' )
1332
- except UnicodeDecodeError :
1333
- # GH 25960
1334
- return s .decode ('latin-1' )
1335
-
1336
- def _null_terminate (self , s ):
1337
1329
# have bytes not strings, so must decode
1338
1330
s = s .partition (b"\0 " )[0 ]
1339
1331
try :
1340
1332
return s .decode (self ._encoding )
1341
1333
except UnicodeDecodeError :
1342
- # GH 25960
1334
+ # GH 25960, fallback to handle incorrect format produced when 117
1335
+ # files are converted to 118 files in Stata
1336
+ msg = """
1337
+ One or more strings in the dta file could not be decoded using {encoding}, and
1338
+ so the fallback encoding of latin-1 is being used. This can happen when a file
1339
+ has been incorrectly encoded by Stata or some other software. You should verify
1340
+ the string values returned are correct."""
1341
+ warnings .warn (msg .format (encoding = self ._encoding ), UnicodeWarning )
1343
1342
return s .decode ('latin-1' )
1344
1343
1345
1344
def _read_value_labels (self ):
@@ -1370,7 +1369,7 @@ def _read_value_labels(self):
1370
1369
if not slength :
1371
1370
break # end of value label table (format < 117)
1372
1371
if self .format_version <= 117 :
1373
- labname = self ._null_terminate (self .path_or_buf .read (33 ))
1372
+ labname = self ._decode (self .path_or_buf .read (33 ))
1374
1373
else :
1375
1374
labname = self ._decode (self .path_or_buf .read (129 ))
1376
1375
self .path_or_buf .read (3 ) # padding
@@ -1392,12 +1391,8 @@ def _read_value_labels(self):
1392
1391
self .value_label_dict [labname ] = dict ()
1393
1392
for i in range (n ):
1394
1393
end = off [i + 1 ] if i < n - 1 else txtlen
1395
- if self .format_version <= 117 :
1396
- self .value_label_dict [labname ][val [i ]] = (
1397
- self ._null_terminate (txt [off [i ]:end ]))
1398
- else :
1399
- self .value_label_dict [labname ][val [i ]] = (
1400
- self ._decode (txt [off [i ]:end ]))
1394
+ self .value_label_dict [labname ][val [i ]] = \
1395
+ self ._decode (txt [off [i ]:end ])
1401
1396
if self .format_version >= 117 :
1402
1397
self .path_or_buf .read (6 ) # </lbl>
1403
1398
self ._value_labels_read = True
@@ -1552,7 +1547,7 @@ def read(self, nrows=None, convert_dates=None,
1552
1547
for col , typ in zip (data , self .typlist ):
1553
1548
if type (typ ) is int :
1554
1549
data [col ] = data [col ].apply (
1555
- self ._null_terminate , convert_dtype = True )
1550
+ self ._decode , convert_dtype = True )
1556
1551
1557
1552
data = self ._insert_strls (data )
1558
1553
0 commit comments