@@ -1137,7 +1137,7 @@ def _get_varlist(self):
1137
1137
elif self .format_version == 118 :
1138
1138
b = 129
1139
1139
1140
- return [self ._null_terminate (self .path_or_buf .read (b ))
1140
+ return [self ._decode (self .path_or_buf .read (b ))
1141
1141
for i in range (self .nvar )]
1142
1142
1143
1143
# Returns the format list
@@ -1151,7 +1151,7 @@ def _get_fmtlist(self):
1151
1151
else :
1152
1152
b = 7
1153
1153
1154
- return [self ._null_terminate (self .path_or_buf .read (b ))
1154
+ return [self ._decode (self .path_or_buf .read (b ))
1155
1155
for i in range (self .nvar )]
1156
1156
1157
1157
# Returns the label list
@@ -1162,18 +1162,18 @@ def _get_lbllist(self):
1162
1162
b = 33
1163
1163
else :
1164
1164
b = 9
1165
- return [self ._null_terminate (self .path_or_buf .read (b ))
1165
+ return [self ._decode (self .path_or_buf .read (b ))
1166
1166
for i in range (self .nvar )]
1167
1167
1168
1168
def _get_variable_labels (self ):
1169
1169
if self .format_version == 118 :
1170
1170
vlblist = [self ._decode (self .path_or_buf .read (321 ))
1171
1171
for i in range (self .nvar )]
1172
1172
elif self .format_version > 105 :
1173
- vlblist = [self ._null_terminate (self .path_or_buf .read (81 ))
1173
+ vlblist = [self ._decode (self .path_or_buf .read (81 ))
1174
1174
for i in range (self .nvar )]
1175
1175
else :
1176
- vlblist = [self ._null_terminate (self .path_or_buf .read (32 ))
1176
+ vlblist = [self ._decode (self .path_or_buf .read (32 ))
1177
1177
for i in range (self .nvar )]
1178
1178
return vlblist
1179
1179
@@ -1192,21 +1192,21 @@ def _get_data_label(self):
1192
1192
return self ._decode (self .path_or_buf .read (strlen ))
1193
1193
elif self .format_version == 117 :
1194
1194
strlen = struct .unpack ('b' , self .path_or_buf .read (1 ))[0 ]
1195
- return self ._null_terminate (self .path_or_buf .read (strlen ))
1195
+ return self ._decode (self .path_or_buf .read (strlen ))
1196
1196
elif self .format_version > 105 :
1197
- return self ._null_terminate (self .path_or_buf .read (81 ))
1197
+ return self ._decode (self .path_or_buf .read (81 ))
1198
1198
else :
1199
- return self ._null_terminate (self .path_or_buf .read (32 ))
1199
+ return self ._decode (self .path_or_buf .read (32 ))
1200
1200
1201
1201
def _get_time_stamp (self ):
1202
1202
if self .format_version == 118 :
1203
1203
strlen = struct .unpack ('b' , self .path_or_buf .read (1 ))[0 ]
1204
1204
return self .path_or_buf .read (strlen ).decode ("utf-8" )
1205
1205
elif self .format_version == 117 :
1206
1206
strlen = struct .unpack ('b' , self .path_or_buf .read (1 ))[0 ]
1207
- return self ._null_terminate (self .path_or_buf .read (strlen ))
1207
+ return self ._decode (self .path_or_buf .read (strlen ))
1208
1208
elif self .format_version > 104 :
1209
- return self ._null_terminate (self .path_or_buf .read (18 ))
1209
+ return self ._decode (self .path_or_buf .read (18 ))
1210
1210
else :
1211
1211
raise ValueError ()
1212
1212
@@ -1267,10 +1267,10 @@ def _read_old_header(self, first_char):
1267
1267
.format (',' .join (str (x ) for x in typlist )))
1268
1268
1269
1269
if self .format_version > 108 :
1270
- self .varlist = [self ._null_terminate (self .path_or_buf .read (33 ))
1270
+ self .varlist = [self ._decode (self .path_or_buf .read (33 ))
1271
1271
for i in range (self .nvar )]
1272
1272
else :
1273
- self .varlist = [self ._null_terminate (self .path_or_buf .read (9 ))
1273
+ self .varlist = [self ._decode (self .path_or_buf .read (9 ))
1274
1274
for i in range (self .nvar )]
1275
1275
self .srtlist = struct .unpack (
1276
1276
self .byteorder + ('h' * (self .nvar + 1 )),
@@ -1327,13 +1327,20 @@ def _calcsize(self, fmt):
1327
1327
struct .calcsize (self .byteorder + fmt ))
1328
1328
1329
1329
def _decode (self , s ):
1330
- s = s .partition (b"\0 " )[0 ]
1331
- return s .decode ('utf-8' )
1332
-
1333
- def _null_terminate (self , s ):
1334
1330
# have bytes not strings, so must decode
1335
1331
s = s .partition (b"\0 " )[0 ]
1336
- return s .decode (self ._encoding )
1332
+ try :
1333
+ return s .decode (self ._encoding )
1334
+ except UnicodeDecodeError :
1335
+ # GH 25960, fallback to handle incorrect format produced when 117
1336
+ # files are converted to 118 files in Stata
1337
+ msg = """
1338
+ One or more strings in the dta file could not be decoded using {encoding}, and
1339
+ so the fallback encoding of latin-1 is being used. This can happen when a file
1340
+ has been incorrectly encoded by Stata or some other software. You should verify
1341
+ the string values returned are correct."""
1342
+ warnings .warn (msg .format (encoding = self ._encoding ), UnicodeWarning )
1343
+ return s .decode ('latin-1' )
1337
1344
1338
1345
def _read_value_labels (self ):
1339
1346
if self ._value_labels_read :
@@ -1363,7 +1370,7 @@ def _read_value_labels(self):
1363
1370
if not slength :
1364
1371
break # end of value label table (format < 117)
1365
1372
if self .format_version <= 117 :
1366
- labname = self ._null_terminate (self .path_or_buf .read (33 ))
1373
+ labname = self ._decode (self .path_or_buf .read (33 ))
1367
1374
else :
1368
1375
labname = self ._decode (self .path_or_buf .read (129 ))
1369
1376
self .path_or_buf .read (3 ) # padding
@@ -1385,12 +1392,8 @@ def _read_value_labels(self):
1385
1392
self .value_label_dict [labname ] = dict ()
1386
1393
for i in range (n ):
1387
1394
end = off [i + 1 ] if i < n - 1 else txtlen
1388
- if self .format_version <= 117 :
1389
- self .value_label_dict [labname ][val [i ]] = (
1390
- self ._null_terminate (txt [off [i ]:end ]))
1391
- else :
1392
- self .value_label_dict [labname ][val [i ]] = (
1393
- self ._decode (txt [off [i ]:end ]))
1395
+ self .value_label_dict [labname ][val [i ]] = \
1396
+ self ._decode (txt [off [i ]:end ])
1394
1397
if self .format_version >= 117 :
1395
1398
self .path_or_buf .read (6 ) # </lbl>
1396
1399
self ._value_labels_read = True
@@ -1545,7 +1548,7 @@ def read(self, nrows=None, convert_dates=None,
1545
1548
for col , typ in zip (data , self .typlist ):
1546
1549
if type (typ ) is int :
1547
1550
data [col ] = data [col ].apply (
1548
- self ._null_terminate , convert_dtype = True )
1551
+ self ._decode , convert_dtype = True )
1549
1552
1550
1553
data = self ._insert_strls (data )
1551
1554
0 commit comments