Skip to content

Commit 435e2b5

Browse files
bashtagejreback
authored andcommitted
ENH: Allow poorly formatted stata files to be read (#25967)
* ENH: Allow poorly formatted stata files to be read Add a fall back decode path that allows improperly formatted Stata files written in 118 format but using latin-1 encoded strings to be read closes #25960 * MAINT: Refactor decode Refactor decode and null terminate to use file encoding
1 parent caad3b5 commit 435e2b5

File tree

4 files changed

+46
-25
lines changed

4 files changed

+46
-25
lines changed

doc/source/whatsnew/v0.25.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -355,6 +355,7 @@ I/O
355355
- Improved performance in :meth:`pandas.read_stata` and :class:`pandas.io.stata.StataReader` when converting columns that have missing values (:issue:`25772`)
356356
- Bug in :func:`read_hdf` not properly closing store after a ``KeyError`` is raised (:issue:`25766`)
357357
- Bug in ``read_csv`` which would not raise ``ValueError`` if a column index in ``usecols`` was out of bounds (:issue:`25623`)
358+
- Improved :meth:`pandas.read_stata` and :class:`pandas.io.stata.StataReader` to read incorrectly formatted 118 format files saved by Stata (:issue:`25960`)
358359

359360
Plotting
360361
^^^^^^^^

pandas/io/stata.py

+28-25
Original file line numberDiff line numberDiff line change
@@ -1137,7 +1137,7 @@ def _get_varlist(self):
11371137
elif self.format_version == 118:
11381138
b = 129
11391139

1140-
return [self._null_terminate(self.path_or_buf.read(b))
1140+
return [self._decode(self.path_or_buf.read(b))
11411141
for i in range(self.nvar)]
11421142

11431143
# Returns the format list
@@ -1151,7 +1151,7 @@ def _get_fmtlist(self):
11511151
else:
11521152
b = 7
11531153

1154-
return [self._null_terminate(self.path_or_buf.read(b))
1154+
return [self._decode(self.path_or_buf.read(b))
11551155
for i in range(self.nvar)]
11561156

11571157
# Returns the label list
@@ -1162,18 +1162,18 @@ def _get_lbllist(self):
11621162
b = 33
11631163
else:
11641164
b = 9
1165-
return [self._null_terminate(self.path_or_buf.read(b))
1165+
return [self._decode(self.path_or_buf.read(b))
11661166
for i in range(self.nvar)]
11671167

11681168
def _get_variable_labels(self):
11691169
if self.format_version == 118:
11701170
vlblist = [self._decode(self.path_or_buf.read(321))
11711171
for i in range(self.nvar)]
11721172
elif self.format_version > 105:
1173-
vlblist = [self._null_terminate(self.path_or_buf.read(81))
1173+
vlblist = [self._decode(self.path_or_buf.read(81))
11741174
for i in range(self.nvar)]
11751175
else:
1176-
vlblist = [self._null_terminate(self.path_or_buf.read(32))
1176+
vlblist = [self._decode(self.path_or_buf.read(32))
11771177
for i in range(self.nvar)]
11781178
return vlblist
11791179

@@ -1192,21 +1192,21 @@ def _get_data_label(self):
11921192
return self._decode(self.path_or_buf.read(strlen))
11931193
elif self.format_version == 117:
11941194
strlen = struct.unpack('b', self.path_or_buf.read(1))[0]
1195-
return self._null_terminate(self.path_or_buf.read(strlen))
1195+
return self._decode(self.path_or_buf.read(strlen))
11961196
elif self.format_version > 105:
1197-
return self._null_terminate(self.path_or_buf.read(81))
1197+
return self._decode(self.path_or_buf.read(81))
11981198
else:
1199-
return self._null_terminate(self.path_or_buf.read(32))
1199+
return self._decode(self.path_or_buf.read(32))
12001200

12011201
def _get_time_stamp(self):
12021202
if self.format_version == 118:
12031203
strlen = struct.unpack('b', self.path_or_buf.read(1))[0]
12041204
return self.path_or_buf.read(strlen).decode("utf-8")
12051205
elif self.format_version == 117:
12061206
strlen = struct.unpack('b', self.path_or_buf.read(1))[0]
1207-
return self._null_terminate(self.path_or_buf.read(strlen))
1207+
return self._decode(self.path_or_buf.read(strlen))
12081208
elif self.format_version > 104:
1209-
return self._null_terminate(self.path_or_buf.read(18))
1209+
return self._decode(self.path_or_buf.read(18))
12101210
else:
12111211
raise ValueError()
12121212

@@ -1267,10 +1267,10 @@ def _read_old_header(self, first_char):
12671267
.format(','.join(str(x) for x in typlist)))
12681268

12691269
if self.format_version > 108:
1270-
self.varlist = [self._null_terminate(self.path_or_buf.read(33))
1270+
self.varlist = [self._decode(self.path_or_buf.read(33))
12711271
for i in range(self.nvar)]
12721272
else:
1273-
self.varlist = [self._null_terminate(self.path_or_buf.read(9))
1273+
self.varlist = [self._decode(self.path_or_buf.read(9))
12741274
for i in range(self.nvar)]
12751275
self.srtlist = struct.unpack(
12761276
self.byteorder + ('h' * (self.nvar + 1)),
@@ -1327,13 +1327,20 @@ def _calcsize(self, fmt):
13271327
struct.calcsize(self.byteorder + fmt))
13281328

13291329
def _decode(self, s):
1330-
s = s.partition(b"\0")[0]
1331-
return s.decode('utf-8')
1332-
1333-
def _null_terminate(self, s):
13341330
# have bytes not strings, so must decode
13351331
s = s.partition(b"\0")[0]
1336-
return s.decode(self._encoding)
1332+
try:
1333+
return s.decode(self._encoding)
1334+
except UnicodeDecodeError:
1335+
# GH 25960, fallback to handle incorrect format produced when 117
1336+
# files are converted to 118 files in Stata
1337+
msg = """
1338+
One or more strings in the dta file could not be decoded using {encoding}, and
1339+
so the fallback encoding of latin-1 is being used. This can happen when a file
1340+
has been incorrectly encoded by Stata or some other software. You should verify
1341+
the string values returned are correct."""
1342+
warnings.warn(msg.format(encoding=self._encoding), UnicodeWarning)
1343+
return s.decode('latin-1')
13371344

13381345
def _read_value_labels(self):
13391346
if self._value_labels_read:
@@ -1363,7 +1370,7 @@ def _read_value_labels(self):
13631370
if not slength:
13641371
break # end of value label table (format < 117)
13651372
if self.format_version <= 117:
1366-
labname = self._null_terminate(self.path_or_buf.read(33))
1373+
labname = self._decode(self.path_or_buf.read(33))
13671374
else:
13681375
labname = self._decode(self.path_or_buf.read(129))
13691376
self.path_or_buf.read(3) # padding
@@ -1385,12 +1392,8 @@ def _read_value_labels(self):
13851392
self.value_label_dict[labname] = dict()
13861393
for i in range(n):
13871394
end = off[i + 1] if i < n - 1 else txtlen
1388-
if self.format_version <= 117:
1389-
self.value_label_dict[labname][val[i]] = (
1390-
self._null_terminate(txt[off[i]:end]))
1391-
else:
1392-
self.value_label_dict[labname][val[i]] = (
1393-
self._decode(txt[off[i]:end]))
1395+
self.value_label_dict[labname][val[i]] = \
1396+
self._decode(txt[off[i]:end])
13941397
if self.format_version >= 117:
13951398
self.path_or_buf.read(6) # </lbl>
13961399
self._value_labels_read = True
@@ -1545,7 +1548,7 @@ def read(self, nrows=None, convert_dates=None,
15451548
for col, typ in zip(data, self.typlist):
15461549
if type(typ) is int:
15471550
data[col] = data[col].apply(
1548-
self._null_terminate, convert_dtype=True)
1551+
self._decode, convert_dtype=True)
15491552

15501553
data = self._insert_strls(data)
15511554

5.46 KB
Binary file not shown.

pandas/tests/io/test_stata.py

+17
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,8 @@ def setup_method(self, datapath):
6666
self.dta4_117 = os.path.join(self.dirpath, 'stata4_117.dta')
6767

6868
self.dta_encoding = os.path.join(self.dirpath, 'stata1_encoding.dta')
69+
self.dta_encoding_118 = os.path.join(self.dirpath,
70+
'stata1_encoding_118.dta')
6971

7072
self.csv14 = os.path.join(self.dirpath, 'stata5.csv')
7173
self.dta14_113 = os.path.join(self.dirpath, 'stata5_113.dta')
@@ -1608,3 +1610,18 @@ def test_strl_latin1(self):
16081610
val = gso.split(b'\x00')[-2]
16091611
size = gso[gso.find(b'\x82') + 1]
16101612
assert len(val) == size - 1
1613+
1614+
def test_encoding_latin1_118(self):
1615+
# GH 25960
1616+
msg = """
1617+
One or more strings in the dta file could not be decoded using utf-8, and
1618+
so the fallback encoding of latin-1 is being used. This can happen when a file
1619+
has been incorrectly encoded by Stata or some other software. You should verify
1620+
the string values returned are correct."""
1621+
with tm.assert_produces_warning(UnicodeWarning) as w:
1622+
encoded = read_stata(self.dta_encoding_118)
1623+
assert len(w) == 151
1624+
assert w[0].message.args[0] == msg
1625+
1626+
expected = pd.DataFrame([['Düsseldorf']] * 151, columns=['kreis1849'])
1627+
tm.assert_frame_equal(encoded, expected)

0 commit comments

Comments
 (0)