Skip to content

Commit 2aff757

Browse files
committed
ENH: Allow poorly formatted stata files to be read
Add a fall back decode path that allows improperly formatted Stata files written in 118 format but using latin-1 encoded strings to be read closes pandas-dev#25960
1 parent 4814a28 commit 2aff757

File tree

4 files changed

+19
-2
lines changed

4 files changed

+19
-2
lines changed

doc/source/whatsnew/v0.25.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -355,6 +355,7 @@ I/O
355355
- Improved performance in :meth:`pandas.read_stata` and :class:`pandas.io.stata.StataReader` when converting columns that have missing values (:issue:`25772`)
356356
- Bug in :func:`read_hdf` not properly closing store after a ``KeyError`` is raised (:issue:`25766`)
357357
- Bug in ``read_csv`` which would not raise ``ValueError`` if a column index in ``usecols`` was out of bounds (:issue:`25623`)
358+
- Improved :meth:`pandas.read_stata` and :class:`pandas.io.stata.StataReader` to read incorrectly formatted 118 format files saved by Stata (:issue:`25960`)
358359

359360
Plotting
360361
^^^^^^^^

pandas/io/stata.py

+10-2
Original file line numberDiff line numberDiff line change
@@ -1327,12 +1327,20 @@ def _calcsize(self, fmt):
13271327

13281328
def _decode(self, s):
13291329
s = s.partition(b"\0")[0]
1330-
return s.decode('utf-8')
1330+
try:
1331+
return s.decode('utf-8')
1332+
except UnicodeDecodeError:
1333+
# GH 25960
1334+
return s.decode('latin-1')
13311335

13321336
def _null_terminate(self, s):
13331337
# have bytes not strings, so must decode
13341338
s = s.partition(b"\0")[0]
1335-
return s.decode(self._encoding)
1339+
try:
1340+
return s.decode(self._encoding)
1341+
except UnicodeDecodeError:
1342+
# GH 25960
1343+
return s.decode('latin-1')
13361344

13371345
def _read_value_labels(self):
13381346
if self._value_labels_read:
5.46 KB
Binary file not shown.

pandas/tests/io/test_stata.py

+8
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,8 @@ def setup_method(self, datapath):
6666
self.dta4_117 = os.path.join(self.dirpath, 'stata4_117.dta')
6767

6868
self.dta_encoding = os.path.join(self.dirpath, 'stata1_encoding.dta')
69+
self.dta_encoding_118 = os.path.join(self.dirpath,
70+
'stata1_encoding_118.dta')
6971

7072
self.csv14 = os.path.join(self.dirpath, 'stata5.csv')
7173
self.dta14_113 = os.path.join(self.dirpath, 'stata5_113.dta')
@@ -1608,3 +1610,9 @@ def test_strl_latin1(self):
16081610
val = gso.split(b'\x00')[-2]
16091611
size = gso[gso.find(b'\x82') + 1]
16101612
assert len(val) == size - 1
1613+
1614+
def test_encoding_latin1_118(self):
1615+
# GH 25960
1616+
encoded = read_stata(self.dta_encoding_118)
1617+
expected = pd.DataFrame([['Düsseldorf']] * 151, columns=['kreis1849'])
1618+
tm.assert_frame_equal(encoded, expected)

0 commit comments

Comments
 (0)