ENH: Allow poorly formatted stata files to be read (#25967)

bashtage · jreback · commit 435e2b58c827 · 2019-04-04T08:51:41.000-04:00
* ENH: Allow poorly formatted stata files to be read Add a fall back decode path that allows improperly formatted Stata files written in 118 format but using latin-1 encoded strings to be read closes #25960 * MAINT: Refactor decode Refactor decode and null terminate to use file encoding
diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -355,6 +355,7 @@ I/O
 - Improved performance in :meth:`pandas.read_stata` and :class:`pandas.io.stata.StataReader` when converting columns that have missing values (:issue:`25772`)
 - Bug in :func:`read_hdf` not properly closing store after a ``KeyError`` is raised (:issue:`25766`)
 - Bug in ``read_csv`` which would not raise ``ValueError`` if a column index in ``usecols`` was out of bounds (:issue:`25623`)
+- Improved :meth:`pandas.read_stata` and :class:`pandas.io.stata.StataReader` to read incorrectly formatted 118 format files saved by Stata (:issue:`25960`)
 
 Plotting
 ^^^^^^^^
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -1137,7 +1137,7 @@ def _get_varlist(self):
         elif self.format_version == 118:
             b = 129
 
-        return [self._null_terminate(self.path_or_buf.read(b))
+        return [self._decode(self.path_or_buf.read(b))
                 for i in range(self.nvar)]
 
     # Returns the format list
@@ -1151,7 +1151,7 @@ def _get_fmtlist(self):
         else:
             b = 7
 
-        return [self._null_terminate(self.path_or_buf.read(b))
+        return [self._decode(self.path_or_buf.read(b))
                 for i in range(self.nvar)]
 
     # Returns the label list
@@ -1162,18 +1162,18 @@ def _get_lbllist(self):
             b = 33
         else:
             b = 9
-        return [self._null_terminate(self.path_or_buf.read(b))
+        return [self._decode(self.path_or_buf.read(b))
                 for i in range(self.nvar)]
 
     def _get_variable_labels(self):
         if self.format_version == 118:
             vlblist = [self._decode(self.path_or_buf.read(321))
                        for i in range(self.nvar)]
         elif self.format_version > 105:
-            vlblist = [self._null_terminate(self.path_or_buf.read(81))
+            vlblist = [self._decode(self.path_or_buf.read(81))
                        for i in range(self.nvar)]
         else:
-            vlblist = [self._null_terminate(self.path_or_buf.read(32))
+            vlblist = [self._decode(self.path_or_buf.read(32))
                        for i in range(self.nvar)]
         return vlblist
 
@@ -1192,21 +1192,21 @@ def _get_data_label(self):
             return self._decode(self.path_or_buf.read(strlen))
         elif self.format_version == 117:
             strlen = struct.unpack('b', self.path_or_buf.read(1))[0]
-            return self._null_terminate(self.path_or_buf.read(strlen))
+            return self._decode(self.path_or_buf.read(strlen))
         elif self.format_version > 105:
-            return self._null_terminate(self.path_or_buf.read(81))
+            return self._decode(self.path_or_buf.read(81))
         else:
-            return self._null_terminate(self.path_or_buf.read(32))
+            return self._decode(self.path_or_buf.read(32))
 
     def _get_time_stamp(self):
         if self.format_version == 118:
             strlen = struct.unpack('b', self.path_or_buf.read(1))[0]
             return self.path_or_buf.read(strlen).decode("utf-8")
         elif self.format_version == 117:
             strlen = struct.unpack('b', self.path_or_buf.read(1))[0]
-            return self._null_terminate(self.path_or_buf.read(strlen))
+            return self._decode(self.path_or_buf.read(strlen))
         elif self.format_version > 104:
-            return self._null_terminate(self.path_or_buf.read(18))
+            return self._decode(self.path_or_buf.read(18))
         else:
             raise ValueError()
 
@@ -1267,10 +1267,10 @@ def _read_old_header(self, first_char):
                              .format(','.join(str(x) for x in typlist)))
 
         if self.format_version > 108:
-            self.varlist = [self._null_terminate(self.path_or_buf.read(33))
+            self.varlist = [self._decode(self.path_or_buf.read(33))
                             for i in range(self.nvar)]
         else:
-            self.varlist = [self._null_terminate(self.path_or_buf.read(9))
+            self.varlist = [self._decode(self.path_or_buf.read(9))
                             for i in range(self.nvar)]
         self.srtlist = struct.unpack(
             self.byteorder + ('h' * (self.nvar + 1)),
@@ -1327,13 +1327,20 @@ def _calcsize(self, fmt):
                 struct.calcsize(self.byteorder + fmt))
 
     def _decode(self, s):
-        s = s.partition(b"\0")[0]
-        return s.decode('utf-8')
-
-    def _null_terminate(self, s):
         # have bytes not strings, so must decode
         s = s.partition(b"\0")[0]
-        return s.decode(self._encoding)
+        try:
+            return s.decode(self._encoding)
+        except UnicodeDecodeError:
+            # GH 25960, fallback to handle incorrect format produced when 117
+            # files are converted to 118 files in Stata
+            msg = """
+One or more strings in the dta file could not be decoded using {encoding}, and
+so the fallback encoding of latin-1 is being used.  This can happen when a file
+has been incorrectly encoded by Stata or some other software. You should verify
+the string values returned are correct."""
+            warnings.warn(msg.format(encoding=self._encoding), UnicodeWarning)
+            return s.decode('latin-1')
 
     def _read_value_labels(self):
         if self._value_labels_read:
@@ -1363,7 +1370,7 @@ def _read_value_labels(self):
             if not slength:
                 break  # end of value label table (format < 117)
             if self.format_version <= 117:
-                labname = self._null_terminate(self.path_or_buf.read(33))
+                labname = self._decode(self.path_or_buf.read(33))
             else:
                 labname = self._decode(self.path_or_buf.read(129))
             self.path_or_buf.read(3)  # padding
@@ -1385,12 +1392,8 @@ def _read_value_labels(self):
             self.value_label_dict[labname] = dict()
             for i in range(n):
                 end = off[i + 1] if i < n - 1 else txtlen
-                if self.format_version <= 117:
-                    self.value_label_dict[labname][val[i]] = (
-                        self._null_terminate(txt[off[i]:end]))
-                else:
-                    self.value_label_dict[labname][val[i]] = (
-                        self._decode(txt[off[i]:end]))
+                self.value_label_dict[labname][val[i]] = \
+                    self._decode(txt[off[i]:end])
             if self.format_version >= 117:
                 self.path_or_buf.read(6)  # </lbl>
         self._value_labels_read = True
@@ -1545,7 +1548,7 @@ def read(self, nrows=None, convert_dates=None,
         for col, typ in zip(data, self.typlist):
             if type(typ) is int:
                 data[col] = data[col].apply(
-                    self._null_terminate, convert_dtype=True)
+                    self._decode, convert_dtype=True)
 
         data = self._insert_strls(data)
 
diff --git a/pandas/tests/io/data/stata1_encoding_118.dta b/pandas/tests/io/data/stata1_encoding_118.dta
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
@@ -66,6 +66,8 @@ def setup_method(self, datapath):
         self.dta4_117 = os.path.join(self.dirpath, 'stata4_117.dta')
 
         self.dta_encoding = os.path.join(self.dirpath, 'stata1_encoding.dta')
+        self.dta_encoding_118 = os.path.join(self.dirpath,
+                                             'stata1_encoding_118.dta')
 
         self.csv14 = os.path.join(self.dirpath, 'stata5.csv')
         self.dta14_113 = os.path.join(self.dirpath, 'stata5_113.dta')
@@ -1608,3 +1610,18 @@ def test_strl_latin1(self):
                     val = gso.split(b'\x00')[-2]
                     size = gso[gso.find(b'\x82') + 1]
                     assert len(val) == size - 1
+
+    def test_encoding_latin1_118(self):
+        # GH 25960
+        msg = """
+One or more strings in the dta file could not be decoded using utf-8, and
+so the fallback encoding of latin-1 is being used.  This can happen when a file
+has been incorrectly encoded by Stata or some other software. You should verify
+the string values returned are correct."""
+        with tm.assert_produces_warning(UnicodeWarning) as w:
+            encoded = read_stata(self.dta_encoding_118)
+            assert len(w) == 151
+            assert w[0].message.args[0] == msg
+
+        expected = pd.DataFrame([['Düsseldorf']] * 151, columns=['kreis1849'])
+        tm.assert_frame_equal(encoded, expected)