diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 1ef05ae5f9c6b..8dabaeb6c7bfe 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -355,6 +355,7 @@ I/O - Improved performance in :meth:`pandas.read_stata` and :class:`pandas.io.stata.StataReader` when converting columns that have missing values (:issue:`25772`) - Bug in :func:`read_hdf` not properly closing store after a ``KeyError`` is raised (:issue:`25766`) - Bug in ``read_csv`` which would not raise ``ValueError`` if a column index in ``usecols`` was out of bounds (:issue:`25623`) +- Improved :meth:`pandas.read_stata` and :class:`pandas.io.stata.StataReader` to read incorrectly formatted 118 format files saved by Stata (:issue:`25960`) Plotting ^^^^^^^^ diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 628f77856942d..99a50cf1a518a 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1136,7 +1136,7 @@ def _get_varlist(self): elif self.format_version == 118: b = 129 - return [self._null_terminate(self.path_or_buf.read(b)) + return [self._decode(self.path_or_buf.read(b)) for i in range(self.nvar)] # Returns the format list @@ -1150,7 +1150,7 @@ def _get_fmtlist(self): else: b = 7 - return [self._null_terminate(self.path_or_buf.read(b)) + return [self._decode(self.path_or_buf.read(b)) for i in range(self.nvar)] # Returns the label list @@ -1161,7 +1161,7 @@ def _get_lbllist(self): b = 33 else: b = 9 - return [self._null_terminate(self.path_or_buf.read(b)) + return [self._decode(self.path_or_buf.read(b)) for i in range(self.nvar)] def _get_variable_labels(self): @@ -1169,10 +1169,10 @@ def _get_variable_labels(self): vlblist = [self._decode(self.path_or_buf.read(321)) for i in range(self.nvar)] elif self.format_version > 105: - vlblist = [self._null_terminate(self.path_or_buf.read(81)) + vlblist = [self._decode(self.path_or_buf.read(81)) for i in range(self.nvar)] else: - vlblist = [self._null_terminate(self.path_or_buf.read(32)) + vlblist = [self._decode(self.path_or_buf.read(32)) for i in range(self.nvar)] return vlblist @@ -1191,11 +1191,11 @@ def _get_data_label(self): return self._decode(self.path_or_buf.read(strlen)) elif self.format_version == 117: strlen = struct.unpack('b', self.path_or_buf.read(1))[0] - return self._null_terminate(self.path_or_buf.read(strlen)) + return self._decode(self.path_or_buf.read(strlen)) elif self.format_version > 105: - return self._null_terminate(self.path_or_buf.read(81)) + return self._decode(self.path_or_buf.read(81)) else: - return self._null_terminate(self.path_or_buf.read(32)) + return self._decode(self.path_or_buf.read(32)) def _get_time_stamp(self): if self.format_version == 118: @@ -1203,9 +1203,9 @@ def _get_time_stamp(self): return self.path_or_buf.read(strlen).decode("utf-8") elif self.format_version == 117: strlen = struct.unpack('b', self.path_or_buf.read(1))[0] - return self._null_terminate(self.path_or_buf.read(strlen)) + return self._decode(self.path_or_buf.read(strlen)) elif self.format_version > 104: - return self._null_terminate(self.path_or_buf.read(18)) + return self._decode(self.path_or_buf.read(18)) else: raise ValueError() @@ -1266,10 +1266,10 @@ def _read_old_header(self, first_char): .format(','.join(str(x) for x in typlist))) if self.format_version > 108: - self.varlist = [self._null_terminate(self.path_or_buf.read(33)) + self.varlist = [self._decode(self.path_or_buf.read(33)) for i in range(self.nvar)] else: - self.varlist = [self._null_terminate(self.path_or_buf.read(9)) + self.varlist = [self._decode(self.path_or_buf.read(9)) for i in range(self.nvar)] self.srtlist = struct.unpack( self.byteorder + ('h' * (self.nvar + 1)), @@ -1326,13 +1326,20 @@ def _calcsize(self, fmt): struct.calcsize(self.byteorder + fmt)) def _decode(self, s): - s = s.partition(b"\0")[0] - return s.decode('utf-8') - - def _null_terminate(self, s): # have bytes not strings, so must decode s = s.partition(b"\0")[0] - return s.decode(self._encoding) + try: + return s.decode(self._encoding) + except UnicodeDecodeError: + # GH 25960, fallback to handle incorrect format produced when 117 + # files are converted to 118 files in Stata + msg = """ +One or more strings in the dta file could not be decoded using {encoding}, and +so the fallback encoding of latin-1 is being used. This can happen when a file +has been incorrectly encoded by Stata or some other software. You should verify +the string values returned are correct.""" + warnings.warn(msg.format(encoding=self._encoding), UnicodeWarning) + return s.decode('latin-1') def _read_value_labels(self): if self._value_labels_read: @@ -1362,7 +1369,7 @@ def _read_value_labels(self): if not slength: break # end of value label table (format < 117) if self.format_version <= 117: - labname = self._null_terminate(self.path_or_buf.read(33)) + labname = self._decode(self.path_or_buf.read(33)) else: labname = self._decode(self.path_or_buf.read(129)) self.path_or_buf.read(3) # padding @@ -1384,12 +1391,8 @@ def _read_value_labels(self): self.value_label_dict[labname] = dict() for i in range(n): end = off[i + 1] if i < n - 1 else txtlen - if self.format_version <= 117: - self.value_label_dict[labname][val[i]] = ( - self._null_terminate(txt[off[i]:end])) - else: - self.value_label_dict[labname][val[i]] = ( - self._decode(txt[off[i]:end])) + self.value_label_dict[labname][val[i]] = \ + self._decode(txt[off[i]:end]) if self.format_version >= 117: self.path_or_buf.read(6) # self._value_labels_read = True @@ -1544,7 +1547,7 @@ def read(self, nrows=None, convert_dates=None, for col, typ in zip(data, self.typlist): if type(typ) is int: data[col] = data[col].apply( - self._null_terminate, convert_dtype=True) + self._decode, convert_dtype=True) data = self._insert_strls(data) diff --git a/pandas/tests/io/data/stata1_encoding_118.dta b/pandas/tests/io/data/stata1_encoding_118.dta new file mode 100644 index 0000000000000..475f172feff87 Binary files /dev/null and b/pandas/tests/io/data/stata1_encoding_118.dta differ diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 420ccfc885ef0..21cb3e597ca2a 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -66,6 +66,8 @@ def setup_method(self, datapath): self.dta4_117 = os.path.join(self.dirpath, 'stata4_117.dta') self.dta_encoding = os.path.join(self.dirpath, 'stata1_encoding.dta') + self.dta_encoding_118 = os.path.join(self.dirpath, + 'stata1_encoding_118.dta') self.csv14 = os.path.join(self.dirpath, 'stata5.csv') self.dta14_113 = os.path.join(self.dirpath, 'stata5_113.dta') @@ -1608,3 +1610,18 @@ def test_strl_latin1(self): val = gso.split(b'\x00')[-2] size = gso[gso.find(b'\x82') + 1] assert len(val) == size - 1 + + def test_encoding_latin1_118(self): + # GH 25960 + msg = """ +One or more strings in the dta file could not be decoded using utf-8, and +so the fallback encoding of latin-1 is being used. This can happen when a file +has been incorrectly encoded by Stata or some other software. You should verify +the string values returned are correct.""" + with tm.assert_produces_warning(UnicodeWarning) as w: + encoded = read_stata(self.dta_encoding_118) + assert len(w) == 151 + assert w[0].message.args[0] == msg + + expected = pd.DataFrame([['Düsseldorf']] * 151, columns=['kreis1849']) + tm.assert_frame_equal(encoded, expected)