From 2aff7575a377e59763f9b2c276d5cc0684ced41c Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Tue, 2 Apr 2019 23:47:41 +0100 Subject: [PATCH 1/2] ENH: Allow poorly formatted stata files to be read Add a fall back decode path that allows improperly formatted Stata files written in 118 format but using latin-1 encoded strings to be read closes #25960 --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/io/stata.py | 12 ++++++++++-- pandas/tests/io/data/stata1_encoding_118.dta | Bin 0 -> 5587 bytes pandas/tests/io/test_stata.py | 8 ++++++++ 4 files changed, 19 insertions(+), 2 deletions(-) create mode 100644 pandas/tests/io/data/stata1_encoding_118.dta diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 1ef05ae5f9c6b..8dabaeb6c7bfe 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -355,6 +355,7 @@ I/O - Improved performance in :meth:`pandas.read_stata` and :class:`pandas.io.stata.StataReader` when converting columns that have missing values (:issue:`25772`) - Bug in :func:`read_hdf` not properly closing store after a ``KeyError`` is raised (:issue:`25766`) - Bug in ``read_csv`` which would not raise ``ValueError`` if a column index in ``usecols`` was out of bounds (:issue:`25623`) +- Improved :meth:`pandas.read_stata` and :class:`pandas.io.stata.StataReader` to read incorrectly formatted 118 format files saved by Stata (:issue:`25960`) Plotting ^^^^^^^^ diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 628f77856942d..476b876cbbe8e 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1327,12 +1327,20 @@ def _calcsize(self, fmt): def _decode(self, s): s = s.partition(b"\0")[0] - return s.decode('utf-8') + try: + return s.decode('utf-8') + except UnicodeDecodeError: + # GH 25960 + return s.decode('latin-1') def _null_terminate(self, s): # have bytes not strings, so must decode s = s.partition(b"\0")[0] - return s.decode(self._encoding) + try: + return s.decode(self._encoding) + except UnicodeDecodeError: + # GH 25960 + return s.decode('latin-1') def _read_value_labels(self): if self._value_labels_read: diff --git a/pandas/tests/io/data/stata1_encoding_118.dta b/pandas/tests/io/data/stata1_encoding_118.dta new file mode 100644 index 0000000000000000000000000000000000000000..475f172feff8757337f408f75454add10ed276c9 GIT binary patch literal 5587 zcmeHLQEOa96h5ngwyCnHEmWT_QlCn4vs=NW45Kz{tfa=8RR|QiOm-*Dr8m3Fo!O9m zN=pjOldk~-eQcpGHV=h9_Dvc@MSp-w#0o7`iltck&~xU_*I=j~y+Fu&WgzyRx8~xIx-3 zdwesO4ur>_Jl^&A-b<HM$36xR=?c+KOt9)I>ILON9}sw7ggv98#5Qycj@ zUZYP$CUu2ZlIIc~nZoG1V_+)PMyqbq6Q19%^DXs7m_}mb$XG|52s=|T*GLwWHS(Rp zsCg#WM95p<)#H|~xaV3)&wZvH|IbClLBCVor|4WY@>J3?CQ|iaoE}p)hHh09TIN_txDUs*!C+H5lPG za|B1wfEXgEgMt9QgkdOS51@oH0~Qj*Dq;ZRXxXAOY_b^BMLxlcP|qFTfa_Z~@^_Fd zma!gCPBLoukFNOIca0pd5OHuJyI?Xf-fcMu^mLagjT`Hr(< zdsz7xy`;{ptRpKdqZ`qfSaFC$V&zfKyq^`eBixpiwpIyNQ_so3Zs4TyMLD8K0laew zeW%WxOd}^OBTi^coH)cGaq@T1yq^=cBixpgLYI(3yM&OMjxm4n!ZYYNbtdHD=bIfF z5kfj5#32rekiEZm=lz7R9pScw94QQQ<~vN&zY{gPqP}3%G*w-@sNq_|Bwa<4SVknd zdVwT4^d!BQ^#xh6_+{i?KU4>(Gb^W&6_$1C3$jBpYX7M(sG|_!s>^nS96A)D|Ejt@ z^#zMe-@B=XpS!a3#26jd8p#|?90rZ=`Am|rkwrq51z8qkS&(HxmIYZBWLdCNSs+69 Ykt&Q$V*EEH^w{(77D@Z2-T!f)0%MyuzyJUM literal 0 HcmV?d00001 diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 420ccfc885ef0..bdc456d52c5c6 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -66,6 +66,8 @@ def setup_method(self, datapath): self.dta4_117 = os.path.join(self.dirpath, 'stata4_117.dta') self.dta_encoding = os.path.join(self.dirpath, 'stata1_encoding.dta') + self.dta_encoding_118 = os.path.join(self.dirpath, + 'stata1_encoding_118.dta') self.csv14 = os.path.join(self.dirpath, 'stata5.csv') self.dta14_113 = os.path.join(self.dirpath, 'stata5_113.dta') @@ -1608,3 +1610,9 @@ def test_strl_latin1(self): val = gso.split(b'\x00')[-2] size = gso[gso.find(b'\x82') + 1] assert len(val) == size - 1 + + def test_encoding_latin1_118(self): + # GH 25960 + encoded = read_stata(self.dta_encoding_118) + expected = pd.DataFrame([['Düsseldorf']] * 151, columns=['kreis1849']) + tm.assert_frame_equal(encoded, expected) From 27a173d0b6caf5a713a7a8f5479dac71302b8294 Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Wed, 3 Apr 2019 08:04:19 +0100 Subject: [PATCH 2/2] MAINT: Refactor decode Refactor decode and null terminate to use file encoding --- pandas/io/stata.py | 53 ++++++++++++++++------------------- pandas/tests/io/test_stata.py | 11 +++++++- 2 files changed, 34 insertions(+), 30 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 476b876cbbe8e..99a50cf1a518a 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1136,7 +1136,7 @@ def _get_varlist(self): elif self.format_version == 118: b = 129 - return [self._null_terminate(self.path_or_buf.read(b)) + return [self._decode(self.path_or_buf.read(b)) for i in range(self.nvar)] # Returns the format list @@ -1150,7 +1150,7 @@ def _get_fmtlist(self): else: b = 7 - return [self._null_terminate(self.path_or_buf.read(b)) + return [self._decode(self.path_or_buf.read(b)) for i in range(self.nvar)] # Returns the label list @@ -1161,7 +1161,7 @@ def _get_lbllist(self): b = 33 else: b = 9 - return [self._null_terminate(self.path_or_buf.read(b)) + return [self._decode(self.path_or_buf.read(b)) for i in range(self.nvar)] def _get_variable_labels(self): @@ -1169,10 +1169,10 @@ def _get_variable_labels(self): vlblist = [self._decode(self.path_or_buf.read(321)) for i in range(self.nvar)] elif self.format_version > 105: - vlblist = [self._null_terminate(self.path_or_buf.read(81)) + vlblist = [self._decode(self.path_or_buf.read(81)) for i in range(self.nvar)] else: - vlblist = [self._null_terminate(self.path_or_buf.read(32)) + vlblist = [self._decode(self.path_or_buf.read(32)) for i in range(self.nvar)] return vlblist @@ -1191,11 +1191,11 @@ def _get_data_label(self): return self._decode(self.path_or_buf.read(strlen)) elif self.format_version == 117: strlen = struct.unpack('b', self.path_or_buf.read(1))[0] - return self._null_terminate(self.path_or_buf.read(strlen)) + return self._decode(self.path_or_buf.read(strlen)) elif self.format_version > 105: - return self._null_terminate(self.path_or_buf.read(81)) + return self._decode(self.path_or_buf.read(81)) else: - return self._null_terminate(self.path_or_buf.read(32)) + return self._decode(self.path_or_buf.read(32)) def _get_time_stamp(self): if self.format_version == 118: @@ -1203,9 +1203,9 @@ def _get_time_stamp(self): return self.path_or_buf.read(strlen).decode("utf-8") elif self.format_version == 117: strlen = struct.unpack('b', self.path_or_buf.read(1))[0] - return self._null_terminate(self.path_or_buf.read(strlen)) + return self._decode(self.path_or_buf.read(strlen)) elif self.format_version > 104: - return self._null_terminate(self.path_or_buf.read(18)) + return self._decode(self.path_or_buf.read(18)) else: raise ValueError() @@ -1266,10 +1266,10 @@ def _read_old_header(self, first_char): .format(','.join(str(x) for x in typlist))) if self.format_version > 108: - self.varlist = [self._null_terminate(self.path_or_buf.read(33)) + self.varlist = [self._decode(self.path_or_buf.read(33)) for i in range(self.nvar)] else: - self.varlist = [self._null_terminate(self.path_or_buf.read(9)) + self.varlist = [self._decode(self.path_or_buf.read(9)) for i in range(self.nvar)] self.srtlist = struct.unpack( self.byteorder + ('h' * (self.nvar + 1)), @@ -1326,20 +1326,19 @@ def _calcsize(self, fmt): struct.calcsize(self.byteorder + fmt)) def _decode(self, s): - s = s.partition(b"\0")[0] - try: - return s.decode('utf-8') - except UnicodeDecodeError: - # GH 25960 - return s.decode('latin-1') - - def _null_terminate(self, s): # have bytes not strings, so must decode s = s.partition(b"\0")[0] try: return s.decode(self._encoding) except UnicodeDecodeError: - # GH 25960 + # GH 25960, fallback to handle incorrect format produced when 117 + # files are converted to 118 files in Stata + msg = """ +One or more strings in the dta file could not be decoded using {encoding}, and +so the fallback encoding of latin-1 is being used. This can happen when a file +has been incorrectly encoded by Stata or some other software. You should verify +the string values returned are correct.""" + warnings.warn(msg.format(encoding=self._encoding), UnicodeWarning) return s.decode('latin-1') def _read_value_labels(self): @@ -1370,7 +1369,7 @@ def _read_value_labels(self): if not slength: break # end of value label table (format < 117) if self.format_version <= 117: - labname = self._null_terminate(self.path_or_buf.read(33)) + labname = self._decode(self.path_or_buf.read(33)) else: labname = self._decode(self.path_or_buf.read(129)) self.path_or_buf.read(3) # padding @@ -1392,12 +1391,8 @@ def _read_value_labels(self): self.value_label_dict[labname] = dict() for i in range(n): end = off[i + 1] if i < n - 1 else txtlen - if self.format_version <= 117: - self.value_label_dict[labname][val[i]] = ( - self._null_terminate(txt[off[i]:end])) - else: - self.value_label_dict[labname][val[i]] = ( - self._decode(txt[off[i]:end])) + self.value_label_dict[labname][val[i]] = \ + self._decode(txt[off[i]:end]) if self.format_version >= 117: self.path_or_buf.read(6) # self._value_labels_read = True @@ -1552,7 +1547,7 @@ def read(self, nrows=None, convert_dates=None, for col, typ in zip(data, self.typlist): if type(typ) is int: data[col] = data[col].apply( - self._null_terminate, convert_dtype=True) + self._decode, convert_dtype=True) data = self._insert_strls(data) diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index bdc456d52c5c6..21cb3e597ca2a 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -1613,6 +1613,15 @@ def test_strl_latin1(self): def test_encoding_latin1_118(self): # GH 25960 - encoded = read_stata(self.dta_encoding_118) + msg = """ +One or more strings in the dta file could not be decoded using utf-8, and +so the fallback encoding of latin-1 is being used. This can happen when a file +has been incorrectly encoded by Stata or some other software. You should verify +the string values returned are correct.""" + with tm.assert_produces_warning(UnicodeWarning) as w: + encoded = read_stata(self.dta_encoding_118) + assert len(w) == 151 + assert w[0].message.args[0] == msg + expected = pd.DataFrame([['Düsseldorf']] * 151, columns=['kreis1849']) tm.assert_frame_equal(encoded, expected)