From ac8c2c2e76dd59acffa7083c8e78d043ab697134 Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Thu, 31 May 2018 11:59:53 +0200 Subject: [PATCH] BUG: Fix encoding for Stata format 118 format files Ensure that Stata 118 files always use utf-8 encoding --- doc/source/whatsnew/v0.23.1.txt | 1 + pandas/io/stata.py | 62 +++++++++++---------------- pandas/tests/io/data/stata16_118.dta | Bin 0 -> 4614 bytes pandas/tests/io/test_stata.py | 29 +++++++++---- 4 files changed, 47 insertions(+), 45 deletions(-) create mode 100644 pandas/tests/io/data/stata16_118.dta diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index 69b07d12c1e98..bddb9d3b8e2a7 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -95,6 +95,7 @@ I/O - Bug in IO methods specifying ``compression='zip'`` which produced uncompressed zip archives (:issue:`17778`, :issue:`21144`) - Bug in :meth:`DataFrame.to_stata` which prevented exporting DataFrames to buffers and most file-like objects (:issue:`21041`) - Bug in :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` causes encoding error when compression and encoding are specified (:issue:`21241`, :issue:`21118`) +- Bug in :meth:`read_stata` and :class:`StataReader` which did not correctly decode utf-8 strings on Python 3 from Stata 14 files (dta version 118) (:issue:`21244`) - Plotting diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 2797924985c70..8584e1f0e3f14 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -182,7 +182,7 @@ def read_stata(filepath_or_buffer, convert_dates=True, preserve_dtypes=preserve_dtypes, columns=columns, order_categoricals=order_categoricals, - chunksize=chunksize, encoding=encoding) + chunksize=chunksize) if iterator or chunksize: data = reader @@ -838,15 +838,8 @@ def get_base_missing_value(cls, dtype): class StataParser(object): - _default_encoding = 'latin-1' - def __init__(self, encoding): - if encoding is not None: - if encoding not in VALID_ENCODINGS: - raise ValueError('Unknown encoding. Only latin-1 and ascii ' - 'supported.') - - self._encoding = encoding + def __init__(self): # type code. # -------------------- @@ -964,8 +957,8 @@ def __init__(self, path_or_buf, convert_dates=True, convert_categoricals=True, index_col=None, convert_missing=False, preserve_dtypes=True, columns=None, order_categoricals=True, - encoding='latin-1', chunksize=None): - super(StataReader, self).__init__(encoding) + encoding=None, chunksize=None): + super(StataReader, self).__init__() self.col_sizes = () # Arguments to the reader (can be temporarily overridden in @@ -977,10 +970,6 @@ def __init__(self, path_or_buf, convert_dates=True, self._preserve_dtypes = preserve_dtypes self._columns = columns self._order_categoricals = order_categoricals - if encoding is not None: - if encoding not in VALID_ENCODINGS: - raise ValueError('Unknown encoding. Only latin-1 and ascii ' - 'supported.') self._encoding = encoding self._chunksize = chunksize @@ -998,18 +987,13 @@ def __init__(self, path_or_buf, convert_dates=True, path_or_buf = _stringify_path(path_or_buf) if isinstance(path_or_buf, str): path_or_buf, encoding, _, should_close = get_filepath_or_buffer( - path_or_buf, encoding=self._default_encoding - ) + path_or_buf) if isinstance(path_or_buf, (str, text_type, bytes)): self.path_or_buf = open(path_or_buf, 'rb') else: # Copy to BytesIO, and ensure no encoding contents = path_or_buf.read() - try: - contents = contents.encode(self._default_encoding) - except: - pass self.path_or_buf = BytesIO(contents) self._read_header() @@ -1030,6 +1014,15 @@ def close(self): except IOError: pass + def _set_encoding(self): + """ + Set string encoding which depends on file version + """ + if self.format_version < 118: + self._encoding = 'latin-1' + else: + self._encoding = 'utf-8' + def _read_header(self): first_char = self.path_or_buf.read(1) if struct.unpack('c', first_char)[0] == b'<': @@ -1049,6 +1042,7 @@ def _read_new_header(self, first_char): self.format_version = int(self.path_or_buf.read(3)) if self.format_version not in [117, 118]: raise ValueError(_version_error) + self._set_encoding() self.path_or_buf.read(21) # self.byteorder = self.path_or_buf.read(3) == b'MSF' and '>' or '<' self.path_or_buf.read(15) # @@ -1235,6 +1229,7 @@ def _read_old_header(self, first_char): self.format_version = struct.unpack('b', first_char)[0] if self.format_version not in [104, 105, 108, 111, 113, 114, 115]: raise ValueError(_version_error) + self._set_encoding() self.byteorder = struct.unpack('b', self.path_or_buf.read(1))[ 0] == 0x1 and '>' or '<' self.filetype = struct.unpack('b', self.path_or_buf.read(1))[0] @@ -1338,16 +1333,9 @@ def _decode(self, s): return s.decode('utf-8') def _null_terminate(self, s): - if compat.PY3 or self._encoding is not None: - # have bytes not strings, so must decode - s = s.partition(b"\0")[0] - return s.decode(self._encoding or self._default_encoding) - else: - null_byte = "\0" - try: - return s.lstrip(null_byte)[:s.index(null_byte)] - except: - return s + # have bytes not strings, so must decode + s = s.partition(b"\0")[0] + return s.decode(self._encoding) def _read_value_labels(self): if self._value_labels_read: @@ -1433,10 +1421,7 @@ def _read_strls(self): self.path_or_buf.read(4))[0] va = self.path_or_buf.read(length) if typ == 130: - encoding = 'utf-8' - if self.format_version == 117: - encoding = self._encoding or self._default_encoding - va = va[0:-1].decode(encoding) + va = va[0:-1].decode(self._encoding) # Wrap v_o in a string to allow uint64 values as keys on 32bit OS self.GSO[str(v_o)] = va @@ -1980,9 +1965,14 @@ class StataWriter(StataParser): def __init__(self, fname, data, convert_dates=None, write_index=True, encoding="latin-1", byteorder=None, time_stamp=None, data_label=None, variable_labels=None): - super(StataWriter, self).__init__(encoding) + super(StataWriter, self).__init__() self._convert_dates = {} if convert_dates is None else convert_dates self._write_index = write_index + if encoding is not None: + if encoding not in VALID_ENCODINGS: + raise ValueError('Unknown encoding. Only latin-1 and ascii ' + 'supported.') + self._encoding = encoding self._time_stamp = time_stamp self._data_label = data_label self._variable_labels = variable_labels diff --git a/pandas/tests/io/data/stata16_118.dta b/pandas/tests/io/data/stata16_118.dta new file mode 100644 index 0000000000000000000000000000000000000000..49cfa49d1b302eeaba7453548ac56d050c7da17a GIT binary patch literal 4614 zcmeHL%}*0S6d$##M57VI#al02aI#iN{D{Oa z5V0cip@=7vwkVqPFyP6Q#H-O{yy#J{UT|h-X8S=rP)!<>ZkoP%Z{C~tn>TOv%`7x6 zWT9Z#LK^JXQCK%LXy{QLnY!ln`k^Wr*gI_L0|rw#8#o2kA`81T1wge+gWcL5z#ph~ zYcPs>^{57DA8$b`64R-_SVC)T^}5fYVYkoY^}Bto$B(r^wb%<)2_=RSbO8QL3wn45 zZ*&U!Wf$i+1yvQ!e-dC$9N>JPpy`90|J=ywFF|)7;yhHDS2%=>2byDotidi_4$F94^xx>D6%5)S^_|3C=vml+CgAT5Hu|#T9f!wr*;Ht ziNMZJo{!KRFsx|Aw46=U)5G?oBZhqgMhsb|)_l}wRu!>kpQlR0w(9JF4fSFbV4JB& zPc?>6m0_Y=Jvyid?e|%b$G8e0b(4Ao{%r)sP*U{wpig^sphv zYdOjF$Cp2LFrT8H18FbuN<5W(CBBB4a(r^Uy%qQ*IEiClMZYsi=~x;_0QCU!h|~jE zu|w(^D7H#{3;*x$z>?dKC@V13j|>#Dbb}r=BOz0R>J~|uUNW_ftA*vlN+DBN-bjJ` z3dz4F`8OoLO7d?>{+)c8GI{}zba&#B0Jx5C;G6gslOnIy@K zk<2J<1i&oG&XMdq$u5xWbCP{QvWq19l4O@~8cgBGIE|m+Y5bIXiMs&5eky{x?FaNn zE?wptfBYAkW&L3nuuChQET`_W?(H&kvb^XivY3u-cZgDb=ish@-CX`x!@1?y3gB0E zsM`4g*)`78P6y7j7|5cV+E1CXRH6!Oc8&sD*8~t|e31dL&B{_nm^lipfux!EmH<-h zKzgRnG^=aCeXx|x(o%A__fCK2%lwyr64dDy5+ XJ_5bq*^w)~t*~ubaz