From ac8c2c2e76dd59acffa7083c8e78d043ab697134 Mon Sep 17 00:00:00 2001
From: Kevin Sheppard <kevin.k.sheppard@gmail.com>
Date: Thu, 31 May 2018 11:59:53 +0200
Subject: [PATCH] BUG: Fix encoding for Stata format 118 format files

Ensure that Stata 118 files always use utf-8 encoding
---
 doc/source/whatsnew/v0.23.1.txt      |   1 +
 pandas/io/stata.py                   |  62 +++++++++++----------------
 pandas/tests/io/data/stata16_118.dta | Bin 0 -> 4614 bytes
 pandas/tests/io/test_stata.py        |  29 +++++++++----
 4 files changed, 47 insertions(+), 45 deletions(-)
 create mode 100644 pandas/tests/io/data/stata16_118.dta

diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt
index 69b07d12c1e98..bddb9d3b8e2a7 100644
--- a/doc/source/whatsnew/v0.23.1.txt
+++ b/doc/source/whatsnew/v0.23.1.txt
@@ -95,6 +95,7 @@ I/O
 - Bug in IO methods specifying ``compression='zip'`` which produced uncompressed zip archives (:issue:`17778`, :issue:`21144`)
 - Bug in :meth:`DataFrame.to_stata` which prevented exporting DataFrames to buffers and most file-like objects (:issue:`21041`)
 - Bug in :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` causes encoding error when compression and encoding are specified (:issue:`21241`, :issue:`21118`)
+- Bug in :meth:`read_stata` and :class:`StataReader` which did not correctly decode utf-8 strings on Python 3 from Stata 14 files (dta version 118) (:issue:`21244`)
 -
 
 Plotting
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
index 2797924985c70..8584e1f0e3f14 100644
--- a/pandas/io/stata.py
+++ b/pandas/io/stata.py
@@ -182,7 +182,7 @@ def read_stata(filepath_or_buffer, convert_dates=True,
                          preserve_dtypes=preserve_dtypes,
                          columns=columns,
                          order_categoricals=order_categoricals,
-                         chunksize=chunksize, encoding=encoding)
+                         chunksize=chunksize)
 
     if iterator or chunksize:
         data = reader
@@ -838,15 +838,8 @@ def get_base_missing_value(cls, dtype):
 
 
 class StataParser(object):
-    _default_encoding = 'latin-1'
 
-    def __init__(self, encoding):
-        if encoding is not None:
-            if encoding not in VALID_ENCODINGS:
-                raise ValueError('Unknown encoding. Only latin-1 and ascii '
-                                 'supported.')
-
-        self._encoding = encoding
+    def __init__(self):
 
         # type          code.
         # --------------------
@@ -964,8 +957,8 @@ def __init__(self, path_or_buf, convert_dates=True,
                  convert_categoricals=True, index_col=None,
                  convert_missing=False, preserve_dtypes=True,
                  columns=None, order_categoricals=True,
-                 encoding='latin-1', chunksize=None):
-        super(StataReader, self).__init__(encoding)
+                 encoding=None, chunksize=None):
+        super(StataReader, self).__init__()
         self.col_sizes = ()
 
         # Arguments to the reader (can be temporarily overridden in
@@ -977,10 +970,6 @@ def __init__(self, path_or_buf, convert_dates=True,
         self._preserve_dtypes = preserve_dtypes
         self._columns = columns
         self._order_categoricals = order_categoricals
-        if encoding is not None:
-            if encoding not in VALID_ENCODINGS:
-                raise ValueError('Unknown encoding. Only latin-1 and ascii '
-                                 'supported.')
         self._encoding = encoding
         self._chunksize = chunksize
 
@@ -998,18 +987,13 @@ def __init__(self, path_or_buf, convert_dates=True,
         path_or_buf = _stringify_path(path_or_buf)
         if isinstance(path_or_buf, str):
             path_or_buf, encoding, _, should_close = get_filepath_or_buffer(
-                path_or_buf, encoding=self._default_encoding
-            )
+                path_or_buf)
 
         if isinstance(path_or_buf, (str, text_type, bytes)):
             self.path_or_buf = open(path_or_buf, 'rb')
         else:
             # Copy to BytesIO, and ensure no encoding
             contents = path_or_buf.read()
-            try:
-                contents = contents.encode(self._default_encoding)
-            except:
-                pass
             self.path_or_buf = BytesIO(contents)
 
         self._read_header()
@@ -1030,6 +1014,15 @@ def close(self):
         except IOError:
             pass
 
+    def _set_encoding(self):
+        """
+        Set string encoding which depends on file version
+        """
+        if self.format_version < 118:
+            self._encoding = 'latin-1'
+        else:
+            self._encoding = 'utf-8'
+
     def _read_header(self):
         first_char = self.path_or_buf.read(1)
         if struct.unpack('c', first_char)[0] == b'<':
@@ -1049,6 +1042,7 @@ def _read_new_header(self, first_char):
         self.format_version = int(self.path_or_buf.read(3))
         if self.format_version not in [117, 118]:
             raise ValueError(_version_error)
+        self._set_encoding()
         self.path_or_buf.read(21)  # </release><byteorder>
         self.byteorder = self.path_or_buf.read(3) == b'MSF' and '>' or '<'
         self.path_or_buf.read(15)  # </byteorder><K>
@@ -1235,6 +1229,7 @@ def _read_old_header(self, first_char):
         self.format_version = struct.unpack('b', first_char)[0]
         if self.format_version not in [104, 105, 108, 111, 113, 114, 115]:
             raise ValueError(_version_error)
+        self._set_encoding()
         self.byteorder = struct.unpack('b', self.path_or_buf.read(1))[
             0] == 0x1 and '>' or '<'
         self.filetype = struct.unpack('b', self.path_or_buf.read(1))[0]
@@ -1338,16 +1333,9 @@ def _decode(self, s):
         return s.decode('utf-8')
 
     def _null_terminate(self, s):
-        if compat.PY3 or self._encoding is not None:
-            # have bytes not strings, so must decode
-            s = s.partition(b"\0")[0]
-            return s.decode(self._encoding or self._default_encoding)
-        else:
-            null_byte = "\0"
-            try:
-                return s.lstrip(null_byte)[:s.index(null_byte)]
-            except:
-                return s
+        # have bytes not strings, so must decode
+        s = s.partition(b"\0")[0]
+        return s.decode(self._encoding)
 
     def _read_value_labels(self):
         if self._value_labels_read:
@@ -1433,10 +1421,7 @@ def _read_strls(self):
                                    self.path_or_buf.read(4))[0]
             va = self.path_or_buf.read(length)
             if typ == 130:
-                encoding = 'utf-8'
-                if self.format_version == 117:
-                    encoding = self._encoding or self._default_encoding
-                va = va[0:-1].decode(encoding)
+                va = va[0:-1].decode(self._encoding)
             # Wrap v_o in a string to allow uint64 values as keys on 32bit OS
             self.GSO[str(v_o)] = va
 
@@ -1980,9 +1965,14 @@ class StataWriter(StataParser):
     def __init__(self, fname, data, convert_dates=None, write_index=True,
                  encoding="latin-1", byteorder=None, time_stamp=None,
                  data_label=None, variable_labels=None):
-        super(StataWriter, self).__init__(encoding)
+        super(StataWriter, self).__init__()
         self._convert_dates = {} if convert_dates is None else convert_dates
         self._write_index = write_index
+        if encoding is not None:
+            if encoding not in VALID_ENCODINGS:
+                raise ValueError('Unknown encoding. Only latin-1 and ascii '
+                                 'supported.')
+        self._encoding = encoding
         self._time_stamp = time_stamp
         self._data_label = data_label
         self._variable_labels = variable_labels
diff --git a/pandas/tests/io/data/stata16_118.dta b/pandas/tests/io/data/stata16_118.dta
new file mode 100644
index 0000000000000000000000000000000000000000..49cfa49d1b302eeaba7453548ac56d050c7da17a
GIT binary patch
literal 4614
zcmeHL%}*0S6d$##M57VI#al02<gjhgq?pMDB5ERvCjA4-&@NJKOS8L>aI#iN{D{Oa
z5V0cip@=7vwkVqPFyP6Q#H-O{yy#J{UT|h-X8S=rP)!<>ZkoP%Z{C~tn>TOv%`7x6
zWT9Z#LK^JXQCK%LXy{QLnY!ln`k^Wr*gI_L0|rw#8#o2kA`81T1wge+gWcL5z#ph~
zYcPs>^{57DA8$b`64R-_SVC)T^}5fYVYkoY^}Bto$B(r^wb%<)2_=RSbO8QL3wn45
zZ*&U!Wf$i+1yvQ!e-dC$9N>JPpy`90|J=ywFF|)7;yhHDS2%=>2<naML2Edno7xf3
z1gOI#NJ6!ui>byDotidi_4$F94^xx>D6%5)S^_|3C=vml+CgAT5Hu|#T9f!wr*;Ht
ziNMZJo{!KRFsx|Aw46=U)5G?oBZhqgMhsb|)_l}wRu!>kpQlR0w(9JF4fSFbV4JB&
zPc?>6m0_Y=Jvyid?e|%b$G8e0b(4Ao{%r)<p07;IVg<9))pXtgqHa0}20NzOnQTZ%
zURkR)|KsQ8W~r`h-)`sn4cAl`Gg<u5(jZLBNR!}}&h*&h`M+N&>sP*U{wpig^sphv
zYdOjF$Cp2LFrT8H18FbuN<5W(CBBB4a(r^Uy%qQ*IEiClMZYsi=~x;_0QCU!h|~jE
zu|w(^D7H#{3;*x$z>?dKC@V13j|>#Dbb}r=BOz0R>J~|uUNW_ftA*vlN+DBN-bjJ`
z3dz4F`8OoLO7d?>{+)c8GI{}zba&#B0Jx5C;G6gslO<g2{zV@}(D(Ce$@P)7J8Prs
zX^@*Cxo0FdOLB80H&1d4^k-|cKRf~M;(Pc$et^gE1b&Do@gw8{J8cC;P?DJ>nIy@K
zk<2J<1i&oG&XMdq$u5xWbCP{QvWq19l4O@~8cgBGIE|m+Y5bIXiMs&5eky{x?FaNn
zE?wptfBYAkW&L3nuuChQET`_W?(H&kvb^XivY3u-cZgDb=ish@-CX`x!@1?y3gB0E
zsM`4g*)`78P6y7j7|5cV+E1CXRH6!Oc8&sD*8~t|e31dL&B{_nm^lipfux!EmH<-h
z<G_~G5T=e2GmT4AzF^cNn~YdoVc>KzgRnG^=aCeXx|x(o%A__fCK2%lwyr64dDy5+
XJ_5bq*^w)~t*~ubaz<oq@zc+53D*S#

literal 0
HcmV?d00001

diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
index f3a465da4e87f..e5585902a9dd6 100644
--- a/pandas/tests/io/test_stata.py
+++ b/pandas/tests/io/test_stata.py
@@ -96,6 +96,7 @@ def setup_method(self, method):
         self.dta23 = os.path.join(self.dirpath, 'stata15.dta')
 
         self.dta24_111 = os.path.join(self.dirpath, 'stata7_111.dta')
+        self.dta25_118 = os.path.join(self.dirpath, 'stata16_118.dta')
 
         self.stata_dates = os.path.join(self.dirpath, 'stata13_dates.dta')
 
@@ -363,19 +364,14 @@ def test_encoding(self, version):
         encoded = read_stata(self.dta_encoding, encoding="latin-1")
         result = encoded.kreis1849[0]
 
-        if compat.PY3:
-            expected = raw.kreis1849[0]
-            assert result == expected
-            assert isinstance(result, compat.string_types)
-        else:
-            expected = raw.kreis1849.str.decode("latin-1")[0]
-            assert result == expected
-            assert isinstance(result, unicode)  # noqa
+        expected = raw.kreis1849[0]
+        assert result == expected
+        assert isinstance(result, compat.string_types)
 
         with tm.ensure_clean() as path:
             encoded.to_stata(path, encoding='latin-1',
                              write_index=False, version=version)
-            reread_encoded = read_stata(path, encoding='latin-1')
+            reread_encoded = read_stata(path)
             tm.assert_frame_equal(encoded, reread_encoded)
 
     def test_read_write_dta11(self):
@@ -1500,3 +1496,18 @@ def test_gzip_writing(self):
             with gzip.GzipFile(path, 'rb') as gz:
                 reread = pd.read_stata(gz, index_col='index')
         tm.assert_frame_equal(df, reread)
+
+    def test_unicode_dta_118(self):
+        unicode_df = self.read_dta(self.dta25_118)
+
+        columns = ['utf8', 'latin1', 'ascii', 'utf8_strl', 'ascii_strl']
+        values = [[u'ραηδας', u'PÄNDÄS', 'p', u'ραηδας', 'p'],
+                  [u'ƤĀńĐąŜ', u'Ö', 'a', u'ƤĀńĐąŜ', 'a'],
+                  [u'ᴘᴀᴎᴅᴀS', u'Ü', 'n', u'ᴘᴀᴎᴅᴀS', 'n'],
+                  ['      ', '      ', 'd', '      ', 'd'],
+                  [' ', '', 'a', ' ', 'a'],
+                  ['', '', 's', '', 's'],
+                  ['', '', ' ', '', ' ']]
+        expected = pd.DataFrame(values, columns=columns)
+
+        tm.assert_frame_equal(unicode_df, expected)