BUG: Fix encoding for Stata format 1188888888 files

bashtage · bashtage · commit 4bdb2c41419c · 2018-06-02T10:48:40.000+01:00
Ensure that Stata 118 files always use utf-8 encoding
Deprecate encoding from read_stata and StataReader
diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt
@@ -23,7 +23,7 @@ New features
 Deprecations
 ~~~~~~~~~~~~
 
--
+- :func:`read_stata` and :class:`StataReader` have deprecated the ``encoding`` parameter. Stata files only support a single encoding and so this input has no effect. (:issue:`21244`)
 -
 
 .. _whatsnew_0231.performance:
@@ -92,7 +92,7 @@ I/O
 
 - Bug in IO methods specifying ``compression='zip'`` which produced uncompressed zip archives (:issue:`17778`, :issue:`21144`)
 - Bug in :meth:`DataFrame.to_stata` which prevented exporting DataFrames to buffers and most file-like objects (:issue:`21041`)
--
+- Bug in :meth:`read_stata` and :class:`StataReader` which did not correctly decode utf-8 strings on Python 3 from Stata 14 files (dta version 118) (:issue:`21244`)
 
 Plotting
 ^^^^^^^^
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -169,6 +169,7 @@
 
 
 @Appender(_read_stata_doc)
+@deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
 @deprecate_kwarg(old_arg_name='index', new_arg_name='index_col')
 def read_stata(filepath_or_buffer, convert_dates=True,
                convert_categoricals=True, encoding=None, index_col=None,
@@ -182,7 +183,7 @@ def read_stata(filepath_or_buffer, convert_dates=True,
                          preserve_dtypes=preserve_dtypes,
                          columns=columns,
                          order_categoricals=order_categoricals,
-                         chunksize=chunksize, encoding=encoding)
+                         chunksize=chunksize)
 
     if iterator or chunksize:
         data = reader
@@ -838,15 +839,8 @@ def get_base_missing_value(cls, dtype):
 
 
 class StataParser(object):
-    _default_encoding = 'latin-1'
 
-    def __init__(self, encoding):
-        if encoding is not None:
-            if encoding not in VALID_ENCODINGS:
-                raise ValueError('Unknown encoding. Only latin-1 and ascii '
-                                 'supported.')
-
-        self._encoding = encoding
+    def __init__(self):
 
         # type          code.
         # --------------------
@@ -959,13 +953,14 @@ def __init__(self, encoding):
 class StataReader(StataParser, BaseIterator):
     __doc__ = _stata_reader_doc
 
+    @deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
     @deprecate_kwarg(old_arg_name='index', new_arg_name='index_col')
     def __init__(self, path_or_buf, convert_dates=True,
                  convert_categoricals=True, index_col=None,
                  convert_missing=False, preserve_dtypes=True,
                  columns=None, order_categoricals=True,
-                 encoding='latin-1', chunksize=None):
-        super(StataReader, self).__init__(encoding)
+                 encoding=None, chunksize=None):
+        super(StataReader, self).__init__()
         self.col_sizes = ()
 
         # Arguments to the reader (can be temporarily overridden in
@@ -977,10 +972,6 @@ def __init__(self, path_or_buf, convert_dates=True,
         self._preserve_dtypes = preserve_dtypes
         self._columns = columns
         self._order_categoricals = order_categoricals
-        if encoding is not None:
-            if encoding not in VALID_ENCODINGS:
-                raise ValueError('Unknown encoding. Only latin-1 and ascii '
-                                 'supported.')
         self._encoding = encoding
         self._chunksize = chunksize
 
@@ -998,18 +989,13 @@ def __init__(self, path_or_buf, convert_dates=True,
         path_or_buf = _stringify_path(path_or_buf)
         if isinstance(path_or_buf, str):
             path_or_buf, encoding, _, should_close = get_filepath_or_buffer(
-                path_or_buf, encoding=self._default_encoding
-            )
+                path_or_buf)
 
         if isinstance(path_or_buf, (str, text_type, bytes)):
             self.path_or_buf = open(path_or_buf, 'rb')
         else:
             # Copy to BytesIO, and ensure no encoding
             contents = path_or_buf.read()
-            try:
-                contents = contents.encode(self._default_encoding)
-            except:
-                pass
             self.path_or_buf = BytesIO(contents)
 
         self._read_header()
@@ -1030,6 +1016,15 @@ def close(self):
         except IOError:
             pass
 
+    def _set_encoding(self):
+        """
+        Set string encoding which depends on file version
+        """
+        if self.format_version < 118:
+            self._encoding = 'latin-1'
+        else:
+            self._encoding = 'utf-8'
+
     def _read_header(self):
         first_char = self.path_or_buf.read(1)
         if struct.unpack('c', first_char)[0] == b'<':
@@ -1049,6 +1044,7 @@ def _read_new_header(self, first_char):
         self.format_version = int(self.path_or_buf.read(3))
         if self.format_version not in [117, 118]:
             raise ValueError(_version_error)
+        self._set_encoding()
         self.path_or_buf.read(21)  # </release><byteorder>
         self.byteorder = self.path_or_buf.read(3) == b'MSF' and '>' or '<'
         self.path_or_buf.read(15)  # </byteorder><K>
@@ -1235,6 +1231,7 @@ def _read_old_header(self, first_char):
         self.format_version = struct.unpack('b', first_char)[0]
         if self.format_version not in [104, 105, 108, 111, 113, 114, 115]:
             raise ValueError(_version_error)
+        self._set_encoding()
         self.byteorder = struct.unpack('b', self.path_or_buf.read(1))[
             0] == 0x1 and '>' or '<'
         self.filetype = struct.unpack('b', self.path_or_buf.read(1))[0]
@@ -1338,16 +1335,9 @@ def _decode(self, s):
         return s.decode('utf-8')
 
     def _null_terminate(self, s):
-        if compat.PY3 or self._encoding is not None:
-            # have bytes not strings, so must decode
-            s = s.partition(b"\0")[0]
-            return s.decode(self._encoding or self._default_encoding)
-        else:
-            null_byte = "\0"
-            try:
-                return s.lstrip(null_byte)[:s.index(null_byte)]
-            except:
-                return s
+        # have bytes not strings, so must decode
+        s = s.partition(b"\0")[0]
+        return s.decode(self._encoding)
 
     def _read_value_labels(self):
         if self._value_labels_read:
@@ -1433,10 +1423,7 @@ def _read_strls(self):
                                    self.path_or_buf.read(4))[0]
             va = self.path_or_buf.read(length)
             if typ == 130:
-                encoding = 'utf-8'
-                if self.format_version == 117:
-                    encoding = self._encoding or self._default_encoding
-                va = va[0:-1].decode(encoding)
+                va = va[0:-1].decode(self._encoding)
             # Wrap v_o in a string to allow uint64 values as keys on 32bit OS
             self.GSO[str(v_o)] = va
 
@@ -1980,9 +1967,14 @@ class StataWriter(StataParser):
     def __init__(self, fname, data, convert_dates=None, write_index=True,
                  encoding="latin-1", byteorder=None, time_stamp=None,
                  data_label=None, variable_labels=None):
-        super(StataWriter, self).__init__(encoding)
+        super(StataWriter, self).__init__()
         self._convert_dates = {} if convert_dates is None else convert_dates
         self._write_index = write_index
+        if encoding is not None:
+            if encoding not in VALID_ENCODINGS:
+                raise ValueError('Unknown encoding. Only latin-1 and ascii '
+                                 'supported.')
+        self._encoding = encoding
         self._time_stamp = time_stamp
         self._data_label = data_label
         self._variable_labels = variable_labels
diff --git a/pandas/tests/io/data/stata16_118.dta b/pandas/tests/io/data/stata16_118.dta
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
@@ -96,6 +96,7 @@ def setup_method(self, method):
         self.dta23 = os.path.join(self.dirpath, 'stata15.dta')
 
         self.dta24_111 = os.path.join(self.dirpath, 'stata7_111.dta')
+        self.dta25_118 = os.path.join(self.dirpath, 'stata16_118.dta')
 
         self.stata_dates = os.path.join(self.dirpath, 'stata13_dates.dta')
 
@@ -360,22 +361,20 @@ def test_encoding(self, version):
 
         # GH 4626, proper encoding handling
         raw = read_stata(self.dta_encoding)
-        encoded = read_stata(self.dta_encoding, encoding="latin-1")
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+            encoded = read_stata(self.dta_encoding, encoding="latin-1")
+            assert len(w) == 1
         result = encoded.kreis1849[0]
 
-        if compat.PY3:
-            expected = raw.kreis1849[0]
-            assert result == expected
-            assert isinstance(result, compat.string_types)
-        else:
-            expected = raw.kreis1849.str.decode("latin-1")[0]
-            assert result == expected
-            assert isinstance(result, unicode)  # noqa
+        expected = raw.kreis1849[0]
+        assert result == expected
+        assert isinstance(result, compat.string_types)
 
         with tm.ensure_clean() as path:
             encoded.to_stata(path, encoding='latin-1',
                              write_index=False, version=version)
-            reread_encoded = read_stata(path, encoding='latin-1')
+            reread_encoded = read_stata(path)
             tm.assert_frame_equal(encoded, reread_encoded)
 
     def test_read_write_dta11(self):
@@ -1500,3 +1499,18 @@ def test_gzip_writing(self):
             with gzip.GzipFile(path, 'rb') as gz:
                 reread = pd.read_stata(gz, index_col='index')
         tm.assert_frame_equal(df, reread)
+
+    def test_unicode_dta_118(self):
+        unicode_df = self.read_dta(self.dta25_118)
+
+        columns = ['utf8', 'latin1', 'ascii', 'utf8_strl', 'ascii_strl']
+        values = [[u'ραηδας', u'PÄNDÄS', 'p', u'ραηδας', 'p'],
+                  [u'ƤĀńĐąŜ', u'Ö', 'a', u'ƤĀńĐąŜ', 'a'],
+                  [u'ᴘᴀᴎᴅᴀS', u'Ü', 'n', u'ᴘᴀᴎᴅᴀS', 'n'],
+                  ['      ', '      ', 'd', '      ', 'd'],
+                  [' ', '', 'a', ' ', 'a'],
+                  ['', '', 's', '', 's'],
+                  ['', '', ' ', '', ' ']]
+        expected = pd.DataFrame(values, columns=columns)
+
+        tm.assert_frame_equal(unicode_df, expected)