BUG: Fix encoding for Stata format 118 files

bashtage · bashtage · commit dcbbb32a9da4 · 2018-06-01T15:56:16.000+02:00
Ensure that Stata 118 files always use utf-8 encoding
Deprecate encoding from read_stata and StataReader
diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt
@@ -92,7 +92,7 @@ I/O
 
 - Bug in IO methods specifying ``compression='zip'`` which produced uncompressed zip archives (:issue:`17778`, :issue:`21144`)
 - Bug in :meth:`DataFrame.to_stata` which prevented exporting DataFrames to buffers and most file-like objects (:issue:`21041`)
--
+- Bug in :meth:`read_stata` and :class:`StataReader` which did not correctly decode utf-8 strings on Python 3 from Stata 14 files (dta version 118) (:issue:`21244`)
 
 Plotting
 ^^^^^^^^
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -36,9 +36,12 @@
 from pandas.util._decorators import Appender
 from pandas.util._decorators import deprecate_kwarg
 
-VALID_ENCODINGS = ('ascii', 'us-ascii', 'latin-1', 'latin_1', 'iso-8859-1',
+# Allowed encodings of Stata dta files.  Preferred is first entry
+VALID_ENCODINGS = ('latin-1', 'latin_1', 'ascii', 'us-ascii', 'iso-8859-1',
                    'iso8859-1', '8859', 'cp819', 'latin', 'latin1', 'L1')
 
+VALID_ENCODINGS_118 = ('utf8', 'utf-8')
+
 _version_error = ("Version of given Stata file is not 104, 105, 108, "
                   "111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), "
                   "115 (Stata 12), 117 (Stata 13), or 118 (Stata 14)")
@@ -169,6 +172,7 @@
 
 
 @Appender(_read_stata_doc)
+@deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
 @deprecate_kwarg(old_arg_name='index', new_arg_name='index_col')
 def read_stata(filepath_or_buffer, convert_dates=True,
                convert_categoricals=True, encoding=None, index_col=None,
@@ -182,7 +186,7 @@ def read_stata(filepath_or_buffer, convert_dates=True,
                          preserve_dtypes=preserve_dtypes,
                          columns=columns,
                          order_categoricals=order_categoricals,
-                         chunksize=chunksize, encoding=encoding)
+                         chunksize=chunksize)
 
     if iterator or chunksize:
         data = reader
@@ -399,16 +403,19 @@ def parse_dates_safe(dates, delta=False, year=False, days=False):
         elif infer_dtype(dates) == 'datetime':
             if delta:
                 delta = dates.values - stata_epoch
-                f = lambda x: \
-                    US_PER_DAY * x.days + 1000000 * x.seconds + x.microseconds
+
+                def f(x):
+                    return US_PER_DAY * x.days + \
+                        1000000 * x.seconds + x.microseconds
                 v = np.vectorize(f)
                 d['delta'] = v(delta)
             if year:
                 year_month = dates.apply(lambda x: 100 * x.year + x.month)
                 d['year'] = year_month.values // 100
                 d['month'] = (year_month.values - d['year'] * 100)
             if days:
-                f = lambda x: (x - datetime.datetime(x.year, 1, 1)).days
+                def f(x):
+                    return (x - datetime.datetime(x.year, 1, 1)).days
                 v = np.vectorize(f)
                 d['days'] = v(dates)
         else:
@@ -838,7 +845,6 @@ def get_base_missing_value(cls, dtype):
 
 
 class StataParser(object):
-    _default_encoding = 'latin-1'
 
     def __init__(self, encoding):
         if encoding is not None:
@@ -959,12 +965,13 @@ def __init__(self, encoding):
 class StataReader(StataParser, BaseIterator):
     __doc__ = _stata_reader_doc
 
+    @deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
     @deprecate_kwarg(old_arg_name='index', new_arg_name='index_col')
     def __init__(self, path_or_buf, convert_dates=True,
                  convert_categoricals=True, index_col=None,
                  convert_missing=False, preserve_dtypes=True,
                  columns=None, order_categoricals=True,
-                 encoding='latin-1', chunksize=None):
+                 encoding=None, chunksize=None):
         super(StataReader, self).__init__(encoding)
         self.col_sizes = ()
 
@@ -977,10 +984,6 @@ def __init__(self, path_or_buf, convert_dates=True,
         self._preserve_dtypes = preserve_dtypes
         self._columns = columns
         self._order_categoricals = order_categoricals
-        if encoding is not None:
-            if encoding not in VALID_ENCODINGS:
-                raise ValueError('Unknown encoding. Only latin-1 and ascii '
-                                 'supported.')
         self._encoding = encoding
         self._chunksize = chunksize
 
@@ -998,18 +1001,13 @@ def __init__(self, path_or_buf, convert_dates=True,
         path_or_buf = _stringify_path(path_or_buf)
         if isinstance(path_or_buf, str):
             path_or_buf, encoding, _, should_close = get_filepath_or_buffer(
-                path_or_buf, encoding=self._default_encoding
-            )
+                path_or_buf)
 
         if isinstance(path_or_buf, (str, text_type, bytes)):
             self.path_or_buf = open(path_or_buf, 'rb')
         else:
             # Copy to BytesIO, and ensure no encoding
             contents = path_or_buf.read()
-            try:
-                contents = contents.encode(self._default_encoding)
-            except:
-                pass
             self.path_or_buf = BytesIO(contents)
 
         self._read_header()
@@ -1030,6 +1028,15 @@ def close(self):
         except IOError:
             pass
 
+    def _set_encoding(self):
+        """
+        Check validity of user-set encoding set the default encoding
+        """
+        if self.format_version < 118:
+            self._encoding = 'latin-1'
+        else:
+            self._encoding = 'utf-8'
+
     def _read_header(self):
         first_char = self.path_or_buf.read(1)
         if struct.unpack('c', first_char)[0] == b'<':
@@ -1049,6 +1056,7 @@ def _read_new_header(self, first_char):
         self.format_version = int(self.path_or_buf.read(3))
         if self.format_version not in [117, 118]:
             raise ValueError(_version_error)
+        self._set_encoding()
         self.path_or_buf.read(21)  # </release><byteorder>
         self.byteorder = self.path_or_buf.read(3) == b'MSF' and '>' or '<'
         self.path_or_buf.read(15)  # </byteorder><K>
@@ -1235,6 +1243,7 @@ def _read_old_header(self, first_char):
         self.format_version = struct.unpack('b', first_char)[0]
         if self.format_version not in [104, 105, 108, 111, 113, 114, 115]:
             raise ValueError(_version_error)
+        self._set_encoding()
         self.byteorder = struct.unpack('b', self.path_or_buf.read(1))[
             0] == 0x1 and '>' or '<'
         self.filetype = struct.unpack('b', self.path_or_buf.read(1))[0]
@@ -1338,16 +1347,9 @@ def _decode(self, s):
         return s.decode('utf-8')
 
     def _null_terminate(self, s):
-        if compat.PY3 or self._encoding is not None:
-            # have bytes not strings, so must decode
-            s = s.partition(b"\0")[0]
-            return s.decode(self._encoding or self._default_encoding)
-        else:
-            null_byte = "\0"
-            try:
-                return s.lstrip(null_byte)[:s.index(null_byte)]
-            except:
-                return s
+        # have bytes not strings, so must decode
+        s = s.partition(b"\0")[0]
+        return s.decode(self._encoding)
 
     def _read_value_labels(self):
         if self._value_labels_read:
@@ -1433,10 +1435,7 @@ def _read_strls(self):
                                    self.path_or_buf.read(4))[0]
             va = self.path_or_buf.read(length)
             if typ == 130:
-                encoding = 'utf-8'
-                if self.format_version == 117:
-                    encoding = self._encoding or self._default_encoding
-                va = va[0:-1].decode(encoding)
+                va = va[0:-1].decode(self._encoding)
             # Wrap v_o in a string to allow uint64 values as keys on 32bit OS
             self.GSO[str(v_o)] = va
 
diff --git a/pandas/tests/io/data/stata16_118.dta b/pandas/tests/io/data/stata16_118.dta
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
@@ -96,6 +96,7 @@ def setup_method(self, method):
         self.dta23 = os.path.join(self.dirpath, 'stata15.dta')
 
         self.dta24_111 = os.path.join(self.dirpath, 'stata7_111.dta')
+        self.dta25_118 = os.path.join(self.dirpath, 'stata16_118.dta')
 
         self.stata_dates = os.path.join(self.dirpath, 'stata13_dates.dta')
 
@@ -360,22 +361,20 @@ def test_encoding(self, version):
 
         # GH 4626, proper encoding handling
         raw = read_stata(self.dta_encoding)
-        encoded = read_stata(self.dta_encoding, encoding="latin-1")
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+            encoded = read_stata(self.dta_encoding, encoding="latin-1")
+            assert len(w) == 1
         result = encoded.kreis1849[0]
 
-        if compat.PY3:
-            expected = raw.kreis1849[0]
-            assert result == expected
-            assert isinstance(result, compat.string_types)
-        else:
-            expected = raw.kreis1849.str.decode("latin-1")[0]
-            assert result == expected
-            assert isinstance(result, unicode)  # noqa
+        expected = raw.kreis1849[0]
+        assert result == expected
+        assert isinstance(result, compat.string_types)
 
         with tm.ensure_clean() as path:
             encoded.to_stata(path, encoding='latin-1',
                              write_index=False, version=version)
-            reread_encoded = read_stata(path, encoding='latin-1')
+            reread_encoded = read_stata(path)
             tm.assert_frame_equal(encoded, reread_encoded)
 
     def test_read_write_dta11(self):
@@ -520,7 +519,8 @@ def test_numeric_column_names(self):
             written_and_read_again = self.read_dta(path)
             written_and_read_again = written_and_read_again.set_index('index')
             columns = list(written_and_read_again.columns)
-            convert_col_name = lambda x: int(x[1])
+
+            def convert_col_name(x): return int(x[1])
             written_and_read_again.columns = map(convert_col_name, columns)
             tm.assert_frame_equal(original, written_and_read_again)
 
@@ -1363,14 +1363,16 @@ def test_invalid_encoding(self):
     def test_path_pathlib(self):
         df = tm.makeDataFrame()
         df.index.name = 'index'
-        reader = lambda x: read_stata(x).set_index('index')
+
+        def reader(x): return read_stata(x).set_index('index')
         result = tm.round_trip_pathlib(df.to_stata, reader)
         tm.assert_frame_equal(df, result)
 
     def test_pickle_path_localpath(self):
         df = tm.makeDataFrame()
         df.index.name = 'index'
-        reader = lambda x: read_stata(x).set_index('index')
+
+        def reader(x): return read_stata(x).set_index('index')
         result = tm.round_trip_localpath(df.to_stata, reader)
         tm.assert_frame_equal(df, result)
 
@@ -1500,3 +1502,18 @@ def test_gzip_writing(self):
             with gzip.GzipFile(path, 'rb') as gz:
                 reread = pd.read_stata(gz, index_col='index')
         tm.assert_frame_equal(df, reread)
+
+    def test_unicode_dta_118(self):
+        unicode_df = self.read_dta(self.dta25_118)
+
+        columns = ['utf8', 'latin1', 'ascii', 'utf8_strl', 'ascii_strl']
+        values = [[u'ραηδας', u'PÄNDÄS', 'p', u'ραηδας', 'p'],
+                  [u'ƤĀńĐąŜ', u'Ö', 'a', u'ƤĀńĐąŜ', 'a'],
+                  [u'ᴘᴀᴎᴅᴀS', u'Ü', 'n', u'ᴘᴀᴎᴅᴀS', 'n'],
+                  ['      ', '      ', 'd', '      ', 'd'],
+                  [' ', '', 'a', ' ', 'a'],
+                  ['', '', 's', '', 's'],
+                  ['', '', ' ', '', ' ']]
+        expected = pd.DataFrame(values, columns=columns)
+
+        tm.assert_frame_equal(unicode_df, expected)