BIG: Enforce correc encoding in stata

bashtage · bashtage · commit f549481a6e31 · 2017-03-21T17:13:05.000Z
Ensure StataReader and StataWriter have the correct encoding. Standardized default encoding to 'latin-1' closes pandas-dev#15723
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -917,6 +917,8 @@ Bug Fixes
 - Avoid use of ``np.finfo()`` during ``import pandas`` removed to mitigate deadlock on Python GIL misuse (:issue:`14641`)
 
 - Bug in ``DataFrame.to_stata()`` and ``StataWriter`` which produces incorrectly formatted files to be produced for some locales (:issue:`13856`)
+- Bug in ``StataReader`` and ``StataWriter`` which allows invalid encodings (:issue:`15723`)
+
 - Bug in ``pd.concat()`` in which concatting with an empty dataframe with ``join='inner'`` was being improperly handled (:issue:`15328`)
 - Bug in ``groupby.agg()`` incorrectly localizing timezone on ``datetime`` (:issue:`15426`, :issue:`10668`, :issue:`13046`)
 
@@ -931,3 +933,4 @@ Bug Fixes
 - Bug in ``pd.melt()`` where passing a tuple value for ``value_vars`` caused a ``TypeError`` (:issue:`15348`)
 - Bug in ``.eval()`` which caused multiline evals to fail with local variables not on the first line (:issue:`15342`)
 - Bug in ``pd.read_msgpack`` which did not allow to load dataframe with an index of type ``CategoricalIndex`` (:issue:`15487`)
+
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -33,6 +33,9 @@
 from pandas._libs.lib import max_len_string_array, infer_dtype
 from pandas._libs.tslib import NaT, Timestamp
 
+VALID_ENCODINGS = ('ascii', 'us-ascii', 'latin-1', 'latin_1', 'iso-8859-1',
+                   'iso8859-1', '8859', 'cp819', 'latin', 'latin1', 'L1')
+
 _version_error = ("Version of given Stata file is not 104, 105, 108, "
                   "111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), "
                   "115 (Stata 12), 117 (Stata 13), or 118 (Stata 14)")
@@ -45,7 +48,7 @@
 
 _encoding_params = """\
 encoding : string, None or encoding
-    Encoding used to parse the files. None defaults to iso-8859-1."""
+    Encoding used to parse the files. None defaults to latin-1."""
 
 _statafile_processing_params2 = """\
 index : identifier of index column
@@ -153,7 +156,7 @@
 
 @Appender(_read_stata_doc)
 def read_stata(filepath_or_buffer, convert_dates=True,
-               convert_categoricals=True, encoding=None, index=None,
+               convert_categoricals=True, encoding='latin-1', index=None,
                convert_missing=False, preserve_dtypes=True, columns=None,
                order_categoricals=True, chunksize=None, iterator=False):
 
@@ -816,9 +819,14 @@ def get_base_missing_value(cls, dtype):
 
 
 class StataParser(object):
-    _default_encoding = 'iso-8859-1'
+    _default_encoding = 'latin-1'
+
+    def __init__(self, encoding='latin-1'):
+
+        if encoding not in VALID_ENCODINGS:
+            raise ValueError('Unknown encoding. Only latin-1 and  ascii '
+                             'supported.')
 
-    def __init__(self, encoding):
         self._encoding = encoding
 
         # type          code.
@@ -936,7 +944,7 @@ def __init__(self, path_or_buf, convert_dates=True,
                  convert_categoricals=True, index=None,
                  convert_missing=False, preserve_dtypes=True,
                  columns=None, order_categoricals=True,
-                 encoding='iso-8859-1', chunksize=None):
+                 encoding='latin-1', chunksize=None):
         super(StataReader, self).__init__(encoding)
         self.col_sizes = ()
 
@@ -949,6 +957,9 @@ def __init__(self, path_or_buf, convert_dates=True,
         self._preserve_dtypes = preserve_dtypes
         self._columns = columns
         self._order_categoricals = order_categoricals
+        if encoding not in VALID_ENCODINGS:
+            raise ValueError('Unknown encoding. Only latin-1 and  ascii '
+                             'supported.')
         self._encoding = encoding
         self._chunksize = chunksize
 
@@ -1855,7 +1866,7 @@ class StataWriter(StataParser):
     write_index : bool
         Write the index to Stata dataset.
     encoding : str
-        Default is latin-1. Unicode is not supported
+        Default is latin-1. Only latin-1 and ascii are supported.
     byteorder : str
         Can be ">", "<", "little", or "big". default is `sys.byteorder`
     time_stamp : datetime
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
@@ -1276,3 +1276,9 @@ def test_out_of_range_float(self):
                 original.to_stata(path)
             tm.assertTrue('ColumnTooBig' in cm.exception)
             tm.assertTrue('infinity' in cm.exception)
+
+    def test_invalid_encoding(self):
+        original = self.read_csv(self.csv3)
+        with tm.assertRaises(ValueError):
+            with tm.ensure_clean() as path:
+                original.to_stata(path, encoding='utf-8')