MAINT: Deprecate encoding from stata reader/writer (pandas-dev#21400)

bashtage · victor · commit 0ade274a6c10 · 2018-10-01T01:49:51.000+02:00
Deprecate the encoding parameter from all Stata reading and writing
methods and classes.  The encoding depends only on the file format and
cannot be changed by users.
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -45,7 +45,7 @@ Other API Changes
 Deprecations
 ~~~~~~~~~~~~
 
--
+- :meth:`DataFrame.to_stata`, :meth:`read_stata`, :class:`StataReader` and :class:`StataWriter` have deprecated the ``encoding`` argument.  The encoding of a Stata dta file is determined by the file type and cannot be changed (:issue:`21244`).
 -
 -
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -80,7 +80,8 @@
 from pandas.compat import PY36
 from pandas.compat.numpy import function as nv
 from pandas.util._decorators import (Appender, Substitution,
-                                     rewrite_axis_style_signature)
+                                     rewrite_axis_style_signature,
+                                     deprecate_kwarg)
 from pandas.util._validators import (validate_bool_kwarg,
                                      validate_axis_style_args)
 
@@ -1764,6 +1765,7 @@ def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='',
                         startcol=startcol, freeze_panes=freeze_panes,
                         engine=engine)
 
+    @deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
     def to_stata(self, fname, convert_dates=None, write_index=True,
                  encoding="latin-1", byteorder=None, time_stamp=None,
                  data_label=None, variable_labels=None, version=114,
@@ -1869,9 +1871,8 @@ def to_stata(self, fname, convert_dates=None, write_index=True,
             kwargs['convert_strl'] = convert_strl
 
         writer = statawriter(fname, self, convert_dates=convert_dates,
-                             encoding=encoding, byteorder=byteorder,
-                             time_stamp=time_stamp, data_label=data_label,
-                             write_index=write_index,
+                             byteorder=byteorder, time_stamp=time_stamp,
+                             data_label=data_label, write_index=write_index,
                              variable_labels=variable_labels, **kwargs)
         writer.write_file()
 
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -33,11 +33,7 @@
 from pandas.core.series import Series
 from pandas.io.common import (get_filepath_or_buffer, BaseIterator,
                               _stringify_path)
-from pandas.util._decorators import Appender
-from pandas.util._decorators import deprecate_kwarg
-
-VALID_ENCODINGS = ('ascii', 'us-ascii', 'latin-1', 'latin_1', 'iso-8859-1',
-                   'iso8859-1', '8859', 'cp819', 'latin', 'latin1', 'L1')
+from pandas.util._decorators import Appender, deprecate_kwarg
 
 _version_error = ("Version of given Stata file is not 104, 105, 108, "
                   "111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), "
@@ -169,6 +165,7 @@
 
 
 @Appender(_read_stata_doc)
+@deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
 @deprecate_kwarg(old_arg_name='index', new_arg_name='index_col')
 def read_stata(filepath_or_buffer, convert_dates=True,
                convert_categoricals=True, encoding=None, index_col=None,
@@ -952,6 +949,7 @@ def __init__(self):
 class StataReader(StataParser, BaseIterator):
     __doc__ = _stata_reader_doc
 
+    @deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
     @deprecate_kwarg(old_arg_name='index', new_arg_name='index_col')
     def __init__(self, path_or_buf, convert_dates=True,
                  convert_categoricals=True, index_col=None,
@@ -970,7 +968,7 @@ def __init__(self, path_or_buf, convert_dates=True,
         self._preserve_dtypes = preserve_dtypes
         self._columns = columns
         self._order_categoricals = order_categoricals
-        self._encoding = encoding
+        self._encoding = None
         self._chunksize = chunksize
 
         # State variables for the file
@@ -1962,17 +1960,14 @@ class StataWriter(StataParser):
 
     _max_string_length = 244
 
+    @deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
     def __init__(self, fname, data, convert_dates=None, write_index=True,
                  encoding="latin-1", byteorder=None, time_stamp=None,
                  data_label=None, variable_labels=None):
         super(StataWriter, self).__init__()
         self._convert_dates = {} if convert_dates is None else convert_dates
         self._write_index = write_index
-        if encoding is not None:
-            if encoding not in VALID_ENCODINGS:
-                raise ValueError('Unknown encoding. Only latin-1 and ascii '
-                                 'supported.')
-        self._encoding = encoding
+        self._encoding = 'latin-1'
         self._time_stamp = time_stamp
         self._data_label = data_label
         self._variable_labels = variable_labels
@@ -2731,16 +2726,18 @@ class StataWriter117(StataWriter):
 
     _max_string_length = 2045
 
+    @deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
     def __init__(self, fname, data, convert_dates=None, write_index=True,
                  encoding="latin-1", byteorder=None, time_stamp=None,
                  data_label=None, variable_labels=None, convert_strl=None):
         # Shallow copy since convert_strl might be modified later
         self._convert_strl = [] if convert_strl is None else convert_strl[:]
 
         super(StataWriter117, self).__init__(fname, data, convert_dates,
-                                             write_index, encoding, byteorder,
-                                             time_stamp, data_label,
-                                             variable_labels)
+                                             write_index, byteorder=byteorder,
+                                             time_stamp=time_stamp,
+                                             data_label=data_label,
+                                             variable_labels=variable_labels)
         self._map = None
         self._strl_blob = None
 
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
@@ -361,16 +361,18 @@ def test_encoding(self, version):
 
         # GH 4626, proper encoding handling
         raw = read_stata(self.dta_encoding)
-        encoded = read_stata(self.dta_encoding, encoding="latin-1")
+        with tm.assert_produces_warning(FutureWarning):
+            encoded = read_stata(self.dta_encoding, encoding='latin-1')
         result = encoded.kreis1849[0]
 
         expected = raw.kreis1849[0]
         assert result == expected
         assert isinstance(result, compat.string_types)
 
         with tm.ensure_clean() as path:
-            encoded.to_stata(path, encoding='latin-1',
-                             write_index=False, version=version)
+            with tm.assert_produces_warning(FutureWarning):
+                encoded.to_stata(path, write_index=False, version=version,
+                                 encoding='latin-1')
             reread_encoded = read_stata(path)
             tm.assert_frame_equal(encoded, reread_encoded)
 
@@ -1349,13 +1351,6 @@ def test_out_of_range_float(self):
             assert 'ColumnTooBig' in cm.exception
             assert 'infinity' in cm.exception
 
-    def test_invalid_encoding(self):
-        # GH15723, validate encoding
-        original = self.read_csv(self.csv3)
-        with pytest.raises(ValueError):
-            with tm.ensure_clean() as path:
-                original.to_stata(path, encoding='utf-8')
-
     def test_path_pathlib(self):
         df = tm.makeDataFrame()
         df.index.name = 'index'

Original file line number	Diff line number	Diff line change
`@@ -45,7 +45,7 @@ Other API Changes`
`45`	`45`	`Deprecations`
`46`	`46`	`~~~~~~~~~~~~`
`47`	`47`
`48`		`--`
	`48`	+- :meth:`DataFrame.to_stata`, :meth:`read_stata`, :class:`StataReader` and :class:`StataWriter` have deprecated the ``encoding`` argument. The encoding of a Stata dta file is determined by the file type and cannot be changed (:issue:`21244`).
`49`	`49`	`-`
`50`	`50`	`-`
`51`	`51`