Skip to content

Commit 0ade274

Browse files
bashtagevictor
authored and
victor
committed
MAINT: Deprecate encoding from stata reader/writer (pandas-dev#21400)
Deprecate the encoding parameter from all Stata reading and writing methods and classes. The encoding depends only on the file format and cannot be changed by users.
1 parent 766b9fd commit 0ade274

File tree

4 files changed

+22
-29
lines changed

4 files changed

+22
-29
lines changed

doc/source/whatsnew/v0.24.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ Other API Changes
4545
Deprecations
4646
~~~~~~~~~~~~
4747

48-
-
48+
- :meth:`DataFrame.to_stata`, :meth:`read_stata`, :class:`StataReader` and :class:`StataWriter` have deprecated the ``encoding`` argument. The encoding of a Stata dta file is determined by the file type and cannot be changed (:issue:`21244`).
4949
-
5050
-
5151

pandas/core/frame.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,8 @@
8080
from pandas.compat import PY36
8181
from pandas.compat.numpy import function as nv
8282
from pandas.util._decorators import (Appender, Substitution,
83-
rewrite_axis_style_signature)
83+
rewrite_axis_style_signature,
84+
deprecate_kwarg)
8485
from pandas.util._validators import (validate_bool_kwarg,
8586
validate_axis_style_args)
8687

@@ -1764,6 +1765,7 @@ def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='',
17641765
startcol=startcol, freeze_panes=freeze_panes,
17651766
engine=engine)
17661767

1768+
@deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
17671769
def to_stata(self, fname, convert_dates=None, write_index=True,
17681770
encoding="latin-1", byteorder=None, time_stamp=None,
17691771
data_label=None, variable_labels=None, version=114,
@@ -1869,9 +1871,8 @@ def to_stata(self, fname, convert_dates=None, write_index=True,
18691871
kwargs['convert_strl'] = convert_strl
18701872

18711873
writer = statawriter(fname, self, convert_dates=convert_dates,
1872-
encoding=encoding, byteorder=byteorder,
1873-
time_stamp=time_stamp, data_label=data_label,
1874-
write_index=write_index,
1874+
byteorder=byteorder, time_stamp=time_stamp,
1875+
data_label=data_label, write_index=write_index,
18751876
variable_labels=variable_labels, **kwargs)
18761877
writer.write_file()
18771878

pandas/io/stata.py

+11-14
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,7 @@
3333
from pandas.core.series import Series
3434
from pandas.io.common import (get_filepath_or_buffer, BaseIterator,
3535
_stringify_path)
36-
from pandas.util._decorators import Appender
37-
from pandas.util._decorators import deprecate_kwarg
38-
39-
VALID_ENCODINGS = ('ascii', 'us-ascii', 'latin-1', 'latin_1', 'iso-8859-1',
40-
'iso8859-1', '8859', 'cp819', 'latin', 'latin1', 'L1')
36+
from pandas.util._decorators import Appender, deprecate_kwarg
4137

4238
_version_error = ("Version of given Stata file is not 104, 105, 108, "
4339
"111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), "
@@ -169,6 +165,7 @@
169165

170166

171167
@Appender(_read_stata_doc)
168+
@deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
172169
@deprecate_kwarg(old_arg_name='index', new_arg_name='index_col')
173170
def read_stata(filepath_or_buffer, convert_dates=True,
174171
convert_categoricals=True, encoding=None, index_col=None,
@@ -952,6 +949,7 @@ def __init__(self):
952949
class StataReader(StataParser, BaseIterator):
953950
__doc__ = _stata_reader_doc
954951

952+
@deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
955953
@deprecate_kwarg(old_arg_name='index', new_arg_name='index_col')
956954
def __init__(self, path_or_buf, convert_dates=True,
957955
convert_categoricals=True, index_col=None,
@@ -970,7 +968,7 @@ def __init__(self, path_or_buf, convert_dates=True,
970968
self._preserve_dtypes = preserve_dtypes
971969
self._columns = columns
972970
self._order_categoricals = order_categoricals
973-
self._encoding = encoding
971+
self._encoding = None
974972
self._chunksize = chunksize
975973

976974
# State variables for the file
@@ -1962,17 +1960,14 @@ class StataWriter(StataParser):
19621960

19631961
_max_string_length = 244
19641962

1963+
@deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
19651964
def __init__(self, fname, data, convert_dates=None, write_index=True,
19661965
encoding="latin-1", byteorder=None, time_stamp=None,
19671966
data_label=None, variable_labels=None):
19681967
super(StataWriter, self).__init__()
19691968
self._convert_dates = {} if convert_dates is None else convert_dates
19701969
self._write_index = write_index
1971-
if encoding is not None:
1972-
if encoding not in VALID_ENCODINGS:
1973-
raise ValueError('Unknown encoding. Only latin-1 and ascii '
1974-
'supported.')
1975-
self._encoding = encoding
1970+
self._encoding = 'latin-1'
19761971
self._time_stamp = time_stamp
19771972
self._data_label = data_label
19781973
self._variable_labels = variable_labels
@@ -2731,16 +2726,18 @@ class StataWriter117(StataWriter):
27312726

27322727
_max_string_length = 2045
27332728

2729+
@deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
27342730
def __init__(self, fname, data, convert_dates=None, write_index=True,
27352731
encoding="latin-1", byteorder=None, time_stamp=None,
27362732
data_label=None, variable_labels=None, convert_strl=None):
27372733
# Shallow copy since convert_strl might be modified later
27382734
self._convert_strl = [] if convert_strl is None else convert_strl[:]
27392735

27402736
super(StataWriter117, self).__init__(fname, data, convert_dates,
2741-
write_index, encoding, byteorder,
2742-
time_stamp, data_label,
2743-
variable_labels)
2737+
write_index, byteorder=byteorder,
2738+
time_stamp=time_stamp,
2739+
data_label=data_label,
2740+
variable_labels=variable_labels)
27442741
self._map = None
27452742
self._strl_blob = None
27462743

pandas/tests/io/test_stata.py

+5-10
Original file line numberDiff line numberDiff line change
@@ -361,16 +361,18 @@ def test_encoding(self, version):
361361

362362
# GH 4626, proper encoding handling
363363
raw = read_stata(self.dta_encoding)
364-
encoded = read_stata(self.dta_encoding, encoding="latin-1")
364+
with tm.assert_produces_warning(FutureWarning):
365+
encoded = read_stata(self.dta_encoding, encoding='latin-1')
365366
result = encoded.kreis1849[0]
366367

367368
expected = raw.kreis1849[0]
368369
assert result == expected
369370
assert isinstance(result, compat.string_types)
370371

371372
with tm.ensure_clean() as path:
372-
encoded.to_stata(path, encoding='latin-1',
373-
write_index=False, version=version)
373+
with tm.assert_produces_warning(FutureWarning):
374+
encoded.to_stata(path, write_index=False, version=version,
375+
encoding='latin-1')
374376
reread_encoded = read_stata(path)
375377
tm.assert_frame_equal(encoded, reread_encoded)
376378

@@ -1349,13 +1351,6 @@ def test_out_of_range_float(self):
13491351
assert 'ColumnTooBig' in cm.exception
13501352
assert 'infinity' in cm.exception
13511353

1352-
def test_invalid_encoding(self):
1353-
# GH15723, validate encoding
1354-
original = self.read_csv(self.csv3)
1355-
with pytest.raises(ValueError):
1356-
with tm.ensure_clean() as path:
1357-
original.to_stata(path, encoding='utf-8')
1358-
13591354
def test_path_pathlib(self):
13601355
df = tm.makeDataFrame()
13611356
df.index.name = 'index'

0 commit comments

Comments
 (0)