Skip to content

Commit 461dd94

Browse files
committed
BUG: Fix encoding for Stata format 118 files
Ensure that Stata 118 files always use utf-8 encoding Deprecate encoding from read_stata and StataReader
1 parent 4274b84 commit 461dd94

File tree

4 files changed

+47
-49
lines changed

4 files changed

+47
-49
lines changed

doc/source/whatsnew/v0.23.1.txt

+2-2
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ New features
2323
Deprecations
2424
~~~~~~~~~~~~
2525

26-
-
26+
- :func:`read_stata` and :class:`StataReader` have deprecated the ``encoding`` parameter. Stata files only support a single encoding and so this input has no effect. (:issue:`21244`)
2727
-
2828

2929
.. _whatsnew_0231.performance:
@@ -92,7 +92,7 @@ I/O
9292

9393
- Bug in IO methods specifying ``compression='zip'`` which produced uncompressed zip archives (:issue:`17778`, :issue:`21144`)
9494
- Bug in :meth:`DataFrame.to_stata` which prevented exporting DataFrames to buffers and most file-like objects (:issue:`21041`)
95-
-
95+
- Bug in :meth:`read_stata` and :class:`StataReader` which did not correctly decode utf-8 strings on Python 3 from Stata 14 files (dta version 118) (:issue:`21244`)
9696

9797
Plotting
9898
^^^^^^^^

pandas/io/stata.py

+21-37
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,6 @@
3636
from pandas.util._decorators import Appender
3737
from pandas.util._decorators import deprecate_kwarg
3838

39-
VALID_ENCODINGS = ('ascii', 'us-ascii', 'latin-1', 'latin_1', 'iso-8859-1',
40-
'iso8859-1', '8859', 'cp819', 'latin', 'latin1', 'L1')
41-
4239
_version_error = ("Version of given Stata file is not 104, 105, 108, "
4340
"111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), "
4441
"115 (Stata 12), 117 (Stata 13), or 118 (Stata 14)")
@@ -169,6 +166,7 @@
169166

170167

171168
@Appender(_read_stata_doc)
169+
@deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
172170
@deprecate_kwarg(old_arg_name='index', new_arg_name='index_col')
173171
def read_stata(filepath_or_buffer, convert_dates=True,
174172
convert_categoricals=True, encoding=None, index_col=None,
@@ -182,7 +180,7 @@ def read_stata(filepath_or_buffer, convert_dates=True,
182180
preserve_dtypes=preserve_dtypes,
183181
columns=columns,
184182
order_categoricals=order_categoricals,
185-
chunksize=chunksize, encoding=encoding)
183+
chunksize=chunksize)
186184

187185
if iterator or chunksize:
188186
data = reader
@@ -838,15 +836,8 @@ def get_base_missing_value(cls, dtype):
838836

839837

840838
class StataParser(object):
841-
_default_encoding = 'latin-1'
842-
843-
def __init__(self, encoding):
844-
if encoding is not None:
845-
if encoding not in VALID_ENCODINGS:
846-
raise ValueError('Unknown encoding. Only latin-1 and ascii '
847-
'supported.')
848839

849-
self._encoding = encoding
840+
def __init__(self):
850841

851842
# type code.
852843
# --------------------
@@ -959,12 +950,13 @@ def __init__(self, encoding):
959950
class StataReader(StataParser, BaseIterator):
960951
__doc__ = _stata_reader_doc
961952

953+
@deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
962954
@deprecate_kwarg(old_arg_name='index', new_arg_name='index_col')
963955
def __init__(self, path_or_buf, convert_dates=True,
964956
convert_categoricals=True, index_col=None,
965957
convert_missing=False, preserve_dtypes=True,
966958
columns=None, order_categoricals=True,
967-
encoding='latin-1', chunksize=None):
959+
encoding=None, chunksize=None):
968960
super(StataReader, self).__init__(encoding)
969961
self.col_sizes = ()
970962

@@ -977,10 +969,6 @@ def __init__(self, path_or_buf, convert_dates=True,
977969
self._preserve_dtypes = preserve_dtypes
978970
self._columns = columns
979971
self._order_categoricals = order_categoricals
980-
if encoding is not None:
981-
if encoding not in VALID_ENCODINGS:
982-
raise ValueError('Unknown encoding. Only latin-1 and ascii '
983-
'supported.')
984972
self._encoding = encoding
985973
self._chunksize = chunksize
986974

@@ -998,18 +986,13 @@ def __init__(self, path_or_buf, convert_dates=True,
998986
path_or_buf = _stringify_path(path_or_buf)
999987
if isinstance(path_or_buf, str):
1000988
path_or_buf, encoding, _, should_close = get_filepath_or_buffer(
1001-
path_or_buf, encoding=self._default_encoding
1002-
)
989+
path_or_buf)
1003990

1004991
if isinstance(path_or_buf, (str, text_type, bytes)):
1005992
self.path_or_buf = open(path_or_buf, 'rb')
1006993
else:
1007994
# Copy to BytesIO, and ensure no encoding
1008995
contents = path_or_buf.read()
1009-
try:
1010-
contents = contents.encode(self._default_encoding)
1011-
except:
1012-
pass
1013996
self.path_or_buf = BytesIO(contents)
1014997

1015998
self._read_header()
@@ -1030,6 +1013,15 @@ def close(self):
10301013
except IOError:
10311014
pass
10321015

1016+
def _set_encoding(self):
1017+
"""
1018+
Set string encoding which depends on file version
1019+
"""
1020+
if self.format_version < 118:
1021+
self._encoding = 'latin-1'
1022+
else:
1023+
self._encoding = 'utf-8'
1024+
10331025
def _read_header(self):
10341026
first_char = self.path_or_buf.read(1)
10351027
if struct.unpack('c', first_char)[0] == b'<':
@@ -1049,6 +1041,7 @@ def _read_new_header(self, first_char):
10491041
self.format_version = int(self.path_or_buf.read(3))
10501042
if self.format_version not in [117, 118]:
10511043
raise ValueError(_version_error)
1044+
self._set_encoding()
10521045
self.path_or_buf.read(21) # </release><byteorder>
10531046
self.byteorder = self.path_or_buf.read(3) == b'MSF' and '>' or '<'
10541047
self.path_or_buf.read(15) # </byteorder><K>
@@ -1235,6 +1228,7 @@ def _read_old_header(self, first_char):
12351228
self.format_version = struct.unpack('b', first_char)[0]
12361229
if self.format_version not in [104, 105, 108, 111, 113, 114, 115]:
12371230
raise ValueError(_version_error)
1231+
self._set_encoding()
12381232
self.byteorder = struct.unpack('b', self.path_or_buf.read(1))[
12391233
0] == 0x1 and '>' or '<'
12401234
self.filetype = struct.unpack('b', self.path_or_buf.read(1))[0]
@@ -1338,16 +1332,9 @@ def _decode(self, s):
13381332
return s.decode('utf-8')
13391333

13401334
def _null_terminate(self, s):
1341-
if compat.PY3 or self._encoding is not None:
1342-
# have bytes not strings, so must decode
1343-
s = s.partition(b"\0")[0]
1344-
return s.decode(self._encoding or self._default_encoding)
1345-
else:
1346-
null_byte = "\0"
1347-
try:
1348-
return s.lstrip(null_byte)[:s.index(null_byte)]
1349-
except:
1350-
return s
1335+
# have bytes not strings, so must decode
1336+
s = s.partition(b"\0")[0]
1337+
return s.decode(self._encoding)
13511338

13521339
def _read_value_labels(self):
13531340
if self._value_labels_read:
@@ -1433,10 +1420,7 @@ def _read_strls(self):
14331420
self.path_or_buf.read(4))[0]
14341421
va = self.path_or_buf.read(length)
14351422
if typ == 130:
1436-
encoding = 'utf-8'
1437-
if self.format_version == 117:
1438-
encoding = self._encoding or self._default_encoding
1439-
va = va[0:-1].decode(encoding)
1423+
va = va[0:-1].decode(self._encoding)
14401424
# Wrap v_o in a string to allow uint64 values as keys on 32bit OS
14411425
self.GSO[str(v_o)] = va
14421426

pandas/tests/io/data/stata16_118.dta

4.51 KB
Binary file not shown.

pandas/tests/io/test_stata.py

+24-10
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ def setup_method(self, method):
9696
self.dta23 = os.path.join(self.dirpath, 'stata15.dta')
9797

9898
self.dta24_111 = os.path.join(self.dirpath, 'stata7_111.dta')
99+
self.dta25_118 = os.path.join(self.dirpath, 'stata16_118.dta')
99100

100101
self.stata_dates = os.path.join(self.dirpath, 'stata13_dates.dta')
101102

@@ -360,22 +361,20 @@ def test_encoding(self, version):
360361

361362
# GH 4626, proper encoding handling
362363
raw = read_stata(self.dta_encoding)
363-
encoded = read_stata(self.dta_encoding, encoding="latin-1")
364+
with warnings.catch_warnings(record=True) as w:
365+
warnings.simplefilter("always")
366+
encoded = read_stata(self.dta_encoding, encoding="latin-1")
367+
assert len(w) == 1
364368
result = encoded.kreis1849[0]
365369

366-
if compat.PY3:
367-
expected = raw.kreis1849[0]
368-
assert result == expected
369-
assert isinstance(result, compat.string_types)
370-
else:
371-
expected = raw.kreis1849.str.decode("latin-1")[0]
372-
assert result == expected
373-
assert isinstance(result, unicode) # noqa
370+
expected = raw.kreis1849[0]
371+
assert result == expected
372+
assert isinstance(result, compat.string_types)
374373

375374
with tm.ensure_clean() as path:
376375
encoded.to_stata(path, encoding='latin-1',
377376
write_index=False, version=version)
378-
reread_encoded = read_stata(path, encoding='latin-1')
377+
reread_encoded = read_stata(path)
379378
tm.assert_frame_equal(encoded, reread_encoded)
380379

381380
def test_read_write_dta11(self):
@@ -1500,3 +1499,18 @@ def test_gzip_writing(self):
15001499
with gzip.GzipFile(path, 'rb') as gz:
15011500
reread = pd.read_stata(gz, index_col='index')
15021501
tm.assert_frame_equal(df, reread)
1502+
1503+
def test_unicode_dta_118(self):
1504+
unicode_df = self.read_dta(self.dta25_118)
1505+
1506+
columns = ['utf8', 'latin1', 'ascii', 'utf8_strl', 'ascii_strl']
1507+
values = [[u'ραηδας', u'PÄNDÄS', 'p', u'ραηδας', 'p'],
1508+
[u'ƤĀńĐąŜ', u'Ö', 'a', u'ƤĀńĐąŜ', 'a'],
1509+
[u'ᴘᴀᴎᴅᴀS', u'Ü', 'n', u'ᴘᴀᴎᴅᴀS', 'n'],
1510+
[' ', ' ', 'd', ' ', 'd'],
1511+
[' ', '', 'a', ' ', 'a'],
1512+
['', '', 's', '', 's'],
1513+
['', '', ' ', '', ' ']]
1514+
expected = pd.DataFrame(values, columns=columns)
1515+
1516+
tm.assert_frame_equal(unicode_df, expected)

0 commit comments

Comments
 (0)