Skip to content

Commit 4bdb2c4

Browse files
committed
BUG: Fix encoding for Stata format 1188888888 files
Ensure that Stata 118 files always use utf-8 encoding Deprecate encoding from read_stata and StataReader
1 parent 4274b84 commit 4bdb2c4

File tree

4 files changed

+54
-48
lines changed

4 files changed

+54
-48
lines changed

doc/source/whatsnew/v0.23.1.txt

+2-2
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ New features
2323
Deprecations
2424
~~~~~~~~~~~~
2525

26-
-
26+
- :func:`read_stata` and :class:`StataReader` have deprecated the ``encoding`` parameter. Stata files only support a single encoding and so this input has no effect. (:issue:`21244`)
2727
-
2828

2929
.. _whatsnew_0231.performance:
@@ -92,7 +92,7 @@ I/O
9292

9393
- Bug in IO methods specifying ``compression='zip'`` which produced uncompressed zip archives (:issue:`17778`, :issue:`21144`)
9494
- Bug in :meth:`DataFrame.to_stata` which prevented exporting DataFrames to buffers and most file-like objects (:issue:`21041`)
95-
-
95+
- Bug in :meth:`read_stata` and :class:`StataReader` which did not correctly decode utf-8 strings on Python 3 from Stata 14 files (dta version 118) (:issue:`21244`)
9696

9797
Plotting
9898
^^^^^^^^

pandas/io/stata.py

+28-36
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,7 @@
169169

170170

171171
@Appender(_read_stata_doc)
172+
@deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
172173
@deprecate_kwarg(old_arg_name='index', new_arg_name='index_col')
173174
def read_stata(filepath_or_buffer, convert_dates=True,
174175
convert_categoricals=True, encoding=None, index_col=None,
@@ -182,7 +183,7 @@ def read_stata(filepath_or_buffer, convert_dates=True,
182183
preserve_dtypes=preserve_dtypes,
183184
columns=columns,
184185
order_categoricals=order_categoricals,
185-
chunksize=chunksize, encoding=encoding)
186+
chunksize=chunksize)
186187

187188
if iterator or chunksize:
188189
data = reader
@@ -838,15 +839,8 @@ def get_base_missing_value(cls, dtype):
838839

839840

840841
class StataParser(object):
841-
_default_encoding = 'latin-1'
842842

843-
def __init__(self, encoding):
844-
if encoding is not None:
845-
if encoding not in VALID_ENCODINGS:
846-
raise ValueError('Unknown encoding. Only latin-1 and ascii '
847-
'supported.')
848-
849-
self._encoding = encoding
843+
def __init__(self):
850844

851845
# type code.
852846
# --------------------
@@ -959,13 +953,14 @@ def __init__(self, encoding):
959953
class StataReader(StataParser, BaseIterator):
960954
__doc__ = _stata_reader_doc
961955

956+
@deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
962957
@deprecate_kwarg(old_arg_name='index', new_arg_name='index_col')
963958
def __init__(self, path_or_buf, convert_dates=True,
964959
convert_categoricals=True, index_col=None,
965960
convert_missing=False, preserve_dtypes=True,
966961
columns=None, order_categoricals=True,
967-
encoding='latin-1', chunksize=None):
968-
super(StataReader, self).__init__(encoding)
962+
encoding=None, chunksize=None):
963+
super(StataReader, self).__init__()
969964
self.col_sizes = ()
970965

971966
# Arguments to the reader (can be temporarily overridden in
@@ -977,10 +972,6 @@ def __init__(self, path_or_buf, convert_dates=True,
977972
self._preserve_dtypes = preserve_dtypes
978973
self._columns = columns
979974
self._order_categoricals = order_categoricals
980-
if encoding is not None:
981-
if encoding not in VALID_ENCODINGS:
982-
raise ValueError('Unknown encoding. Only latin-1 and ascii '
983-
'supported.')
984975
self._encoding = encoding
985976
self._chunksize = chunksize
986977

@@ -998,18 +989,13 @@ def __init__(self, path_or_buf, convert_dates=True,
998989
path_or_buf = _stringify_path(path_or_buf)
999990
if isinstance(path_or_buf, str):
1000991
path_or_buf, encoding, _, should_close = get_filepath_or_buffer(
1001-
path_or_buf, encoding=self._default_encoding
1002-
)
992+
path_or_buf)
1003993

1004994
if isinstance(path_or_buf, (str, text_type, bytes)):
1005995
self.path_or_buf = open(path_or_buf, 'rb')
1006996
else:
1007997
# Copy to BytesIO, and ensure no encoding
1008998
contents = path_or_buf.read()
1009-
try:
1010-
contents = contents.encode(self._default_encoding)
1011-
except:
1012-
pass
1013999
self.path_or_buf = BytesIO(contents)
10141000

10151001
self._read_header()
@@ -1030,6 +1016,15 @@ def close(self):
10301016
except IOError:
10311017
pass
10321018

1019+
def _set_encoding(self):
1020+
"""
1021+
Set string encoding which depends on file version
1022+
"""
1023+
if self.format_version < 118:
1024+
self._encoding = 'latin-1'
1025+
else:
1026+
self._encoding = 'utf-8'
1027+
10331028
def _read_header(self):
10341029
first_char = self.path_or_buf.read(1)
10351030
if struct.unpack('c', first_char)[0] == b'<':
@@ -1049,6 +1044,7 @@ def _read_new_header(self, first_char):
10491044
self.format_version = int(self.path_or_buf.read(3))
10501045
if self.format_version not in [117, 118]:
10511046
raise ValueError(_version_error)
1047+
self._set_encoding()
10521048
self.path_or_buf.read(21) # </release><byteorder>
10531049
self.byteorder = self.path_or_buf.read(3) == b'MSF' and '>' or '<'
10541050
self.path_or_buf.read(15) # </byteorder><K>
@@ -1235,6 +1231,7 @@ def _read_old_header(self, first_char):
12351231
self.format_version = struct.unpack('b', first_char)[0]
12361232
if self.format_version not in [104, 105, 108, 111, 113, 114, 115]:
12371233
raise ValueError(_version_error)
1234+
self._set_encoding()
12381235
self.byteorder = struct.unpack('b', self.path_or_buf.read(1))[
12391236
0] == 0x1 and '>' or '<'
12401237
self.filetype = struct.unpack('b', self.path_or_buf.read(1))[0]
@@ -1338,16 +1335,9 @@ def _decode(self, s):
13381335
return s.decode('utf-8')
13391336

13401337
def _null_terminate(self, s):
1341-
if compat.PY3 or self._encoding is not None:
1342-
# have bytes not strings, so must decode
1343-
s = s.partition(b"\0")[0]
1344-
return s.decode(self._encoding or self._default_encoding)
1345-
else:
1346-
null_byte = "\0"
1347-
try:
1348-
return s.lstrip(null_byte)[:s.index(null_byte)]
1349-
except:
1350-
return s
1338+
# have bytes not strings, so must decode
1339+
s = s.partition(b"\0")[0]
1340+
return s.decode(self._encoding)
13511341

13521342
def _read_value_labels(self):
13531343
if self._value_labels_read:
@@ -1433,10 +1423,7 @@ def _read_strls(self):
14331423
self.path_or_buf.read(4))[0]
14341424
va = self.path_or_buf.read(length)
14351425
if typ == 130:
1436-
encoding = 'utf-8'
1437-
if self.format_version == 117:
1438-
encoding = self._encoding or self._default_encoding
1439-
va = va[0:-1].decode(encoding)
1426+
va = va[0:-1].decode(self._encoding)
14401427
# Wrap v_o in a string to allow uint64 values as keys on 32bit OS
14411428
self.GSO[str(v_o)] = va
14421429

@@ -1980,9 +1967,14 @@ class StataWriter(StataParser):
19801967
def __init__(self, fname, data, convert_dates=None, write_index=True,
19811968
encoding="latin-1", byteorder=None, time_stamp=None,
19821969
data_label=None, variable_labels=None):
1983-
super(StataWriter, self).__init__(encoding)
1970+
super(StataWriter, self).__init__()
19841971
self._convert_dates = {} if convert_dates is None else convert_dates
19851972
self._write_index = write_index
1973+
if encoding is not None:
1974+
if encoding not in VALID_ENCODINGS:
1975+
raise ValueError('Unknown encoding. Only latin-1 and ascii '
1976+
'supported.')
1977+
self._encoding = encoding
19861978
self._time_stamp = time_stamp
19871979
self._data_label = data_label
19881980
self._variable_labels = variable_labels

pandas/tests/io/data/stata16_118.dta

4.51 KB
Binary file not shown.

pandas/tests/io/test_stata.py

+24-10
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ def setup_method(self, method):
9696
self.dta23 = os.path.join(self.dirpath, 'stata15.dta')
9797

9898
self.dta24_111 = os.path.join(self.dirpath, 'stata7_111.dta')
99+
self.dta25_118 = os.path.join(self.dirpath, 'stata16_118.dta')
99100

100101
self.stata_dates = os.path.join(self.dirpath, 'stata13_dates.dta')
101102

@@ -360,22 +361,20 @@ def test_encoding(self, version):
360361

361362
# GH 4626, proper encoding handling
362363
raw = read_stata(self.dta_encoding)
363-
encoded = read_stata(self.dta_encoding, encoding="latin-1")
364+
with warnings.catch_warnings(record=True) as w:
365+
warnings.simplefilter("always")
366+
encoded = read_stata(self.dta_encoding, encoding="latin-1")
367+
assert len(w) == 1
364368
result = encoded.kreis1849[0]
365369

366-
if compat.PY3:
367-
expected = raw.kreis1849[0]
368-
assert result == expected
369-
assert isinstance(result, compat.string_types)
370-
else:
371-
expected = raw.kreis1849.str.decode("latin-1")[0]
372-
assert result == expected
373-
assert isinstance(result, unicode) # noqa
370+
expected = raw.kreis1849[0]
371+
assert result == expected
372+
assert isinstance(result, compat.string_types)
374373

375374
with tm.ensure_clean() as path:
376375
encoded.to_stata(path, encoding='latin-1',
377376
write_index=False, version=version)
378-
reread_encoded = read_stata(path, encoding='latin-1')
377+
reread_encoded = read_stata(path)
379378
tm.assert_frame_equal(encoded, reread_encoded)
380379

381380
def test_read_write_dta11(self):
@@ -1500,3 +1499,18 @@ def test_gzip_writing(self):
15001499
with gzip.GzipFile(path, 'rb') as gz:
15011500
reread = pd.read_stata(gz, index_col='index')
15021501
tm.assert_frame_equal(df, reread)
1502+
1503+
def test_unicode_dta_118(self):
1504+
unicode_df = self.read_dta(self.dta25_118)
1505+
1506+
columns = ['utf8', 'latin1', 'ascii', 'utf8_strl', 'ascii_strl']
1507+
values = [[u'ραηδας', u'PÄNDÄS', 'p', u'ραηδας', 'p'],
1508+
[u'ƤĀńĐąŜ', u'Ö', 'a', u'ƤĀńĐąŜ', 'a'],
1509+
[u'ᴘᴀᴎᴅᴀS', u'Ü', 'n', u'ᴘᴀᴎᴅᴀS', 'n'],
1510+
[' ', ' ', 'd', ' ', 'd'],
1511+
[' ', '', 'a', ' ', 'a'],
1512+
['', '', 's', '', 's'],
1513+
['', '', ' ', '', ' ']]
1514+
expected = pd.DataFrame(values, columns=columns)
1515+
1516+
tm.assert_frame_equal(unicode_df, expected)

0 commit comments

Comments
 (0)