Skip to content

Commit ac8c2c2

Browse files
committed
BUG: Fix encoding for Stata format 118 format files
Ensure that Stata 118 files always use utf-8 encoding
1 parent b32fdc4 commit ac8c2c2

File tree

4 files changed

+47
-45
lines changed

4 files changed

+47
-45
lines changed

doc/source/whatsnew/v0.23.1.txt

+1
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ I/O
9595
- Bug in IO methods specifying ``compression='zip'`` which produced uncompressed zip archives (:issue:`17778`, :issue:`21144`)
9696
- Bug in :meth:`DataFrame.to_stata` which prevented exporting DataFrames to buffers and most file-like objects (:issue:`21041`)
9797
- Bug in :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` causes encoding error when compression and encoding are specified (:issue:`21241`, :issue:`21118`)
98+
- Bug in :meth:`read_stata` and :class:`StataReader` which did not correctly decode utf-8 strings on Python 3 from Stata 14 files (dta version 118) (:issue:`21244`)
9899
-
99100

100101
Plotting

pandas/io/stata.py

+26-36
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,7 @@ def read_stata(filepath_or_buffer, convert_dates=True,
182182
preserve_dtypes=preserve_dtypes,
183183
columns=columns,
184184
order_categoricals=order_categoricals,
185-
chunksize=chunksize, encoding=encoding)
185+
chunksize=chunksize)
186186

187187
if iterator or chunksize:
188188
data = reader
@@ -838,15 +838,8 @@ def get_base_missing_value(cls, dtype):
838838

839839

840840
class StataParser(object):
841-
_default_encoding = 'latin-1'
842841

843-
def __init__(self, encoding):
844-
if encoding is not None:
845-
if encoding not in VALID_ENCODINGS:
846-
raise ValueError('Unknown encoding. Only latin-1 and ascii '
847-
'supported.')
848-
849-
self._encoding = encoding
842+
def __init__(self):
850843

851844
# type code.
852845
# --------------------
@@ -964,8 +957,8 @@ def __init__(self, path_or_buf, convert_dates=True,
964957
convert_categoricals=True, index_col=None,
965958
convert_missing=False, preserve_dtypes=True,
966959
columns=None, order_categoricals=True,
967-
encoding='latin-1', chunksize=None):
968-
super(StataReader, self).__init__(encoding)
960+
encoding=None, chunksize=None):
961+
super(StataReader, self).__init__()
969962
self.col_sizes = ()
970963

971964
# Arguments to the reader (can be temporarily overridden in
@@ -977,10 +970,6 @@ def __init__(self, path_or_buf, convert_dates=True,
977970
self._preserve_dtypes = preserve_dtypes
978971
self._columns = columns
979972
self._order_categoricals = order_categoricals
980-
if encoding is not None:
981-
if encoding not in VALID_ENCODINGS:
982-
raise ValueError('Unknown encoding. Only latin-1 and ascii '
983-
'supported.')
984973
self._encoding = encoding
985974
self._chunksize = chunksize
986975

@@ -998,18 +987,13 @@ def __init__(self, path_or_buf, convert_dates=True,
998987
path_or_buf = _stringify_path(path_or_buf)
999988
if isinstance(path_or_buf, str):
1000989
path_or_buf, encoding, _, should_close = get_filepath_or_buffer(
1001-
path_or_buf, encoding=self._default_encoding
1002-
)
990+
path_or_buf)
1003991

1004992
if isinstance(path_or_buf, (str, text_type, bytes)):
1005993
self.path_or_buf = open(path_or_buf, 'rb')
1006994
else:
1007995
# Copy to BytesIO, and ensure no encoding
1008996
contents = path_or_buf.read()
1009-
try:
1010-
contents = contents.encode(self._default_encoding)
1011-
except:
1012-
pass
1013997
self.path_or_buf = BytesIO(contents)
1014998

1015999
self._read_header()
@@ -1030,6 +1014,15 @@ def close(self):
10301014
except IOError:
10311015
pass
10321016

1017+
def _set_encoding(self):
1018+
"""
1019+
Set string encoding which depends on file version
1020+
"""
1021+
if self.format_version < 118:
1022+
self._encoding = 'latin-1'
1023+
else:
1024+
self._encoding = 'utf-8'
1025+
10331026
def _read_header(self):
10341027
first_char = self.path_or_buf.read(1)
10351028
if struct.unpack('c', first_char)[0] == b'<':
@@ -1049,6 +1042,7 @@ def _read_new_header(self, first_char):
10491042
self.format_version = int(self.path_or_buf.read(3))
10501043
if self.format_version not in [117, 118]:
10511044
raise ValueError(_version_error)
1045+
self._set_encoding()
10521046
self.path_or_buf.read(21) # </release><byteorder>
10531047
self.byteorder = self.path_or_buf.read(3) == b'MSF' and '>' or '<'
10541048
self.path_or_buf.read(15) # </byteorder><K>
@@ -1235,6 +1229,7 @@ def _read_old_header(self, first_char):
12351229
self.format_version = struct.unpack('b', first_char)[0]
12361230
if self.format_version not in [104, 105, 108, 111, 113, 114, 115]:
12371231
raise ValueError(_version_error)
1232+
self._set_encoding()
12381233
self.byteorder = struct.unpack('b', self.path_or_buf.read(1))[
12391234
0] == 0x1 and '>' or '<'
12401235
self.filetype = struct.unpack('b', self.path_or_buf.read(1))[0]
@@ -1338,16 +1333,9 @@ def _decode(self, s):
13381333
return s.decode('utf-8')
13391334

13401335
def _null_terminate(self, s):
1341-
if compat.PY3 or self._encoding is not None:
1342-
# have bytes not strings, so must decode
1343-
s = s.partition(b"\0")[0]
1344-
return s.decode(self._encoding or self._default_encoding)
1345-
else:
1346-
null_byte = "\0"
1347-
try:
1348-
return s.lstrip(null_byte)[:s.index(null_byte)]
1349-
except:
1350-
return s
1336+
# have bytes not strings, so must decode
1337+
s = s.partition(b"\0")[0]
1338+
return s.decode(self._encoding)
13511339

13521340
def _read_value_labels(self):
13531341
if self._value_labels_read:
@@ -1433,10 +1421,7 @@ def _read_strls(self):
14331421
self.path_or_buf.read(4))[0]
14341422
va = self.path_or_buf.read(length)
14351423
if typ == 130:
1436-
encoding = 'utf-8'
1437-
if self.format_version == 117:
1438-
encoding = self._encoding or self._default_encoding
1439-
va = va[0:-1].decode(encoding)
1424+
va = va[0:-1].decode(self._encoding)
14401425
# Wrap v_o in a string to allow uint64 values as keys on 32bit OS
14411426
self.GSO[str(v_o)] = va
14421427

@@ -1980,9 +1965,14 @@ class StataWriter(StataParser):
19801965
def __init__(self, fname, data, convert_dates=None, write_index=True,
19811966
encoding="latin-1", byteorder=None, time_stamp=None,
19821967
data_label=None, variable_labels=None):
1983-
super(StataWriter, self).__init__(encoding)
1968+
super(StataWriter, self).__init__()
19841969
self._convert_dates = {} if convert_dates is None else convert_dates
19851970
self._write_index = write_index
1971+
if encoding is not None:
1972+
if encoding not in VALID_ENCODINGS:
1973+
raise ValueError('Unknown encoding. Only latin-1 and ascii '
1974+
'supported.')
1975+
self._encoding = encoding
19861976
self._time_stamp = time_stamp
19871977
self._data_label = data_label
19881978
self._variable_labels = variable_labels

pandas/tests/io/data/stata16_118.dta

4.51 KB
Binary file not shown.

pandas/tests/io/test_stata.py

+20-9
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ def setup_method(self, method):
9696
self.dta23 = os.path.join(self.dirpath, 'stata15.dta')
9797

9898
self.dta24_111 = os.path.join(self.dirpath, 'stata7_111.dta')
99+
self.dta25_118 = os.path.join(self.dirpath, 'stata16_118.dta')
99100

100101
self.stata_dates = os.path.join(self.dirpath, 'stata13_dates.dta')
101102

@@ -363,19 +364,14 @@ def test_encoding(self, version):
363364
encoded = read_stata(self.dta_encoding, encoding="latin-1")
364365
result = encoded.kreis1849[0]
365366

366-
if compat.PY3:
367-
expected = raw.kreis1849[0]
368-
assert result == expected
369-
assert isinstance(result, compat.string_types)
370-
else:
371-
expected = raw.kreis1849.str.decode("latin-1")[0]
372-
assert result == expected
373-
assert isinstance(result, unicode) # noqa
367+
expected = raw.kreis1849[0]
368+
assert result == expected
369+
assert isinstance(result, compat.string_types)
374370

375371
with tm.ensure_clean() as path:
376372
encoded.to_stata(path, encoding='latin-1',
377373
write_index=False, version=version)
378-
reread_encoded = read_stata(path, encoding='latin-1')
374+
reread_encoded = read_stata(path)
379375
tm.assert_frame_equal(encoded, reread_encoded)
380376

381377
def test_read_write_dta11(self):
@@ -1500,3 +1496,18 @@ def test_gzip_writing(self):
15001496
with gzip.GzipFile(path, 'rb') as gz:
15011497
reread = pd.read_stata(gz, index_col='index')
15021498
tm.assert_frame_equal(df, reread)
1499+
1500+
def test_unicode_dta_118(self):
1501+
unicode_df = self.read_dta(self.dta25_118)
1502+
1503+
columns = ['utf8', 'latin1', 'ascii', 'utf8_strl', 'ascii_strl']
1504+
values = [[u'ραηδας', u'PÄNDÄS', 'p', u'ραηδας', 'p'],
1505+
[u'ƤĀńĐąŜ', u'Ö', 'a', u'ƤĀńĐąŜ', 'a'],
1506+
[u'ᴘᴀᴎᴅᴀS', u'Ü', 'n', u'ᴘᴀᴎᴅᴀS', 'n'],
1507+
[' ', ' ', 'd', ' ', 'd'],
1508+
[' ', '', 'a', ' ', 'a'],
1509+
['', '', 's', '', 's'],
1510+
['', '', ' ', '', ' ']]
1511+
expected = pd.DataFrame(values, columns=columns)
1512+
1513+
tm.assert_frame_equal(unicode_df, expected)

0 commit comments

Comments
 (0)