diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt
index 69b07d12c1e98..bddb9d3b8e2a7 100644
--- a/doc/source/whatsnew/v0.23.1.txt
+++ b/doc/source/whatsnew/v0.23.1.txt
@@ -95,6 +95,7 @@ I/O
- Bug in IO methods specifying ``compression='zip'`` which produced uncompressed zip archives (:issue:`17778`, :issue:`21144`)
- Bug in :meth:`DataFrame.to_stata` which prevented exporting DataFrames to buffers and most file-like objects (:issue:`21041`)
- Bug in :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` causes encoding error when compression and encoding are specified (:issue:`21241`, :issue:`21118`)
+- Bug in :meth:`read_stata` and :class:`StataReader` which did not correctly decode utf-8 strings on Python 3 from Stata 14 files (dta version 118) (:issue:`21244`)
-
Plotting
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
index 2797924985c70..8584e1f0e3f14 100644
--- a/pandas/io/stata.py
+++ b/pandas/io/stata.py
@@ -182,7 +182,7 @@ def read_stata(filepath_or_buffer, convert_dates=True,
preserve_dtypes=preserve_dtypes,
columns=columns,
order_categoricals=order_categoricals,
- chunksize=chunksize, encoding=encoding)
+ chunksize=chunksize)
if iterator or chunksize:
data = reader
@@ -838,15 +838,8 @@ def get_base_missing_value(cls, dtype):
class StataParser(object):
- _default_encoding = 'latin-1'
- def __init__(self, encoding):
- if encoding is not None:
- if encoding not in VALID_ENCODINGS:
- raise ValueError('Unknown encoding. Only latin-1 and ascii '
- 'supported.')
-
- self._encoding = encoding
+ def __init__(self):
# type code.
# --------------------
@@ -964,8 +957,8 @@ def __init__(self, path_or_buf, convert_dates=True,
convert_categoricals=True, index_col=None,
convert_missing=False, preserve_dtypes=True,
columns=None, order_categoricals=True,
- encoding='latin-1', chunksize=None):
- super(StataReader, self).__init__(encoding)
+ encoding=None, chunksize=None):
+ super(StataReader, self).__init__()
self.col_sizes = ()
# Arguments to the reader (can be temporarily overridden in
@@ -977,10 +970,6 @@ def __init__(self, path_or_buf, convert_dates=True,
self._preserve_dtypes = preserve_dtypes
self._columns = columns
self._order_categoricals = order_categoricals
- if encoding is not None:
- if encoding not in VALID_ENCODINGS:
- raise ValueError('Unknown encoding. Only latin-1 and ascii '
- 'supported.')
self._encoding = encoding
self._chunksize = chunksize
@@ -998,18 +987,13 @@ def __init__(self, path_or_buf, convert_dates=True,
path_or_buf = _stringify_path(path_or_buf)
if isinstance(path_or_buf, str):
path_or_buf, encoding, _, should_close = get_filepath_or_buffer(
- path_or_buf, encoding=self._default_encoding
- )
+ path_or_buf)
if isinstance(path_or_buf, (str, text_type, bytes)):
self.path_or_buf = open(path_or_buf, 'rb')
else:
# Copy to BytesIO, and ensure no encoding
contents = path_or_buf.read()
- try:
- contents = contents.encode(self._default_encoding)
- except:
- pass
self.path_or_buf = BytesIO(contents)
self._read_header()
@@ -1030,6 +1014,15 @@ def close(self):
except IOError:
pass
+ def _set_encoding(self):
+ """
+ Set string encoding which depends on file version
+ """
+ if self.format_version < 118:
+ self._encoding = 'latin-1'
+ else:
+ self._encoding = 'utf-8'
+
def _read_header(self):
first_char = self.path_or_buf.read(1)
if struct.unpack('c', first_char)[0] == b'<':
@@ -1049,6 +1042,7 @@ def _read_new_header(self, first_char):
self.format_version = int(self.path_or_buf.read(3))
if self.format_version not in [117, 118]:
raise ValueError(_version_error)
+ self._set_encoding()
self.path_or_buf.read(21) #
self.byteorder = self.path_or_buf.read(3) == b'MSF' and '>' or '<'
self.path_or_buf.read(15) #
@@ -1235,6 +1229,7 @@ def _read_old_header(self, first_char):
self.format_version = struct.unpack('b', first_char)[0]
if self.format_version not in [104, 105, 108, 111, 113, 114, 115]:
raise ValueError(_version_error)
+ self._set_encoding()
self.byteorder = struct.unpack('b', self.path_or_buf.read(1))[
0] == 0x1 and '>' or '<'
self.filetype = struct.unpack('b', self.path_or_buf.read(1))[0]
@@ -1338,16 +1333,9 @@ def _decode(self, s):
return s.decode('utf-8')
def _null_terminate(self, s):
- if compat.PY3 or self._encoding is not None:
- # have bytes not strings, so must decode
- s = s.partition(b"\0")[0]
- return s.decode(self._encoding or self._default_encoding)
- else:
- null_byte = "\0"
- try:
- return s.lstrip(null_byte)[:s.index(null_byte)]
- except:
- return s
+ # have bytes not strings, so must decode
+ s = s.partition(b"\0")[0]
+ return s.decode(self._encoding)
def _read_value_labels(self):
if self._value_labels_read:
@@ -1433,10 +1421,7 @@ def _read_strls(self):
self.path_or_buf.read(4))[0]
va = self.path_or_buf.read(length)
if typ == 130:
- encoding = 'utf-8'
- if self.format_version == 117:
- encoding = self._encoding or self._default_encoding
- va = va[0:-1].decode(encoding)
+ va = va[0:-1].decode(self._encoding)
# Wrap v_o in a string to allow uint64 values as keys on 32bit OS
self.GSO[str(v_o)] = va
@@ -1980,9 +1965,14 @@ class StataWriter(StataParser):
def __init__(self, fname, data, convert_dates=None, write_index=True,
encoding="latin-1", byteorder=None, time_stamp=None,
data_label=None, variable_labels=None):
- super(StataWriter, self).__init__(encoding)
+ super(StataWriter, self).__init__()
self._convert_dates = {} if convert_dates is None else convert_dates
self._write_index = write_index
+ if encoding is not None:
+ if encoding not in VALID_ENCODINGS:
+ raise ValueError('Unknown encoding. Only latin-1 and ascii '
+ 'supported.')
+ self._encoding = encoding
self._time_stamp = time_stamp
self._data_label = data_label
self._variable_labels = variable_labels
diff --git a/pandas/tests/io/data/stata16_118.dta b/pandas/tests/io/data/stata16_118.dta
new file mode 100644
index 0000000000000..49cfa49d1b302
Binary files /dev/null and b/pandas/tests/io/data/stata16_118.dta differ
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
index f3a465da4e87f..e5585902a9dd6 100644
--- a/pandas/tests/io/test_stata.py
+++ b/pandas/tests/io/test_stata.py
@@ -96,6 +96,7 @@ def setup_method(self, method):
self.dta23 = os.path.join(self.dirpath, 'stata15.dta')
self.dta24_111 = os.path.join(self.dirpath, 'stata7_111.dta')
+ self.dta25_118 = os.path.join(self.dirpath, 'stata16_118.dta')
self.stata_dates = os.path.join(self.dirpath, 'stata13_dates.dta')
@@ -363,19 +364,14 @@ def test_encoding(self, version):
encoded = read_stata(self.dta_encoding, encoding="latin-1")
result = encoded.kreis1849[0]
- if compat.PY3:
- expected = raw.kreis1849[0]
- assert result == expected
- assert isinstance(result, compat.string_types)
- else:
- expected = raw.kreis1849.str.decode("latin-1")[0]
- assert result == expected
- assert isinstance(result, unicode) # noqa
+ expected = raw.kreis1849[0]
+ assert result == expected
+ assert isinstance(result, compat.string_types)
with tm.ensure_clean() as path:
encoded.to_stata(path, encoding='latin-1',
write_index=False, version=version)
- reread_encoded = read_stata(path, encoding='latin-1')
+ reread_encoded = read_stata(path)
tm.assert_frame_equal(encoded, reread_encoded)
def test_read_write_dta11(self):
@@ -1500,3 +1496,18 @@ def test_gzip_writing(self):
with gzip.GzipFile(path, 'rb') as gz:
reread = pd.read_stata(gz, index_col='index')
tm.assert_frame_equal(df, reread)
+
+ def test_unicode_dta_118(self):
+ unicode_df = self.read_dta(self.dta25_118)
+
+ columns = ['utf8', 'latin1', 'ascii', 'utf8_strl', 'ascii_strl']
+ values = [[u'ραηδας', u'PÄNDÄS', 'p', u'ραηδας', 'p'],
+ [u'ƤĀńĐąŜ', u'Ö', 'a', u'ƤĀńĐąŜ', 'a'],
+ [u'ᴘᴀᴎᴅᴀS', u'Ü', 'n', u'ᴘᴀᴎᴅᴀS', 'n'],
+ [' ', ' ', 'd', ' ', 'd'],
+ [' ', '', 'a', ' ', 'a'],
+ ['', '', 's', '', 's'],
+ ['', '', ' ', '', ' ']]
+ expected = pd.DataFrame(values, columns=columns)
+
+ tm.assert_frame_equal(unicode_df, expected)