diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index 69b07d12c1e98..bddb9d3b8e2a7 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -95,6 +95,7 @@ I/O - Bug in IO methods specifying ``compression='zip'`` which produced uncompressed zip archives (:issue:`17778`, :issue:`21144`) - Bug in :meth:`DataFrame.to_stata` which prevented exporting DataFrames to buffers and most file-like objects (:issue:`21041`) - Bug in :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` causes encoding error when compression and encoding are specified (:issue:`21241`, :issue:`21118`) +- Bug in :meth:`read_stata` and :class:`StataReader` which did not correctly decode utf-8 strings on Python 3 from Stata 14 files (dta version 118) (:issue:`21244`) - Plotting diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 2797924985c70..8584e1f0e3f14 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -182,7 +182,7 @@ def read_stata(filepath_or_buffer, convert_dates=True, preserve_dtypes=preserve_dtypes, columns=columns, order_categoricals=order_categoricals, - chunksize=chunksize, encoding=encoding) + chunksize=chunksize) if iterator or chunksize: data = reader @@ -838,15 +838,8 @@ def get_base_missing_value(cls, dtype): class StataParser(object): - _default_encoding = 'latin-1' - def __init__(self, encoding): - if encoding is not None: - if encoding not in VALID_ENCODINGS: - raise ValueError('Unknown encoding. Only latin-1 and ascii ' - 'supported.') - - self._encoding = encoding + def __init__(self): # type code. # -------------------- @@ -964,8 +957,8 @@ def __init__(self, path_or_buf, convert_dates=True, convert_categoricals=True, index_col=None, convert_missing=False, preserve_dtypes=True, columns=None, order_categoricals=True, - encoding='latin-1', chunksize=None): - super(StataReader, self).__init__(encoding) + encoding=None, chunksize=None): + super(StataReader, self).__init__() self.col_sizes = () # Arguments to the reader (can be temporarily overridden in @@ -977,10 +970,6 @@ def __init__(self, path_or_buf, convert_dates=True, self._preserve_dtypes = preserve_dtypes self._columns = columns self._order_categoricals = order_categoricals - if encoding is not None: - if encoding not in VALID_ENCODINGS: - raise ValueError('Unknown encoding. Only latin-1 and ascii ' - 'supported.') self._encoding = encoding self._chunksize = chunksize @@ -998,18 +987,13 @@ def __init__(self, path_or_buf, convert_dates=True, path_or_buf = _stringify_path(path_or_buf) if isinstance(path_or_buf, str): path_or_buf, encoding, _, should_close = get_filepath_or_buffer( - path_or_buf, encoding=self._default_encoding - ) + path_or_buf) if isinstance(path_or_buf, (str, text_type, bytes)): self.path_or_buf = open(path_or_buf, 'rb') else: # Copy to BytesIO, and ensure no encoding contents = path_or_buf.read() - try: - contents = contents.encode(self._default_encoding) - except: - pass self.path_or_buf = BytesIO(contents) self._read_header() @@ -1030,6 +1014,15 @@ def close(self): except IOError: pass + def _set_encoding(self): + """ + Set string encoding which depends on file version + """ + if self.format_version < 118: + self._encoding = 'latin-1' + else: + self._encoding = 'utf-8' + def _read_header(self): first_char = self.path_or_buf.read(1) if struct.unpack('c', first_char)[0] == b'<': @@ -1049,6 +1042,7 @@ def _read_new_header(self, first_char): self.format_version = int(self.path_or_buf.read(3)) if self.format_version not in [117, 118]: raise ValueError(_version_error) + self._set_encoding() self.path_or_buf.read(21) # self.byteorder = self.path_or_buf.read(3) == b'MSF' and '>' or '<' self.path_or_buf.read(15) # @@ -1235,6 +1229,7 @@ def _read_old_header(self, first_char): self.format_version = struct.unpack('b', first_char)[0] if self.format_version not in [104, 105, 108, 111, 113, 114, 115]: raise ValueError(_version_error) + self._set_encoding() self.byteorder = struct.unpack('b', self.path_or_buf.read(1))[ 0] == 0x1 and '>' or '<' self.filetype = struct.unpack('b', self.path_or_buf.read(1))[0] @@ -1338,16 +1333,9 @@ def _decode(self, s): return s.decode('utf-8') def _null_terminate(self, s): - if compat.PY3 or self._encoding is not None: - # have bytes not strings, so must decode - s = s.partition(b"\0")[0] - return s.decode(self._encoding or self._default_encoding) - else: - null_byte = "\0" - try: - return s.lstrip(null_byte)[:s.index(null_byte)] - except: - return s + # have bytes not strings, so must decode + s = s.partition(b"\0")[0] + return s.decode(self._encoding) def _read_value_labels(self): if self._value_labels_read: @@ -1433,10 +1421,7 @@ def _read_strls(self): self.path_or_buf.read(4))[0] va = self.path_or_buf.read(length) if typ == 130: - encoding = 'utf-8' - if self.format_version == 117: - encoding = self._encoding or self._default_encoding - va = va[0:-1].decode(encoding) + va = va[0:-1].decode(self._encoding) # Wrap v_o in a string to allow uint64 values as keys on 32bit OS self.GSO[str(v_o)] = va @@ -1980,9 +1965,14 @@ class StataWriter(StataParser): def __init__(self, fname, data, convert_dates=None, write_index=True, encoding="latin-1", byteorder=None, time_stamp=None, data_label=None, variable_labels=None): - super(StataWriter, self).__init__(encoding) + super(StataWriter, self).__init__() self._convert_dates = {} if convert_dates is None else convert_dates self._write_index = write_index + if encoding is not None: + if encoding not in VALID_ENCODINGS: + raise ValueError('Unknown encoding. Only latin-1 and ascii ' + 'supported.') + self._encoding = encoding self._time_stamp = time_stamp self._data_label = data_label self._variable_labels = variable_labels diff --git a/pandas/tests/io/data/stata16_118.dta b/pandas/tests/io/data/stata16_118.dta new file mode 100644 index 0000000000000..49cfa49d1b302 Binary files /dev/null and b/pandas/tests/io/data/stata16_118.dta differ diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index f3a465da4e87f..e5585902a9dd6 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -96,6 +96,7 @@ def setup_method(self, method): self.dta23 = os.path.join(self.dirpath, 'stata15.dta') self.dta24_111 = os.path.join(self.dirpath, 'stata7_111.dta') + self.dta25_118 = os.path.join(self.dirpath, 'stata16_118.dta') self.stata_dates = os.path.join(self.dirpath, 'stata13_dates.dta') @@ -363,19 +364,14 @@ def test_encoding(self, version): encoded = read_stata(self.dta_encoding, encoding="latin-1") result = encoded.kreis1849[0] - if compat.PY3: - expected = raw.kreis1849[0] - assert result == expected - assert isinstance(result, compat.string_types) - else: - expected = raw.kreis1849.str.decode("latin-1")[0] - assert result == expected - assert isinstance(result, unicode) # noqa + expected = raw.kreis1849[0] + assert result == expected + assert isinstance(result, compat.string_types) with tm.ensure_clean() as path: encoded.to_stata(path, encoding='latin-1', write_index=False, version=version) - reread_encoded = read_stata(path, encoding='latin-1') + reread_encoded = read_stata(path) tm.assert_frame_equal(encoded, reread_encoded) def test_read_write_dta11(self): @@ -1500,3 +1496,18 @@ def test_gzip_writing(self): with gzip.GzipFile(path, 'rb') as gz: reread = pd.read_stata(gz, index_col='index') tm.assert_frame_equal(df, reread) + + def test_unicode_dta_118(self): + unicode_df = self.read_dta(self.dta25_118) + + columns = ['utf8', 'latin1', 'ascii', 'utf8_strl', 'ascii_strl'] + values = [[u'ραηδας', u'PÄNDÄS', 'p', u'ραηδας', 'p'], + [u'ƤĀńĐąŜ', u'Ö', 'a', u'ƤĀńĐąŜ', 'a'], + [u'ᴘᴀᴎᴅᴀS', u'Ü', 'n', u'ᴘᴀᴎᴅᴀS', 'n'], + [' ', ' ', 'd', ' ', 'd'], + [' ', '', 'a', ' ', 'a'], + ['', '', 's', '', 's'], + ['', '', ' ', '', ' ']] + expected = pd.DataFrame(values, columns=columns) + + tm.assert_frame_equal(unicode_df, expected)