diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 0bd695f5a40ea..689f5cc7951af 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1576,6 +1576,7 @@ Notice how we now instead output ``np.nan`` itself instead of a stringified form - :func:`DataFrame.to_string()`, :func:`DataFrame.to_html()`, :func:`DataFrame.to_latex()` will correctly format output when a string is passed as the ``float_format`` argument (:issue:`21625`, :issue:`22270`) - Bug in :func:`read_csv` that caused it to raise ``OverflowError`` when trying to use 'inf' as ``na_value`` with integer index column (:issue:`17128`) - Bug in :func:`pandas.io.json.json_normalize` that caused it to raise ``TypeError`` when two consecutive elements of ``record_path`` are dicts (:issue:`22706`) +- Bug in :meth:`DataFrame.to_stata` and :class:`pandas.io.stata.StataWriter117` that produced invalid files when using strLs with non-ASCII characters (:issue:`23573`) Plotting ^^^^^^^^ diff --git a/pandas/io/stata.py b/pandas/io/stata.py index aadb9686bc6d9..b61ca21a53f75 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -2643,12 +2643,11 @@ def generate_blob(self, gso_table): bio.write(gso_type) # llll - encoded = self._encode(strl) - bio.write(struct.pack(len_type, len(encoded) + 1)) + utf8_string = _bytes(strl, 'utf-8') + bio.write(struct.pack(len_type, len(utf8_string) + 1)) # xxx...xxx - s = _bytes(strl, 'utf-8') - bio.write(s) + bio.write(utf8_string) bio.write(null) bio.seek(0) diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index fb08af36e8325..e6544b8e3ee4b 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -16,7 +16,7 @@ import pandas as pd import pandas.util.testing as tm import pandas.compat as compat -from pandas.compat import iterkeys +from pandas.compat import iterkeys, PY3 from pandas.core.dtypes.common import is_categorical_dtype from pandas.core.frame import DataFrame, Series from pandas.io.parsers import read_csv @@ -1546,3 +1546,23 @@ def test_all_none_exception(self, version): output.to_stata(path, version=version) assert 'Only string-like' in excinfo.value.args[0] assert 'Column `none`' in excinfo.value.args[0] + + def test_strl_latin1(self): + # GH 23573, correct GSO data to reflect correct size + output = DataFrame([[u'pandas'] * 2, [u'þâÑÐŧ'] * 2], + columns=['var_str', 'var_strl']) + + with tm.ensure_clean() as path: + output.to_stata(path, version=117, convert_strl=['var_strl']) + with open(path, 'rb') as reread: + content = reread.read() + expected = u'þâÑÐŧ' + assert expected.encode('latin-1') in content + assert expected.encode('utf-8') in content + gsos = content.split(b'strls')[1][1:-2] + for gso in gsos.split(b'GSO')[1:]: + val = gso.split(b'\x00')[-2] + size = gso[gso.find(b'\x82') + 1] + if not PY3: + size = ord(size) + assert len(val) == size - 1