BUG: Fix GSO values when writing latin-1 strLs (#24337)

bashtage · jreback · commit 2b5058e974bc · 2018-12-18T09:10:59.000-05:00
The size calculation of the string is incorrect when writing characters
that have a different encoding in latin-1 and utf-8. The utf-8 size
needs to be written in stead of the latin-1 size.
diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
@@ -1576,6 +1576,7 @@ Notice how we now instead output ``np.nan`` itself instead of a stringified form
 - :func:`DataFrame.to_string()`, :func:`DataFrame.to_html()`, :func:`DataFrame.to_latex()` will correctly format output when a string is passed as the ``float_format`` argument (:issue:`21625`, :issue:`22270`)
 - Bug in :func:`read_csv` that caused it to raise ``OverflowError`` when trying to use 'inf' as ``na_value`` with integer index column (:issue:`17128`)
 - Bug in :func:`pandas.io.json.json_normalize` that caused it to raise ``TypeError`` when two consecutive elements of ``record_path`` are dicts (:issue:`22706`)
+- Bug in :meth:`DataFrame.to_stata` and :class:`pandas.io.stata.StataWriter117` that produced invalid files when using strLs with non-ASCII characters (:issue:`23573`)
 
 Plotting
 ^^^^^^^^
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -2643,12 +2643,11 @@ def generate_blob(self, gso_table):
             bio.write(gso_type)
 
             # llll
-            encoded = self._encode(strl)
-            bio.write(struct.pack(len_type, len(encoded) + 1))
+            utf8_string = _bytes(strl, 'utf-8')
+            bio.write(struct.pack(len_type, len(utf8_string) + 1))
 
             # xxx...xxx
-            s = _bytes(strl, 'utf-8')
-            bio.write(s)
+            bio.write(utf8_string)
             bio.write(null)
 
         bio.seek(0)
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
@@ -16,7 +16,7 @@
 import pandas as pd
 import pandas.util.testing as tm
 import pandas.compat as compat
-from pandas.compat import iterkeys
+from pandas.compat import iterkeys, PY3
 from pandas.core.dtypes.common import is_categorical_dtype
 from pandas.core.frame import DataFrame, Series
 from pandas.io.parsers import read_csv
@@ -1546,3 +1546,23 @@ def test_all_none_exception(self, version):
                 output.to_stata(path, version=version)
         assert 'Only string-like' in excinfo.value.args[0]
         assert 'Column `none`' in excinfo.value.args[0]
+
+    def test_strl_latin1(self):
+        # GH 23573, correct GSO data to reflect correct size
+        output = DataFrame([[u'pandas'] * 2, [u'þâÑÐÅ§'] * 2],
+                           columns=['var_str', 'var_strl'])
+
+        with tm.ensure_clean() as path:
+            output.to_stata(path, version=117, convert_strl=['var_strl'])
+            with open(path, 'rb') as reread:
+                content = reread.read()
+                expected = u'þâÑÐÅ§'
+                assert expected.encode('latin-1') in content
+                assert expected.encode('utf-8') in content
+                gsos = content.split(b'strls')[1][1:-2]
+                for gso in gsos.split(b'GSO')[1:]:
+                    val = gso.split(b'\x00')[-2]
+                    size = gso[gso.find(b'\x82') + 1]
+                    if not PY3:
+                        size = ord(size)
+                    assert len(val) == size - 1