From 4a9db3d61883408981dcaeed8926b67d6c689f1c Mon Sep 17 00:00:00 2001 From: eirki Date: Thu, 9 Jun 2022 21:16:05 +0200 Subject: [PATCH] BUG: StataWriterUTF8 removing some valid characters in column names (#47276) --- doc/source/whatsnew/v1.5.0.rst | 1 + pandas/io/stata.py | 16 ++++++++++------ pandas/tests/io/test_stata.py | 4 ++-- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 1b079217f64ea..3348cab0325de 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -845,6 +845,7 @@ I/O - :meth:`to_html` now excludes the ``border`` attribute from ```` elements when ``border`` keyword is set to ``False``. - Bug in :func:`read_sas` returned ``None`` rather than an empty DataFrame for SAS7BDAT files with zero rows (:issue:`18198`) - Bug in :class:`StataWriter` where value labels were always written with default encoding (:issue:`46750`) +- Bug in :class:`StataWriterUTF8` where some valid characters were removed from variable names (:issue:`47276`) Period ^^^^^^ diff --git a/pandas/io/stata.py b/pandas/io/stata.py index b8d56172027e1..1a230f4ae4164 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -3646,12 +3646,16 @@ def _validate_variable_name(self, name: str) -> str: # High code points appear to be acceptable for c in name: if ( - ord(c) < 128 - and (c < "A" or c > "Z") - and (c < "a" or c > "z") - and (c < "0" or c > "9") - and c != "_" - ) or 128 <= ord(c) < 256: + ( + ord(c) < 128 + and (c < "A" or c > "Z") + and (c < "a" or c > "z") + and (c < "0" or c > "9") + and c != "_" + ) + or 128 <= ord(c) < 192 + or c in {"×", "÷"} + ): name = name.replace(c, "_") return name diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 377b8758c250e..f06bf0035c7dc 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -1786,11 +1786,11 @@ def test_utf8_writer(self, version): [2.0, 2, "ᴮ", ""], [3.0, 3, "ᴰ", None], ], - columns=["a", "β", "ĉ", "strls"], + columns=["Å", "β", "ĉ", "strls"], ) data["ᴐᴬᵀ"] = cat variable_labels = { - "a": "apple", + "Å": "apple", "β": "ᵈᵉᵊ", "ĉ": "ᴎტჄႲႳႴႶႺ", "strls": "Long Strings",