From e9bfd260b236448a5e7840027926889b6ee77259 Mon Sep 17 00:00:00 2001 From: eirki Date: Tue, 10 May 2022 22:05:24 +0200 Subject: [PATCH 1/2] BUG: Encoding not passed to NonCatValueLabel in StataWriter (#46750) --- pandas/io/stata.py | 10 ++++++---- pandas/tests/io/test_stata.py | 6 ++++++ 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index d9912f2480e07..b8d56172027e1 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -668,7 +668,9 @@ class StataValueLabel: Encoding to use for value labels. """ - def __init__(self, catarray: Series, encoding: str = "latin-1") -> None: + def __init__( + self, catarray: Series, encoding: Literal["latin-1", "utf-8"] = "latin-1" + ) -> None: if encoding not in ("latin-1", "utf-8"): raise ValueError("Only latin-1 and utf-8 are supported.") @@ -2250,7 +2252,7 @@ class StataWriter(StataParser): """ _max_string_length = 244 - _encoding = "latin-1" + _encoding: Literal["latin-1", "utf-8"] = "latin-1" def __init__( self, @@ -2331,7 +2333,7 @@ def _prepare_non_cat_value_labels( f"Can't create value labels for {labname}, value labels " "can only be applied to numeric columns." ) - svl = StataNonCatValueLabel(colname, labels) + svl = StataNonCatValueLabel(colname, labels, self._encoding) non_cat_value_labels.append(svl) return non_cat_value_labels @@ -3575,7 +3577,7 @@ class StataWriterUTF8(StataWriter117): >>> writer.write_file() """ - _encoding = "utf-8" + _encoding: Literal["utf-8"] = "utf-8" def __init__( self, diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index c21673af2d979..49aa99da9d81a 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -1797,6 +1797,7 @@ def test_utf8_writer(self, version): "ᴐᴬᵀ": "", } data_label = "ᴅaᵀa-label" + value_labels = {"β": {1: "label", 2: "æøå", 3: "ŋot valid latin-1"}} data["β"] = data["β"].astype(np.int32) with tm.ensure_clean() as path: writer = StataWriterUTF8( @@ -1807,11 +1808,16 @@ def test_utf8_writer(self, version): variable_labels=variable_labels, write_index=False, version=version, + value_labels=value_labels, ) writer.write_file() reread_encoded = read_stata(path) # Missing is intentionally converted to empty strl data["strls"] = data["strls"].fillna("") + # Variable with value labels is reread as categorical + data["β"] = ( + data["β"].replace(value_labels["β"]).astype("category").cat.as_ordered() + ) tm.assert_frame_equal(data, reread_encoded) reader = StataReader(path) assert reader.data_label == data_label From bf1c2ae574457778acaf52808136c83d70d433f4 Mon Sep 17 00:00:00 2001 From: eirki Date: Thu, 2 Jun 2022 10:43:10 +0200 Subject: [PATCH 2/2] DOC: Add StataWriter bug fix to whatsnew --- doc/source/whatsnew/v1.5.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 61848cb127029..cb99f15775ab4 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -820,7 +820,7 @@ I/O - Bug in :func:`read_parquet` when ``engine="fastparquet"`` where the file was not closed on error (:issue:`46555`) - :meth:`to_html` now excludes the ``border`` attribute from ```` elements when ``border`` keyword is set to ``False``. - Bug in :func:`read_sas` returned ``None`` rather than an empty DataFrame for SAS7BDAT files with zero rows (:issue:`18198`) -- +- Bug in :class:`StataWriter` where value labels were always written with default encoding (:issue:`46750`) Period ^^^^^^