Skip to content

Commit 68c439b

Browse files
authored
BUG: StataWriter value_label encoding (#47199)
1 parent 1b114d7 commit 68c439b

File tree

3 files changed

+13
-5
lines changed

3 files changed

+13
-5
lines changed

doc/source/whatsnew/v1.5.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -828,7 +828,7 @@ I/O
828828
- Bug in :func:`read_parquet` when ``engine="fastparquet"`` where the file was not closed on error (:issue:`46555`)
829829
- :meth:`to_html` now excludes the ``border`` attribute from ``<table>`` elements when ``border`` keyword is set to ``False``.
830830
- Bug in :func:`read_sas` returned ``None`` rather than an empty DataFrame for SAS7BDAT files with zero rows (:issue:`18198`)
831-
-
831+
- Bug in :class:`StataWriter` where value labels were always written with default encoding (:issue:`46750`)
832832

833833
Period
834834
^^^^^^

pandas/io/stata.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -668,7 +668,9 @@ class StataValueLabel:
668668
Encoding to use for value labels.
669669
"""
670670

671-
def __init__(self, catarray: Series, encoding: str = "latin-1") -> None:
671+
def __init__(
672+
self, catarray: Series, encoding: Literal["latin-1", "utf-8"] = "latin-1"
673+
) -> None:
672674

673675
if encoding not in ("latin-1", "utf-8"):
674676
raise ValueError("Only latin-1 and utf-8 are supported.")
@@ -2250,7 +2252,7 @@ class StataWriter(StataParser):
22502252
"""
22512253

22522254
_max_string_length = 244
2253-
_encoding = "latin-1"
2255+
_encoding: Literal["latin-1", "utf-8"] = "latin-1"
22542256

22552257
def __init__(
22562258
self,
@@ -2331,7 +2333,7 @@ def _prepare_non_cat_value_labels(
23312333
f"Can't create value labels for {labname}, value labels "
23322334
"can only be applied to numeric columns."
23332335
)
2334-
svl = StataNonCatValueLabel(colname, labels)
2336+
svl = StataNonCatValueLabel(colname, labels, self._encoding)
23352337
non_cat_value_labels.append(svl)
23362338
return non_cat_value_labels
23372339

@@ -3575,7 +3577,7 @@ class StataWriterUTF8(StataWriter117):
35753577
>>> writer.write_file()
35763578
"""
35773579

3578-
_encoding = "utf-8"
3580+
_encoding: Literal["utf-8"] = "utf-8"
35793581

35803582
def __init__(
35813583
self,

pandas/tests/io/test_stata.py

+6
Original file line numberDiff line numberDiff line change
@@ -1797,6 +1797,7 @@ def test_utf8_writer(self, version):
17971797
"ᴐᴬᵀ": "",
17981798
}
17991799
data_label = "ᴅaᵀa-label"
1800+
value_labels = {"β": {1: "label", 2: "æøå", 3: "ŋot valid latin-1"}}
18001801
data["β"] = data["β"].astype(np.int32)
18011802
with tm.ensure_clean() as path:
18021803
writer = StataWriterUTF8(
@@ -1807,11 +1808,16 @@ def test_utf8_writer(self, version):
18071808
variable_labels=variable_labels,
18081809
write_index=False,
18091810
version=version,
1811+
value_labels=value_labels,
18101812
)
18111813
writer.write_file()
18121814
reread_encoded = read_stata(path)
18131815
# Missing is intentionally converted to empty strl
18141816
data["strls"] = data["strls"].fillna("")
1817+
# Variable with value labels is reread as categorical
1818+
data["β"] = (
1819+
data["β"].replace(value_labels["β"]).astype("category").cat.as_ordered()
1820+
)
18151821
tm.assert_frame_equal(data, reread_encoded)
18161822
reader = StataReader(path)
18171823
assert reader.data_label == data_label

0 commit comments

Comments
 (0)