From c7282a0108a1271eee3edc5d56dee106dcf7a380 Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Thu, 31 Oct 2024 10:03:19 +0000 Subject: [PATCH 1/2] BUG: Remove incorrect check on value label length Remove 32,000 limit on value limit check since this applies to the number of variable, not the length of the value labels closes #60107 --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/io/stata.py | 6 ------ pandas/tests/io/test_stata.py | 11 +++++++++++ 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 87d92f6618023..d7d29665950a6 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -687,6 +687,7 @@ I/O - Bug in :meth:`DataFrame.to_dict` raises unnecessary ``UserWarning`` when columns are not unique and ``orient='tight'``. (:issue:`58281`) - Bug in :meth:`DataFrame.to_excel` when writing empty :class:`DataFrame` with :class:`MultiIndex` on both axes (:issue:`57696`) - Bug in :meth:`DataFrame.to_stata` when writing :class:`DataFrame` and ``byteorder=`big```. (:issue:`58969`) +- Bug in :meth:`DataFrame.to_stata` when writing more than 32,000 value labels. (:issue:`60107`) - Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`) - Bug in :meth:`HDFStore.get` was failing to save data of dtype datetime64[s] correctly (:issue:`59004`) - Bug in :meth:`read_csv` causing segmentation fault when ``encoding_errors`` is not a string. (:issue:`59059`) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 722e2c79c4e6a..ed89d5766c306 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -691,12 +691,6 @@ def _prepare_value_labels(self) -> None: self.txt.append(category) self.n += 1 - if self.text_len > 32000: - raise ValueError( - "Stata value labels for a single variable must " - "have a combined length less than 32,000 characters." - ) - # Ensure int32 self.off = np.array(offsets, dtype=np.int32) self.val = np.array(values, dtype=np.int32) diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 4b5369d61bed6..5bcbf0850a428 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -3,7 +3,9 @@ from datetime import datetime import gzip import io +import itertools import os +import string import struct import tarfile import zipfile @@ -2592,3 +2594,12 @@ def test_empty_frame(temp_file): df3 = read_stata(path, columns=["a"]) assert "b" not in df3 tm.assert_series_equal(df3.dtypes, dtypes.loc[["a"]]) + + +@pytest.mark.parametrize("version", [114, 117, 118, 119, None]) +def test_many_strl(temp_file, version): + n = 65534 + df = DataFrame(np.arange(n), columns=["col"]) + lbls = ["".join(v) for v in itertools.product(*([string.ascii_letters] * 3))] + value_labels = {"col": {i: lbls[i] for i in range(n)}} + df.to_stata(temp_file, value_labels=value_labels, version=version) From 2bea4f49960ca0db450b320cc04349ba2465eded Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Thu, 31 Oct 2024 12:33:10 +0000 Subject: [PATCH 2/2] TST: Remove incorrect test Remove test of the error that was being incorrectly raised --- pandas/tests/io/test_stata.py | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 5bcbf0850a428..8fa85d13bbdb5 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -1165,28 +1165,13 @@ def test_categorical_writing(self, version, temp_file): def test_categorical_warnings_and_errors(self, temp_file): # Warning for non-string labels - # Error for labels too long - original = DataFrame.from_records( - [["a" * 10000], ["b" * 10000], ["c" * 10000], ["d" * 10000]], - columns=["Too_long"], - ) - - original = original.astype("category") - path = temp_file - msg = ( - "Stata value labels for a single variable must have " - r"a combined length less than 32,000 characters\." - ) - with pytest.raises(ValueError, match=msg): - original.to_stata(path) - original = DataFrame.from_records( [["a"], ["b"], ["c"], ["d"], [1]], columns=["Too_long"] ).astype("category") msg = "data file created has not lost information due to duplicate labels" with tm.assert_produces_warning(ValueLabelTypeMismatch, match=msg): - original.to_stata(path) + original.to_stata(temp_file) # should get a warning for mixed content @pytest.mark.parametrize("version", [114, 117, 118, 119, None])