BUG: Remove incorrect check on value label length (pandas-dev#60156)

bashtage · web-flow · commit e26e3eec231c · 2024-10-31T08:56:55.000-07:00
* BUG: Remove incorrect check on value label length Remove 32,000 limit on value limit check since this applies to the number of variable, not the length of the value labels closes pandas-dev#60107 * TST: Remove incorrect test Remove test of the error that was being incorrectly raised
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -687,6 +687,7 @@ I/O
 - Bug in :meth:`DataFrame.to_dict` raises unnecessary ``UserWarning`` when columns are not unique and ``orient='tight'``. (:issue:`58281`)
 - Bug in :meth:`DataFrame.to_excel` when writing empty :class:`DataFrame` with :class:`MultiIndex` on both axes (:issue:`57696`)
 - Bug in :meth:`DataFrame.to_stata` when writing :class:`DataFrame` and ``byteorder=`big```. (:issue:`58969`)
+- Bug in :meth:`DataFrame.to_stata` when writing more than 32,000 value labels. (:issue:`60107`)
 - Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`)
 - Bug in :meth:`HDFStore.get` was failing to save data of dtype datetime64[s] correctly (:issue:`59004`)
 - Bug in :meth:`read_csv` causing segmentation fault when ``encoding_errors`` is not a string. (:issue:`59059`)
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -691,12 +691,6 @@ def _prepare_value_labels(self) -> None:
             self.txt.append(category)
             self.n += 1
 
-        if self.text_len > 32000:
-            raise ValueError(
-                "Stata value labels for a single variable must "
-                "have a combined length less than 32,000 characters."
-            )
-
         # Ensure int32
         self.off = np.array(offsets, dtype=np.int32)
         self.val = np.array(values, dtype=np.int32)
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
@@ -3,7 +3,9 @@
 from datetime import datetime
 import gzip
 import io
+import itertools
 import os
+import string
 import struct
 import tarfile
 import zipfile
@@ -1163,28 +1165,13 @@ def test_categorical_writing(self, version, temp_file):
 
     def test_categorical_warnings_and_errors(self, temp_file):
         # Warning for non-string labels
-        # Error for labels too long
-        original = DataFrame.from_records(
-            [["a" * 10000], ["b" * 10000], ["c" * 10000], ["d" * 10000]],
-            columns=["Too_long"],
-        )
-
-        original = original.astype("category")
-        path = temp_file
-        msg = (
-            "Stata value labels for a single variable must have "
-            r"a combined length less than 32,000 characters\."
-        )
-        with pytest.raises(ValueError, match=msg):
-            original.to_stata(path)
-
         original = DataFrame.from_records(
             [["a"], ["b"], ["c"], ["d"], [1]], columns=["Too_long"]
         ).astype("category")
 
         msg = "data file created has not lost information due to duplicate labels"
         with tm.assert_produces_warning(ValueLabelTypeMismatch, match=msg):
-            original.to_stata(path)
+            original.to_stata(temp_file)
             # should get a warning for mixed content
 
     @pytest.mark.parametrize("version", [114, 117, 118, 119, None])
@@ -2592,3 +2579,12 @@ def test_empty_frame(temp_file):
     df3 = read_stata(path, columns=["a"])
     assert "b" not in df3
     tm.assert_series_equal(df3.dtypes, dtypes.loc[["a"]])
+
+
+@pytest.mark.parametrize("version", [114, 117, 118, 119, None])
+def test_many_strl(temp_file, version):
+    n = 65534
+    df = DataFrame(np.arange(n), columns=["col"])
+    lbls = ["".join(v) for v in itertools.product(*([string.ascii_letters] * 3))]
+    value_labels = {"col": {i: lbls[i] for i in range(n)}}
+    df.to_stata(temp_file, value_labels=value_labels, version=version)