Skip to content

Commit e26e3ee

Browse files
authored
BUG: Remove incorrect check on value label length (#60156)
* BUG: Remove incorrect check on value label length Remove 32,000 limit on value limit check since this applies to the number of variable, not the length of the value labels closes #60107 * TST: Remove incorrect test Remove test of the error that was being incorrectly raised
1 parent e7bb845 commit e26e3ee

File tree

3 files changed

+13
-22
lines changed

3 files changed

+13
-22
lines changed

doc/source/whatsnew/v3.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -687,6 +687,7 @@ I/O
687687
- Bug in :meth:`DataFrame.to_dict` raises unnecessary ``UserWarning`` when columns are not unique and ``orient='tight'``. (:issue:`58281`)
688688
- Bug in :meth:`DataFrame.to_excel` when writing empty :class:`DataFrame` with :class:`MultiIndex` on both axes (:issue:`57696`)
689689
- Bug in :meth:`DataFrame.to_stata` when writing :class:`DataFrame` and ``byteorder=`big```. (:issue:`58969`)
690+
- Bug in :meth:`DataFrame.to_stata` when writing more than 32,000 value labels. (:issue:`60107`)
690691
- Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`)
691692
- Bug in :meth:`HDFStore.get` was failing to save data of dtype datetime64[s] correctly (:issue:`59004`)
692693
- Bug in :meth:`read_csv` causing segmentation fault when ``encoding_errors`` is not a string. (:issue:`59059`)

pandas/io/stata.py

-6
Original file line numberDiff line numberDiff line change
@@ -691,12 +691,6 @@ def _prepare_value_labels(self) -> None:
691691
self.txt.append(category)
692692
self.n += 1
693693

694-
if self.text_len > 32000:
695-
raise ValueError(
696-
"Stata value labels for a single variable must "
697-
"have a combined length less than 32,000 characters."
698-
)
699-
700694
# Ensure int32
701695
self.off = np.array(offsets, dtype=np.int32)
702696
self.val = np.array(values, dtype=np.int32)

pandas/tests/io/test_stata.py

+12-16
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,9 @@
33
from datetime import datetime
44
import gzip
55
import io
6+
import itertools
67
import os
8+
import string
79
import struct
810
import tarfile
911
import zipfile
@@ -1163,28 +1165,13 @@ def test_categorical_writing(self, version, temp_file):
11631165

11641166
def test_categorical_warnings_and_errors(self, temp_file):
11651167
# Warning for non-string labels
1166-
# Error for labels too long
1167-
original = DataFrame.from_records(
1168-
[["a" * 10000], ["b" * 10000], ["c" * 10000], ["d" * 10000]],
1169-
columns=["Too_long"],
1170-
)
1171-
1172-
original = original.astype("category")
1173-
path = temp_file
1174-
msg = (
1175-
"Stata value labels for a single variable must have "
1176-
r"a combined length less than 32,000 characters\."
1177-
)
1178-
with pytest.raises(ValueError, match=msg):
1179-
original.to_stata(path)
1180-
11811168
original = DataFrame.from_records(
11821169
[["a"], ["b"], ["c"], ["d"], [1]], columns=["Too_long"]
11831170
).astype("category")
11841171

11851172
msg = "data file created has not lost information due to duplicate labels"
11861173
with tm.assert_produces_warning(ValueLabelTypeMismatch, match=msg):
1187-
original.to_stata(path)
1174+
original.to_stata(temp_file)
11881175
# should get a warning for mixed content
11891176

11901177
@pytest.mark.parametrize("version", [114, 117, 118, 119, None])
@@ -2592,3 +2579,12 @@ def test_empty_frame(temp_file):
25922579
df3 = read_stata(path, columns=["a"])
25932580
assert "b" not in df3
25942581
tm.assert_series_equal(df3.dtypes, dtypes.loc[["a"]])
2582+
2583+
2584+
@pytest.mark.parametrize("version", [114, 117, 118, 119, None])
2585+
def test_many_strl(temp_file, version):
2586+
n = 65534
2587+
df = DataFrame(np.arange(n), columns=["col"])
2588+
lbls = ["".join(v) for v in itertools.product(*([string.ascii_letters] * 3))]
2589+
value_labels = {"col": {i: lbls[i] for i in range(n)}}
2590+
df.to_stata(temp_file, value_labels=value_labels, version=version)

0 commit comments

Comments
 (0)