Skip to content

Commit 4472717

Browse files
bashtageKevin Sheppard
authored and
Kevin Sheppard
committed
TST: Add a test to check order invariance
Check the label ordering does not cause any issues
1 parent 51dcc83 commit 4472717

File tree

2 files changed

+26
-0
lines changed

2 files changed

+26
-0
lines changed

pandas/io/stata.py

+12
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,14 @@
106106
iterator : bool, default False
107107
Return StataReader object."""
108108

109+
_reader_notes = """\
110+
Notes
111+
-----
112+
Categorical variables read through an iterator may not have the same
113+
categories and dtype. This occurs when a variable stored in a DTA
114+
file is associated to an incomplete set of value labels that only
115+
label a strict subset of the values."""
116+
109117
_read_stata_doc = f"""
110118
Read Stata file into DataFrame.
111119
@@ -135,6 +143,8 @@
135143
io.stata.StataReader : Low-level reader for Stata data files.
136144
DataFrame.to_stata: Export Stata data files.
137145
146+
{_reader_notes}
147+
138148
Examples
139149
--------
140150
Read a Stata dta file:
@@ -176,6 +186,8 @@
176186
{_statafile_processing_params1}
177187
{_statafile_processing_params2}
178188
{_chunksize_params}
189+
190+
{_reader_notes}
179191
"""
180192

181193

pandas/tests/io/test_stata.py

+14
Original file line numberDiff line numberDiff line change
@@ -1969,3 +1969,17 @@ def test_iterator_errors(dirpath):
19691969
with pytest.raises(ValueError, match="chunksize must be set to a positive"):
19701970
with StataReader(dta_file) as reader:
19711971
reader.__next__()
1972+
1973+
1974+
def test_iterator_value_labels():
1975+
# GH 31544
1976+
values = ["c_label", "b_label"] + ["a_label"] * 500
1977+
df = DataFrame({f"col{k}": pd.Categorical(values, ordered=True) for k in range(2)})
1978+
with tm.ensure_clean() as path:
1979+
df.to_stata(path, write_index=False)
1980+
reader = pd.read_stata(path, chunksize=100)
1981+
expected = pd.Index(["a_label", "b_label", "c_label"], dtype="object")
1982+
for j, chunk in enumerate(reader):
1983+
for i in range(2):
1984+
tm.assert_index_equal(chunk.dtypes[i].categories, expected)
1985+
tm.assert_frame_equal(chunk, df.iloc[j * 100 : (j + 1) * 100])

0 commit comments

Comments
 (0)