TST: Add a test to check order invariance

bashtage · Kevin Sheppard · commit 44727177468a · 2020-05-12T22:53:24.000+01:00
Check the label ordering does not cause any issues
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -106,6 +106,14 @@
 iterator : bool, default False
     Return StataReader object."""
 
+_reader_notes = """\
+Notes
+-----
+Categorical variables read through an iterator may not have the same
+categories and dtype. This occurs when  a variable stored in a DTA
+file is associated to an incomplete set of value labels that only
+label a strict subset of the values."""
+
 _read_stata_doc = f"""
 Read Stata file into DataFrame.
 
@@ -135,6 +143,8 @@
 io.stata.StataReader : Low-level reader for Stata data files.
 DataFrame.to_stata: Export Stata data files.
 
+{_reader_notes}
+
 Examples
 --------
 Read a Stata dta file:
@@ -176,6 +186,8 @@
 {_statafile_processing_params1}
 {_statafile_processing_params2}
 {_chunksize_params}
+
+{_reader_notes}
 """
 
 
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
@@ -1969,3 +1969,17 @@ def test_iterator_errors(dirpath):
     with pytest.raises(ValueError, match="chunksize must be set to a positive"):
         with StataReader(dta_file) as reader:
             reader.__next__()
+
+
+def test_iterator_value_labels():
+    # GH 31544
+    values = ["c_label", "b_label"] + ["a_label"] * 500
+    df = DataFrame({f"col{k}": pd.Categorical(values, ordered=True) for k in range(2)})
+    with tm.ensure_clean() as path:
+        df.to_stata(path, write_index=False)
+        reader = pd.read_stata(path, chunksize=100)
+        expected = pd.Index(["a_label", "b_label", "c_label"], dtype="object")
+        for j, chunk in enumerate(reader):
+            for i in range(2):
+                tm.assert_index_equal(chunk.dtypes[i].categories, expected)
+            tm.assert_frame_equal(chunk, df.iloc[j * 100 : (j + 1) * 100])