BUG/ENH: Correct categorical on iterators

Kevin Sheppard · bashtage · commit 3f090641facf · 2020-05-12T14:07:04.000+01:00
Return categoricals with the same categories if possible when reading data through an interator. Warn if not possible. closes pandas-dev#31544
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -777,6 +777,7 @@ I/O
 - Bug in :meth:`~DataFrame.read_feather` was raising an `ArrowIOError` when reading an s3 or http file path (:issue:`29055`)
 - Bug in :meth:`read_parquet` was raising a ``FileNotFoundError`` when passed an s3 directory path. (:issue:`26388`)
 - Bug in :meth:`~DataFrame.to_parquet` was throwing an ``AttributeError`` when writing a partitioned parquet file to s3 (:issue:`27596`)
+- Bug in :meth:`~pandas.io.stata.StataReader` which resulted in categorical variables with difference dtypes when reading data using an iterator. (:issue:`31544`)
 
 Plotting
 ^^^^^^^^
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -480,6 +480,21 @@ class InvalidColumnName(Warning):
 """
 
 
+class CategoricalConversionWarning(Warning):
+    pass
+
+
+categorical_conversion_warning = """
+One or more series with value labels are not fully labeled. Reading this
+dataset with an iterator results in categorical variable with different
+categories. This occurs since it is not possible to know all possible values
+until the entire dataset has been read. To avoid this warning, you can either
+read dataset without an interator, or manually convert categorical data by
+``convert_categoricals`` to False and then accessing the variable labels
+through the value_labels method of the reader.
+"""
+
+
 def _cast_to_stata_types(data: DataFrame) -> DataFrame:
     """
     Checks the dtypes of the columns of a pandas DataFrame for
@@ -1736,8 +1751,8 @@ def _do_select_columns(self, data: DataFrame, columns: Sequence[str]) -> DataFra
 
         return data[columns]
 
-    @staticmethod
     def _do_convert_categoricals(
+        self,
         data: DataFrame,
         value_label_dict: Dict[str, Dict[Union[float, int], str]],
         lbllist: Sequence[str],
@@ -1751,14 +1766,36 @@ def _do_convert_categoricals(
         for col, label in zip(data, lbllist):
             if label in value_labels:
                 # Explicit call with ordered=True
-                cat_data = Categorical(data[col], ordered=order_categoricals)
-                categories = []
-                for category in cat_data.categories:
-                    if category in value_label_dict[label]:
-                        categories.append(value_label_dict[label][category])
-                    else:
-                        categories.append(category)  # Partially labeled
+                vl = value_label_dict[label]
+                keys = np.array([k for k in vl.keys()])
+                column = data[col]
+                if column.isin(keys).all() and self._chunksize:
+                    # If all categories are in the keys and we are iterating,
+                    # use the same keys for all chunks. If some are missing
+                    # value labels, then we will fall back to the categories
+                    # varying across chunks.
+                    initial_categories = keys
+                    warnings.warn(
+                        categorical_conversion_warning, CategoricalConversionWarning
+                    )
+                else:
+                    initial_categories = None
+                cat_data = Categorical(
+                    column, categories=initial_categories, ordered=order_categoricals
+                )
+                if initial_categories is None:
+                    # If None here, then we need to match the cats in the Categorical
+                    categories = []
+                    for category in cat_data.categories:
+                        if category in vl:
+                            categories.append(vl[category])
+                        else:
+                            categories.append(category)
+                else:
+                    # If all cats are matched, we can use the values
+                    categories = [v for v in vl.values()]
                 try:
+                    # Try to catch duplicate categories
                     cat_data.categories = categories
                 except ValueError as err:
                     vc = Series(categories).value_counts()
diff --git a/pandas/tests/io/data/stata/stata-dta-partially-labeled.dta b/pandas/tests/io/data/stata/stata-dta-partially-labeled.dta
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
@@ -17,6 +17,7 @@
 
 from pandas.io.parsers import read_csv
 from pandas.io.stata import (
+    CategoricalConversionWarning,
     InvalidColumnName,
     PossiblePrecisionLoss,
     StataMissingValue,
@@ -1853,3 +1854,34 @@ def test_writer_118_exceptions(self):
         with tm.ensure_clean() as path:
             with pytest.raises(ValueError, match="You must use version 119"):
                 StataWriterUTF8(path, df, version=118)
+
+
+@pytest.mark.parametrize("version", [114, 117, 118, 119, None])
+def test_chunked_categorical(version):
+    df = DataFrame({"cats": Series(["a", "b", "a", "b", "c"], dtype="category")})
+    df.index.name = "index"
+    with tm.ensure_clean() as path:
+        df.to_stata(path, version=version)
+        reader = StataReader(path, chunksize=2, order_categoricals=False)
+        for i, block in enumerate(reader):
+            block = block.set_index("index")
+            assert "cats" in block
+            tm.assert_series_equal(block.cats, df.cats.iloc[2 * i : 2 * (i + 1)])
+
+
+def test_chunked_categorical_partial(dirpath):
+    dta_file = os.path.join(dirpath, "stata-dta-partially-labeled.dta")
+    reader = StataReader(dta_file, chunksize=2)
+    values = ["a", "b", "a", "b", 3.0]
+    with pytest.warns(CategoricalConversionWarning, match="One or more series"):
+        for i, block in enumerate(reader):
+            assert list(block.cats) == values[2 * i : 2 * (i + 1)]
+            if i < 2:
+                idx = pd.Index(["a", "b"])
+            else:
+                idx = pd.Float64Index([3.0])
+            tm.assert_index_equal(block.cats.cat.categories, idx)
+    reader = StataReader(dta_file, chunksize=5)
+    large_chunk = reader.__next__()
+    direct = read_stata(dta_file)
+    tm.assert_frame_equal(direct, large_chunk)