diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 6e8cbc34be062..d11c3699dc86d 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -905,6 +905,7 @@ I/O - Bug in :meth:`~DataFrame.read_feather` was raising an `ArrowIOError` when reading an s3 or http file path (:issue:`29055`) - Bug in :meth:`~DataFrame.to_excel` could not handle the column name `render` and was raising an ``KeyError`` (:issue:`34331`) - Bug in :meth:`~SQLDatabase.execute` was raising a ``ProgrammingError`` for some DB-API drivers when the SQL statement contained the `%` character and no parameters were present (:issue:`34211`) +- Bug in :meth:`~pandas.io.stata.StataReader` which resulted in categorical variables with difference dtypes when reading data using an iterator. (:issue:`31544`) Plotting ^^^^^^^^ diff --git a/pandas/io/stata.py b/pandas/io/stata.py index fe8dcf1bdb9aa..e9adf5292ef6f 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -106,6 +106,14 @@ iterator : bool, default False Return StataReader object.""" +_reader_notes = """\ +Notes +----- +Categorical variables read through an iterator may not have the same +categories and dtype. This occurs when a variable stored in a DTA +file is associated to an incomplete set of value labels that only +label a strict subset of the values.""" + _read_stata_doc = f""" Read Stata file into DataFrame. @@ -135,6 +143,8 @@ io.stata.StataReader : Low-level reader for Stata data files. DataFrame.to_stata: Export Stata data files. +{_reader_notes} + Examples -------- Read a Stata dta file: @@ -176,6 +186,8 @@ {_statafile_processing_params1} {_statafile_processing_params2} {_chunksize_params} + +{_reader_notes} """ @@ -497,6 +509,21 @@ class InvalidColumnName(Warning): """ +class CategoricalConversionWarning(Warning): + pass + + +categorical_conversion_warning = """ +One or more series with value labels are not fully labeled. Reading this +dataset with an iterator results in categorical variable with different +categories. This occurs since it is not possible to know all possible values +until the entire dataset has been read. To avoid this warning, you can either +read dataset without an interator, or manually convert categorical data by +``convert_categoricals`` to False and then accessing the variable labels +through the value_labels method of the reader. +""" + + def _cast_to_stata_types(data: DataFrame) -> DataFrame: """ Checks the dtypes of the columns of a pandas DataFrame for @@ -1023,6 +1050,10 @@ def __init__( self._order_categoricals = order_categoricals self._encoding = "" self._chunksize = chunksize + if self._chunksize is not None and ( + not isinstance(chunksize, int) or chunksize <= 0 + ): + raise ValueError("chunksize must be a positive integer when set.") # State variables for the file self._has_string_data = False @@ -1488,6 +1519,10 @@ def _read_strls(self) -> None: self.GSO[str(v_o)] = decoded_va def __next__(self) -> DataFrame: + if self._chunksize is None: + raise ValueError( + "chunksize must be set to a positive integer to use as an iterator." + ) return self.read(nrows=self._chunksize or 1) def get_chunk(self, size: Optional[int] = None) -> DataFrame: @@ -1753,8 +1788,8 @@ def _do_select_columns(self, data: DataFrame, columns: Sequence[str]) -> DataFra return data[columns] - @staticmethod def _do_convert_categoricals( + self, data: DataFrame, value_label_dict: Dict[str, Dict[Union[float, int], str]], lbllist: Sequence[str], @@ -1768,14 +1803,39 @@ def _do_convert_categoricals( for col, label in zip(data, lbllist): if label in value_labels: # Explicit call with ordered=True - cat_data = Categorical(data[col], ordered=order_categoricals) - categories = [] - for category in cat_data.categories: - if category in value_label_dict[label]: - categories.append(value_label_dict[label][category]) - else: - categories.append(category) # Partially labeled + vl = value_label_dict[label] + keys = np.array(list(vl.keys())) + column = data[col] + key_matches = column.isin(keys) + if self._chunksize is not None and key_matches.all(): + initial_categories = keys + # If all categories are in the keys and we are iterating, + # use the same keys for all chunks. If some are missing + # value labels, then we will fall back to the categories + # varying across chunks. + else: + if self._chunksize is not None: + # warn is using an iterator + warnings.warn( + categorical_conversion_warning, CategoricalConversionWarning + ) + initial_categories = None + cat_data = Categorical( + column, categories=initial_categories, ordered=order_categoricals + ) + if initial_categories is None: + # If None here, then we need to match the cats in the Categorical + categories = [] + for category in cat_data.categories: + if category in vl: + categories.append(vl[category]) + else: + categories.append(category) + else: + # If all cats are matched, we can use the values + categories = list(vl.values()) try: + # Try to catch duplicate categories cat_data.categories = categories except ValueError as err: vc = Series(categories).value_counts() diff --git a/pandas/tests/io/data/stata/stata-dta-partially-labeled.dta b/pandas/tests/io/data/stata/stata-dta-partially-labeled.dta new file mode 100644 index 0000000000000..b9abdb8827432 Binary files /dev/null and b/pandas/tests/io/data/stata/stata-dta-partially-labeled.dta differ diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 698b5417b471b..aa3aa61bbb984 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -20,6 +20,7 @@ from pandas.io.parsers import read_csv from pandas.io.stata import ( + CategoricalConversionWarning, InvalidColumnName, PossiblePrecisionLoss, StataMissingValue, @@ -1923,3 +1924,62 @@ def test_compression_dict(method, file_ext): fp = path reread = read_stata(fp, index_col="index") tm.assert_frame_equal(reread, df) + + +@pytest.mark.parametrize("version", [114, 117, 118, 119, None]) +def test_chunked_categorical(version): + df = DataFrame({"cats": Series(["a", "b", "a", "b", "c"], dtype="category")}) + df.index.name = "index" + with tm.ensure_clean() as path: + df.to_stata(path, version=version) + reader = StataReader(path, chunksize=2, order_categoricals=False) + for i, block in enumerate(reader): + block = block.set_index("index") + assert "cats" in block + tm.assert_series_equal(block.cats, df.cats.iloc[2 * i : 2 * (i + 1)]) + + +def test_chunked_categorical_partial(dirpath): + dta_file = os.path.join(dirpath, "stata-dta-partially-labeled.dta") + values = ["a", "b", "a", "b", 3.0] + with StataReader(dta_file, chunksize=2) as reader: + with tm.assert_produces_warning(CategoricalConversionWarning): + for i, block in enumerate(reader): + assert list(block.cats) == values[2 * i : 2 * (i + 1)] + if i < 2: + idx = pd.Index(["a", "b"]) + else: + idx = pd.Float64Index([3.0]) + tm.assert_index_equal(block.cats.cat.categories, idx) + with tm.assert_produces_warning(CategoricalConversionWarning): + with StataReader(dta_file, chunksize=5) as reader: + large_chunk = reader.__next__() + direct = read_stata(dta_file) + tm.assert_frame_equal(direct, large_chunk) + + +def test_iterator_errors(dirpath): + dta_file = os.path.join(dirpath, "stata-dta-partially-labeled.dta") + with pytest.raises(ValueError, match="chunksize must be a positive"): + StataReader(dta_file, chunksize=-1) + with pytest.raises(ValueError, match="chunksize must be a positive"): + StataReader(dta_file, chunksize=0) + with pytest.raises(ValueError, match="chunksize must be a positive"): + StataReader(dta_file, chunksize="apple") + with pytest.raises(ValueError, match="chunksize must be set to a positive"): + with StataReader(dta_file) as reader: + reader.__next__() + + +def test_iterator_value_labels(): + # GH 31544 + values = ["c_label", "b_label"] + ["a_label"] * 500 + df = DataFrame({f"col{k}": pd.Categorical(values, ordered=True) for k in range(2)}) + with tm.ensure_clean() as path: + df.to_stata(path, write_index=False) + reader = pd.read_stata(path, chunksize=100) + expected = pd.Index(["a_label", "b_label", "c_label"], dtype="object") + for j, chunk in enumerate(reader): + for i in range(2): + tm.assert_index_equal(chunk.dtypes[i].categories, expected) + tm.assert_frame_equal(chunk, df.iloc[j * 100 : (j + 1) * 100])