Skip to content

BUG/ENH: Improve categorical construction when using the iterator in StataReader #34128

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jun 4, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -905,6 +905,7 @@ I/O
- Bug in :meth:`~DataFrame.read_feather` was raising an `ArrowIOError` when reading an s3 or http file path (:issue:`29055`)
- Bug in :meth:`~DataFrame.to_excel` could not handle the column name `render` and was raising an ``KeyError`` (:issue:`34331`)
- Bug in :meth:`~SQLDatabase.execute` was raising a ``ProgrammingError`` for some DB-API drivers when the SQL statement contained the `%` character and no parameters were present (:issue:`34211`)
- Bug in :meth:`~pandas.io.stata.StataReader` which resulted in categorical variables with difference dtypes when reading data using an iterator. (:issue:`31544`)

Plotting
^^^^^^^^
Expand Down
76 changes: 68 additions & 8 deletions pandas/io/stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,14 @@
iterator : bool, default False
Return StataReader object."""

_reader_notes = """\
Notes
-----
Categorical variables read through an iterator may not have the same
categories and dtype. This occurs when a variable stored in a DTA
file is associated to an incomplete set of value labels that only
label a strict subset of the values."""

_read_stata_doc = f"""
Read Stata file into DataFrame.

Expand Down Expand Up @@ -135,6 +143,8 @@
io.stata.StataReader : Low-level reader for Stata data files.
DataFrame.to_stata: Export Stata data files.

{_reader_notes}

Examples
--------
Read a Stata dta file:
Expand Down Expand Up @@ -176,6 +186,8 @@
{_statafile_processing_params1}
{_statafile_processing_params2}
{_chunksize_params}

{_reader_notes}
"""


Expand Down Expand Up @@ -497,6 +509,21 @@ class InvalidColumnName(Warning):
"""


class CategoricalConversionWarning(Warning):
pass


categorical_conversion_warning = """
One or more series with value labels are not fully labeled. Reading this
dataset with an iterator results in categorical variable with different
categories. This occurs since it is not possible to know all possible values
until the entire dataset has been read. To avoid this warning, you can either
read dataset without an interator, or manually convert categorical data by
``convert_categoricals`` to False and then accessing the variable labels
through the value_labels method of the reader.
"""


def _cast_to_stata_types(data: DataFrame) -> DataFrame:
"""
Checks the dtypes of the columns of a pandas DataFrame for
Expand Down Expand Up @@ -1023,6 +1050,10 @@ def __init__(
self._order_categoricals = order_categoricals
self._encoding = ""
self._chunksize = chunksize
if self._chunksize is not None and (
not isinstance(chunksize, int) or chunksize <= 0
):
raise ValueError("chunksize must be a positive integer when set.")

# State variables for the file
self._has_string_data = False
Expand Down Expand Up @@ -1488,6 +1519,10 @@ def _read_strls(self) -> None:
self.GSO[str(v_o)] = decoded_va

def __next__(self) -> DataFrame:
if self._chunksize is None:
raise ValueError(
"chunksize must be set to a positive integer to use as an iterator."
)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This breaks when read_stata() is passed iterator=True but no chunksize, #37280

return self.read(nrows=self._chunksize or 1)

def get_chunk(self, size: Optional[int] = None) -> DataFrame:
Expand Down Expand Up @@ -1753,8 +1788,8 @@ def _do_select_columns(self, data: DataFrame, columns: Sequence[str]) -> DataFra

return data[columns]

@staticmethod
def _do_convert_categoricals(
self,
data: DataFrame,
value_label_dict: Dict[str, Dict[Union[float, int], str]],
lbllist: Sequence[str],
Expand All @@ -1768,14 +1803,39 @@ def _do_convert_categoricals(
for col, label in zip(data, lbllist):
if label in value_labels:
# Explicit call with ordered=True
cat_data = Categorical(data[col], ordered=order_categoricals)
categories = []
for category in cat_data.categories:
if category in value_label_dict[label]:
categories.append(value_label_dict[label][category])
else:
categories.append(category) # Partially labeled
vl = value_label_dict[label]
keys = np.array(list(vl.keys()))
column = data[col]
key_matches = column.isin(keys)
if self._chunksize is not None and key_matches.all():
initial_categories = keys
# If all categories are in the keys and we are iterating,
# use the same keys for all chunks. If some are missing
# value labels, then we will fall back to the categories
# varying across chunks.
else:
if self._chunksize is not None:
# warn is using an iterator
warnings.warn(
categorical_conversion_warning, CategoricalConversionWarning
)
initial_categories = None
cat_data = Categorical(
column, categories=initial_categories, ordered=order_categoricals
)
if initial_categories is None:
# If None here, then we need to match the cats in the Categorical
categories = []
for category in cat_data.categories:
if category in vl:
categories.append(vl[category])
else:
categories.append(category)
else:
# If all cats are matched, we can use the values
categories = list(vl.values())
try:
# Try to catch duplicate categories
cat_data.categories = categories
except ValueError as err:
vc = Series(categories).value_counts()
Expand Down
Binary file not shown.
60 changes: 60 additions & 0 deletions pandas/tests/io/test_stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

from pandas.io.parsers import read_csv
from pandas.io.stata import (
CategoricalConversionWarning,
InvalidColumnName,
PossiblePrecisionLoss,
StataMissingValue,
Expand Down Expand Up @@ -1923,3 +1924,62 @@ def test_compression_dict(method, file_ext):
fp = path
reread = read_stata(fp, index_col="index")
tm.assert_frame_equal(reread, df)


@pytest.mark.parametrize("version", [114, 117, 118, 119, None])
def test_chunked_categorical(version):
df = DataFrame({"cats": Series(["a", "b", "a", "b", "c"], dtype="category")})
df.index.name = "index"
with tm.ensure_clean() as path:
df.to_stata(path, version=version)
reader = StataReader(path, chunksize=2, order_categoricals=False)
for i, block in enumerate(reader):
block = block.set_index("index")
assert "cats" in block
tm.assert_series_equal(block.cats, df.cats.iloc[2 * i : 2 * (i + 1)])


def test_chunked_categorical_partial(dirpath):
dta_file = os.path.join(dirpath, "stata-dta-partially-labeled.dta")
values = ["a", "b", "a", "b", 3.0]
with StataReader(dta_file, chunksize=2) as reader:
with tm.assert_produces_warning(CategoricalConversionWarning):
for i, block in enumerate(reader):
assert list(block.cats) == values[2 * i : 2 * (i + 1)]
if i < 2:
idx = pd.Index(["a", "b"])
else:
idx = pd.Float64Index([3.0])
tm.assert_index_equal(block.cats.cat.categories, idx)
with tm.assert_produces_warning(CategoricalConversionWarning):
with StataReader(dta_file, chunksize=5) as reader:
large_chunk = reader.__next__()
direct = read_stata(dta_file)
tm.assert_frame_equal(direct, large_chunk)


def test_iterator_errors(dirpath):
dta_file = os.path.join(dirpath, "stata-dta-partially-labeled.dta")
with pytest.raises(ValueError, match="chunksize must be a positive"):
StataReader(dta_file, chunksize=-1)
with pytest.raises(ValueError, match="chunksize must be a positive"):
StataReader(dta_file, chunksize=0)
with pytest.raises(ValueError, match="chunksize must be a positive"):
StataReader(dta_file, chunksize="apple")
with pytest.raises(ValueError, match="chunksize must be set to a positive"):
with StataReader(dta_file) as reader:
reader.__next__()


def test_iterator_value_labels():
# GH 31544
values = ["c_label", "b_label"] + ["a_label"] * 500
df = DataFrame({f"col{k}": pd.Categorical(values, ordered=True) for k in range(2)})
with tm.ensure_clean() as path:
df.to_stata(path, write_index=False)
reader = pd.read_stata(path, chunksize=100)
expected = pd.Index(["a_label", "b_label", "c_label"], dtype="object")
for j, chunk in enumerate(reader):
for i in range(2):
tm.assert_index_equal(chunk.dtypes[i].categories, expected)
tm.assert_frame_equal(chunk, df.iloc[j * 100 : (j + 1) * 100])