Skip to content

Commit a060553

Browse files
committed
MAINT: Restrict use of iterator
Restrict iterator to StataReaders constructed with a positive chunksize
1 parent 0f8116e commit a060553

File tree

2 files changed

+21
-1
lines changed

2 files changed

+21
-1
lines changed

pandas/io/stata.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -1038,6 +1038,10 @@ def __init__(
10381038
self._order_categoricals = order_categoricals
10391039
self._encoding = ""
10401040
self._chunksize = chunksize
1041+
if self._chunksize is not None and (
1042+
not isinstance(chunksize, int) or chunksize <= 0
1043+
):
1044+
raise ValueError("chunksize must be a positive integer when set.")
10411045

10421046
# State variables for the file
10431047
self._has_string_data = False
@@ -1503,6 +1507,10 @@ def _read_strls(self) -> None:
15031507
self.GSO[str(v_o)] = decoded_va
15041508

15051509
def __next__(self) -> DataFrame:
1510+
if self._chunksize is None:
1511+
raise ValueError(
1512+
"chunksize must be set to a positive integer to use as an iterator."
1513+
)
15061514
return self.read(nrows=self._chunksize or 1)
15071515

15081516
def get_chunk(self, size: Optional[int] = None) -> DataFrame:
@@ -1786,7 +1794,7 @@ def _do_convert_categoricals(
17861794
vl = value_label_dict[label]
17871795
keys = np.array([k for k in vl.keys()])
17881796
column = data[col]
1789-
if column.isin(keys).all() and self._chunksize:
1797+
if self._chunksize is not None and column.isin(keys).all():
17901798
# If all categories are in the keys and we are iterating,
17911799
# use the same keys for all chunks. If some are missing
17921800
# value labels, then we will fall back to the categories

pandas/tests/io/test_stata.py

+12
Original file line numberDiff line numberDiff line change
@@ -1955,3 +1955,15 @@ def test_chunked_categorical_partial(dirpath):
19551955
large_chunk = reader.__next__()
19561956
direct = read_stata(dta_file)
19571957
tm.assert_frame_equal(direct, large_chunk)
1958+
1959+
1960+
def test_iterator_errors(dirpath):
1961+
dta_file = os.path.join(dirpath, "stata-dta-partially-labeled.dta")
1962+
with pytest.raises(ValueError, match="chunksize must be a positive"):
1963+
StataReader(dta_file, chunksize=-1)
1964+
with pytest.raises(ValueError, match="chunksize must be a positive"):
1965+
StataReader(dta_file, chunksize=0)
1966+
with pytest.raises(ValueError, match="chunksize must be a positive"):
1967+
StataReader(dta_file, chunksize="apple")
1968+
with pytest.raises(ValueError, match="chunksize must be set to a positive"):
1969+
StataReader(dta_file).__next__()

0 commit comments

Comments
 (0)