MAINT: Restrict use of iterator

bashtage · bashtage · commit a0605537622d · 2020-05-12T17:26:33.000+01:00
Restrict iterator to StataReaders constructed with a positive chunksize
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -1038,6 +1038,10 @@ def __init__(
         self._order_categoricals = order_categoricals
         self._encoding = ""
         self._chunksize = chunksize
+        if self._chunksize is not None and (
+            not isinstance(chunksize, int) or chunksize <= 0
+        ):
+            raise ValueError("chunksize must be a positive integer when set.")
 
         # State variables for the file
         self._has_string_data = False
@@ -1503,6 +1507,10 @@ def _read_strls(self) -> None:
             self.GSO[str(v_o)] = decoded_va
 
     def __next__(self) -> DataFrame:
+        if self._chunksize is None:
+            raise ValueError(
+                "chunksize must be set to a positive integer to use as an iterator."
+            )
         return self.read(nrows=self._chunksize or 1)
 
     def get_chunk(self, size: Optional[int] = None) -> DataFrame:
@@ -1786,7 +1794,7 @@ def _do_convert_categoricals(
                 vl = value_label_dict[label]
                 keys = np.array([k for k in vl.keys()])
                 column = data[col]
-                if column.isin(keys).all() and self._chunksize:
+                if self._chunksize is not None and column.isin(keys).all():
                     # If all categories are in the keys and we are iterating,
                     # use the same keys for all chunks. If some are missing
                     # value labels, then we will fall back to the categories
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
@@ -1955,3 +1955,15 @@ def test_chunked_categorical_partial(dirpath):
     large_chunk = reader.__next__()
     direct = read_stata(dta_file)
     tm.assert_frame_equal(direct, large_chunk)
+
+
+def test_iterator_errors(dirpath):
+    dta_file = os.path.join(dirpath, "stata-dta-partially-labeled.dta")
+    with pytest.raises(ValueError, match="chunksize must be a positive"):
+        StataReader(dta_file, chunksize=-1)
+    with pytest.raises(ValueError, match="chunksize must be a positive"):
+        StataReader(dta_file, chunksize=0)
+    with pytest.raises(ValueError, match="chunksize must be a positive"):
+        StataReader(dta_file, chunksize="apple")
+    with pytest.raises(ValueError, match="chunksize must be set to a positive"):
+        StataReader(dta_file).__next__()