Skip to content

BUG: Allow empty chunksize in stata reader when using iterator #37302

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 23, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.1.4.rst
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ Fixed regressions
- Fixed regression in :meth:`DataFrame.resample(...).apply(...)` raised ``AttributeError`` when input was a :class:`DataFrame` and only a :class:`Series` was evaluated (:issue:`36951`)
- Fixed regression in :class:`PeriodDtype` comparing both equal and unequal to its string representation (:issue:`37265`)
- Fixed regression in certain offsets (:meth:`pd.offsets.Day() <pandas.tseries.offsets.Day>` and below) no longer being hashable (:issue:`37267`)
- Fixed regression in :class:`StataReader` which required ``chunksize`` to be manually set when using an iterator to read a dataset (:issue:`37280`)

.. ---------------------------------------------------------------------------

Expand Down
80 changes: 41 additions & 39 deletions pandas/io/stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -469,7 +469,7 @@ class PossiblePrecisionLoss(Warning):


precision_loss_doc = """
Column converted from %s to %s, and some data are outside of the lossless
Column converted from {0} to {1}, and some data are outside of the lossless
conversion range. This may result in a loss of precision in the saved data.
"""

Expand Down Expand Up @@ -543,7 +543,7 @@ def _cast_to_stata_types(data: DataFrame) -> DataFrame:
object in a DataFrame.
"""
ws = ""
# original, if small, if large
# original, if small, if large
conversion_data = (
(np.bool_, np.int8, np.int8),
(np.uint8, np.int8, np.int16),
Expand All @@ -563,7 +563,7 @@ def _cast_to_stata_types(data: DataFrame) -> DataFrame:
dtype = c_data[1]
else:
dtype = c_data[2]
if c_data[2] == np.float64: # Warn if necessary
if c_data[2] == np.int64: # Warn if necessary
if data[col].max() >= 2 ** 53:
ws = precision_loss_doc.format("uint64", "float64")

Expand Down Expand Up @@ -627,12 +627,12 @@ def __init__(self, catarray: Series, encoding: str = "latin-1"):
self.value_labels = list(zip(np.arange(len(categories)), categories))
self.value_labels.sort(key=lambda x: x[0])
self.text_len = 0
self.off: List[int] = []
self.val: List[int] = []
self.txt: List[bytes] = []
self.n = 0

# Compute lengths and setup lists of offsets and labels
offsets: List[int] = []
values: List[int] = []
for vl in self.value_labels:
category = vl[1]
if not isinstance(category, str):
Expand All @@ -642,9 +642,9 @@ def __init__(self, catarray: Series, encoding: str = "latin-1"):
ValueLabelTypeMismatch,
)
category = category.encode(encoding)
self.off.append(self.text_len)
offsets.append(self.text_len)
self.text_len += len(category) + 1 # +1 for the padding
self.val.append(vl[0])
values.append(vl[0])
self.txt.append(category)
self.n += 1

Expand All @@ -655,8 +655,8 @@ def __init__(self, catarray: Series, encoding: str = "latin-1"):
)

# Ensure int32
self.off = np.array(self.off, dtype=np.int32)
self.val = np.array(self.val, dtype=np.int32)
self.off = np.array(offsets, dtype=np.int32)
self.val = np.array(values, dtype=np.int32)

# Total length
self.len = 4 + 4 + 4 * self.n + 4 * self.n + self.text_len
Expand Down Expand Up @@ -868,23 +868,23 @@ def __init__(self):
# with a label, but the underlying variable is -127 to 100
# we're going to drop the label and cast to int
self.DTYPE_MAP = dict(
list(zip(range(1, 245), ["a" + str(i) for i in range(1, 245)]))
list(zip(range(1, 245), [np.dtype("a" + str(i)) for i in range(1, 245)]))
+ [
(251, np.int8),
(252, np.int16),
(253, np.int32),
(254, np.float32),
(255, np.float64),
(251, np.dtype(np.int8)),
(252, np.dtype(np.int16)),
(253, np.dtype(np.int32)),
(254, np.dtype(np.float32)),
(255, np.dtype(np.float64)),
]
)
self.DTYPE_MAP_XML = dict(
[
(32768, np.uint8), # Keys to GSO
(65526, np.float64),
(65527, np.float32),
(65528, np.int32),
(65529, np.int16),
(65530, np.int8),
(32768, np.dtype(np.uint8)), # Keys to GSO
(65526, np.dtype(np.float64)),
(65527, np.dtype(np.float32)),
(65528, np.dtype(np.int32)),
(65529, np.dtype(np.int16)),
(65530, np.dtype(np.int8)),
]
)
# error: Argument 1 to "list" has incompatible type "str";
Expand Down Expand Up @@ -1045,9 +1045,10 @@ def __init__(
self._order_categoricals = order_categoricals
self._encoding = ""
self._chunksize = chunksize
if self._chunksize is not None and (
not isinstance(chunksize, int) or chunksize <= 0
):
self._using_iterator = False
if self._chunksize is None:
self._chunksize = 1
elif not isinstance(chunksize, int) or chunksize <= 0:
raise ValueError("chunksize must be a positive integer when set.")

# State variables for the file
Expand All @@ -1057,7 +1058,7 @@ def __init__(
self._column_selector_set = False
self._value_labels_read = False
self._data_read = False
self._dtype = None
self._dtype: Optional[np.dtype] = None
self._lines_read = 0

self._native_byteorder = _set_endianness(sys.byteorder)
Expand Down Expand Up @@ -1193,7 +1194,7 @@ def _read_new_header(self) -> None:
# Get data type information, works for versions 117-119.
def _get_dtypes(
self, seek_vartypes: int
) -> Tuple[List[Union[int, str]], List[Union[int, np.dtype]]]:
) -> Tuple[List[Union[int, str]], List[Union[str, np.dtype]]]:

self.path_or_buf.seek(seek_vartypes)
raw_typlist = [
Expand Down Expand Up @@ -1518,11 +1519,8 @@ def _read_strls(self) -> None:
self.GSO[str(v_o)] = decoded_va

def __next__(self) -> DataFrame:
if self._chunksize is None:
raise ValueError(
"chunksize must be set to a positive integer to use as an iterator."
)
return self.read(nrows=self._chunksize or 1)
self._using_iterator = True
return self.read(nrows=self._chunksize)

def get_chunk(self, size: Optional[int] = None) -> DataFrame:
"""
Expand Down Expand Up @@ -1690,11 +1688,15 @@ def any_startswith(x: str) -> bool:
convert = False
for col in data:
dtype = data[col].dtype
if dtype in (np.float16, np.float32):
dtype = np.float64
if dtype in (np.dtype(np.float16), np.dtype(np.float32)):
dtype = np.dtype(np.float64)
convert = True
elif dtype in (np.int8, np.int16, np.int32):
dtype = np.int64
elif dtype in (
np.dtype(np.int8),
np.dtype(np.int16),
np.dtype(np.int32),
):
dtype = np.dtype(np.int64)
convert = True
retyped_data.append((col, data[col].astype(dtype)))
if convert:
Expand Down Expand Up @@ -1806,14 +1808,14 @@ def _do_convert_categoricals(
keys = np.array(list(vl.keys()))
column = data[col]
key_matches = column.isin(keys)
if self._chunksize is not None and key_matches.all():
initial_categories = keys
if self._using_iterator and key_matches.all():
initial_categories: Optional[np.ndarray] = keys
# If all categories are in the keys and we are iterating,
# use the same keys for all chunks. If some are missing
# value labels, then we will fall back to the categories
# varying across chunks.
else:
if self._chunksize is not None:
if self._using_iterator:
# warn is using an iterator
warnings.warn(
categorical_conversion_warning, CategoricalConversionWarning
Expand Down Expand Up @@ -2024,7 +2026,7 @@ def _convert_datetime_to_stata_type(fmt: str) -> np.dtype:
"ty",
"%ty",
]:
return np.float64 # Stata expects doubles for SIFs
return np.dtype(np.float64) # Stata expects doubles for SIFs
else:
raise NotImplementedError(f"Format {fmt} not implemented")

Expand Down
20 changes: 17 additions & 3 deletions pandas/tests/io/test_stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -1966,9 +1966,6 @@ def test_iterator_errors(dirpath):
StataReader(dta_file, chunksize=0)
with pytest.raises(ValueError, match="chunksize must be a positive"):
StataReader(dta_file, chunksize="apple")
with pytest.raises(ValueError, match="chunksize must be set to a positive"):
with StataReader(dta_file) as reader:
reader.__next__()


def test_iterator_value_labels():
Expand All @@ -1983,3 +1980,20 @@ def test_iterator_value_labels():
for i in range(2):
tm.assert_index_equal(chunk.dtypes[i].categories, expected)
tm.assert_frame_equal(chunk, df.iloc[j * 100 : (j + 1) * 100])


def test_precision_loss():
df = DataFrame(
[[sum(2 ** i for i in range(60)), sum(2 ** i for i in range(52))]],
columns=["big", "little"],
)
with tm.ensure_clean() as path:
with tm.assert_produces_warning(
PossiblePrecisionLoss, match="Column converted from int64 to float64"
):
df.to_stata(path, write_index=False)
reread = read_stata(path)
expected_dt = Series([np.float64, np.float64], index=["big", "little"])
tm.assert_series_equal(reread.dtypes, expected_dt)
assert reread.loc[0, "little"] == df.loc[0, "little"]
assert reread.loc[0, "big"] == float(df.loc[0, "big"])