Skip to content

Properly propagate exceptions in sas.pyx #47149

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 5, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 8 additions & 11 deletions pandas/io/sas/sas.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ ctypedef unsigned short uint16_t
# algorithm. It is partially documented here:
#
# https://cran.r-project.org/package=sas7bdat/vignettes/sas7bdat.pdf
cdef const uint8_t[:] rle_decompress(int result_length, const uint8_t[:] inbuff):
cdef const uint8_t[:] rle_decompress(int result_length, const uint8_t[:] inbuff) except *:

cdef:
uint8_t control_byte, x
Expand Down Expand Up @@ -116,7 +116,7 @@ cdef const uint8_t[:] rle_decompress(int result_length, const uint8_t[:] inbuff)
# rdc_decompress decompresses data using the Ross Data Compression algorithm:
#
# http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm
cdef const uint8_t[:] rdc_decompress(int result_length, const uint8_t[:] inbuff):
cdef const uint8_t[:] rdc_decompress(int result_length, const uint8_t[:] inbuff) except *:

cdef:
uint8_t cmd
Expand Down Expand Up @@ -177,17 +177,14 @@ cdef const uint8_t[:] rdc_decompress(int result_length, const uint8_t[:] inbuff)
rpos += cnt

# short pattern
elif (cmd >= 3) & (cmd <= 15):
else:
ofs = cnt + 3
ofs += <uint16_t>inbuff[ipos] << 4
ipos += 1
for k in range(cmd):
outbuff[rpos + k] = outbuff[rpos - <int>ofs + k]
rpos += cmd

else:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This case is actually impossible and I removed it so that nobody is confused why we aren't testing the raise ValueError.

raise ValueError("unknown RDC command")

# In py37 cython/clang sees `len(outbuff)` as size_t and not Py_ssize_t
if <Py_ssize_t>len(outbuff) != <Py_ssize_t>result_length:
raise ValueError(f"RDC: {len(outbuff)} != {result_length}\n")
Expand Down Expand Up @@ -231,7 +228,7 @@ cdef class Parser:
int subheader_pointer_length
int current_page_type
bint is_little_endian
const uint8_t[:] (*decompress)(int result_length, const uint8_t[:] inbuff)
const uint8_t[:] (*decompress)(int result_length, const uint8_t[:] inbuff) except *
object parser

def __init__(self, object parser):
Expand Down Expand Up @@ -294,8 +291,8 @@ cdef class Parser:
self.parser._current_row_in_chunk_index = self.current_row_in_chunk_index
self.parser._current_row_in_file_index = self.current_row_in_file_index

cdef bint read_next_page(self):
cdef done
cdef bint read_next_page(self) except? True:
cdef bint done

done = self.parser._read_next_page()
if done:
Expand All @@ -316,7 +313,7 @@ cdef class Parser:
)
self.current_page_subheaders_count = self.parser._current_page_subheaders_count

cdef readline(self):
cdef bint readline(self) except? True:

cdef:
int offset, bit_offset, align_correction
Expand Down Expand Up @@ -385,7 +382,7 @@ cdef class Parser:
else:
raise ValueError(f"unknown page type: {self.current_page_type}")

cdef void process_byte_array_with_data(self, int offset, int length):
cdef void process_byte_array_with_data(self, int offset, int length) except *:

cdef:
Py_ssize_t j
Expand Down
33 changes: 33 additions & 0 deletions pandas/tests/io/sas/test_sas7bdat.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,3 +333,36 @@ def test_null_date(datapath):
},
)
tm.assert_frame_equal(df, expected)


@pytest.mark.parametrize("test_file", ["test2.sas7bdat", "test3.sas7bdat"])
def test_exception_propagation_rdc_rle_decompress(datapath, monkeypatch, test_file):
"""Errors in RLE/RDC decompression should propagate the same error."""
orig_np_zeros = np.zeros

def _patched_zeros(size, dtype):
if isinstance(size, int):
# np.zeros() call in {rdc,rle}_decompress
raise Exception("Test exception")
else:
# Other calls to np.zeros
return orig_np_zeros(size, dtype)

monkeypatch.setattr(np, "zeros", _patched_zeros)

with pytest.raises(Exception, match="^Test exception$"):
pd.read_sas(datapath("io", "sas", "data", test_file))


def test_exception_propagation_rle_decompress(tmp_path, datapath):
"""Illegal control byte in RLE decompressor should raise the correct ValueError."""
with open(datapath("io", "sas", "data", "test2.sas7bdat"), "rb") as f:
data = bytearray(f.read())
invalid_control_byte = 0x10
page_offset = 0x10000
control_byte_pos = 55229
data[page_offset + control_byte_pos] = invalid_control_byte
tmp_file = tmp_path / "test2.sas7bdat"
tmp_file.write_bytes(data)
with pytest.raises(ValueError, match="unknown control byte"):
pd.read_sas(tmp_file)