From ac4c2f15489bef8d2157043069eda1d1e98dc5a7 Mon Sep 17 00:00:00 2001 From: Jonas Haag Date: Fri, 27 May 2022 18:23:49 +0200 Subject: [PATCH] Properly propagate exceptions in sas.pyx --- pandas/io/sas/sas.pyx | 19 +++++++--------- pandas/tests/io/sas/test_sas7bdat.py | 33 ++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 11 deletions(-) diff --git a/pandas/io/sas/sas.pyx b/pandas/io/sas/sas.pyx index efb8d81c4552d..483bbd70faec9 100644 --- a/pandas/io/sas/sas.pyx +++ b/pandas/io/sas/sas.pyx @@ -13,7 +13,7 @@ ctypedef unsigned short uint16_t # algorithm. It is partially documented here: # # https://cran.r-project.org/package=sas7bdat/vignettes/sas7bdat.pdf -cdef const uint8_t[:] rle_decompress(int result_length, const uint8_t[:] inbuff): +cdef const uint8_t[:] rle_decompress(int result_length, const uint8_t[:] inbuff) except *: cdef: uint8_t control_byte, x @@ -116,7 +116,7 @@ cdef const uint8_t[:] rle_decompress(int result_length, const uint8_t[:] inbuff) # rdc_decompress decompresses data using the Ross Data Compression algorithm: # # http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm -cdef const uint8_t[:] rdc_decompress(int result_length, const uint8_t[:] inbuff): +cdef const uint8_t[:] rdc_decompress(int result_length, const uint8_t[:] inbuff) except *: cdef: uint8_t cmd @@ -177,7 +177,7 @@ cdef const uint8_t[:] rdc_decompress(int result_length, const uint8_t[:] inbuff) rpos += cnt # short pattern - elif (cmd >= 3) & (cmd <= 15): + else: ofs = cnt + 3 ofs += inbuff[ipos] << 4 ipos += 1 @@ -185,9 +185,6 @@ cdef const uint8_t[:] rdc_decompress(int result_length, const uint8_t[:] inbuff) outbuff[rpos + k] = outbuff[rpos - ofs + k] rpos += cmd - else: - raise ValueError("unknown RDC command") - # In py37 cython/clang sees `len(outbuff)` as size_t and not Py_ssize_t if len(outbuff) != result_length: raise ValueError(f"RDC: {len(outbuff)} != {result_length}\n") @@ -231,7 +228,7 @@ cdef class Parser: int subheader_pointer_length int current_page_type bint is_little_endian - const uint8_t[:] (*decompress)(int result_length, const uint8_t[:] inbuff) + const uint8_t[:] (*decompress)(int result_length, const uint8_t[:] inbuff) except * object parser def __init__(self, object parser): @@ -294,8 +291,8 @@ cdef class Parser: self.parser._current_row_in_chunk_index = self.current_row_in_chunk_index self.parser._current_row_in_file_index = self.current_row_in_file_index - cdef bint read_next_page(self): - cdef done + cdef bint read_next_page(self) except? True: + cdef bint done done = self.parser._read_next_page() if done: @@ -316,7 +313,7 @@ cdef class Parser: ) self.current_page_subheaders_count = self.parser._current_page_subheaders_count - cdef readline(self): + cdef bint readline(self) except? True: cdef: int offset, bit_offset, align_correction @@ -385,7 +382,7 @@ cdef class Parser: else: raise ValueError(f"unknown page type: {self.current_page_type}") - cdef void process_byte_array_with_data(self, int offset, int length): + cdef void process_byte_array_with_data(self, int offset, int length) except *: cdef: Py_ssize_t j diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index 0dd1fa175fa3f..e797c95958269 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -333,3 +333,36 @@ def test_null_date(datapath): }, ) tm.assert_frame_equal(df, expected) + + +@pytest.mark.parametrize("test_file", ["test2.sas7bdat", "test3.sas7bdat"]) +def test_exception_propagation_rdc_rle_decompress(datapath, monkeypatch, test_file): + """Errors in RLE/RDC decompression should propagate the same error.""" + orig_np_zeros = np.zeros + + def _patched_zeros(size, dtype): + if isinstance(size, int): + # np.zeros() call in {rdc,rle}_decompress + raise Exception("Test exception") + else: + # Other calls to np.zeros + return orig_np_zeros(size, dtype) + + monkeypatch.setattr(np, "zeros", _patched_zeros) + + with pytest.raises(Exception, match="^Test exception$"): + pd.read_sas(datapath("io", "sas", "data", test_file)) + + +def test_exception_propagation_rle_decompress(tmp_path, datapath): + """Illegal control byte in RLE decompressor should raise the correct ValueError.""" + with open(datapath("io", "sas", "data", "test2.sas7bdat"), "rb") as f: + data = bytearray(f.read()) + invalid_control_byte = 0x10 + page_offset = 0x10000 + control_byte_pos = 55229 + data[page_offset + control_byte_pos] = invalid_control_byte + tmp_file = tmp_path / "test2.sas7bdat" + tmp_file.write_bytes(data) + with pytest.raises(ValueError, match="unknown control byte"): + pd.read_sas(tmp_file)