diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 4ba9628d8f275..defd7a262f681 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -842,6 +842,7 @@ I/O - Bug in :func:`read_excel` when reading a ``.ods`` file with newlines between xml elements (:issue:`45598`) - Bug in :func:`read_parquet` when ``engine="fastparquet"`` where the file was not closed on error (:issue:`46555`) - :meth:`to_html` now excludes the ``border`` attribute from ```` elements when ``border`` keyword is set to ``False``. +- Bug in :func:`read_sas` with certain types of compressed SAS7BDAT files (:issue:`35545`) - Bug in :func:`read_sas` returned ``None`` rather than an empty DataFrame for SAS7BDAT files with zero rows (:issue:`18198`) - Bug in :class:`StataWriter` where value labels were always written with default encoding (:issue:`46750`) diff --git a/pandas/io/sas/sas.pyx b/pandas/io/sas/sas.pyx index 483bbd70faec9..2df3e1f7243da 100644 --- a/pandas/io/sas/sas.pyx +++ b/pandas/io/sas/sas.pyx @@ -198,10 +198,11 @@ cdef enum ColumnTypes: # type the page_data types +assert len(const.page_meta_types) == 2 cdef: - int page_meta_type = const.page_meta_type - int page_mix_types_0 = const.page_mix_types[0] - int page_mix_types_1 = const.page_mix_types[1] + int page_meta_types_0 = const.page_meta_types[0] + int page_meta_types_1 = const.page_meta_types[1] + int page_mix_type = const.page_mix_type int page_data_type = const.page_data_type int subheader_pointers_offset = const.subheader_pointers_offset @@ -332,7 +333,7 @@ cdef class Parser: # Loop until a data row is read while True: - if self.current_page_type == page_meta_type: + if self.current_page_type in (page_meta_types_0, page_meta_types_1): flag = self.current_row_on_page_index >=\ self.current_page_data_subheader_pointers_len if flag: @@ -347,8 +348,7 @@ cdef class Parser: current_subheader_pointer.offset, current_subheader_pointer.length) return False - elif (self.current_page_type == page_mix_types_0 or - self.current_page_type == page_mix_types_1): + elif self.current_page_type == page_mix_type: align_correction = ( bit_offset + subheader_pointers_offset @@ -366,7 +366,7 @@ cdef class Parser: if done: return True return False - elif self.current_page_type & page_data_type == page_data_type: + elif self.current_page_type == page_data_type: self.process_byte_array_with_data( bit_offset + subheader_pointers_offset diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index 1e071690d35fb..0ed853d619d4e 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -410,11 +410,11 @@ def _parse_metadata(self) -> None: def _process_page_meta(self) -> bool: self._read_page_header() - pt = [const.page_meta_type, const.page_amd_type] + const.page_mix_types + pt = const.page_meta_types + [const.page_amd_type, const.page_mix_type] if self._current_page_type in pt: self._process_page_metadata() - is_data_page = self._current_page_type & const.page_data_type - is_mix_page = self._current_page_type in const.page_mix_types + is_data_page = self._current_page_type == const.page_data_type + is_mix_page = self._current_page_type == const.page_mix_type return bool( is_data_page or is_mix_page @@ -424,7 +424,9 @@ def _process_page_meta(self) -> bool: def _read_page_header(self): bit_offset = self._page_bit_offset tx = const.page_type_offset + bit_offset - self._current_page_type = self._read_int(tx, const.page_type_length) + self._current_page_type = ( + self._read_int(tx, const.page_type_length) & const.page_type_mask2 + ) tx = const.block_count_offset + bit_offset self._current_page_block_count = self._read_int(tx, const.block_count_length) tx = const.subheader_count_offset + bit_offset @@ -774,13 +776,13 @@ def _read_next_page(self): raise ValueError(msg) self._read_page_header() - page_type = self._current_page_type - if page_type == const.page_meta_type: + if self._current_page_type in const.page_meta_types: self._process_page_metadata() - is_data_page = page_type & const.page_data_type - pt = [const.page_meta_type] + const.page_mix_types - if not is_data_page and self._current_page_type not in pt: + if self._current_page_type not in const.page_meta_types + [ + const.page_data_type, + const.page_mix_type, + ]: return self._read_next_page() return False diff --git a/pandas/io/sas/sas_constants.py b/pandas/io/sas/sas_constants.py index 23b23a1bf09c0..979b2cacbf706 100644 --- a/pandas/io/sas/sas_constants.py +++ b/pandas/io/sas/sas_constants.py @@ -53,12 +53,16 @@ block_count_length = 2 subheader_count_offset = 4 subheader_count_length = 2 -page_meta_type = 0 -page_data_type = 256 -page_amd_type = 1024 -page_metc_type = 16384 -page_comp_type = -28672 -page_mix_types = [512, 640] +page_type_mask = 0x0F00 +# Keep "page_comp_type" bits +page_type_mask2 = 0xF000 | page_type_mask +page_meta_type = 0x0000 +page_data_type = 0x0100 +page_mix_type = 0x0200 +page_amd_type = 0x0400 +page_meta2_type = 0x4000 +page_comp_type = 0x9000 +page_meta_types = [page_meta_type, page_meta2_type] subheader_pointers_offset = 8 truncated_subheader_id = 1 compressed_subheader_id = 4 diff --git a/pandas/tests/io/sas/data/test_meta2_page.sas7bdat b/pandas/tests/io/sas/data/test_meta2_page.sas7bdat new file mode 100644 index 0000000000000..9732cabc212cf Binary files /dev/null and b/pandas/tests/io/sas/data/test_meta2_page.sas7bdat differ diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index 5cacd7efb5920..3f150c1a061ee 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -343,6 +343,13 @@ def test_null_date(datapath): tm.assert_frame_equal(df, expected) +def test_meta2_page(datapath): + # GH 35545 + fname = datapath("io", "sas", "data", "test_meta2_page.sas7bdat") + df = pd.read_sas(fname) + assert len(df) == 1000 + + @pytest.mark.parametrize("test_file", ["test2.sas7bdat", "test3.sas7bdat"]) def test_exception_propagation_rdc_rle_decompress(datapath, monkeypatch, test_file): """Errors in RLE/RDC decompression should propagate the same error."""