pandas-dev · jreback · Jun 9, 2022 · May 24, 2022 · May 31, 2022 · Jun 1, 2022
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -842,6 +842,7 @@ I/O
 - Bug in :func:`read_excel` when reading a ``.ods`` file with newlines between xml elements (:issue:`45598`)
 - Bug in :func:`read_parquet` when ``engine="fastparquet"`` where the file was not closed on error (:issue:`46555`)
 - :meth:`to_html` now excludes the ``border`` attribute from ``<table>`` elements when ``border`` keyword is set to ``False``.
+- Bug in :func:`read_sas` with certain types of compressed SAS7BDAT files (:issue:`35545`)
 - Bug in :func:`read_sas` returned ``None`` rather than an empty DataFrame for SAS7BDAT files with zero rows (:issue:`18198`)
 - Bug in :class:`StataWriter` where value labels were always written with default encoding (:issue:`46750`)
 

diff --git a/pandas/io/sas/sas.pyx b/pandas/io/sas/sas.pyx
@@ -198,10 +198,11 @@ cdef enum ColumnTypes:
 
 
 # type the page_data types
+assert len(const.page_meta_types) == 2
 cdef:
-    int page_meta_type = const.page_meta_type
-    int page_mix_types_0 = const.page_mix_types[0]
-    int page_mix_types_1 = const.page_mix_types[1]
+    int page_meta_types_0 = const.page_meta_types[0]
+    int page_meta_types_1 = const.page_meta_types[1]
+    int page_mix_type = const.page_mix_type
     int page_data_type = const.page_data_type
     int subheader_pointers_offset = const.subheader_pointers_offset
 
@@ -332,7 +333,7 @@ cdef class Parser:
 
         # Loop until a data row is read
         while True:
-            if self.current_page_type == page_meta_type:
+            if self.current_page_type in (page_meta_types_0, page_meta_types_1):
                 flag = self.current_row_on_page_index >=\
                     self.current_page_data_subheader_pointers_len
                 if flag:
@@ -347,8 +348,7 @@ cdef class Parser:
                     current_subheader_pointer.offset,
                     current_subheader_pointer.length)
                 return False
-            elif (self.current_page_type == page_mix_types_0 or
-                    self.current_page_type == page_mix_types_1):
+            elif self.current_page_type == page_mix_type:
                 align_correction = (
                     bit_offset
                     + subheader_pointers_offset
@@ -366,7 +366,7 @@ cdef class Parser:
                     if done:
                         return True
                 return False
-            elif self.current_page_type & page_data_type == page_data_type:
+            elif self.current_page_type == page_data_type:
                 self.process_byte_array_with_data(
                     bit_offset
                     + subheader_pointers_offset

diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py
@@ -410,11 +410,11 @@ def _parse_metadata(self) -> None:
 
     def _process_page_meta(self) -> bool:
         self._read_page_header()
-        pt = [const.page_meta_type, const.page_amd_type] + const.page_mix_types
+        pt = const.page_meta_types + [const.page_amd_type, const.page_mix_type]
         if self._current_page_type in pt:
             self._process_page_metadata()
-        is_data_page = self._current_page_type & const.page_data_type
-        is_mix_page = self._current_page_type in const.page_mix_types
+        is_data_page = self._current_page_type == const.page_data_type
+        is_mix_page = self._current_page_type == const.page_mix_type
         return bool(
             is_data_page
             or is_mix_page
@@ -424,7 +424,9 @@ def _process_page_meta(self) -> bool:
     def _read_page_header(self):
         bit_offset = self._page_bit_offset
         tx = const.page_type_offset + bit_offset
-        self._current_page_type = self._read_int(tx, const.page_type_length)
+        self._current_page_type = (
+            self._read_int(tx, const.page_type_length) & const.page_type_mask2
+        )
         tx = const.block_count_offset + bit_offset
         self._current_page_block_count = self._read_int(tx, const.block_count_length)
         tx = const.subheader_count_offset + bit_offset
@@ -774,13 +776,13 @@ def _read_next_page(self):
             raise ValueError(msg)
 
         self._read_page_header()
-        page_type = self._current_page_type
-        if page_type == const.page_meta_type:
+        if self._current_page_type in const.page_meta_types:
             self._process_page_metadata()
 
-        is_data_page = page_type & const.page_data_type
-        pt = [const.page_meta_type] + const.page_mix_types
-        if not is_data_page and self._current_page_type not in pt:
+        if self._current_page_type not in const.page_meta_types + [
+            const.page_data_type,
+            const.page_mix_type,
+        ]:
             return self._read_next_page()
 
         return False

diff --git a/pandas/io/sas/sas_constants.py b/pandas/io/sas/sas_constants.py
@@ -53,12 +53,16 @@
 block_count_length = 2
 subheader_count_offset = 4
 subheader_count_length = 2
-page_meta_type = 0
-page_data_type = 256
-page_amd_type = 1024
-page_metc_type = 16384
-page_comp_type = -28672
-page_mix_types = [512, 640]
+page_type_mask = 0x0F00
+# Keep "page_comp_type" bits
+page_type_mask2 = 0xF000 | page_type_mask
+page_meta_type = 0x0000
+page_data_type = 0x0100
+page_mix_type = 0x0200
+page_amd_type = 0x0400
+page_meta2_type = 0x4000
+page_comp_type = 0x9000
+page_meta_types = [page_meta_type, page_meta2_type]
 subheader_pointers_offset = 8
 truncated_subheader_id = 1
 compressed_subheader_id = 4

diff --git a/pandas/tests/io/sas/data/test_meta2_page.sas7bdat b/pandas/tests/io/sas/data/test_meta2_page.sas7bdat
diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py
@@ -343,6 +343,13 @@ def test_null_date(datapath):
     tm.assert_frame_equal(df, expected)
 
 
+def test_meta2_page(datapath):
+    # GH 35545
+    fname = datapath("io", "sas", "data", "test_meta2_page.sas7bdat")
+    df = pd.read_sas(fname)
+    assert len(df) == 1000
+
+
 @pytest.mark.parametrize("test_file", ["test2.sas7bdat", "test3.sas7bdat"])
 def test_exception_propagation_rdc_rle_decompress(datapath, monkeypatch, test_file):
     """Errors in RLE/RDC decompression should propagate the same error."""