diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
index 4ba9628d8f275..defd7a262f681 100644
--- a/doc/source/whatsnew/v1.5.0.rst
+++ b/doc/source/whatsnew/v1.5.0.rst
@@ -842,6 +842,7 @@ I/O
- Bug in :func:`read_excel` when reading a ``.ods`` file with newlines between xml elements (:issue:`45598`)
- Bug in :func:`read_parquet` when ``engine="fastparquet"`` where the file was not closed on error (:issue:`46555`)
- :meth:`to_html` now excludes the ``border`` attribute from ``
`` elements when ``border`` keyword is set to ``False``.
+- Bug in :func:`read_sas` with certain types of compressed SAS7BDAT files (:issue:`35545`)
- Bug in :func:`read_sas` returned ``None`` rather than an empty DataFrame for SAS7BDAT files with zero rows (:issue:`18198`)
- Bug in :class:`StataWriter` where value labels were always written with default encoding (:issue:`46750`)
diff --git a/pandas/io/sas/sas.pyx b/pandas/io/sas/sas.pyx
index 483bbd70faec9..2df3e1f7243da 100644
--- a/pandas/io/sas/sas.pyx
+++ b/pandas/io/sas/sas.pyx
@@ -198,10 +198,11 @@ cdef enum ColumnTypes:
# type the page_data types
+assert len(const.page_meta_types) == 2
cdef:
- int page_meta_type = const.page_meta_type
- int page_mix_types_0 = const.page_mix_types[0]
- int page_mix_types_1 = const.page_mix_types[1]
+ int page_meta_types_0 = const.page_meta_types[0]
+ int page_meta_types_1 = const.page_meta_types[1]
+ int page_mix_type = const.page_mix_type
int page_data_type = const.page_data_type
int subheader_pointers_offset = const.subheader_pointers_offset
@@ -332,7 +333,7 @@ cdef class Parser:
# Loop until a data row is read
while True:
- if self.current_page_type == page_meta_type:
+ if self.current_page_type in (page_meta_types_0, page_meta_types_1):
flag = self.current_row_on_page_index >=\
self.current_page_data_subheader_pointers_len
if flag:
@@ -347,8 +348,7 @@ cdef class Parser:
current_subheader_pointer.offset,
current_subheader_pointer.length)
return False
- elif (self.current_page_type == page_mix_types_0 or
- self.current_page_type == page_mix_types_1):
+ elif self.current_page_type == page_mix_type:
align_correction = (
bit_offset
+ subheader_pointers_offset
@@ -366,7 +366,7 @@ cdef class Parser:
if done:
return True
return False
- elif self.current_page_type & page_data_type == page_data_type:
+ elif self.current_page_type == page_data_type:
self.process_byte_array_with_data(
bit_offset
+ subheader_pointers_offset
diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py
index 1e071690d35fb..0ed853d619d4e 100644
--- a/pandas/io/sas/sas7bdat.py
+++ b/pandas/io/sas/sas7bdat.py
@@ -410,11 +410,11 @@ def _parse_metadata(self) -> None:
def _process_page_meta(self) -> bool:
self._read_page_header()
- pt = [const.page_meta_type, const.page_amd_type] + const.page_mix_types
+ pt = const.page_meta_types + [const.page_amd_type, const.page_mix_type]
if self._current_page_type in pt:
self._process_page_metadata()
- is_data_page = self._current_page_type & const.page_data_type
- is_mix_page = self._current_page_type in const.page_mix_types
+ is_data_page = self._current_page_type == const.page_data_type
+ is_mix_page = self._current_page_type == const.page_mix_type
return bool(
is_data_page
or is_mix_page
@@ -424,7 +424,9 @@ def _process_page_meta(self) -> bool:
def _read_page_header(self):
bit_offset = self._page_bit_offset
tx = const.page_type_offset + bit_offset
- self._current_page_type = self._read_int(tx, const.page_type_length)
+ self._current_page_type = (
+ self._read_int(tx, const.page_type_length) & const.page_type_mask2
+ )
tx = const.block_count_offset + bit_offset
self._current_page_block_count = self._read_int(tx, const.block_count_length)
tx = const.subheader_count_offset + bit_offset
@@ -774,13 +776,13 @@ def _read_next_page(self):
raise ValueError(msg)
self._read_page_header()
- page_type = self._current_page_type
- if page_type == const.page_meta_type:
+ if self._current_page_type in const.page_meta_types:
self._process_page_metadata()
- is_data_page = page_type & const.page_data_type
- pt = [const.page_meta_type] + const.page_mix_types
- if not is_data_page and self._current_page_type not in pt:
+ if self._current_page_type not in const.page_meta_types + [
+ const.page_data_type,
+ const.page_mix_type,
+ ]:
return self._read_next_page()
return False
diff --git a/pandas/io/sas/sas_constants.py b/pandas/io/sas/sas_constants.py
index 23b23a1bf09c0..979b2cacbf706 100644
--- a/pandas/io/sas/sas_constants.py
+++ b/pandas/io/sas/sas_constants.py
@@ -53,12 +53,16 @@
block_count_length = 2
subheader_count_offset = 4
subheader_count_length = 2
-page_meta_type = 0
-page_data_type = 256
-page_amd_type = 1024
-page_metc_type = 16384
-page_comp_type = -28672
-page_mix_types = [512, 640]
+page_type_mask = 0x0F00
+# Keep "page_comp_type" bits
+page_type_mask2 = 0xF000 | page_type_mask
+page_meta_type = 0x0000
+page_data_type = 0x0100
+page_mix_type = 0x0200
+page_amd_type = 0x0400
+page_meta2_type = 0x4000
+page_comp_type = 0x9000
+page_meta_types = [page_meta_type, page_meta2_type]
subheader_pointers_offset = 8
truncated_subheader_id = 1
compressed_subheader_id = 4
diff --git a/pandas/tests/io/sas/data/test_meta2_page.sas7bdat b/pandas/tests/io/sas/data/test_meta2_page.sas7bdat
new file mode 100644
index 0000000000000..9732cabc212cf
Binary files /dev/null and b/pandas/tests/io/sas/data/test_meta2_page.sas7bdat differ
diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py
index 5cacd7efb5920..3f150c1a061ee 100644
--- a/pandas/tests/io/sas/test_sas7bdat.py
+++ b/pandas/tests/io/sas/test_sas7bdat.py
@@ -343,6 +343,13 @@ def test_null_date(datapath):
tm.assert_frame_equal(df, expected)
+def test_meta2_page(datapath):
+ # GH 35545
+ fname = datapath("io", "sas", "data", "test_meta2_page.sas7bdat")
+ df = pd.read_sas(fname)
+ assert len(df) == 1000
+
+
@pytest.mark.parametrize("test_file", ["test2.sas7bdat", "test3.sas7bdat"])
def test_exception_propagation_rdc_rle_decompress(datapath, monkeypatch, test_file):
"""Errors in RLE/RDC decompression should propagate the same error."""