Skip to content

Commit 2db05d5

Browse files
authored
Fix reading some RDC compressed pages (#47115)
1 parent d2ffd98 commit 2db05d5

File tree

6 files changed

+36
-22
lines changed

6 files changed

+36
-22
lines changed

doc/source/whatsnew/v1.5.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -844,6 +844,7 @@ I/O
844844
- Bug in :func:`read_excel` when reading a ``.ods`` file with newlines between xml elements (:issue:`45598`)
845845
- Bug in :func:`read_parquet` when ``engine="fastparquet"`` where the file was not closed on error (:issue:`46555`)
846846
- :meth:`to_html` now excludes the ``border`` attribute from ``<table>`` elements when ``border`` keyword is set to ``False``.
847+
- Bug in :func:`read_sas` with certain types of compressed SAS7BDAT files (:issue:`35545`)
847848
- Bug in :func:`read_sas` returned ``None`` rather than an empty DataFrame for SAS7BDAT files with zero rows (:issue:`18198`)
848849
- Bug in :class:`StataWriter` where value labels were always written with default encoding (:issue:`46750`)
849850

pandas/io/sas/sas.pyx

+7-7
Original file line numberDiff line numberDiff line change
@@ -198,10 +198,11 @@ cdef enum ColumnTypes:
198198

199199

200200
# type the page_data types
201+
assert len(const.page_meta_types) == 2
201202
cdef:
202-
int page_meta_type = const.page_meta_type
203-
int page_mix_types_0 = const.page_mix_types[0]
204-
int page_mix_types_1 = const.page_mix_types[1]
203+
int page_meta_types_0 = const.page_meta_types[0]
204+
int page_meta_types_1 = const.page_meta_types[1]
205+
int page_mix_type = const.page_mix_type
205206
int page_data_type = const.page_data_type
206207
int subheader_pointers_offset = const.subheader_pointers_offset
207208

@@ -332,7 +333,7 @@ cdef class Parser:
332333

333334
# Loop until a data row is read
334335
while True:
335-
if self.current_page_type == page_meta_type:
336+
if self.current_page_type in (page_meta_types_0, page_meta_types_1):
336337
flag = self.current_row_on_page_index >=\
337338
self.current_page_data_subheader_pointers_len
338339
if flag:
@@ -347,8 +348,7 @@ cdef class Parser:
347348
current_subheader_pointer.offset,
348349
current_subheader_pointer.length)
349350
return False
350-
elif (self.current_page_type == page_mix_types_0 or
351-
self.current_page_type == page_mix_types_1):
351+
elif self.current_page_type == page_mix_type:
352352
align_correction = (
353353
bit_offset
354354
+ subheader_pointers_offset
@@ -366,7 +366,7 @@ cdef class Parser:
366366
if done:
367367
return True
368368
return False
369-
elif self.current_page_type & page_data_type == page_data_type:
369+
elif self.current_page_type == page_data_type:
370370
self.process_byte_array_with_data(
371371
bit_offset
372372
+ subheader_pointers_offset

pandas/io/sas/sas7bdat.py

+11-9
Original file line numberDiff line numberDiff line change
@@ -410,11 +410,11 @@ def _parse_metadata(self) -> None:
410410

411411
def _process_page_meta(self) -> bool:
412412
self._read_page_header()
413-
pt = [const.page_meta_type, const.page_amd_type] + const.page_mix_types
413+
pt = const.page_meta_types + [const.page_amd_type, const.page_mix_type]
414414
if self._current_page_type in pt:
415415
self._process_page_metadata()
416-
is_data_page = self._current_page_type & const.page_data_type
417-
is_mix_page = self._current_page_type in const.page_mix_types
416+
is_data_page = self._current_page_type == const.page_data_type
417+
is_mix_page = self._current_page_type == const.page_mix_type
418418
return bool(
419419
is_data_page
420420
or is_mix_page
@@ -424,7 +424,9 @@ def _process_page_meta(self) -> bool:
424424
def _read_page_header(self):
425425
bit_offset = self._page_bit_offset
426426
tx = const.page_type_offset + bit_offset
427-
self._current_page_type = self._read_int(tx, const.page_type_length)
427+
self._current_page_type = (
428+
self._read_int(tx, const.page_type_length) & const.page_type_mask2
429+
)
428430
tx = const.block_count_offset + bit_offset
429431
self._current_page_block_count = self._read_int(tx, const.block_count_length)
430432
tx = const.subheader_count_offset + bit_offset
@@ -774,13 +776,13 @@ def _read_next_page(self):
774776
raise ValueError(msg)
775777

776778
self._read_page_header()
777-
page_type = self._current_page_type
778-
if page_type == const.page_meta_type:
779+
if self._current_page_type in const.page_meta_types:
779780
self._process_page_metadata()
780781

781-
is_data_page = page_type & const.page_data_type
782-
pt = [const.page_meta_type] + const.page_mix_types
783-
if not is_data_page and self._current_page_type not in pt:
782+
if self._current_page_type not in const.page_meta_types + [
783+
const.page_data_type,
784+
const.page_mix_type,
785+
]:
784786
return self._read_next_page()
785787

786788
return False

pandas/io/sas/sas_constants.py

+10-6
Original file line numberDiff line numberDiff line change
@@ -53,12 +53,16 @@
5353
block_count_length = 2
5454
subheader_count_offset = 4
5555
subheader_count_length = 2
56-
page_meta_type = 0
57-
page_data_type = 256
58-
page_amd_type = 1024
59-
page_metc_type = 16384
60-
page_comp_type = -28672
61-
page_mix_types = [512, 640]
56+
page_type_mask = 0x0F00
57+
# Keep "page_comp_type" bits
58+
page_type_mask2 = 0xF000 | page_type_mask
59+
page_meta_type = 0x0000
60+
page_data_type = 0x0100
61+
page_mix_type = 0x0200
62+
page_amd_type = 0x0400
63+
page_meta2_type = 0x4000
64+
page_comp_type = 0x9000
65+
page_meta_types = [page_meta_type, page_meta2_type]
6266
subheader_pointers_offset = 8
6367
truncated_subheader_id = 1
6468
compressed_subheader_id = 4
Binary file not shown.

pandas/tests/io/sas/test_sas7bdat.py

+7
Original file line numberDiff line numberDiff line change
@@ -343,6 +343,13 @@ def test_null_date(datapath):
343343
tm.assert_frame_equal(df, expected)
344344

345345

346+
def test_meta2_page(datapath):
347+
# GH 35545
348+
fname = datapath("io", "sas", "data", "test_meta2_page.sas7bdat")
349+
df = pd.read_sas(fname)
350+
assert len(df) == 1000
351+
352+
346353
@pytest.mark.parametrize("test_file", ["test2.sas7bdat", "test3.sas7bdat"])
347354
def test_exception_propagation_rdc_rle_decompress(datapath, monkeypatch, test_file):
348355
"""Errors in RLE/RDC decompression should propagate the same error."""

0 commit comments

Comments
 (0)