Skip to content

Commit d32365d

Browse files
committed
Fix reading some RDC compressed pages
1 parent c9ce063 commit d32365d

File tree

5 files changed

+36
-22
lines changed

5 files changed

+36
-22
lines changed

doc/source/whatsnew/v1.5.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -814,6 +814,7 @@ I/O
814814
- Bug in :func:`read_excel` when reading a ``.ods`` file with newlines between xml elements (:issue:`45598`)
815815
- Bug in :func:`read_parquet` when ``engine="fastparquet"`` where the file was not closed on error (:issue:`46555`)
816816
- :meth:`to_html` now excludes the ``border`` attribute from ``<table>`` elements when ``border`` keyword is set to ``False``.
817+
- Bug in :func:`read_sas` with certain types of compressed SAS7BDAT files (:issue:`35545`)
817818
-
818819

819820
Period

pandas/io/sas/sas.pyx

+7-7
Original file line numberDiff line numberDiff line change
@@ -201,10 +201,11 @@ cdef enum ColumnTypes:
201201

202202

203203
# type the page_data types
204+
assert len(const.page_meta_types) == 2
204205
cdef:
205-
int page_meta_type = const.page_meta_type
206-
int page_mix_types_0 = const.page_mix_types[0]
207-
int page_mix_types_1 = const.page_mix_types[1]
206+
int page_meta_types_0 = const.page_meta_types[0]
207+
int page_meta_types_1 = const.page_meta_types[1]
208+
int page_mix_type = const.page_mix_type
208209
int page_data_type = const.page_data_type
209210
int subheader_pointers_offset = const.subheader_pointers_offset
210211

@@ -335,7 +336,7 @@ cdef class Parser:
335336

336337
# Loop until a data row is read
337338
while True:
338-
if self.current_page_type == page_meta_type:
339+
if self.current_page_type in (page_meta_types_0, page_meta_types_1):
339340
flag = self.current_row_on_page_index >=\
340341
self.current_page_data_subheader_pointers_len
341342
if flag:
@@ -350,8 +351,7 @@ cdef class Parser:
350351
current_subheader_pointer.offset,
351352
current_subheader_pointer.length)
352353
return False
353-
elif (self.current_page_type == page_mix_types_0 or
354-
self.current_page_type == page_mix_types_1):
354+
elif self.current_page_type == page_mix_type:
355355
align_correction = (
356356
bit_offset
357357
+ subheader_pointers_offset
@@ -369,7 +369,7 @@ cdef class Parser:
369369
if done:
370370
return True
371371
return False
372-
elif self.current_page_type & page_data_type == page_data_type:
372+
elif self.current_page_type == page_data_type:
373373
self.process_byte_array_with_data(
374374
bit_offset
375375
+ subheader_pointers_offset

pandas/io/sas/sas7bdat.py

+11-9
Original file line numberDiff line numberDiff line change
@@ -410,11 +410,11 @@ def _parse_metadata(self) -> None:
410410

411411
def _process_page_meta(self) -> bool:
412412
self._read_page_header()
413-
pt = [const.page_meta_type, const.page_amd_type] + const.page_mix_types
413+
pt = const.page_meta_types + [const.page_amd_type, const.page_mix_type]
414414
if self._current_page_type in pt:
415415
self._process_page_metadata()
416-
is_data_page = self._current_page_type & const.page_data_type
417-
is_mix_page = self._current_page_type in const.page_mix_types
416+
is_data_page = self._current_page_type == const.page_data_type
417+
is_mix_page = self._current_page_type == const.page_mix_type
418418
return bool(
419419
is_data_page
420420
or is_mix_page
@@ -424,7 +424,9 @@ def _process_page_meta(self) -> bool:
424424
def _read_page_header(self):
425425
bit_offset = self._page_bit_offset
426426
tx = const.page_type_offset + bit_offset
427-
self._current_page_type = self._read_int(tx, const.page_type_length)
427+
self._current_page_type = (
428+
self._read_int(tx, const.page_type_length) & const.page_type_mask2
429+
)
428430
tx = const.block_count_offset + bit_offset
429431
self._current_page_block_count = self._read_int(tx, const.block_count_length)
430432
tx = const.subheader_count_offset + bit_offset
@@ -774,13 +776,13 @@ def _read_next_page(self):
774776
raise ValueError(msg)
775777

776778
self._read_page_header()
777-
page_type = self._current_page_type
778-
if page_type == const.page_meta_type:
779+
if self._current_page_type in const.page_meta_types:
779780
self._process_page_metadata()
780781

781-
is_data_page = page_type & const.page_data_type
782-
pt = [const.page_meta_type] + const.page_mix_types
783-
if not is_data_page and self._current_page_type not in pt:
782+
if self._current_page_type not in const.page_meta_types + [
783+
const.page_data_type,
784+
const.page_mix_type,
785+
]:
784786
return self._read_next_page()
785787

786788
return False

pandas/io/sas/sas_constants.py

+10-6
Original file line numberDiff line numberDiff line change
@@ -53,12 +53,16 @@
5353
block_count_length = 2
5454
subheader_count_offset = 4
5555
subheader_count_length = 2
56-
page_meta_type = 0
57-
page_data_type = 256
58-
page_amd_type = 1024
59-
page_metc_type = 16384
60-
page_comp_type = -28672
61-
page_mix_types = [512, 640]
56+
page_type_mask = 0x0F00
57+
# Keep "page_comp_type" bits
58+
page_type_mask2 = 0xF000 | page_type_mask
59+
page_meta_type = 0x0000
60+
page_data_type = 0x0100
61+
page_mix_type = 0x0200
62+
page_amd_type = 0x0400
63+
page_meta2_type = 0x4000
64+
page_comp_type = 0x9000
65+
page_meta_types = [page_meta_type, page_meta2_type]
6266
subheader_pointers_offset = 8
6367
truncated_subheader_id = 1
6468
compressed_subheader_id = 4

pandas/tests/io/sas/test_sas7bdat.py

+7
Original file line numberDiff line numberDiff line change
@@ -333,3 +333,10 @@ def test_null_date(datapath):
333333
},
334334
)
335335
tm.assert_frame_equal(df, expected)
336+
337+
338+
def test_meta2_page(datapath):
339+
# GH 35545
340+
fname = datapath("io", "sas", "data", "test_meta2_page.sas7bdat")
341+
df = pd.read_sas(fname)
342+
assert len(df) == 1000

0 commit comments

Comments
 (0)