Skip to content

Fix reading some RDC compressed pages #47115

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jun 9, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.5.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -842,6 +842,7 @@ I/O
- Bug in :func:`read_excel` when reading a ``.ods`` file with newlines between xml elements (:issue:`45598`)
- Bug in :func:`read_parquet` when ``engine="fastparquet"`` where the file was not closed on error (:issue:`46555`)
- :meth:`to_html` now excludes the ``border`` attribute from ``<table>`` elements when ``border`` keyword is set to ``False``.
- Bug in :func:`read_sas` with certain types of compressed SAS7BDAT files (:issue:`35545`)
- Bug in :func:`read_sas` returned ``None`` rather than an empty DataFrame for SAS7BDAT files with zero rows (:issue:`18198`)
- Bug in :class:`StataWriter` where value labels were always written with default encoding (:issue:`46750`)

Expand Down
14 changes: 7 additions & 7 deletions pandas/io/sas/sas.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -198,10 +198,11 @@ cdef enum ColumnTypes:


# type the page_data types
assert len(const.page_meta_types) == 2
cdef:
int page_meta_type = const.page_meta_type
int page_mix_types_0 = const.page_mix_types[0]
int page_mix_types_1 = const.page_mix_types[1]
int page_meta_types_0 = const.page_meta_types[0]
int page_meta_types_1 = const.page_meta_types[1]
int page_mix_type = const.page_mix_type
int page_data_type = const.page_data_type
int subheader_pointers_offset = const.subheader_pointers_offset

Expand Down Expand Up @@ -332,7 +333,7 @@ cdef class Parser:

# Loop until a data row is read
while True:
if self.current_page_type == page_meta_type:
if self.current_page_type in (page_meta_types_0, page_meta_types_1):
flag = self.current_row_on_page_index >=\
self.current_page_data_subheader_pointers_len
if flag:
Expand All @@ -347,8 +348,7 @@ cdef class Parser:
current_subheader_pointer.offset,
current_subheader_pointer.length)
return False
elif (self.current_page_type == page_mix_types_0 or
self.current_page_type == page_mix_types_1):
elif self.current_page_type == page_mix_type:
align_correction = (
bit_offset
+ subheader_pointers_offset
Expand All @@ -366,7 +366,7 @@ cdef class Parser:
if done:
return True
return False
elif self.current_page_type & page_data_type == page_data_type:
elif self.current_page_type == page_data_type:
self.process_byte_array_with_data(
bit_offset
+ subheader_pointers_offset
Expand Down
20 changes: 11 additions & 9 deletions pandas/io/sas/sas7bdat.py
Original file line number Diff line number Diff line change
Expand Up @@ -410,11 +410,11 @@ def _parse_metadata(self) -> None:

def _process_page_meta(self) -> bool:
self._read_page_header()
pt = [const.page_meta_type, const.page_amd_type] + const.page_mix_types
pt = const.page_meta_types + [const.page_amd_type, const.page_mix_type]
if self._current_page_type in pt:
self._process_page_metadata()
is_data_page = self._current_page_type & const.page_data_type
is_mix_page = self._current_page_type in const.page_mix_types
is_data_page = self._current_page_type == const.page_data_type
is_mix_page = self._current_page_type == const.page_mix_type
return bool(
is_data_page
or is_mix_page
Expand All @@ -424,7 +424,9 @@ def _process_page_meta(self) -> bool:
def _read_page_header(self):
bit_offset = self._page_bit_offset
tx = const.page_type_offset + bit_offset
self._current_page_type = self._read_int(tx, const.page_type_length)
self._current_page_type = (
self._read_int(tx, const.page_type_length) & const.page_type_mask2
)
tx = const.block_count_offset + bit_offset
self._current_page_block_count = self._read_int(tx, const.block_count_length)
tx = const.subheader_count_offset + bit_offset
Expand Down Expand Up @@ -774,13 +776,13 @@ def _read_next_page(self):
raise ValueError(msg)

self._read_page_header()
page_type = self._current_page_type
if page_type == const.page_meta_type:
if self._current_page_type in const.page_meta_types:
self._process_page_metadata()

is_data_page = page_type & const.page_data_type
pt = [const.page_meta_type] + const.page_mix_types
if not is_data_page and self._current_page_type not in pt:
if self._current_page_type not in const.page_meta_types + [
const.page_data_type,
const.page_mix_type,
]:
return self._read_next_page()

return False
Expand Down
16 changes: 10 additions & 6 deletions pandas/io/sas/sas_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,16 @@
block_count_length = 2
subheader_count_offset = 4
subheader_count_length = 2
page_meta_type = 0
page_data_type = 256
page_amd_type = 1024
page_metc_type = 16384
page_comp_type = -28672
page_mix_types = [512, 640]
page_type_mask = 0x0F00
# Keep "page_comp_type" bits
page_type_mask2 = 0xF000 | page_type_mask
page_meta_type = 0x0000
page_data_type = 0x0100
page_mix_type = 0x0200
page_amd_type = 0x0400
page_meta2_type = 0x4000
page_comp_type = 0x9000
page_meta_types = [page_meta_type, page_meta2_type]
subheader_pointers_offset = 8
truncated_subheader_id = 1
compressed_subheader_id = 4
Expand Down
Binary file not shown.
7 changes: 7 additions & 0 deletions pandas/tests/io/sas/test_sas7bdat.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,13 @@ def test_null_date(datapath):
tm.assert_frame_equal(df, expected)


def test_meta2_page(datapath):
# GH 35545
fname = datapath("io", "sas", "data", "test_meta2_page.sas7bdat")
df = pd.read_sas(fname)
assert len(df) == 1000


@pytest.mark.parametrize("test_file", ["test2.sas7bdat", "test3.sas7bdat"])
def test_exception_propagation_rdc_rle_decompress(datapath, monkeypatch, test_file):
"""Errors in RLE/RDC decompression should propagate the same error."""
Expand Down