From 7669f6ed6a6889aa3a42989fe72243db3e82e458 Mon Sep 17 00:00:00 2001 From: Jonas Haag Date: Tue, 31 May 2022 19:05:02 +0200 Subject: [PATCH 1/6] Fix reading SAS7BDAT files with zero rows (#47116) * Fix reading SAS7BDAT files with zero rows * Add missing file * Update test_sas7bdat.py --- doc/source/whatsnew/v1.5.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 4ba9628d8f275..bb854f0a033e0 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -844,6 +844,7 @@ I/O - :meth:`to_html` now excludes the ``border`` attribute from ```` elements when ``border`` keyword is set to ``False``. - Bug in :func:`read_sas` returned ``None`` rather than an empty DataFrame for SAS7BDAT files with zero rows (:issue:`18198`) - Bug in :class:`StataWriter` where value labels were always written with default encoding (:issue:`46750`) +- Period ^^^^^^ From 93db2f3331277d5b76c78a41decfdd70249af34a Mon Sep 17 00:00:00 2001 From: Jonas Haag Date: Thu, 9 Jun 2022 15:18:12 +0200 Subject: [PATCH 2/6] Review feedback --- doc/source/whatsnew/v1.5.0.rst | 2 + pandas/io/sas/sas7bdat.py | 110 ++++++++++++++------------------- 2 files changed, 49 insertions(+), 63 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index bb854f0a033e0..3d8c767308633 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -844,6 +844,8 @@ I/O - :meth:`to_html` now excludes the ``border`` attribute from ``
`` elements when ``border`` keyword is set to ``False``. - Bug in :func:`read_sas` returned ``None`` rather than an empty DataFrame for SAS7BDAT files with zero rows (:issue:`18198`) - Bug in :class:`StataWriter` where value labels were always written with default encoding (:issue:`46750`) +- Bug in :func:`read_sas` with RLE-compressed SAS7BDAT files that contain 0x40 control bytes (:issue:`31243`) +- Bug in :func:`read_sas` that scrambled column names (:issue:`31243`) - Period diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index 1e071690d35fb..3e50b1d9a56be 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -180,9 +180,9 @@ def __init__( self.default_encoding = "latin-1" self.compression = b"" - self.column_names_strings: list[str] = [] - self.column_names: list[str] = [] - self.column_formats: list[str] = [] + self.column_names_raw: list[bytes] = [] + self.column_names: list[str | bytes] = [] + self.column_formats: list[str | bytes] = [] self.columns: list[_Column] = [] self._current_page_data_subheader_pointers: list[_SubheaderPointer] = [] @@ -274,17 +274,13 @@ def _get_properties(self) -> None: else: self.platform = "unknown" - buf = self._read_bytes(const.dataset_offset, const.dataset_length) - self.name = buf.rstrip(b"\x00 ") - if self.convert_header_text: - self.name = self.name.decode(self.encoding or self.default_encoding) + self.name = self._read_and_convert_header_text( + const.dataset_offset, const.dataset_length + ) - buf = self._read_bytes(const.file_type_offset, const.file_type_length) - self.file_type = buf.rstrip(b"\x00 ") - if self.convert_header_text: - self.file_type = self.file_type.decode( - self.encoding or self.default_encoding - ) + self.file_type = self._read_and_convert_header_text( + const.file_type_offset, const.file_type_length + ) # Timestamp is epoch 01/01/1960 epoch = datetime(1960, 1, 1) @@ -316,46 +312,25 @@ def _get_properties(self) -> None: const.page_count_offset + align1, const.page_count_length ) - buf = self._read_bytes( + self.sas_release_offset = self._read_and_convert_header_text( const.sas_release_offset + total_align, const.sas_release_length ) - self.sas_release = buf.rstrip(b"\x00 ") - if self.convert_header_text: - self.sas_release = self.sas_release.decode( - self.encoding or self.default_encoding - ) - buf = self._read_bytes( + self.server_type = self._read_and_convert_header_text( const.sas_server_type_offset + total_align, const.sas_server_type_length ) - self.server_type = buf.rstrip(b"\x00 ") - if self.convert_header_text: - self.server_type = self.server_type.decode( - self.encoding or self.default_encoding - ) - buf = self._read_bytes( + self.os_version = self._read_and_convert_header_text( const.os_version_number_offset + total_align, const.os_version_number_length ) - self.os_version = buf.rstrip(b"\x00 ") - if self.convert_header_text: - self.os_version = self.os_version.decode( - self.encoding or self.default_encoding - ) - buf = self._read_bytes(const.os_name_offset + total_align, const.os_name_length) - buf = buf.rstrip(b"\x00 ") - if len(buf) > 0: - self.os_name = buf.decode(self.encoding or self.default_encoding) - else: - buf = self._read_bytes( + self.os_name = self._read_and_convert_header_text( + const.os_name_offset + total_align, const.os_name_length + ) + if not self.os_name: + self.os_name = self._read_and_convert_header_text( const.os_maker_offset + total_align, const.os_maker_length ) - self.os_name = buf.rstrip(b"\x00 ") - if self.convert_header_text: - self.os_name = self.os_name.decode( - self.encoding or self.default_encoding - ) def __next__(self): da = self.read(nrows=self.chunksize or 1) @@ -398,6 +373,9 @@ def _read_bytes(self, offset: int, length: int): raise ValueError("The cached page is too small.") return self._cached_page[offset : offset + length] + def _read_and_convert_header_text(self, offset: int, length: int) -> str | bytes: + return self._convert_header_text(self._read_bytes(offset, length)) + def _parse_metadata(self) -> None: done = False while not done: @@ -570,12 +548,9 @@ def _process_columntext_subheader(self, offset: int, length: int) -> None: buf = self._read_bytes(offset, text_block_size) cname_raw = buf[0:text_block_size].rstrip(b"\x00 ") - cname = cname_raw - if self.convert_header_text: - cname = cname.decode(self.encoding or self.default_encoding) - self.column_names_strings.append(cname) + self.column_names_raw.append(cname_raw) - if len(self.column_names_strings) == 1: + if len(self.column_names_raw) == 1: compression_literal = b"" for cl in const.compression_literals: if cl in cname_raw: @@ -609,11 +584,8 @@ def _process_columntext_subheader(self, offset: int, length: int) -> None: offset1 += 4 buf = self._read_bytes(offset1, self._lcs) self.creator_proc = buf[0 : self._lcp] - if self.convert_header_text: - if hasattr(self, "creator_proc"): - self.creator_proc = self.creator_proc.decode( - self.encoding or self.default_encoding - ) + if hasattr(self, "creator_proc"): + self.creator_proc = self._convert_header_text(self.creator_proc) def _process_columnname_subheader(self, offset: int, length: int) -> None: int_len = self._int_length @@ -644,8 +616,9 @@ def _process_columnname_subheader(self, offset: int, length: int) -> None: ) col_len = self._read_int(col_name_length, const.column_name_length_length) - name_str = self.column_names_strings[idx] - self.column_names.append(name_str[col_offset : col_offset + col_len]) + name_raw = self.column_names_raw[idx] + cname = name_raw[col_offset : col_offset + col_len] + self.column_names.append(self._convert_header_text(cname)) def _process_columnattributes_subheader(self, offset: int, length: int) -> None: int_len = self._int_length @@ -693,7 +666,7 @@ def _process_format_subheader(self, offset: int, length: int) -> None: x = self._read_int( text_subheader_format, const.column_format_text_subheader_index_length ) - format_idx = min(x, len(self.column_names_strings) - 1) + format_idx = min(x, len(self.column_names_raw) - 1) format_start = self._read_int( col_format_offset, const.column_format_offset_length @@ -703,15 +676,19 @@ def _process_format_subheader(self, offset: int, length: int) -> None: label_idx = self._read_int( text_subheader_label, const.column_label_text_subheader_index_length ) - label_idx = min(label_idx, len(self.column_names_strings) - 1) + label_idx = min(label_idx, len(self.column_names_raw) - 1) label_start = self._read_int(col_label_offset, const.column_label_offset_length) label_len = self._read_int(col_label_len, const.column_label_length_length) - label_names = self.column_names_strings[label_idx] - column_label = label_names[label_start : label_start + label_len] - format_names = self.column_names_strings[format_idx] - column_format = format_names[format_start : format_start + format_len] + label_names = self.column_names_raw[label_idx] + column_label = self._convert_header_text( + label_names[label_start : label_start + label_len] + ) + format_names = self.column_names_raw[format_idx] + column_format = self._convert_header_text( + format_names[format_start : format_start + format_len] + ) current_column_number = len(self.columns) col = _Column( @@ -809,9 +786,7 @@ def _chunk_to_dataframe(self) -> DataFrame: elif self._column_types[j] == b"s": rslt[name] = pd.Series(self._string_chunk[js, :], index=ix) if self.convert_text and (self.encoding is not None): - rslt[name] = rslt[name].str.decode( - self.encoding or self.default_encoding - ) + rslt[name] = self._decode_string(rslt[name].str) if self.blank_missing: ii = rslt[name].str.len() == 0 rslt[name][ii] = np.nan @@ -822,3 +797,12 @@ def _chunk_to_dataframe(self) -> DataFrame: df = DataFrame(rslt, columns=self.column_names, index=ix, copy=False) return df + + def _decode_string(self, b): + return b.decode(self.encoding or self.default_encoding) + + def _convert_header_text(self, b: bytes) -> str | bytes: + if self.convert_header_text: + return self._decode_string(b) + else: + return b From f9f98e8ee5e9655d9d18bb44b48a34dee6f1b667 Mon Sep 17 00:00:00 2001 From: Jonas Haag Date: Thu, 9 Jun 2022 15:33:18 +0200 Subject: [PATCH 3/6] Add back untested .rstrip --- pandas/io/sas/sas7bdat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index 3e50b1d9a56be..894f9482178a9 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -374,7 +374,7 @@ def _read_bytes(self, offset: int, length: int): return self._cached_page[offset : offset + length] def _read_and_convert_header_text(self, offset: int, length: int) -> str | bytes: - return self._convert_header_text(self._read_bytes(offset, length)) + return self._convert_header_text(self._read_bytes(offset, length).rstrip(b"\x00 ")) def _parse_metadata(self) -> None: done = False From dda849eef73bf0e7be181b6c8e6bc5516b3f0b2e Mon Sep 17 00:00:00 2001 From: Jonas Haag Date: Thu, 9 Jun 2022 15:35:02 +0200 Subject: [PATCH 4/6] Fix lint --- pandas/io/sas/sas7bdat.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index 894f9482178a9..1f375da2bad17 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -374,7 +374,9 @@ def _read_bytes(self, offset: int, length: int): return self._cached_page[offset : offset + length] def _read_and_convert_header_text(self, offset: int, length: int) -> str | bytes: - return self._convert_header_text(self._read_bytes(offset, length).rstrip(b"\x00 ")) + return self._convert_header_text( + self._read_bytes(offset, length).rstrip(b"\x00 ") + ) def _parse_metadata(self) -> None: done = False From 25fac4beae9c236f50cef9ef3923e85b043998cf Mon Sep 17 00:00:00 2001 From: Jonas Haag Date: Fri, 10 Jun 2022 12:16:01 +0200 Subject: [PATCH 5/6] Add tests --- pandas/tests/io/sas/data/0x40controlbyte.csv | 2 ++ .../tests/io/sas/data/0x40controlbyte.sas7bdat | Bin 0 -> 196608 bytes pandas/tests/io/sas/test_sas7bdat.py | 9 +++++++++ 3 files changed, 11 insertions(+) create mode 100644 pandas/tests/io/sas/data/0x40controlbyte.csv create mode 100644 pandas/tests/io/sas/data/0x40controlbyte.sas7bdat diff --git a/pandas/tests/io/sas/data/0x40controlbyte.csv b/pandas/tests/io/sas/data/0x40controlbyte.csv new file mode 100644 index 0000000000000..e81f5cc3904b7 --- /dev/null +++ b/pandas/tests/io/sas/data/0x40controlbyte.csv @@ -0,0 +1,2 @@ +long_string_field1,long_string_field2,long_string_field3 +00000000000000000000000000000000000000000000000000,11111111111111111111111111111111111111111111111111,aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa diff --git a/pandas/tests/io/sas/data/0x40controlbyte.sas7bdat b/pandas/tests/io/sas/data/0x40controlbyte.sas7bdat new file mode 100644 index 0000000000000000000000000000000000000000..013542e282e2f5bffde3a418f2f27073caa8e2fe GIT binary patch literal 196608 zcmeIzPiS04901_i-Lys`adNA+}yf-ty`OUnU?``rz zD9hE~E}i~r-`?w2J{}Gae%JnR|6`+jI|oCjQ>(XY{j^4HL%dUayb}(E&}{Tm7Q2hh z`R5nr=1z2v9bcGh4l4a62Q+58OWk~KKK#rJpMCM{%fEC}T})|yJyX{C<7=T_1j^gf zt?|k6_OYpj<@4{al;JSZnrd|>o;>{2WNY$ptKC{RR5Vwnr%p{xmeKsVZ}0l!YP!mi zUh9X^`PIC&VSm*meF6js5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5*Bj|Hxc&ad8Ud%JI>=tvmJ znm^s{^$#^-ywF~%Q>>kP$n9gTJJsl|SJoSFau0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7?jP5TUbqKfPyy^A9tX-UWxZXc|>Yt5yslR?&TxzY^4?Ynd zE!sz8eJ0lZgz!(Cbg>MD#qQ!OuRizELbFn4yGz~0rMcth-a7y0sdwLdXF03hSbqK7 znRWjyR@-M`LATsXap{P&fY-My;+rc*zerM$Gi zYCpY?6yH@@FZJ)l#FUrLS?$L^k+rzY!0J|9Z(u?kxVg=|t=8Eo<9}u)-)p0?;Tu`W zL+uFuaAo|+_`xD_ImXj_Non=isIHs-2TY~J{x7K)_%tq*ekZB!r)z8~aBXycb<>1x z4yN~+-ec-N6z8Y@o-O*rmG`^N$8RHTIHdnE`Iluln=w+{_Z9cC;=c7VL)rdncYOiM zdFh;GWaELZ5Ag2K`zZguT`9hQ>( Date: Fri, 10 Jun 2022 14:48:49 +0200 Subject: [PATCH 6/6] Fix rebase --- pandas/io/sas/sas.pyx | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/io/sas/sas.pyx b/pandas/io/sas/sas.pyx index 483bbd70faec9..80065f3069c27 100644 --- a/pandas/io/sas/sas.pyx +++ b/pandas/io/sas/sas.pyx @@ -38,8 +38,7 @@ cdef const uint8_t[:] rle_decompress(int result_length, const uint8_t[:] inbuff) ipos += 1 elif control_byte == 0x40: # not documented - nbytes = end_of_first_byte * 16 - nbytes += (inbuff[ipos]) + nbytes = (inbuff[ipos] & 0xFF) + 18 + end_of_first_byte * 256 ipos += 1 for _ in range(nbytes): result[rpos] = inbuff[ipos]