Fix SAS 0x40 control byte handling and column name parsing

jonashaag · jonashaag · commit 050ec0afa2df · 2022-05-28T00:40:02.000+02:00
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -745,6 +745,8 @@ I/O
 - Bug in :func:`read_excel` when reading a ``.ods`` file with newlines between xml elements (:issue:`45598`)
 - Bug in :func:`read_parquet` when ``engine="fastparquet"`` where the file was not closed on error (:issue:`46555`)
 - :meth:`to_html` now excludes the ``border`` attribute from ``<table>`` elements when ``border`` keyword is set to ``False``.
+- Bug in :func:`read_sas` with certain types of compressed SAS7BDAT files (:issue:`31243`)
+- Bug in :func:`read_sas` that scrambled column names
 -
 
 Period
diff --git a/pandas/io/sas/sas.pyx b/pandas/io/sas/sas.pyx
@@ -38,8 +38,7 @@ cdef const uint8_t[:] rle_decompress(int result_length, const uint8_t[:] inbuff)
                 ipos += 1
         elif control_byte == 0x40:
             # not documented
-            nbytes = end_of_first_byte * 16
-            nbytes += <int>(inbuff[ipos])
+            nbytes = (inbuff[ipos] & 0xFF) + 18 + end_of_first_byte * 256
             ipos += 1
             for _ in range(nbytes):
                 result[rpos] = inbuff[ipos]
diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py
@@ -180,9 +180,9 @@ def __init__(
 
         self.default_encoding = "latin-1"
         self.compression = b""
-        self.column_names_strings: list[str] = []
-        self.column_names: list[str] = []
-        self.column_formats: list[str] = []
+        self.column_names_raw: list[bytes] = []
+        self.column_names: list[str | bytes] = []
+        self.column_formats: list[str | bytes] = []
         self.columns: list[_Column] = []
 
         self._current_page_data_subheader_pointers: list[_SubheaderPointer] = []
@@ -570,12 +570,9 @@ def _process_columntext_subheader(self, offset: int, length: int) -> None:
 
         buf = self._read_bytes(offset, text_block_size)
         cname_raw = buf[0:text_block_size].rstrip(b"\x00 ")
-        cname = cname_raw
-        if self.convert_header_text:
-            cname = cname.decode(self.encoding or self.default_encoding)
-        self.column_names_strings.append(cname)
+        self.column_names_raw.append(cname_raw)
 
-        if len(self.column_names_strings) == 1:
+        if len(self.column_names_raw) == 1:
             compression_literal = b""
             for cl in const.compression_literals:
                 if cl in cname_raw:
@@ -644,8 +641,14 @@ def _process_columnname_subheader(self, offset: int, length: int) -> None:
             )
             col_len = self._read_int(col_name_length, const.column_name_length_length)
 
-            name_str = self.column_names_strings[idx]
-            self.column_names.append(name_str[col_offset : col_offset + col_len])
+            name_raw = self.column_names_raw[idx]
+            cname = name_raw[col_offset : col_offset + col_len]
+            if self.convert_header_text:
+                self.column_names.append(
+                    cname.decode(self.encoding or self.default_encoding)
+                )
+            else:
+                self.column_names.append(cname)
 
     def _process_columnattributes_subheader(self, offset: int, length: int) -> None:
         int_len = self._int_length
@@ -693,7 +696,7 @@ def _process_format_subheader(self, offset: int, length: int) -> None:
         x = self._read_int(
             text_subheader_format, const.column_format_text_subheader_index_length
         )
-        format_idx = min(x, len(self.column_names_strings) - 1)
+        format_idx = min(x, len(self.column_names_raw) - 1)
 
         format_start = self._read_int(
             col_format_offset, const.column_format_offset_length
@@ -703,15 +706,29 @@ def _process_format_subheader(self, offset: int, length: int) -> None:
         label_idx = self._read_int(
             text_subheader_label, const.column_label_text_subheader_index_length
         )
-        label_idx = min(label_idx, len(self.column_names_strings) - 1)
+        label_idx = min(label_idx, len(self.column_names_raw) - 1)
 
         label_start = self._read_int(col_label_offset, const.column_label_offset_length)
         label_len = self._read_int(col_label_len, const.column_label_length_length)
 
-        label_names = self.column_names_strings[label_idx]
-        column_label = label_names[label_start : label_start + label_len]
-        format_names = self.column_names_strings[format_idx]
-        column_format = format_names[format_start : format_start + format_len]
+        label_names = self.column_names_raw[label_idx]
+        column_label_bytes = label_names[label_start : label_start + label_len]
+        column_label: str | bytes
+        if self.convert_header_text:
+            column_label = column_label_bytes.decode(
+                self.encoding or self.default_encoding
+            )
+        else:
+            column_label = column_label_bytes
+        format_names = self.column_names_raw[format_idx]
+        column_format_bytes = format_names[format_start : format_start + format_len]
+        column_format: str | bytes
+        if self.convert_header_text:
+            column_format = column_format_bytes.decode(
+                self.encoding or self.default_encoding
+            )
+        else:
+            column_format = column_format_bytes
         current_column_number = len(self.columns)
 
         col = _Column(
diff --git a/pandas/tests/io/sas/data/0x40controlbyte.csv b/pandas/tests/io/sas/data/0x40controlbyte.csv
@@ -0,0 +1,2 @@
+long_string_field1,long_string_field2,long_string_field3
+00000000000000000000000000000000000000000000000000,11111111111111111111111111111111111111111111111111,aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
diff --git a/pandas/tests/io/sas/data/0x40controlbyte.sas7bdat b/pandas/tests/io/sas/data/0x40controlbyte.sas7bdat
diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py
@@ -333,3 +333,12 @@ def test_null_date(datapath):
         },
     )
     tm.assert_frame_equal(df, expected)
+
+
+def test_0x40_control_byte(datapath):
+    # GH 31243
+    fname = datapath("io", "sas", "data", "0x40controlbyte.sas7bdat")
+    df = pd.read_sas(fname, encoding="ascii")
+    fname = datapath("io", "sas", "data", "0x40controlbyte.csv")
+    df0 = pd.read_csv(fname, dtype="object")
+    tm.assert_frame_equal(df, df0)

Original file line number	Diff line number	Diff line change
`@@ -745,6 +745,8 @@ I/O`
`745`	`745`	- Bug in :func:`read_excel` when reading a ``.ods`` file with newlines between xml elements (:issue:`45598`)
`746`	`746`	- Bug in :func:`read_parquet` when ``engine="fastparquet"`` where the file was not closed on error (:issue:`46555`)
`747`	`747`	- :meth:`to_html` now excludes the ``border`` attribute from ``<table>`` elements when ``border`` keyword is set to ``False``.
	`748`	+- Bug in :func:`read_sas` with certain types of compressed SAS7BDAT files (:issue:`31243`)
	`749`	+- Bug in :func:`read_sas` that scrambled column names
`748`	`750`	`-`
`749`	`751`
`750`	`752`	`Period`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+long_string_field1,long_string_field2,long_string_field3`
	`2`	`+00000000000000000000000000000000000000000000000000,11111111111111111111111111111111111111111111111111,aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa`