Skip to content

Commit 050ec0a

Browse files
committed
Fix SAS 0x40 control byte handling and column name parsing
1 parent 4f92db3 commit 050ec0a

File tree

6 files changed

+47
-18
lines changed

6 files changed

+47
-18
lines changed

doc/source/whatsnew/v1.5.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -745,6 +745,8 @@ I/O
745745
- Bug in :func:`read_excel` when reading a ``.ods`` file with newlines between xml elements (:issue:`45598`)
746746
- Bug in :func:`read_parquet` when ``engine="fastparquet"`` where the file was not closed on error (:issue:`46555`)
747747
- :meth:`to_html` now excludes the ``border`` attribute from ``<table>`` elements when ``border`` keyword is set to ``False``.
748+
- Bug in :func:`read_sas` with certain types of compressed SAS7BDAT files (:issue:`31243`)
749+
- Bug in :func:`read_sas` that scrambled column names
748750
-
749751

750752
Period

pandas/io/sas/sas.pyx

+1-2
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,7 @@ cdef const uint8_t[:] rle_decompress(int result_length, const uint8_t[:] inbuff)
3838
ipos += 1
3939
elif control_byte == 0x40:
4040
# not documented
41-
nbytes = end_of_first_byte * 16
42-
nbytes += <int>(inbuff[ipos])
41+
nbytes = (inbuff[ipos] & 0xFF) + 18 + end_of_first_byte * 256
4342
ipos += 1
4443
for _ in range(nbytes):
4544
result[rpos] = inbuff[ipos]

pandas/io/sas/sas7bdat.py

+33-16
Original file line numberDiff line numberDiff line change
@@ -180,9 +180,9 @@ def __init__(
180180

181181
self.default_encoding = "latin-1"
182182
self.compression = b""
183-
self.column_names_strings: list[str] = []
184-
self.column_names: list[str] = []
185-
self.column_formats: list[str] = []
183+
self.column_names_raw: list[bytes] = []
184+
self.column_names: list[str | bytes] = []
185+
self.column_formats: list[str | bytes] = []
186186
self.columns: list[_Column] = []
187187

188188
self._current_page_data_subheader_pointers: list[_SubheaderPointer] = []
@@ -570,12 +570,9 @@ def _process_columntext_subheader(self, offset: int, length: int) -> None:
570570

571571
buf = self._read_bytes(offset, text_block_size)
572572
cname_raw = buf[0:text_block_size].rstrip(b"\x00 ")
573-
cname = cname_raw
574-
if self.convert_header_text:
575-
cname = cname.decode(self.encoding or self.default_encoding)
576-
self.column_names_strings.append(cname)
573+
self.column_names_raw.append(cname_raw)
577574

578-
if len(self.column_names_strings) == 1:
575+
if len(self.column_names_raw) == 1:
579576
compression_literal = b""
580577
for cl in const.compression_literals:
581578
if cl in cname_raw:
@@ -644,8 +641,14 @@ def _process_columnname_subheader(self, offset: int, length: int) -> None:
644641
)
645642
col_len = self._read_int(col_name_length, const.column_name_length_length)
646643

647-
name_str = self.column_names_strings[idx]
648-
self.column_names.append(name_str[col_offset : col_offset + col_len])
644+
name_raw = self.column_names_raw[idx]
645+
cname = name_raw[col_offset : col_offset + col_len]
646+
if self.convert_header_text:
647+
self.column_names.append(
648+
cname.decode(self.encoding or self.default_encoding)
649+
)
650+
else:
651+
self.column_names.append(cname)
649652

650653
def _process_columnattributes_subheader(self, offset: int, length: int) -> None:
651654
int_len = self._int_length
@@ -693,7 +696,7 @@ def _process_format_subheader(self, offset: int, length: int) -> None:
693696
x = self._read_int(
694697
text_subheader_format, const.column_format_text_subheader_index_length
695698
)
696-
format_idx = min(x, len(self.column_names_strings) - 1)
699+
format_idx = min(x, len(self.column_names_raw) - 1)
697700

698701
format_start = self._read_int(
699702
col_format_offset, const.column_format_offset_length
@@ -703,15 +706,29 @@ def _process_format_subheader(self, offset: int, length: int) -> None:
703706
label_idx = self._read_int(
704707
text_subheader_label, const.column_label_text_subheader_index_length
705708
)
706-
label_idx = min(label_idx, len(self.column_names_strings) - 1)
709+
label_idx = min(label_idx, len(self.column_names_raw) - 1)
707710

708711
label_start = self._read_int(col_label_offset, const.column_label_offset_length)
709712
label_len = self._read_int(col_label_len, const.column_label_length_length)
710713

711-
label_names = self.column_names_strings[label_idx]
712-
column_label = label_names[label_start : label_start + label_len]
713-
format_names = self.column_names_strings[format_idx]
714-
column_format = format_names[format_start : format_start + format_len]
714+
label_names = self.column_names_raw[label_idx]
715+
column_label_bytes = label_names[label_start : label_start + label_len]
716+
column_label: str | bytes
717+
if self.convert_header_text:
718+
column_label = column_label_bytes.decode(
719+
self.encoding or self.default_encoding
720+
)
721+
else:
722+
column_label = column_label_bytes
723+
format_names = self.column_names_raw[format_idx]
724+
column_format_bytes = format_names[format_start : format_start + format_len]
725+
column_format: str | bytes
726+
if self.convert_header_text:
727+
column_format = column_format_bytes.decode(
728+
self.encoding or self.default_encoding
729+
)
730+
else:
731+
column_format = column_format_bytes
715732
current_column_number = len(self.columns)
716733

717734
col = _Column(
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
long_string_field1,long_string_field2,long_string_field3
2+
00000000000000000000000000000000000000000000000000,11111111111111111111111111111111111111111111111111,aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
Binary file not shown.

pandas/tests/io/sas/test_sas7bdat.py

+9
Original file line numberDiff line numberDiff line change
@@ -333,3 +333,12 @@ def test_null_date(datapath):
333333
},
334334
)
335335
tm.assert_frame_equal(df, expected)
336+
337+
338+
def test_0x40_control_byte(datapath):
339+
# GH 31243
340+
fname = datapath("io", "sas", "data", "0x40controlbyte.sas7bdat")
341+
df = pd.read_sas(fname, encoding="ascii")
342+
fname = datapath("io", "sas", "data", "0x40controlbyte.csv")
343+
df0 = pd.read_csv(fname, dtype="object")
344+
tm.assert_frame_equal(df, df0)

0 commit comments

Comments
 (0)