Skip to content

Commit 5e3d0ed

Browse files
authored
Fix SAS 0x40 control byte handling and column name parsing (#47113)
* Fix reading SAS7BDAT files with zero rows (#47116) * Fix reading SAS7BDAT files with zero rows * Add missing file * Update test_sas7bdat.py * Review feedback * Add back untested .rstrip * Fix lint * Add tests * Fix rebase
1 parent d580826 commit 5e3d0ed

File tree

6 files changed

+64
-65
lines changed

6 files changed

+64
-65
lines changed

doc/source/whatsnew/v1.5.0.rst

+3
Original file line numberDiff line numberDiff line change
@@ -892,6 +892,9 @@ I/O
892892
- Bug in :class:`StataWriter` where value labels were always written with default encoding (:issue:`46750`)
893893
- Bug in :class:`StataWriterUTF8` where some valid characters were removed from variable names (:issue:`47276`)
894894
- Bug in :meth:`DataFrame.to_excel` when writing an empty dataframe with :class:`MultiIndex` (:issue:`19543`)
895+
- Bug in :func:`read_sas` with RLE-compressed SAS7BDAT files that contain 0x40 control bytes (:issue:`31243`)
896+
- Bug in :func:`read_sas` that scrambled column names (:issue:`31243`)
897+
-
895898

896899
Period
897900
^^^^^^

pandas/io/sas/sas.pyx

+1-2
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,7 @@ cdef const uint8_t[:] rle_decompress(int result_length, const uint8_t[:] inbuff)
3838
ipos += 1
3939
elif control_byte == 0x40:
4040
# not documented
41-
nbytes = end_of_first_byte * 16
42-
nbytes += <int>(inbuff[ipos])
41+
nbytes = (inbuff[ipos] & 0xFF) + 18 + end_of_first_byte * 256
4342
ipos += 1
4443
for _ in range(nbytes):
4544
result[rpos] = inbuff[ipos]

pandas/io/sas/sas7bdat.py

+49-63
Original file line numberDiff line numberDiff line change
@@ -182,9 +182,9 @@ def __init__(
182182

183183
self.default_encoding = "latin-1"
184184
self.compression = b""
185-
self.column_names_strings: list[str] = []
186-
self.column_names: list[str] = []
187-
self.column_formats: list[str] = []
185+
self.column_names_raw: list[bytes] = []
186+
self.column_names: list[str | bytes] = []
187+
self.column_formats: list[str | bytes] = []
188188
self.columns: list[_Column] = []
189189

190190
self._current_page_data_subheader_pointers: list[_SubheaderPointer] = []
@@ -278,17 +278,13 @@ def _get_properties(self) -> None:
278278
else:
279279
self.platform = "unknown"
280280

281-
buf = self._read_bytes(const.dataset_offset, const.dataset_length)
282-
self.name = buf.rstrip(b"\x00 ")
283-
if self.convert_header_text:
284-
self.name = self.name.decode(self.encoding or self.default_encoding)
281+
self.name = self._read_and_convert_header_text(
282+
const.dataset_offset, const.dataset_length
283+
)
285284

286-
buf = self._read_bytes(const.file_type_offset, const.file_type_length)
287-
self.file_type = buf.rstrip(b"\x00 ")
288-
if self.convert_header_text:
289-
self.file_type = self.file_type.decode(
290-
self.encoding or self.default_encoding
291-
)
285+
self.file_type = self._read_and_convert_header_text(
286+
const.file_type_offset, const.file_type_length
287+
)
292288

293289
# Timestamp is epoch 01/01/1960
294290
epoch = datetime(1960, 1, 1)
@@ -320,46 +316,25 @@ def _get_properties(self) -> None:
320316
const.page_count_offset + align1, const.page_count_length
321317
)
322318

323-
buf = self._read_bytes(
319+
self.sas_release_offset = self._read_and_convert_header_text(
324320
const.sas_release_offset + total_align, const.sas_release_length
325321
)
326-
self.sas_release = buf.rstrip(b"\x00 ")
327-
if self.convert_header_text:
328-
self.sas_release = self.sas_release.decode(
329-
self.encoding or self.default_encoding
330-
)
331322

332-
buf = self._read_bytes(
323+
self.server_type = self._read_and_convert_header_text(
333324
const.sas_server_type_offset + total_align, const.sas_server_type_length
334325
)
335-
self.server_type = buf.rstrip(b"\x00 ")
336-
if self.convert_header_text:
337-
self.server_type = self.server_type.decode(
338-
self.encoding or self.default_encoding
339-
)
340326

341-
buf = self._read_bytes(
327+
self.os_version = self._read_and_convert_header_text(
342328
const.os_version_number_offset + total_align, const.os_version_number_length
343329
)
344-
self.os_version = buf.rstrip(b"\x00 ")
345-
if self.convert_header_text:
346-
self.os_version = self.os_version.decode(
347-
self.encoding or self.default_encoding
348-
)
349330

350-
buf = self._read_bytes(const.os_name_offset + total_align, const.os_name_length)
351-
buf = buf.rstrip(b"\x00 ")
352-
if len(buf) > 0:
353-
self.os_name = buf.decode(self.encoding or self.default_encoding)
354-
else:
355-
buf = self._read_bytes(
331+
self.os_name = self._read_and_convert_header_text(
332+
const.os_name_offset + total_align, const.os_name_length
333+
)
334+
if not self.os_name:
335+
self.os_name = self._read_and_convert_header_text(
356336
const.os_maker_offset + total_align, const.os_maker_length
357337
)
358-
self.os_name = buf.rstrip(b"\x00 ")
359-
if self.convert_header_text:
360-
self.os_name = self.os_name.decode(
361-
self.encoding or self.default_encoding
362-
)
363338

364339
def __next__(self) -> DataFrame:
365340
da = self.read(nrows=self.chunksize or 1)
@@ -402,6 +377,11 @@ def _read_bytes(self, offset: int, length: int):
402377
raise ValueError("The cached page is too small.")
403378
return self._cached_page[offset : offset + length]
404379

380+
def _read_and_convert_header_text(self, offset: int, length: int) -> str | bytes:
381+
return self._convert_header_text(
382+
self._read_bytes(offset, length).rstrip(b"\x00 ")
383+
)
384+
405385
def _parse_metadata(self) -> None:
406386
done = False
407387
while not done:
@@ -576,12 +556,9 @@ def _process_columntext_subheader(self, offset: int, length: int) -> None:
576556

577557
buf = self._read_bytes(offset, text_block_size)
578558
cname_raw = buf[0:text_block_size].rstrip(b"\x00 ")
579-
cname = cname_raw
580-
if self.convert_header_text:
581-
cname = cname.decode(self.encoding or self.default_encoding)
582-
self.column_names_strings.append(cname)
559+
self.column_names_raw.append(cname_raw)
583560

584-
if len(self.column_names_strings) == 1:
561+
if len(self.column_names_raw) == 1:
585562
compression_literal = b""
586563
for cl in const.compression_literals:
587564
if cl in cname_raw:
@@ -615,11 +592,8 @@ def _process_columntext_subheader(self, offset: int, length: int) -> None:
615592
offset1 += 4
616593
buf = self._read_bytes(offset1, self._lcs)
617594
self.creator_proc = buf[0 : self._lcp]
618-
if self.convert_header_text:
619-
if hasattr(self, "creator_proc"):
620-
self.creator_proc = self.creator_proc.decode(
621-
self.encoding or self.default_encoding
622-
)
595+
if hasattr(self, "creator_proc"):
596+
self.creator_proc = self._convert_header_text(self.creator_proc)
623597

624598
def _process_columnname_subheader(self, offset: int, length: int) -> None:
625599
int_len = self._int_length
@@ -650,8 +624,9 @@ def _process_columnname_subheader(self, offset: int, length: int) -> None:
650624
)
651625
col_len = self._read_int(col_name_length, const.column_name_length_length)
652626

653-
name_str = self.column_names_strings[idx]
654-
self.column_names.append(name_str[col_offset : col_offset + col_len])
627+
name_raw = self.column_names_raw[idx]
628+
cname = name_raw[col_offset : col_offset + col_len]
629+
self.column_names.append(self._convert_header_text(cname))
655630

656631
def _process_columnattributes_subheader(self, offset: int, length: int) -> None:
657632
int_len = self._int_length
@@ -699,7 +674,7 @@ def _process_format_subheader(self, offset: int, length: int) -> None:
699674
x = self._read_int(
700675
text_subheader_format, const.column_format_text_subheader_index_length
701676
)
702-
format_idx = min(x, len(self.column_names_strings) - 1)
677+
format_idx = min(x, len(self.column_names_raw) - 1)
703678

704679
format_start = self._read_int(
705680
col_format_offset, const.column_format_offset_length
@@ -709,15 +684,19 @@ def _process_format_subheader(self, offset: int, length: int) -> None:
709684
label_idx = self._read_int(
710685
text_subheader_label, const.column_label_text_subheader_index_length
711686
)
712-
label_idx = min(label_idx, len(self.column_names_strings) - 1)
687+
label_idx = min(label_idx, len(self.column_names_raw) - 1)
713688

714689
label_start = self._read_int(col_label_offset, const.column_label_offset_length)
715690
label_len = self._read_int(col_label_len, const.column_label_length_length)
716691

717-
label_names = self.column_names_strings[label_idx]
718-
column_label = label_names[label_start : label_start + label_len]
719-
format_names = self.column_names_strings[format_idx]
720-
column_format = format_names[format_start : format_start + format_len]
692+
label_names = self.column_names_raw[label_idx]
693+
column_label = self._convert_header_text(
694+
label_names[label_start : label_start + label_len]
695+
)
696+
format_names = self.column_names_raw[format_idx]
697+
column_format = self._convert_header_text(
698+
format_names[format_start : format_start + format_len]
699+
)
721700
current_column_number = len(self.columns)
722701

723702
col = _Column(
@@ -815,9 +794,7 @@ def _chunk_to_dataframe(self) -> DataFrame:
815794
elif self._column_types[j] == b"s":
816795
rslt[name] = pd.Series(self._string_chunk[js, :], index=ix)
817796
if self.convert_text and (self.encoding is not None):
818-
rslt[name] = rslt[name].str.decode(
819-
self.encoding or self.default_encoding
820-
)
797+
rslt[name] = self._decode_string(rslt[name].str)
821798
if self.blank_missing:
822799
ii = rslt[name].str.len() == 0
823800
rslt[name][ii] = np.nan
@@ -828,3 +805,12 @@ def _chunk_to_dataframe(self) -> DataFrame:
828805

829806
df = DataFrame(rslt, columns=self.column_names, index=ix, copy=False)
830807
return df
808+
809+
def _decode_string(self, b):
810+
return b.decode(self.encoding or self.default_encoding)
811+
812+
def _convert_header_text(self, b: bytes) -> str | bytes:
813+
if self.convert_header_text:
814+
return self._decode_string(b)
815+
else:
816+
return b
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
long_string_field1,long_string_field2,long_string_field3
2+
00000000000000000000000000000000000000000000000000,11111111111111111111111111111111111111111111111111,aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
Binary file not shown.

pandas/tests/io/sas/test_sas7bdat.py

+9
Original file line numberDiff line numberDiff line change
@@ -381,3 +381,12 @@ def test_exception_propagation_rle_decompress(tmp_path, datapath):
381381
tmp_file.write_bytes(data)
382382
with pytest.raises(ValueError, match="unknown control byte"):
383383
pd.read_sas(tmp_file)
384+
385+
386+
def test_0x40_control_byte(datapath):
387+
# GH 31243
388+
fname = datapath("io", "sas", "data", "0x40controlbyte.sas7bdat")
389+
df = pd.read_sas(fname, encoding="ascii")
390+
fname = datapath("io", "sas", "data", "0x40controlbyte.csv")
391+
df0 = pd.read_csv(fname, dtype="object")
392+
tm.assert_frame_equal(df, df0)

0 commit comments

Comments
 (0)