Skip to content

Commit 93db2f3

Browse files
committed
Review feedback
1 parent 7669f6e commit 93db2f3

File tree

2 files changed

+49
-63
lines changed

2 files changed

+49
-63
lines changed

doc/source/whatsnew/v1.5.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -844,6 +844,8 @@ I/O
844844
- :meth:`to_html` now excludes the ``border`` attribute from ``<table>`` elements when ``border`` keyword is set to ``False``.
845845
- Bug in :func:`read_sas` returned ``None`` rather than an empty DataFrame for SAS7BDAT files with zero rows (:issue:`18198`)
846846
- Bug in :class:`StataWriter` where value labels were always written with default encoding (:issue:`46750`)
847+
- Bug in :func:`read_sas` with RLE-compressed SAS7BDAT files that contain 0x40 control bytes (:issue:`31243`)
848+
- Bug in :func:`read_sas` that scrambled column names (:issue:`31243`)
847849
-
848850

849851
Period

pandas/io/sas/sas7bdat.py

+47-63
Original file line numberDiff line numberDiff line change
@@ -180,9 +180,9 @@ def __init__(
180180

181181
self.default_encoding = "latin-1"
182182
self.compression = b""
183-
self.column_names_strings: list[str] = []
184-
self.column_names: list[str] = []
185-
self.column_formats: list[str] = []
183+
self.column_names_raw: list[bytes] = []
184+
self.column_names: list[str | bytes] = []
185+
self.column_formats: list[str | bytes] = []
186186
self.columns: list[_Column] = []
187187

188188
self._current_page_data_subheader_pointers: list[_SubheaderPointer] = []
@@ -274,17 +274,13 @@ def _get_properties(self) -> None:
274274
else:
275275
self.platform = "unknown"
276276

277-
buf = self._read_bytes(const.dataset_offset, const.dataset_length)
278-
self.name = buf.rstrip(b"\x00 ")
279-
if self.convert_header_text:
280-
self.name = self.name.decode(self.encoding or self.default_encoding)
277+
self.name = self._read_and_convert_header_text(
278+
const.dataset_offset, const.dataset_length
279+
)
281280

282-
buf = self._read_bytes(const.file_type_offset, const.file_type_length)
283-
self.file_type = buf.rstrip(b"\x00 ")
284-
if self.convert_header_text:
285-
self.file_type = self.file_type.decode(
286-
self.encoding or self.default_encoding
287-
)
281+
self.file_type = self._read_and_convert_header_text(
282+
const.file_type_offset, const.file_type_length
283+
)
288284

289285
# Timestamp is epoch 01/01/1960
290286
epoch = datetime(1960, 1, 1)
@@ -316,46 +312,25 @@ def _get_properties(self) -> None:
316312
const.page_count_offset + align1, const.page_count_length
317313
)
318314

319-
buf = self._read_bytes(
315+
self.sas_release_offset = self._read_and_convert_header_text(
320316
const.sas_release_offset + total_align, const.sas_release_length
321317
)
322-
self.sas_release = buf.rstrip(b"\x00 ")
323-
if self.convert_header_text:
324-
self.sas_release = self.sas_release.decode(
325-
self.encoding or self.default_encoding
326-
)
327318

328-
buf = self._read_bytes(
319+
self.server_type = self._read_and_convert_header_text(
329320
const.sas_server_type_offset + total_align, const.sas_server_type_length
330321
)
331-
self.server_type = buf.rstrip(b"\x00 ")
332-
if self.convert_header_text:
333-
self.server_type = self.server_type.decode(
334-
self.encoding or self.default_encoding
335-
)
336322

337-
buf = self._read_bytes(
323+
self.os_version = self._read_and_convert_header_text(
338324
const.os_version_number_offset + total_align, const.os_version_number_length
339325
)
340-
self.os_version = buf.rstrip(b"\x00 ")
341-
if self.convert_header_text:
342-
self.os_version = self.os_version.decode(
343-
self.encoding or self.default_encoding
344-
)
345326

346-
buf = self._read_bytes(const.os_name_offset + total_align, const.os_name_length)
347-
buf = buf.rstrip(b"\x00 ")
348-
if len(buf) > 0:
349-
self.os_name = buf.decode(self.encoding or self.default_encoding)
350-
else:
351-
buf = self._read_bytes(
327+
self.os_name = self._read_and_convert_header_text(
328+
const.os_name_offset + total_align, const.os_name_length
329+
)
330+
if not self.os_name:
331+
self.os_name = self._read_and_convert_header_text(
352332
const.os_maker_offset + total_align, const.os_maker_length
353333
)
354-
self.os_name = buf.rstrip(b"\x00 ")
355-
if self.convert_header_text:
356-
self.os_name = self.os_name.decode(
357-
self.encoding or self.default_encoding
358-
)
359334

360335
def __next__(self):
361336
da = self.read(nrows=self.chunksize or 1)
@@ -398,6 +373,9 @@ def _read_bytes(self, offset: int, length: int):
398373
raise ValueError("The cached page is too small.")
399374
return self._cached_page[offset : offset + length]
400375

376+
def _read_and_convert_header_text(self, offset: int, length: int) -> str | bytes:
377+
return self._convert_header_text(self._read_bytes(offset, length))
378+
401379
def _parse_metadata(self) -> None:
402380
done = False
403381
while not done:
@@ -570,12 +548,9 @@ def _process_columntext_subheader(self, offset: int, length: int) -> None:
570548

571549
buf = self._read_bytes(offset, text_block_size)
572550
cname_raw = buf[0:text_block_size].rstrip(b"\x00 ")
573-
cname = cname_raw
574-
if self.convert_header_text:
575-
cname = cname.decode(self.encoding or self.default_encoding)
576-
self.column_names_strings.append(cname)
551+
self.column_names_raw.append(cname_raw)
577552

578-
if len(self.column_names_strings) == 1:
553+
if len(self.column_names_raw) == 1:
579554
compression_literal = b""
580555
for cl in const.compression_literals:
581556
if cl in cname_raw:
@@ -609,11 +584,8 @@ def _process_columntext_subheader(self, offset: int, length: int) -> None:
609584
offset1 += 4
610585
buf = self._read_bytes(offset1, self._lcs)
611586
self.creator_proc = buf[0 : self._lcp]
612-
if self.convert_header_text:
613-
if hasattr(self, "creator_proc"):
614-
self.creator_proc = self.creator_proc.decode(
615-
self.encoding or self.default_encoding
616-
)
587+
if hasattr(self, "creator_proc"):
588+
self.creator_proc = self._convert_header_text(self.creator_proc)
617589

618590
def _process_columnname_subheader(self, offset: int, length: int) -> None:
619591
int_len = self._int_length
@@ -644,8 +616,9 @@ def _process_columnname_subheader(self, offset: int, length: int) -> None:
644616
)
645617
col_len = self._read_int(col_name_length, const.column_name_length_length)
646618

647-
name_str = self.column_names_strings[idx]
648-
self.column_names.append(name_str[col_offset : col_offset + col_len])
619+
name_raw = self.column_names_raw[idx]
620+
cname = name_raw[col_offset : col_offset + col_len]
621+
self.column_names.append(self._convert_header_text(cname))
649622

650623
def _process_columnattributes_subheader(self, offset: int, length: int) -> None:
651624
int_len = self._int_length
@@ -693,7 +666,7 @@ def _process_format_subheader(self, offset: int, length: int) -> None:
693666
x = self._read_int(
694667
text_subheader_format, const.column_format_text_subheader_index_length
695668
)
696-
format_idx = min(x, len(self.column_names_strings) - 1)
669+
format_idx = min(x, len(self.column_names_raw) - 1)
697670

698671
format_start = self._read_int(
699672
col_format_offset, const.column_format_offset_length
@@ -703,15 +676,19 @@ def _process_format_subheader(self, offset: int, length: int) -> None:
703676
label_idx = self._read_int(
704677
text_subheader_label, const.column_label_text_subheader_index_length
705678
)
706-
label_idx = min(label_idx, len(self.column_names_strings) - 1)
679+
label_idx = min(label_idx, len(self.column_names_raw) - 1)
707680

708681
label_start = self._read_int(col_label_offset, const.column_label_offset_length)
709682
label_len = self._read_int(col_label_len, const.column_label_length_length)
710683

711-
label_names = self.column_names_strings[label_idx]
712-
column_label = label_names[label_start : label_start + label_len]
713-
format_names = self.column_names_strings[format_idx]
714-
column_format = format_names[format_start : format_start + format_len]
684+
label_names = self.column_names_raw[label_idx]
685+
column_label = self._convert_header_text(
686+
label_names[label_start : label_start + label_len]
687+
)
688+
format_names = self.column_names_raw[format_idx]
689+
column_format = self._convert_header_text(
690+
format_names[format_start : format_start + format_len]
691+
)
715692
current_column_number = len(self.columns)
716693

717694
col = _Column(
@@ -809,9 +786,7 @@ def _chunk_to_dataframe(self) -> DataFrame:
809786
elif self._column_types[j] == b"s":
810787
rslt[name] = pd.Series(self._string_chunk[js, :], index=ix)
811788
if self.convert_text and (self.encoding is not None):
812-
rslt[name] = rslt[name].str.decode(
813-
self.encoding or self.default_encoding
814-
)
789+
rslt[name] = self._decode_string(rslt[name].str)
815790
if self.blank_missing:
816791
ii = rslt[name].str.len() == 0
817792
rslt[name][ii] = np.nan
@@ -822,3 +797,12 @@ def _chunk_to_dataframe(self) -> DataFrame:
822797

823798
df = DataFrame(rslt, columns=self.column_names, index=ix, copy=False)
824799
return df
800+
801+
def _decode_string(self, b):
802+
return b.decode(self.encoding or self.default_encoding)
803+
804+
def _convert_header_text(self, b: bytes) -> str | bytes:
805+
if self.convert_header_text:
806+
return self._decode_string(b)
807+
else:
808+
return b

0 commit comments

Comments
 (0)