@@ -182,9 +182,9 @@ def __init__(
182
182
183
183
self .default_encoding = "latin-1"
184
184
self .compression = b""
185
- self .column_names_strings : list [str ] = []
186
- self .column_names : list [str ] = []
187
- self .column_formats : list [str ] = []
185
+ self .column_names_raw : list [bytes ] = []
186
+ self .column_names : list [str | bytes ] = []
187
+ self .column_formats : list [str | bytes ] = []
188
188
self .columns : list [_Column ] = []
189
189
190
190
self ._current_page_data_subheader_pointers : list [_SubheaderPointer ] = []
@@ -278,17 +278,13 @@ def _get_properties(self) -> None:
278
278
else :
279
279
self .platform = "unknown"
280
280
281
- buf = self ._read_bytes (const .dataset_offset , const .dataset_length )
282
- self .name = buf .rstrip (b"\x00 " )
283
- if self .convert_header_text :
284
- self .name = self .name .decode (self .encoding or self .default_encoding )
281
+ self .name = self ._read_and_convert_header_text (
282
+ const .dataset_offset , const .dataset_length
283
+ )
285
284
286
- buf = self ._read_bytes (const .file_type_offset , const .file_type_length )
287
- self .file_type = buf .rstrip (b"\x00 " )
288
- if self .convert_header_text :
289
- self .file_type = self .file_type .decode (
290
- self .encoding or self .default_encoding
291
- )
285
+ self .file_type = self ._read_and_convert_header_text (
286
+ const .file_type_offset , const .file_type_length
287
+ )
292
288
293
289
# Timestamp is epoch 01/01/1960
294
290
epoch = datetime (1960 , 1 , 1 )
@@ -320,46 +316,25 @@ def _get_properties(self) -> None:
320
316
const .page_count_offset + align1 , const .page_count_length
321
317
)
322
318
323
- buf = self ._read_bytes (
319
+ self . sas_release_offset = self ._read_and_convert_header_text (
324
320
const .sas_release_offset + total_align , const .sas_release_length
325
321
)
326
- self .sas_release = buf .rstrip (b"\x00 " )
327
- if self .convert_header_text :
328
- self .sas_release = self .sas_release .decode (
329
- self .encoding or self .default_encoding
330
- )
331
322
332
- buf = self ._read_bytes (
323
+ self . server_type = self ._read_and_convert_header_text (
333
324
const .sas_server_type_offset + total_align , const .sas_server_type_length
334
325
)
335
- self .server_type = buf .rstrip (b"\x00 " )
336
- if self .convert_header_text :
337
- self .server_type = self .server_type .decode (
338
- self .encoding or self .default_encoding
339
- )
340
326
341
- buf = self ._read_bytes (
327
+ self . os_version = self ._read_and_convert_header_text (
342
328
const .os_version_number_offset + total_align , const .os_version_number_length
343
329
)
344
- self .os_version = buf .rstrip (b"\x00 " )
345
- if self .convert_header_text :
346
- self .os_version = self .os_version .decode (
347
- self .encoding or self .default_encoding
348
- )
349
330
350
- buf = self ._read_bytes (const .os_name_offset + total_align , const .os_name_length )
351
- buf = buf .rstrip (b"\x00 " )
352
- if len (buf ) > 0 :
353
- self .os_name = buf .decode (self .encoding or self .default_encoding )
354
- else :
355
- buf = self ._read_bytes (
331
+ self .os_name = self ._read_and_convert_header_text (
332
+ const .os_name_offset + total_align , const .os_name_length
333
+ )
334
+ if not self .os_name :
335
+ self .os_name = self ._read_and_convert_header_text (
356
336
const .os_maker_offset + total_align , const .os_maker_length
357
337
)
358
- self .os_name = buf .rstrip (b"\x00 " )
359
- if self .convert_header_text :
360
- self .os_name = self .os_name .decode (
361
- self .encoding or self .default_encoding
362
- )
363
338
364
339
def __next__ (self ) -> DataFrame :
365
340
da = self .read (nrows = self .chunksize or 1 )
@@ -402,6 +377,11 @@ def _read_bytes(self, offset: int, length: int):
402
377
raise ValueError ("The cached page is too small." )
403
378
return self ._cached_page [offset : offset + length ]
404
379
380
+ def _read_and_convert_header_text (self , offset : int , length : int ) -> str | bytes :
381
+ return self ._convert_header_text (
382
+ self ._read_bytes (offset , length ).rstrip (b"\x00 " )
383
+ )
384
+
405
385
def _parse_metadata (self ) -> None :
406
386
done = False
407
387
while not done :
@@ -576,12 +556,9 @@ def _process_columntext_subheader(self, offset: int, length: int) -> None:
576
556
577
557
buf = self ._read_bytes (offset , text_block_size )
578
558
cname_raw = buf [0 :text_block_size ].rstrip (b"\x00 " )
579
- cname = cname_raw
580
- if self .convert_header_text :
581
- cname = cname .decode (self .encoding or self .default_encoding )
582
- self .column_names_strings .append (cname )
559
+ self .column_names_raw .append (cname_raw )
583
560
584
- if len (self .column_names_strings ) == 1 :
561
+ if len (self .column_names_raw ) == 1 :
585
562
compression_literal = b""
586
563
for cl in const .compression_literals :
587
564
if cl in cname_raw :
@@ -615,11 +592,8 @@ def _process_columntext_subheader(self, offset: int, length: int) -> None:
615
592
offset1 += 4
616
593
buf = self ._read_bytes (offset1 , self ._lcs )
617
594
self .creator_proc = buf [0 : self ._lcp ]
618
- if self .convert_header_text :
619
- if hasattr (self , "creator_proc" ):
620
- self .creator_proc = self .creator_proc .decode (
621
- self .encoding or self .default_encoding
622
- )
595
+ if hasattr (self , "creator_proc" ):
596
+ self .creator_proc = self ._convert_header_text (self .creator_proc )
623
597
624
598
def _process_columnname_subheader (self , offset : int , length : int ) -> None :
625
599
int_len = self ._int_length
@@ -650,8 +624,9 @@ def _process_columnname_subheader(self, offset: int, length: int) -> None:
650
624
)
651
625
col_len = self ._read_int (col_name_length , const .column_name_length_length )
652
626
653
- name_str = self .column_names_strings [idx ]
654
- self .column_names .append (name_str [col_offset : col_offset + col_len ])
627
+ name_raw = self .column_names_raw [idx ]
628
+ cname = name_raw [col_offset : col_offset + col_len ]
629
+ self .column_names .append (self ._convert_header_text (cname ))
655
630
656
631
def _process_columnattributes_subheader (self , offset : int , length : int ) -> None :
657
632
int_len = self ._int_length
@@ -699,7 +674,7 @@ def _process_format_subheader(self, offset: int, length: int) -> None:
699
674
x = self ._read_int (
700
675
text_subheader_format , const .column_format_text_subheader_index_length
701
676
)
702
- format_idx = min (x , len (self .column_names_strings ) - 1 )
677
+ format_idx = min (x , len (self .column_names_raw ) - 1 )
703
678
704
679
format_start = self ._read_int (
705
680
col_format_offset , const .column_format_offset_length
@@ -709,15 +684,19 @@ def _process_format_subheader(self, offset: int, length: int) -> None:
709
684
label_idx = self ._read_int (
710
685
text_subheader_label , const .column_label_text_subheader_index_length
711
686
)
712
- label_idx = min (label_idx , len (self .column_names_strings ) - 1 )
687
+ label_idx = min (label_idx , len (self .column_names_raw ) - 1 )
713
688
714
689
label_start = self ._read_int (col_label_offset , const .column_label_offset_length )
715
690
label_len = self ._read_int (col_label_len , const .column_label_length_length )
716
691
717
- label_names = self .column_names_strings [label_idx ]
718
- column_label = label_names [label_start : label_start + label_len ]
719
- format_names = self .column_names_strings [format_idx ]
720
- column_format = format_names [format_start : format_start + format_len ]
692
+ label_names = self .column_names_raw [label_idx ]
693
+ column_label = self ._convert_header_text (
694
+ label_names [label_start : label_start + label_len ]
695
+ )
696
+ format_names = self .column_names_raw [format_idx ]
697
+ column_format = self ._convert_header_text (
698
+ format_names [format_start : format_start + format_len ]
699
+ )
721
700
current_column_number = len (self .columns )
722
701
723
702
col = _Column (
@@ -815,9 +794,7 @@ def _chunk_to_dataframe(self) -> DataFrame:
815
794
elif self ._column_types [j ] == b"s" :
816
795
rslt [name ] = pd .Series (self ._string_chunk [js , :], index = ix )
817
796
if self .convert_text and (self .encoding is not None ):
818
- rslt [name ] = rslt [name ].str .decode (
819
- self .encoding or self .default_encoding
820
- )
797
+ rslt [name ] = self ._decode_string (rslt [name ].str )
821
798
if self .blank_missing :
822
799
ii = rslt [name ].str .len () == 0
823
800
rslt [name ][ii ] = np .nan
@@ -828,3 +805,12 @@ def _chunk_to_dataframe(self) -> DataFrame:
828
805
829
806
df = DataFrame (rslt , columns = self .column_names , index = ix , copy = False )
830
807
return df
808
+
809
+ def _decode_string (self , b ):
810
+ return b .decode (self .encoding or self .default_encoding )
811
+
812
+ def _convert_header_text (self , b : bytes ) -> str | bytes :
813
+ if self .convert_header_text :
814
+ return self ._decode_string (b )
815
+ else :
816
+ return b
0 commit comments