@@ -180,9 +180,9 @@ def __init__(
180
180
181
181
self .default_encoding = "latin-1"
182
182
self .compression = b""
183
- self .column_names_strings : list [str ] = []
184
- self .column_names : list [str ] = []
185
- self .column_formats : list [str ] = []
183
+ self .column_names_raw : list [bytes ] = []
184
+ self .column_names : list [str | bytes ] = []
185
+ self .column_formats : list [str | bytes ] = []
186
186
self .columns : list [_Column ] = []
187
187
188
188
self ._current_page_data_subheader_pointers : list [_SubheaderPointer ] = []
@@ -274,17 +274,13 @@ def _get_properties(self) -> None:
274
274
else :
275
275
self .platform = "unknown"
276
276
277
- buf = self ._read_bytes (const .dataset_offset , const .dataset_length )
278
- self .name = buf .rstrip (b"\x00 " )
279
- if self .convert_header_text :
280
- self .name = self .name .decode (self .encoding or self .default_encoding )
277
+ self .name = self ._read_and_convert_header_text (
278
+ const .dataset_offset , const .dataset_length
279
+ )
281
280
282
- buf = self ._read_bytes (const .file_type_offset , const .file_type_length )
283
- self .file_type = buf .rstrip (b"\x00 " )
284
- if self .convert_header_text :
285
- self .file_type = self .file_type .decode (
286
- self .encoding or self .default_encoding
287
- )
281
+ self .file_type = self ._read_and_convert_header_text (
282
+ const .file_type_offset , const .file_type_length
283
+ )
288
284
289
285
# Timestamp is epoch 01/01/1960
290
286
epoch = datetime (1960 , 1 , 1 )
@@ -316,46 +312,25 @@ def _get_properties(self) -> None:
316
312
const .page_count_offset + align1 , const .page_count_length
317
313
)
318
314
319
- buf = self ._read_bytes (
315
+ self . sas_release_offset = self ._read_and_convert_header_text (
320
316
const .sas_release_offset + total_align , const .sas_release_length
321
317
)
322
- self .sas_release = buf .rstrip (b"\x00 " )
323
- if self .convert_header_text :
324
- self .sas_release = self .sas_release .decode (
325
- self .encoding or self .default_encoding
326
- )
327
318
328
- buf = self ._read_bytes (
319
+ self . server_type = self ._read_and_convert_header_text (
329
320
const .sas_server_type_offset + total_align , const .sas_server_type_length
330
321
)
331
- self .server_type = buf .rstrip (b"\x00 " )
332
- if self .convert_header_text :
333
- self .server_type = self .server_type .decode (
334
- self .encoding or self .default_encoding
335
- )
336
322
337
- buf = self ._read_bytes (
323
+ self . os_version = self ._read_and_convert_header_text (
338
324
const .os_version_number_offset + total_align , const .os_version_number_length
339
325
)
340
- self .os_version = buf .rstrip (b"\x00 " )
341
- if self .convert_header_text :
342
- self .os_version = self .os_version .decode (
343
- self .encoding or self .default_encoding
344
- )
345
326
346
- buf = self ._read_bytes (const .os_name_offset + total_align , const .os_name_length )
347
- buf = buf .rstrip (b"\x00 " )
348
- if len (buf ) > 0 :
349
- self .os_name = buf .decode (self .encoding or self .default_encoding )
350
- else :
351
- buf = self ._read_bytes (
327
+ self .os_name = self ._read_and_convert_header_text (
328
+ const .os_name_offset + total_align , const .os_name_length
329
+ )
330
+ if not self .os_name :
331
+ self .os_name = self ._read_and_convert_header_text (
352
332
const .os_maker_offset + total_align , const .os_maker_length
353
333
)
354
- self .os_name = buf .rstrip (b"\x00 " )
355
- if self .convert_header_text :
356
- self .os_name = self .os_name .decode (
357
- self .encoding or self .default_encoding
358
- )
359
334
360
335
def __next__ (self ):
361
336
da = self .read (nrows = self .chunksize or 1 )
@@ -398,6 +373,9 @@ def _read_bytes(self, offset: int, length: int):
398
373
raise ValueError ("The cached page is too small." )
399
374
return self ._cached_page [offset : offset + length ]
400
375
376
+ def _read_and_convert_header_text (self , offset : int , length : int ) -> str | bytes :
377
+ return self ._convert_header_text (self ._read_bytes (offset , length ))
378
+
401
379
def _parse_metadata (self ) -> None :
402
380
done = False
403
381
while not done :
@@ -570,12 +548,9 @@ def _process_columntext_subheader(self, offset: int, length: int) -> None:
570
548
571
549
buf = self ._read_bytes (offset , text_block_size )
572
550
cname_raw = buf [0 :text_block_size ].rstrip (b"\x00 " )
573
- cname = cname_raw
574
- if self .convert_header_text :
575
- cname = cname .decode (self .encoding or self .default_encoding )
576
- self .column_names_strings .append (cname )
551
+ self .column_names_raw .append (cname_raw )
577
552
578
- if len (self .column_names_strings ) == 1 :
553
+ if len (self .column_names_raw ) == 1 :
579
554
compression_literal = b""
580
555
for cl in const .compression_literals :
581
556
if cl in cname_raw :
@@ -609,11 +584,8 @@ def _process_columntext_subheader(self, offset: int, length: int) -> None:
609
584
offset1 += 4
610
585
buf = self ._read_bytes (offset1 , self ._lcs )
611
586
self .creator_proc = buf [0 : self ._lcp ]
612
- if self .convert_header_text :
613
- if hasattr (self , "creator_proc" ):
614
- self .creator_proc = self .creator_proc .decode (
615
- self .encoding or self .default_encoding
616
- )
587
+ if hasattr (self , "creator_proc" ):
588
+ self .creator_proc = self ._convert_header_text (self .creator_proc )
617
589
618
590
def _process_columnname_subheader (self , offset : int , length : int ) -> None :
619
591
int_len = self ._int_length
@@ -644,8 +616,9 @@ def _process_columnname_subheader(self, offset: int, length: int) -> None:
644
616
)
645
617
col_len = self ._read_int (col_name_length , const .column_name_length_length )
646
618
647
- name_str = self .column_names_strings [idx ]
648
- self .column_names .append (name_str [col_offset : col_offset + col_len ])
619
+ name_raw = self .column_names_raw [idx ]
620
+ cname = name_raw [col_offset : col_offset + col_len ]
621
+ self .column_names .append (self ._convert_header_text (cname ))
649
622
650
623
def _process_columnattributes_subheader (self , offset : int , length : int ) -> None :
651
624
int_len = self ._int_length
@@ -693,7 +666,7 @@ def _process_format_subheader(self, offset: int, length: int) -> None:
693
666
x = self ._read_int (
694
667
text_subheader_format , const .column_format_text_subheader_index_length
695
668
)
696
- format_idx = min (x , len (self .column_names_strings ) - 1 )
669
+ format_idx = min (x , len (self .column_names_raw ) - 1 )
697
670
698
671
format_start = self ._read_int (
699
672
col_format_offset , const .column_format_offset_length
@@ -703,15 +676,19 @@ def _process_format_subheader(self, offset: int, length: int) -> None:
703
676
label_idx = self ._read_int (
704
677
text_subheader_label , const .column_label_text_subheader_index_length
705
678
)
706
- label_idx = min (label_idx , len (self .column_names_strings ) - 1 )
679
+ label_idx = min (label_idx , len (self .column_names_raw ) - 1 )
707
680
708
681
label_start = self ._read_int (col_label_offset , const .column_label_offset_length )
709
682
label_len = self ._read_int (col_label_len , const .column_label_length_length )
710
683
711
- label_names = self .column_names_strings [label_idx ]
712
- column_label = label_names [label_start : label_start + label_len ]
713
- format_names = self .column_names_strings [format_idx ]
714
- column_format = format_names [format_start : format_start + format_len ]
684
+ label_names = self .column_names_raw [label_idx ]
685
+ column_label = self ._convert_header_text (
686
+ label_names [label_start : label_start + label_len ]
687
+ )
688
+ format_names = self .column_names_raw [format_idx ]
689
+ column_format = self ._convert_header_text (
690
+ format_names [format_start : format_start + format_len ]
691
+ )
715
692
current_column_number = len (self .columns )
716
693
717
694
col = _Column (
@@ -809,9 +786,7 @@ def _chunk_to_dataframe(self) -> DataFrame:
809
786
elif self ._column_types [j ] == b"s" :
810
787
rslt [name ] = pd .Series (self ._string_chunk [js , :], index = ix )
811
788
if self .convert_text and (self .encoding is not None ):
812
- rslt [name ] = rslt [name ].str .decode (
813
- self .encoding or self .default_encoding
814
- )
789
+ rslt [name ] = self ._decode_string (rslt [name ].str )
815
790
if self .blank_missing :
816
791
ii = rslt [name ].str .len () == 0
817
792
rslt [name ][ii ] = np .nan
@@ -822,3 +797,12 @@ def _chunk_to_dataframe(self) -> DataFrame:
822
797
823
798
df = DataFrame (rslt , columns = self .column_names , index = ix , copy = False )
824
799
return df
800
+
801
+ def _decode_string (self , b ):
802
+ return b .decode (self .encoding or self .default_encoding )
803
+
804
+ def _convert_header_text (self , b : bytes ) -> str | bytes :
805
+ if self .convert_header_text :
806
+ return self ._decode_string (b )
807
+ else :
808
+ return b
0 commit comments