42
42
)
43
43
44
44
from pandas .io .common import get_handle
45
- from pandas .io .sas ._sas import Parser
45
+ from pandas .io .sas ._sas import (
46
+ Parser ,
47
+ get_subheader_index ,
48
+ )
46
49
import pandas .io .sas .sas_constants as const
47
50
from pandas .io .sas .sasreader import ReaderBase
48
51
@@ -87,19 +90,6 @@ def _convert_datetimes(sas_datetimes: pd.Series, unit: str) -> pd.Series:
87
90
return s_series
88
91
89
92
90
- class _SubheaderPointer :
91
- offset : int
92
- length : int
93
- compression : int
94
- ptype : int
95
-
96
- def __init__ (self , offset : int , length : int , compression : int , ptype : int ) -> None :
97
- self .offset = offset
98
- self .length = length
99
- self .compression = compression
100
- self .ptype = ptype
101
-
102
-
103
93
class _Column :
104
94
col_id : int
105
95
name : str | bytes
@@ -189,7 +179,7 @@ def __init__(
189
179
self .column_formats : list [str | bytes ] = []
190
180
self .columns : list [_Column ] = []
191
181
192
- self ._current_page_data_subheader_pointers : list [_SubheaderPointer ] = []
182
+ self ._current_page_data_subheader_pointers : list [tuple [ int , int ] ] = []
193
183
self ._cached_page = None
194
184
self ._column_data_lengths : list [int ] = []
195
185
self ._column_data_offsets : list [int ] = []
@@ -205,6 +195,19 @@ def __init__(
205
195
206
196
self ._path_or_buf = self .handles .handle
207
197
198
+ # Same order as const.SASIndex
199
+ self ._subheader_processors = [
200
+ self ._process_rowsize_subheader ,
201
+ self ._process_columnsize_subheader ,
202
+ self ._process_subheader_counts ,
203
+ self ._process_columntext_subheader ,
204
+ self ._process_columnname_subheader ,
205
+ self ._process_columnattributes_subheader ,
206
+ self ._process_format_subheader ,
207
+ self ._process_columnlist_subheader ,
208
+ None , # Data
209
+ ]
210
+
208
211
try :
209
212
self ._get_properties ()
210
213
self ._parse_metadata ()
@@ -426,89 +429,47 @@ def _process_page_metadata(self) -> None:
426
429
bit_offset = self ._page_bit_offset
427
430
428
431
for i in range (self ._current_page_subheaders_count ):
429
- pointer = self ._process_subheader_pointers (
430
- const .subheader_pointers_offset + bit_offset , i
431
- )
432
- if pointer .length == 0 :
433
- continue
434
- if pointer .compression == const .truncated_subheader_id :
435
- continue
436
- subheader_signature = self ._read_subheader_signature (pointer .offset )
437
- subheader_index = self ._get_subheader_index (
438
- subheader_signature , pointer .compression , pointer .ptype
439
- )
440
- self ._process_subheader (subheader_index , pointer )
441
-
442
- def _get_subheader_index (self , signature : bytes , compression , ptype ) -> int :
443
- # TODO: return here could be made an enum
444
- index = const .subheader_signature_to_index .get (signature )
445
- if index is None :
446
- f1 = (compression == const .compressed_subheader_id ) or (compression == 0 )
447
- f2 = ptype == const .compressed_subheader_type
448
- if (self .compression != b"" ) and f1 and f2 :
449
- index = const .SASIndex .data_subheader_index
450
- else :
451
- self .close ()
452
- raise ValueError ("Unknown subheader signature" )
453
- return index
454
-
455
- def _process_subheader_pointers (
456
- self , offset : int , subheader_pointer_index : int
457
- ) -> _SubheaderPointer :
458
-
459
- subheader_pointer_length = self ._subheader_pointer_length
460
- total_offset = offset + subheader_pointer_length * subheader_pointer_index
432
+ offset = const .subheader_pointers_offset + bit_offset
433
+ total_offset = offset + self ._subheader_pointer_length * i
461
434
462
- subheader_offset = self ._read_int (total_offset , self ._int_length )
463
- total_offset += self ._int_length
435
+ subheader_offset = self ._read_int (total_offset , self ._int_length )
436
+ total_offset += self ._int_length
464
437
465
- subheader_length = self ._read_int (total_offset , self ._int_length )
466
- total_offset += self ._int_length
438
+ subheader_length = self ._read_int (total_offset , self ._int_length )
439
+ total_offset += self ._int_length
467
440
468
- subheader_compression = self ._read_int (total_offset , 1 )
469
- total_offset += 1
470
-
471
- subheader_type = self ._read_int (total_offset , 1 )
472
-
473
- x = _SubheaderPointer (
474
- subheader_offset , subheader_length , subheader_compression , subheader_type
475
- )
441
+ subheader_compression = self ._read_int (total_offset , 1 )
442
+ total_offset += 1
476
443
477
- return x
444
+ subheader_type = self . _read_int ( total_offset , 1 )
478
445
479
- def _read_subheader_signature (self , offset : int ) -> bytes :
480
- subheader_signature = self ._read_bytes (offset , self ._int_length )
481
- return subheader_signature
482
-
483
- def _process_subheader (
484
- self , subheader_index : int , pointer : _SubheaderPointer
485
- ) -> None :
486
- offset = pointer .offset
487
- length = pointer .length
488
-
489
- if subheader_index == const .SASIndex .row_size_index :
490
- processor = self ._process_rowsize_subheader
491
- elif subheader_index == const .SASIndex .column_size_index :
492
- processor = self ._process_columnsize_subheader
493
- elif subheader_index == const .SASIndex .column_text_index :
494
- processor = self ._process_columntext_subheader
495
- elif subheader_index == const .SASIndex .column_name_index :
496
- processor = self ._process_columnname_subheader
497
- elif subheader_index == const .SASIndex .column_attributes_index :
498
- processor = self ._process_columnattributes_subheader
499
- elif subheader_index == const .SASIndex .format_and_label_index :
500
- processor = self ._process_format_subheader
501
- elif subheader_index == const .SASIndex .column_list_index :
502
- processor = self ._process_columnlist_subheader
503
- elif subheader_index == const .SASIndex .subheader_counts_index :
504
- processor = self ._process_subheader_counts
505
- elif subheader_index == const .SASIndex .data_subheader_index :
506
- self ._current_page_data_subheader_pointers .append (pointer )
507
- return
508
- else :
509
- raise ValueError ("unknown subheader index" )
446
+ if (
447
+ subheader_length == 0
448
+ or subheader_compression == const .truncated_subheader_id
449
+ ):
450
+ continue
510
451
511
- processor (offset , length )
452
+ subheader_signature = self ._read_bytes (subheader_offset , self ._int_length )
453
+ subheader_index = get_subheader_index (subheader_signature )
454
+ subheader_processor = self ._subheader_processors [subheader_index ]
455
+
456
+ if subheader_processor is None :
457
+ f1 = (
458
+ subheader_compression == const .compressed_subheader_id
459
+ or subheader_compression == 0
460
+ )
461
+ f2 = subheader_type == const .compressed_subheader_type
462
+ if self .compression and f1 and f2 :
463
+ self ._current_page_data_subheader_pointers .append (
464
+ (subheader_offset , subheader_length )
465
+ )
466
+ else :
467
+ self .close ()
468
+ raise ValueError (
469
+ f"Unknown subheader signature { subheader_signature } "
470
+ )
471
+ else :
472
+ subheader_processor (subheader_offset , subheader_length )
512
473
513
474
def _process_rowsize_subheader (self , offset : int , length : int ) -> None :
514
475
@@ -523,10 +484,12 @@ def _process_rowsize_subheader(self, offset: int, length: int) -> None:
523
484
lcp_offset += 378
524
485
525
486
self .row_length = self ._read_int (
526
- offset + const .row_length_offset_multiplier * int_len , int_len
487
+ offset + const .row_length_offset_multiplier * int_len ,
488
+ int_len ,
527
489
)
528
490
self .row_count = self ._read_int (
529
- offset + const .row_count_offset_multiplier * int_len , int_len
491
+ offset + const .row_count_offset_multiplier * int_len ,
492
+ int_len ,
530
493
)
531
494
self .col_count_p1 = self ._read_int (
532
495
offset + const .col_count_p1_multiplier * int_len , int_len
0 commit comments