42
42
)
43
43
44
44
from pandas .io .common import get_handle
45
- from pandas .io .sas ._sas import Parser
45
+ from pandas .io .sas ._sas import (
46
+ Parser ,
47
+ get_subheader_index ,
48
+ )
46
49
import pandas .io .sas .sas_constants as const
47
50
from pandas .io .sas .sasreader import ReaderBase
48
51
@@ -87,19 +90,6 @@ def _convert_datetimes(sas_datetimes: pd.Series, unit: str) -> pd.Series:
87
90
return s_series
88
91
89
92
90
- class _SubheaderPointer :
91
- offset : int
92
- length : int
93
- compression : int
94
- ptype : int
95
-
96
- def __init__ (self , offset : int , length : int , compression : int , ptype : int ) -> None :
97
- self .offset = offset
98
- self .length = length
99
- self .compression = compression
100
- self .ptype = ptype
101
-
102
-
103
93
class _Column :
104
94
col_id : int
105
95
name : str | bytes
@@ -187,7 +177,7 @@ def __init__(
187
177
self .column_formats : list [str | bytes ] = []
188
178
self .columns : list [_Column ] = []
189
179
190
- self ._current_page_data_subheader_pointers : list [_SubheaderPointer ] = []
180
+ self ._current_page_data_subheader_pointers : list [tuple [ int , int ] ] = []
191
181
self ._cached_page = None
192
182
self ._column_data_lengths : list [int ] = []
193
183
self ._column_data_offsets : list [int ] = []
@@ -203,6 +193,19 @@ def __init__(
203
193
204
194
self ._path_or_buf = self .handles .handle
205
195
196
+ # Same order as const.SASIndex
197
+ self ._subheader_processors = [
198
+ self ._process_rowsize_subheader ,
199
+ self ._process_columnsize_subheader ,
200
+ self ._process_subheader_counts ,
201
+ self ._process_columntext_subheader ,
202
+ self ._process_columnname_subheader ,
203
+ self ._process_columnattributes_subheader ,
204
+ self ._process_format_subheader ,
205
+ self ._process_columnlist_subheader ,
206
+ None , # Data
207
+ ]
208
+
206
209
try :
207
210
self ._get_properties ()
208
211
self ._parse_metadata ()
@@ -422,89 +425,47 @@ def _process_page_metadata(self) -> None:
422
425
bit_offset = self ._page_bit_offset
423
426
424
427
for i in range (self ._current_page_subheaders_count ):
425
- pointer = self ._process_subheader_pointers (
426
- const .subheader_pointers_offset + bit_offset , i
427
- )
428
- if pointer .length == 0 :
429
- continue
430
- if pointer .compression == const .truncated_subheader_id :
431
- continue
432
- subheader_signature = self ._read_subheader_signature (pointer .offset )
433
- subheader_index = self ._get_subheader_index (
434
- subheader_signature , pointer .compression , pointer .ptype
435
- )
436
- self ._process_subheader (subheader_index , pointer )
437
-
438
- def _get_subheader_index (self , signature : bytes , compression , ptype ) -> int :
439
- # TODO: return here could be made an enum
440
- index = const .subheader_signature_to_index .get (signature )
441
- if index is None :
442
- f1 = (compression == const .compressed_subheader_id ) or (compression == 0 )
443
- f2 = ptype == const .compressed_subheader_type
444
- if (self .compression != b"" ) and f1 and f2 :
445
- index = const .SASIndex .data_subheader_index
446
- else :
447
- self .close ()
448
- raise ValueError ("Unknown subheader signature" )
449
- return index
450
-
451
- def _process_subheader_pointers (
452
- self , offset : int , subheader_pointer_index : int
453
- ) -> _SubheaderPointer :
454
-
455
- subheader_pointer_length = self ._subheader_pointer_length
456
- total_offset = offset + subheader_pointer_length * subheader_pointer_index
428
+ offset = const .subheader_pointers_offset + bit_offset
429
+ total_offset = offset + self ._subheader_pointer_length * i
457
430
458
- subheader_offset = self ._read_int (total_offset , self ._int_length )
459
- total_offset += self ._int_length
431
+ subheader_offset = self ._read_int (total_offset , self ._int_length )
432
+ total_offset += self ._int_length
460
433
461
- subheader_length = self ._read_int (total_offset , self ._int_length )
462
- total_offset += self ._int_length
434
+ subheader_length = self ._read_int (total_offset , self ._int_length )
435
+ total_offset += self ._int_length
463
436
464
- subheader_compression = self ._read_int (total_offset , 1 )
465
- total_offset += 1
466
-
467
- subheader_type = self ._read_int (total_offset , 1 )
468
-
469
- x = _SubheaderPointer (
470
- subheader_offset , subheader_length , subheader_compression , subheader_type
471
- )
437
+ subheader_compression = self ._read_int (total_offset , 1 )
438
+ total_offset += 1
472
439
473
- return x
440
+ subheader_type = self . _read_int ( total_offset , 1 )
474
441
475
- def _read_subheader_signature (self , offset : int ) -> bytes :
476
- subheader_signature = self ._read_bytes (offset , self ._int_length )
477
- return subheader_signature
478
-
479
- def _process_subheader (
480
- self , subheader_index : int , pointer : _SubheaderPointer
481
- ) -> None :
482
- offset = pointer .offset
483
- length = pointer .length
484
-
485
- if subheader_index == const .SASIndex .row_size_index :
486
- processor = self ._process_rowsize_subheader
487
- elif subheader_index == const .SASIndex .column_size_index :
488
- processor = self ._process_columnsize_subheader
489
- elif subheader_index == const .SASIndex .column_text_index :
490
- processor = self ._process_columntext_subheader
491
- elif subheader_index == const .SASIndex .column_name_index :
492
- processor = self ._process_columnname_subheader
493
- elif subheader_index == const .SASIndex .column_attributes_index :
494
- processor = self ._process_columnattributes_subheader
495
- elif subheader_index == const .SASIndex .format_and_label_index :
496
- processor = self ._process_format_subheader
497
- elif subheader_index == const .SASIndex .column_list_index :
498
- processor = self ._process_columnlist_subheader
499
- elif subheader_index == const .SASIndex .subheader_counts_index :
500
- processor = self ._process_subheader_counts
501
- elif subheader_index == const .SASIndex .data_subheader_index :
502
- self ._current_page_data_subheader_pointers .append (pointer )
503
- return
504
- else :
505
- raise ValueError ("unknown subheader index" )
442
+ if (
443
+ subheader_length == 0
444
+ or subheader_compression == const .truncated_subheader_id
445
+ ):
446
+ continue
506
447
507
- processor (offset , length )
448
+ subheader_signature = self ._read_bytes (subheader_offset , self ._int_length )
449
+ subheader_index = get_subheader_index (subheader_signature )
450
+ subheader_processor = self ._subheader_processors [subheader_index ]
451
+
452
+ if subheader_processor is None :
453
+ f1 = (
454
+ subheader_compression == const .compressed_subheader_id
455
+ or subheader_compression == 0
456
+ )
457
+ f2 = subheader_type == const .compressed_subheader_type
458
+ if self .compression and f1 and f2 :
459
+ self ._current_page_data_subheader_pointers .append (
460
+ (subheader_offset , subheader_length )
461
+ )
462
+ else :
463
+ self .close ()
464
+ raise ValueError (
465
+ f"Unknown subheader signature { subheader_signature } "
466
+ )
467
+ else :
468
+ subheader_processor (subheader_offset , subheader_length )
508
469
509
470
def _process_rowsize_subheader (self , offset : int , length : int ) -> None :
510
471
@@ -519,10 +480,12 @@ def _process_rowsize_subheader(self, offset: int, length: int) -> None:
519
480
lcp_offset += 378
520
481
521
482
self .row_length = self ._read_int (
522
- offset + const .row_length_offset_multiplier * int_len , int_len
483
+ offset + const .row_length_offset_multiplier * int_len ,
484
+ int_len ,
523
485
)
524
486
self .row_count = self ._read_int (
525
- offset + const .row_count_offset_multiplier * int_len , int_len
487
+ offset + const .row_count_offset_multiplier * int_len ,
488
+ int_len ,
526
489
)
527
490
self .col_count_p1 = self ._read_int (
528
491
offset + const .col_count_p1_multiplier * int_len , int_len
0 commit comments