20
20
datetime ,
21
21
timedelta ,
22
22
)
23
- import struct
23
+ import sys
24
24
from typing import cast
25
25
26
26
import numpy as np
42
42
)
43
43
44
44
from pandas .io .common import get_handle
45
+ from pandas .io .sas ._byteswap import (
46
+ read_double_with_byteswap ,
47
+ read_float_with_byteswap ,
48
+ read_uint16_with_byteswap ,
49
+ read_uint32_with_byteswap ,
50
+ read_uint64_with_byteswap ,
51
+ )
45
52
from pandas .io .sas ._sas import (
46
53
Parser ,
47
54
get_subheader_index ,
@@ -263,8 +270,10 @@ def _get_properties(self) -> None:
263
270
buf = self ._read_bytes (const .endianness_offset , const .endianness_length )
264
271
if buf == b"\x01 " :
265
272
self .byte_order = "<"
273
+ self .need_byteswap = sys .byteorder == "big"
266
274
else :
267
275
self .byte_order = ">"
276
+ self .need_byteswap = sys .byteorder == "little"
268
277
269
278
# Get encoding information
270
279
buf = self ._read_bytes (const .encoding_offset , const .encoding_length )[0 ]
@@ -286,7 +295,7 @@ def _get_properties(self) -> None:
286
295
)
287
296
self .date_modified = epoch + pd .to_timedelta (x , unit = "s" )
288
297
289
- self .header_length = self ._read_int (
298
+ self .header_length = self ._read_uint (
290
299
const .header_size_offset + align1 , const .header_size_length
291
300
)
292
301
@@ -298,7 +307,7 @@ def _get_properties(self) -> None:
298
307
if len (self ._cached_page ) != self .header_length : # type: ignore[arg-type]
299
308
raise ValueError ("The SAS7BDAT file appears to be truncated." )
300
309
301
- self ._page_length = self ._read_int (
310
+ self ._page_length = self ._read_uint (
302
311
const .page_size_offset + align1 , const .page_size_length
303
312
)
304
313
@@ -311,37 +320,46 @@ def __next__(self) -> DataFrame:
311
320
312
321
# Read a single float of the given width (4 or 8).
313
322
def _read_float (self , offset : int , width : int ):
314
- if width not in (4 , 8 ):
323
+ assert self ._cached_page is not None
324
+ if width == 4 :
325
+ return read_float_with_byteswap (
326
+ self ._cached_page , offset , self .need_byteswap
327
+ )
328
+ elif width == 8 :
329
+ return read_double_with_byteswap (
330
+ self ._cached_page , offset , self .need_byteswap
331
+ )
332
+ else :
315
333
self .close ()
316
334
raise ValueError ("invalid float width" )
317
- buf = self ._read_bytes (offset , width )
318
- fd = "f" if width == 4 else "d"
319
- return struct .unpack (self .byte_order + fd , buf )[0 ]
320
335
321
- # Read a single signed integer of the given width (1, 2, 4 or 8).
322
- def _read_int (self , offset : int , width : int ) -> int :
323
- if width not in (1 , 2 , 4 , 8 ):
336
+ # Read a single unsigned integer of the given width (1, 2, 4 or 8).
337
+ def _read_uint (self , offset : int , width : int ) -> int :
338
+ assert self ._cached_page is not None
339
+ if width == 1 :
340
+ return self ._read_bytes (offset , 1 )[0 ]
341
+ elif width == 2 :
342
+ return read_uint16_with_byteswap (
343
+ self ._cached_page , offset , self .need_byteswap
344
+ )
345
+ elif width == 4 :
346
+ return read_uint32_with_byteswap (
347
+ self ._cached_page , offset , self .need_byteswap
348
+ )
349
+ elif width == 8 :
350
+ return read_uint64_with_byteswap (
351
+ self ._cached_page , offset , self .need_byteswap
352
+ )
353
+ else :
324
354
self .close ()
325
355
raise ValueError ("invalid int width" )
326
- buf = self ._read_bytes (offset , width )
327
- it = {1 : "b" , 2 : "h" , 4 : "l" , 8 : "q" }[width ]
328
- iv = struct .unpack (self .byte_order + it , buf )[0 ]
329
- return iv
330
356
331
357
def _read_bytes (self , offset : int , length : int ):
332
- if self ._cached_page is None :
333
- self ._path_or_buf .seek (offset )
334
- buf = self ._path_or_buf .read (length )
335
- if len (buf ) < length :
336
- self .close ()
337
- msg = f"Unable to read { length :d} bytes from file position { offset :d} ."
338
- raise ValueError (msg )
339
- return buf
340
- else :
341
- if offset + length > len (self ._cached_page ):
342
- self .close ()
343
- raise ValueError ("The cached page is too small." )
344
- return self ._cached_page [offset : offset + length ]
358
+ assert self ._cached_page is not None
359
+ if offset + length > len (self ._cached_page ):
360
+ self .close ()
361
+ raise ValueError ("The cached page is too small." )
362
+ return self ._cached_page [offset : offset + length ]
345
363
346
364
def _read_and_convert_header_text (self , offset : int , length : int ) -> str | bytes :
347
365
return self ._convert_header_text (
@@ -375,12 +393,12 @@ def _read_page_header(self) -> None:
375
393
bit_offset = self ._page_bit_offset
376
394
tx = const .page_type_offset + bit_offset
377
395
self ._current_page_type = (
378
- self ._read_int (tx , const .page_type_length ) & const .page_type_mask2
396
+ self ._read_uint (tx , const .page_type_length ) & const .page_type_mask2
379
397
)
380
398
tx = const .block_count_offset + bit_offset
381
- self ._current_page_block_count = self ._read_int (tx , const .block_count_length )
399
+ self ._current_page_block_count = self ._read_uint (tx , const .block_count_length )
382
400
tx = const .subheader_count_offset + bit_offset
383
- self ._current_page_subheaders_count = self ._read_int (
401
+ self ._current_page_subheaders_count = self ._read_uint (
384
402
tx , const .subheader_count_length
385
403
)
386
404
@@ -391,16 +409,16 @@ def _process_page_metadata(self) -> None:
391
409
offset = const .subheader_pointers_offset + bit_offset
392
410
total_offset = offset + self ._subheader_pointer_length * i
393
411
394
- subheader_offset = self ._read_int (total_offset , self ._int_length )
412
+ subheader_offset = self ._read_uint (total_offset , self ._int_length )
395
413
total_offset += self ._int_length
396
414
397
- subheader_length = self ._read_int (total_offset , self ._int_length )
415
+ subheader_length = self ._read_uint (total_offset , self ._int_length )
398
416
total_offset += self ._int_length
399
417
400
- subheader_compression = self ._read_int (total_offset , 1 )
418
+ subheader_compression = self ._read_uint (total_offset , 1 )
401
419
total_offset += 1
402
420
403
- subheader_type = self ._read_int (total_offset , 1 )
421
+ subheader_type = self ._read_uint (total_offset , 1 )
404
422
405
423
if (
406
424
subheader_length == 0
@@ -442,29 +460,29 @@ def _process_rowsize_subheader(self, offset: int, length: int) -> None:
442
460
lcs_offset += 354
443
461
lcp_offset += 378
444
462
445
- self .row_length = self ._read_int (
463
+ self .row_length = self ._read_uint (
446
464
offset + const .row_length_offset_multiplier * int_len ,
447
465
int_len ,
448
466
)
449
- self .row_count = self ._read_int (
467
+ self .row_count = self ._read_uint (
450
468
offset + const .row_count_offset_multiplier * int_len ,
451
469
int_len ,
452
470
)
453
- self .col_count_p1 = self ._read_int (
471
+ self .col_count_p1 = self ._read_uint (
454
472
offset + const .col_count_p1_multiplier * int_len , int_len
455
473
)
456
- self .col_count_p2 = self ._read_int (
474
+ self .col_count_p2 = self ._read_uint (
457
475
offset + const .col_count_p2_multiplier * int_len , int_len
458
476
)
459
477
mx = const .row_count_on_mix_page_offset_multiplier * int_len
460
- self ._mix_page_row_count = self ._read_int (offset + mx , int_len )
461
- self ._lcs = self ._read_int (lcs_offset , 2 )
462
- self ._lcp = self ._read_int (lcp_offset , 2 )
478
+ self ._mix_page_row_count = self ._read_uint (offset + mx , int_len )
479
+ self ._lcs = self ._read_uint (lcs_offset , 2 )
480
+ self ._lcp = self ._read_uint (lcp_offset , 2 )
463
481
464
482
def _process_columnsize_subheader (self , offset : int , length : int ) -> None :
465
483
int_len = self ._int_length
466
484
offset += int_len
467
- self .column_count = self ._read_int (offset , int_len )
485
+ self .column_count = self ._read_uint (offset , int_len )
468
486
if self .col_count_p1 + self .col_count_p2 != self .column_count :
469
487
print (
470
488
f"Warning: column count mismatch ({ self .col_count_p1 } + "
@@ -478,7 +496,7 @@ def _process_subheader_counts(self, offset: int, length: int) -> None:
478
496
def _process_columntext_subheader (self , offset : int , length : int ) -> None :
479
497
480
498
offset += self ._int_length
481
- text_block_size = self ._read_int (offset , const .text_block_size_length )
499
+ text_block_size = self ._read_uint (offset , const .text_block_size_length )
482
500
483
501
buf = self ._read_bytes (offset , text_block_size )
484
502
cname_raw = buf [0 :text_block_size ].rstrip (b"\x00 " )
@@ -542,13 +560,13 @@ def _process_columnname_subheader(self, offset: int, length: int) -> None:
542
560
+ const .column_name_length_offset
543
561
)
544
562
545
- idx = self ._read_int (
563
+ idx = self ._read_uint (
546
564
text_subheader , const .column_name_text_subheader_length
547
565
)
548
- col_offset = self ._read_int (
566
+ col_offset = self ._read_uint (
549
567
col_name_offset , const .column_name_offset_length
550
568
)
551
- col_len = self ._read_int (col_name_length , const .column_name_length_length )
569
+ col_len = self ._read_uint (col_name_length , const .column_name_length_length )
552
570
553
571
name_raw = self .column_names_raw [idx ]
554
572
cname = name_raw [col_offset : col_offset + col_len ]
@@ -571,13 +589,13 @@ def _process_columnattributes_subheader(self, offset: int, length: int) -> None:
571
589
offset + 2 * int_len + const .column_type_offset + i * (int_len + 8 )
572
590
)
573
591
574
- x = self ._read_int (col_data_offset , int_len )
592
+ x = self ._read_uint (col_data_offset , int_len )
575
593
self ._column_data_offsets .append (x )
576
594
577
- x = self ._read_int (col_data_len , const .column_data_length_length )
595
+ x = self ._read_uint (col_data_len , const .column_data_length_length )
578
596
self ._column_data_lengths .append (x )
579
597
580
- x = self ._read_int (col_types , const .column_type_length )
598
+ x = self ._read_uint (col_types , const .column_type_length )
581
599
self ._column_types .append (b"d" if x == 1 else b"s" )
582
600
583
601
def _process_columnlist_subheader (self , offset : int , length : int ) -> None :
@@ -597,23 +615,25 @@ def _process_format_subheader(self, offset: int, length: int) -> None:
597
615
col_label_offset = offset + const .column_label_offset_offset + 3 * int_len
598
616
col_label_len = offset + const .column_label_length_offset + 3 * int_len
599
617
600
- x = self ._read_int (
618
+ x = self ._read_uint (
601
619
text_subheader_format , const .column_format_text_subheader_index_length
602
620
)
603
621
format_idx = min (x , len (self .column_names_raw ) - 1 )
604
622
605
- format_start = self ._read_int (
623
+ format_start = self ._read_uint (
606
624
col_format_offset , const .column_format_offset_length
607
625
)
608
- format_len = self ._read_int (col_format_len , const .column_format_length_length )
626
+ format_len = self ._read_uint (col_format_len , const .column_format_length_length )
609
627
610
- label_idx = self ._read_int (
628
+ label_idx = self ._read_uint (
611
629
text_subheader_label , const .column_label_text_subheader_index_length
612
630
)
613
631
label_idx = min (label_idx , len (self .column_names_raw ) - 1 )
614
632
615
- label_start = self ._read_int (col_label_offset , const .column_label_offset_length )
616
- label_len = self ._read_int (col_label_len , const .column_label_length_length )
633
+ label_start = self ._read_uint (
634
+ col_label_offset , const .column_label_offset_length
635
+ )
636
+ label_len = self ._read_uint (col_label_len , const .column_label_length_length )
617
637
618
638
label_names = self .column_names_raw [label_idx ]
619
639
column_label = self ._convert_header_text (
0 commit comments