13
13
Reference for binary data compression:
14
14
http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm
15
15
"""
16
+ from __future__ import annotations
17
+
16
18
from collections import abc
17
19
from datetime import (
18
20
datetime ,
34
36
)
35
37
36
38
import pandas as pd
37
- from pandas import isna
39
+ from pandas import (
40
+ DataFrame ,
41
+ isna ,
42
+ )
38
43
39
44
from pandas .io .common import get_handle
40
45
from pandas .io .sas ._sas import Parser
@@ -150,6 +155,9 @@ class SAS7BDATReader(ReaderBase, abc.Iterator):
150
155
bytes.
151
156
"""
152
157
158
+ _int_length : int
159
+ _cached_page : bytes | None
160
+
153
161
def __init__ (
154
162
self ,
155
163
path_or_buf ,
@@ -198,29 +206,29 @@ def __init__(
198
206
self .close ()
199
207
raise
200
208
201
- def column_data_lengths (self ):
209
+ def column_data_lengths (self ) -> np . ndarray :
202
210
"""Return a numpy int64 array of the column data lengths"""
203
211
return np .asarray (self ._column_data_lengths , dtype = np .int64 )
204
212
205
- def column_data_offsets (self ):
213
+ def column_data_offsets (self ) -> np . ndarray :
206
214
"""Return a numpy int64 array of the column offsets"""
207
215
return np .asarray (self ._column_data_offsets , dtype = np .int64 )
208
216
209
- def column_types (self ):
217
+ def column_types (self ) -> np . ndarray :
210
218
"""
211
219
Returns a numpy character array of the column types:
212
220
s (string) or d (double)
213
221
"""
214
222
return np .asarray (self ._column_types , dtype = np .dtype ("S1" ))
215
223
216
- def close (self ):
224
+ def close (self ) -> None :
217
225
self .handles .close ()
218
226
219
- def _get_properties (self ):
227
+ def _get_properties (self ) -> None :
220
228
221
229
# Check magic number
222
230
self ._path_or_buf .seek (0 )
223
- self ._cached_page = self ._path_or_buf .read (288 )
231
+ self ._cached_page = cast ( bytes , self ._path_or_buf .read (288 ) )
224
232
if self ._cached_page [0 : len (const .magic )] != const .magic :
225
233
raise ValueError ("magic number mismatch (not a SAS file?)" )
226
234
@@ -294,9 +302,11 @@ def _get_properties(self):
294
302
)
295
303
296
304
# Read the rest of the header into cached_page.
297
- buf = self ._path_or_buf .read (self .header_length - 288 )
305
+ buf = cast ( bytes , self ._path_or_buf .read (self .header_length - 288 ) )
298
306
self ._cached_page += buf
299
- if len (self ._cached_page ) != self .header_length :
307
+ # error: Argument 1 to "len" has incompatible type "Optional[bytes]";
308
+ # expected "Sized"
309
+ if len (self ._cached_page ) != self .header_length : # type: ignore[arg-type]
300
310
raise ValueError ("The SAS7BDAT file appears to be truncated." )
301
311
302
312
self ._page_length = self ._read_int (
@@ -355,7 +365,7 @@ def __next__(self):
355
365
return da
356
366
357
367
# Read a single float of the given width (4 or 8).
358
- def _read_float (self , offset , width ):
368
+ def _read_float (self , offset : int , width : int ):
359
369
if width not in (4 , 8 ):
360
370
self .close ()
361
371
raise ValueError ("invalid float width" )
@@ -388,24 +398,24 @@ def _read_bytes(self, offset: int, length: int):
388
398
raise ValueError ("The cached page is too small." )
389
399
return self ._cached_page [offset : offset + length ]
390
400
391
- def _parse_metadata (self ):
401
+ def _parse_metadata (self ) -> None :
392
402
done = False
393
403
while not done :
394
- self ._cached_page = self ._path_or_buf .read (self ._page_length )
404
+ self ._cached_page = cast ( bytes , self ._path_or_buf .read (self ._page_length ) )
395
405
if len (self ._cached_page ) <= 0 :
396
406
break
397
407
if len (self ._cached_page ) != self ._page_length :
398
408
raise ValueError ("Failed to read a meta data page from the SAS file." )
399
409
done = self ._process_page_meta ()
400
410
401
- def _process_page_meta (self ):
411
+ def _process_page_meta (self ) -> bool :
402
412
self ._read_page_header ()
403
413
pt = [const .page_meta_type , const .page_amd_type ] + const .page_mix_types
404
414
if self ._current_page_type in pt :
405
415
self ._process_page_metadata ()
406
416
is_data_page = self ._current_page_type & const .page_data_type
407
417
is_mix_page = self ._current_page_type in const .page_mix_types
408
- return (
418
+ return bool (
409
419
is_data_page
410
420
or is_mix_page
411
421
or self ._current_page_data_subheader_pointers != []
@@ -422,7 +432,7 @@ def _read_page_header(self):
422
432
tx , const .subheader_count_length
423
433
)
424
434
425
- def _process_page_metadata (self ):
435
+ def _process_page_metadata (self ) -> None :
426
436
bit_offset = self ._page_bit_offset
427
437
428
438
for i in range (self ._current_page_subheaders_count ):
@@ -439,7 +449,8 @@ def _process_page_metadata(self):
439
449
)
440
450
self ._process_subheader (subheader_index , pointer )
441
451
442
- def _get_subheader_index (self , signature , compression , ptype ):
452
+ def _get_subheader_index (self , signature : bytes , compression , ptype ) -> int :
453
+ # TODO: return here could be made an enum
443
454
index = const .subheader_signature_to_index .get (signature )
444
455
if index is None :
445
456
f1 = (compression == const .compressed_subheader_id ) or (compression == 0 )
@@ -451,7 +462,9 @@ def _get_subheader_index(self, signature, compression, ptype):
451
462
raise ValueError ("Unknown subheader signature" )
452
463
return index
453
464
454
- def _process_subheader_pointers (self , offset : int , subheader_pointer_index : int ):
465
+ def _process_subheader_pointers (
466
+ self , offset : int , subheader_pointer_index : int
467
+ ) -> _SubheaderPointer :
455
468
456
469
subheader_pointer_length = self ._subheader_pointer_length
457
470
total_offset = offset + subheader_pointer_length * subheader_pointer_index
@@ -473,11 +486,13 @@ def _process_subheader_pointers(self, offset: int, subheader_pointer_index: int)
473
486
474
487
return x
475
488
476
- def _read_subheader_signature (self , offset ) :
489
+ def _read_subheader_signature (self , offset : int ) -> bytes :
477
490
subheader_signature = self ._read_bytes (offset , self ._int_length )
478
491
return subheader_signature
479
492
480
- def _process_subheader (self , subheader_index , pointer ):
493
+ def _process_subheader (
494
+ self , subheader_index : int , pointer : _SubheaderPointer
495
+ ) -> None :
481
496
offset = pointer .offset
482
497
length = pointer .length
483
498
@@ -505,7 +520,7 @@ def _process_subheader(self, subheader_index, pointer):
505
520
506
521
processor (offset , length )
507
522
508
- def _process_rowsize_subheader (self , offset , length ) :
523
+ def _process_rowsize_subheader (self , offset : int , length : int ) -> None :
509
524
510
525
int_len = self ._int_length
511
526
lcs_offset = offset
@@ -534,7 +549,7 @@ def _process_rowsize_subheader(self, offset, length):
534
549
self ._lcs = self ._read_int (lcs_offset , 2 )
535
550
self ._lcp = self ._read_int (lcp_offset , 2 )
536
551
537
- def _process_columnsize_subheader (self , offset , length ) :
552
+ def _process_columnsize_subheader (self , offset : int , length : int ) -> None :
538
553
int_len = self ._int_length
539
554
offset += int_len
540
555
self .column_count = self ._read_int (offset , int_len )
@@ -545,10 +560,10 @@ def _process_columnsize_subheader(self, offset, length):
545
560
)
546
561
547
562
# Unknown purpose
548
- def _process_subheader_counts (self , offset , length ) :
563
+ def _process_subheader_counts (self , offset : int , length : int ) -> None :
549
564
pass
550
565
551
- def _process_columntext_subheader (self , offset , length ) :
566
+ def _process_columntext_subheader (self , offset : int , length : int ) -> None :
552
567
553
568
offset += self ._int_length
554
569
text_block_size = self ._read_int (offset , const .text_block_size_length )
@@ -600,7 +615,7 @@ def _process_columntext_subheader(self, offset, length):
600
615
self .encoding or self .default_encoding
601
616
)
602
617
603
- def _process_columnname_subheader (self , offset , length ) :
618
+ def _process_columnname_subheader (self , offset : int , length : int ) -> None :
604
619
int_len = self ._int_length
605
620
offset += int_len
606
621
column_name_pointers_count = (length - 2 * int_len - 12 ) // 8
@@ -632,7 +647,7 @@ def _process_columnname_subheader(self, offset, length):
632
647
name_str = self .column_names_strings [idx ]
633
648
self .column_names .append (name_str [col_offset : col_offset + col_len ])
634
649
635
- def _process_columnattributes_subheader (self , offset , length ) :
650
+ def _process_columnattributes_subheader (self , offset : int , length : int ) -> None :
636
651
int_len = self ._int_length
637
652
column_attributes_vectors_count = (length - 2 * int_len - 12 ) // (int_len + 8 )
638
653
for i in range (column_attributes_vectors_count ):
@@ -658,11 +673,11 @@ def _process_columnattributes_subheader(self, offset, length):
658
673
x = self ._read_int (col_types , const .column_type_length )
659
674
self ._column_types .append (b"d" if x == 1 else b"s" )
660
675
661
- def _process_columnlist_subheader (self , offset , length ) :
676
+ def _process_columnlist_subheader (self , offset : int , length : int ) -> None :
662
677
# unknown purpose
663
678
pass
664
679
665
- def _process_format_subheader (self , offset , length ) :
680
+ def _process_format_subheader (self , offset : int , length : int ) -> None :
666
681
int_len = self ._int_length
667
682
text_subheader_format = (
668
683
offset + const .column_format_text_subheader_index_offset + 3 * int_len
@@ -711,7 +726,7 @@ def _process_format_subheader(self, offset, length):
711
726
self .column_formats .append (column_format )
712
727
self .columns .append (col )
713
728
714
- def read (self , nrows = None ):
729
+ def read (self , nrows : int | None = None ) -> DataFrame | None :
715
730
716
731
if (nrows is None ) and (self .chunksize is not None ):
717
732
nrows = self .chunksize
@@ -747,7 +762,7 @@ def read(self, nrows=None):
747
762
748
763
def _read_next_page (self ):
749
764
self ._current_page_data_subheader_pointers = []
750
- self ._cached_page = self ._path_or_buf .read (self ._page_length )
765
+ self ._cached_page = cast ( bytes , self ._path_or_buf .read (self ._page_length ) )
751
766
if len (self ._cached_page ) <= 0 :
752
767
return True
753
768
elif len (self ._cached_page ) != self ._page_length :
@@ -770,12 +785,12 @@ def _read_next_page(self):
770
785
771
786
return False
772
787
773
- def _chunk_to_dataframe (self ):
788
+ def _chunk_to_dataframe (self ) -> DataFrame :
774
789
775
790
n = self ._current_row_in_chunk_index
776
791
m = self ._current_row_in_file_index
777
792
ix = range (m - n , m )
778
- rslt = pd . DataFrame (index = ix )
793
+ rslt = DataFrame (index = ix )
779
794
780
795
js , jb = 0 , 0
781
796
for j in range (self .column_count ):
0 commit comments