Skip to content

Commit 44f3831

Browse files
jbrockmendelvladu
authored andcommitted
TYP: io.sas (pandas-dev#40524)
1 parent c83d6e7 commit 44f3831

File tree

1 file changed

+46
-31
lines changed

1 file changed

+46
-31
lines changed

pandas/io/sas/sas7bdat.py

+46-31
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
Reference for binary data compression:
1414
http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm
1515
"""
16+
from __future__ import annotations
17+
1618
from collections import abc
1719
from datetime import (
1820
datetime,
@@ -34,7 +36,10 @@
3436
)
3537

3638
import pandas as pd
37-
from pandas import isna
39+
from pandas import (
40+
DataFrame,
41+
isna,
42+
)
3843

3944
from pandas.io.common import get_handle
4045
from pandas.io.sas._sas import Parser
@@ -150,6 +155,9 @@ class SAS7BDATReader(ReaderBase, abc.Iterator):
150155
bytes.
151156
"""
152157

158+
_int_length: int
159+
_cached_page: bytes | None
160+
153161
def __init__(
154162
self,
155163
path_or_buf,
@@ -198,29 +206,29 @@ def __init__(
198206
self.close()
199207
raise
200208

201-
def column_data_lengths(self):
209+
def column_data_lengths(self) -> np.ndarray:
202210
"""Return a numpy int64 array of the column data lengths"""
203211
return np.asarray(self._column_data_lengths, dtype=np.int64)
204212

205-
def column_data_offsets(self):
213+
def column_data_offsets(self) -> np.ndarray:
206214
"""Return a numpy int64 array of the column offsets"""
207215
return np.asarray(self._column_data_offsets, dtype=np.int64)
208216

209-
def column_types(self):
217+
def column_types(self) -> np.ndarray:
210218
"""
211219
Returns a numpy character array of the column types:
212220
s (string) or d (double)
213221
"""
214222
return np.asarray(self._column_types, dtype=np.dtype("S1"))
215223

216-
def close(self):
224+
def close(self) -> None:
217225
self.handles.close()
218226

219-
def _get_properties(self):
227+
def _get_properties(self) -> None:
220228

221229
# Check magic number
222230
self._path_or_buf.seek(0)
223-
self._cached_page = self._path_or_buf.read(288)
231+
self._cached_page = cast(bytes, self._path_or_buf.read(288))
224232
if self._cached_page[0 : len(const.magic)] != const.magic:
225233
raise ValueError("magic number mismatch (not a SAS file?)")
226234

@@ -294,9 +302,11 @@ def _get_properties(self):
294302
)
295303

296304
# Read the rest of the header into cached_page.
297-
buf = self._path_or_buf.read(self.header_length - 288)
305+
buf = cast(bytes, self._path_or_buf.read(self.header_length - 288))
298306
self._cached_page += buf
299-
if len(self._cached_page) != self.header_length:
307+
# error: Argument 1 to "len" has incompatible type "Optional[bytes]";
308+
# expected "Sized"
309+
if len(self._cached_page) != self.header_length: # type: ignore[arg-type]
300310
raise ValueError("The SAS7BDAT file appears to be truncated.")
301311

302312
self._page_length = self._read_int(
@@ -355,7 +365,7 @@ def __next__(self):
355365
return da
356366

357367
# Read a single float of the given width (4 or 8).
358-
def _read_float(self, offset, width):
368+
def _read_float(self, offset: int, width: int):
359369
if width not in (4, 8):
360370
self.close()
361371
raise ValueError("invalid float width")
@@ -388,24 +398,24 @@ def _read_bytes(self, offset: int, length: int):
388398
raise ValueError("The cached page is too small.")
389399
return self._cached_page[offset : offset + length]
390400

391-
def _parse_metadata(self):
401+
def _parse_metadata(self) -> None:
392402
done = False
393403
while not done:
394-
self._cached_page = self._path_or_buf.read(self._page_length)
404+
self._cached_page = cast(bytes, self._path_or_buf.read(self._page_length))
395405
if len(self._cached_page) <= 0:
396406
break
397407
if len(self._cached_page) != self._page_length:
398408
raise ValueError("Failed to read a meta data page from the SAS file.")
399409
done = self._process_page_meta()
400410

401-
def _process_page_meta(self):
411+
def _process_page_meta(self) -> bool:
402412
self._read_page_header()
403413
pt = [const.page_meta_type, const.page_amd_type] + const.page_mix_types
404414
if self._current_page_type in pt:
405415
self._process_page_metadata()
406416
is_data_page = self._current_page_type & const.page_data_type
407417
is_mix_page = self._current_page_type in const.page_mix_types
408-
return (
418+
return bool(
409419
is_data_page
410420
or is_mix_page
411421
or self._current_page_data_subheader_pointers != []
@@ -422,7 +432,7 @@ def _read_page_header(self):
422432
tx, const.subheader_count_length
423433
)
424434

425-
def _process_page_metadata(self):
435+
def _process_page_metadata(self) -> None:
426436
bit_offset = self._page_bit_offset
427437

428438
for i in range(self._current_page_subheaders_count):
@@ -439,7 +449,8 @@ def _process_page_metadata(self):
439449
)
440450
self._process_subheader(subheader_index, pointer)
441451

442-
def _get_subheader_index(self, signature, compression, ptype):
452+
def _get_subheader_index(self, signature: bytes, compression, ptype) -> int:
453+
# TODO: return here could be made an enum
443454
index = const.subheader_signature_to_index.get(signature)
444455
if index is None:
445456
f1 = (compression == const.compressed_subheader_id) or (compression == 0)
@@ -451,7 +462,9 @@ def _get_subheader_index(self, signature, compression, ptype):
451462
raise ValueError("Unknown subheader signature")
452463
return index
453464

454-
def _process_subheader_pointers(self, offset: int, subheader_pointer_index: int):
465+
def _process_subheader_pointers(
466+
self, offset: int, subheader_pointer_index: int
467+
) -> _SubheaderPointer:
455468

456469
subheader_pointer_length = self._subheader_pointer_length
457470
total_offset = offset + subheader_pointer_length * subheader_pointer_index
@@ -473,11 +486,13 @@ def _process_subheader_pointers(self, offset: int, subheader_pointer_index: int)
473486

474487
return x
475488

476-
def _read_subheader_signature(self, offset):
489+
def _read_subheader_signature(self, offset: int) -> bytes:
477490
subheader_signature = self._read_bytes(offset, self._int_length)
478491
return subheader_signature
479492

480-
def _process_subheader(self, subheader_index, pointer):
493+
def _process_subheader(
494+
self, subheader_index: int, pointer: _SubheaderPointer
495+
) -> None:
481496
offset = pointer.offset
482497
length = pointer.length
483498

@@ -505,7 +520,7 @@ def _process_subheader(self, subheader_index, pointer):
505520

506521
processor(offset, length)
507522

508-
def _process_rowsize_subheader(self, offset, length):
523+
def _process_rowsize_subheader(self, offset: int, length: int) -> None:
509524

510525
int_len = self._int_length
511526
lcs_offset = offset
@@ -534,7 +549,7 @@ def _process_rowsize_subheader(self, offset, length):
534549
self._lcs = self._read_int(lcs_offset, 2)
535550
self._lcp = self._read_int(lcp_offset, 2)
536551

537-
def _process_columnsize_subheader(self, offset, length):
552+
def _process_columnsize_subheader(self, offset: int, length: int) -> None:
538553
int_len = self._int_length
539554
offset += int_len
540555
self.column_count = self._read_int(offset, int_len)
@@ -545,10 +560,10 @@ def _process_columnsize_subheader(self, offset, length):
545560
)
546561

547562
# Unknown purpose
548-
def _process_subheader_counts(self, offset, length):
563+
def _process_subheader_counts(self, offset: int, length: int) -> None:
549564
pass
550565

551-
def _process_columntext_subheader(self, offset, length):
566+
def _process_columntext_subheader(self, offset: int, length: int) -> None:
552567

553568
offset += self._int_length
554569
text_block_size = self._read_int(offset, const.text_block_size_length)
@@ -600,7 +615,7 @@ def _process_columntext_subheader(self, offset, length):
600615
self.encoding or self.default_encoding
601616
)
602617

603-
def _process_columnname_subheader(self, offset, length):
618+
def _process_columnname_subheader(self, offset: int, length: int) -> None:
604619
int_len = self._int_length
605620
offset += int_len
606621
column_name_pointers_count = (length - 2 * int_len - 12) // 8
@@ -632,7 +647,7 @@ def _process_columnname_subheader(self, offset, length):
632647
name_str = self.column_names_strings[idx]
633648
self.column_names.append(name_str[col_offset : col_offset + col_len])
634649

635-
def _process_columnattributes_subheader(self, offset, length):
650+
def _process_columnattributes_subheader(self, offset: int, length: int) -> None:
636651
int_len = self._int_length
637652
column_attributes_vectors_count = (length - 2 * int_len - 12) // (int_len + 8)
638653
for i in range(column_attributes_vectors_count):
@@ -658,11 +673,11 @@ def _process_columnattributes_subheader(self, offset, length):
658673
x = self._read_int(col_types, const.column_type_length)
659674
self._column_types.append(b"d" if x == 1 else b"s")
660675

661-
def _process_columnlist_subheader(self, offset, length):
676+
def _process_columnlist_subheader(self, offset: int, length: int) -> None:
662677
# unknown purpose
663678
pass
664679

665-
def _process_format_subheader(self, offset, length):
680+
def _process_format_subheader(self, offset: int, length: int) -> None:
666681
int_len = self._int_length
667682
text_subheader_format = (
668683
offset + const.column_format_text_subheader_index_offset + 3 * int_len
@@ -711,7 +726,7 @@ def _process_format_subheader(self, offset, length):
711726
self.column_formats.append(column_format)
712727
self.columns.append(col)
713728

714-
def read(self, nrows=None):
729+
def read(self, nrows: int | None = None) -> DataFrame | None:
715730

716731
if (nrows is None) and (self.chunksize is not None):
717732
nrows = self.chunksize
@@ -747,7 +762,7 @@ def read(self, nrows=None):
747762

748763
def _read_next_page(self):
749764
self._current_page_data_subheader_pointers = []
750-
self._cached_page = self._path_or_buf.read(self._page_length)
765+
self._cached_page = cast(bytes, self._path_or_buf.read(self._page_length))
751766
if len(self._cached_page) <= 0:
752767
return True
753768
elif len(self._cached_page) != self._page_length:
@@ -770,12 +785,12 @@ def _read_next_page(self):
770785

771786
return False
772787

773-
def _chunk_to_dataframe(self):
788+
def _chunk_to_dataframe(self) -> DataFrame:
774789

775790
n = self._current_row_in_chunk_index
776791
m = self._current_row_in_file_index
777792
ix = range(m - n, m)
778-
rslt = pd.DataFrame(index=ix)
793+
rslt = DataFrame(index=ix)
779794

780795
js, jb = 0, 0
781796
for j in range(self.column_count):

0 commit comments

Comments
 (0)