Skip to content

Commit 39fc318

Browse files
authored
SAS7BDAT parser: Improve subheader lookup performance (#47656)
* SAS7BDAT parser: Improve subheader lookup performance * Fix ssize_t type * Update _sas.pyi * Lint
1 parent 9f94480 commit 39fc318

File tree

3 files changed

+117
-105
lines changed

3 files changed

+117
-105
lines changed

pandas/io/sas/_sas.pyi

+2
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,5 @@ from pandas.io.sas.sas7bdat import SAS7BDATReader
33
class Parser:
44
def __init__(self, parser: SAS7BDATReader) -> None: ...
55
def read(self, nrows: int) -> None: ...
6+
7+
def get_subheader_index(signature: bytes) -> int: ...

pandas/io/sas/sas.pyx

+58-11
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ from libc.stdint cimport (
66
int64_t,
77
uint8_t,
88
uint16_t,
9+
uint32_t,
10+
uint64_t,
911
)
1012
from libc.stdlib cimport (
1113
calloc,
@@ -17,6 +19,9 @@ import numpy as np
1719
import pandas.io.sas.sas_constants as const
1820

1921

22+
cdef object np_nan = np.nan
23+
24+
2025
cdef struct Buffer:
2126
# Convenience wrapper for uint8_t data to allow fast and safe reads and writes.
2227
# We use this as a replacement for np.array(..., dtype=np.uint8) because it's
@@ -53,9 +58,6 @@ cdef inline buf_free(Buffer buf):
5358
if buf.data != NULL:
5459
free(buf.data)
5560

56-
57-
cdef object np_nan = np.nan
58-
5961
# rle_decompress decompresses data using a Run Length Encoding
6062
# algorithm. It is partially documented here:
6163
#
@@ -231,7 +233,7 @@ cdef enum ColumnTypes:
231233
column_type_string = 2
232234

233235

234-
# type the page_data types
236+
# Const aliases
235237
assert len(const.page_meta_types) == 2
236238
cdef:
237239
int page_meta_types_0 = const.page_meta_types[0]
@@ -240,6 +242,53 @@ cdef:
240242
int page_data_type = const.page_data_type
241243
int subheader_pointers_offset = const.subheader_pointers_offset
242244

245+
# Copy of subheader_signature_to_index that allows for much faster lookups.
246+
# Lookups are done in get_subheader_index. The C structures are initialized
247+
# in _init_subheader_signatures().
248+
uint32_t subheader_signatures_32bit[13]
249+
int subheader_indices_32bit[13]
250+
uint64_t subheader_signatures_64bit[17]
251+
int subheader_indices_64bit[17]
252+
int data_subheader_index = const.SASIndex.data_subheader_index
253+
254+
255+
def _init_subheader_signatures():
256+
subheaders_32bit = [(sig, idx) for sig, idx in const.subheader_signature_to_index.items() if len(sig) == 4]
257+
subheaders_64bit = [(sig, idx) for sig, idx in const.subheader_signature_to_index.items() if len(sig) == 8]
258+
assert len(subheaders_32bit) == 13
259+
assert len(subheaders_64bit) == 17
260+
assert len(const.subheader_signature_to_index) == 13 + 17
261+
for i, (signature, idx) in enumerate(subheaders_32bit):
262+
subheader_signatures_32bit[i] = (<uint32_t *><char *>signature)[0]
263+
subheader_indices_32bit[i] = idx
264+
for i, (signature, idx) in enumerate(subheaders_64bit):
265+
subheader_signatures_64bit[i] = (<uint64_t *><char *>signature)[0]
266+
subheader_indices_64bit[i] = idx
267+
268+
269+
_init_subheader_signatures()
270+
271+
272+
def get_subheader_index(bytes signature):
273+
"""Fast version of 'subheader_signature_to_index.get(signature)'."""
274+
cdef:
275+
uint32_t sig32
276+
uint64_t sig64
277+
Py_ssize_t i
278+
assert len(signature) in (4, 8)
279+
if len(signature) == 4:
280+
sig32 = (<uint32_t *><char *>signature)[0]
281+
for i in range(len(subheader_signatures_32bit)):
282+
if subheader_signatures_32bit[i] == sig32:
283+
return subheader_indices_32bit[i]
284+
else:
285+
sig64 = (<uint64_t *><char *>signature)[0]
286+
for i in range(len(subheader_signatures_64bit)):
287+
if subheader_signatures_64bit[i] == sig64:
288+
return subheader_indices_64bit[i]
289+
290+
return data_subheader_index
291+
243292

244293
cdef class Parser:
245294

@@ -355,7 +404,7 @@ cdef class Parser:
355404
cdef bint readline(self) except? True:
356405

357406
cdef:
358-
int offset, bit_offset, align_correction
407+
int offset, length, bit_offset, align_correction
359408
int subheader_pointer_length, mn
360409
bint done, flag
361410

@@ -379,12 +428,10 @@ cdef class Parser:
379428
if done:
380429
return True
381430
continue
382-
current_subheader_pointer = (
383-
self.parser._current_page_data_subheader_pointers[
384-
self.current_row_on_page_index])
385-
self.process_byte_array_with_data(
386-
current_subheader_pointer.offset,
387-
current_subheader_pointer.length)
431+
offset, length = self.parser._current_page_data_subheader_pointers[
432+
self.current_row_on_page_index
433+
]
434+
self.process_byte_array_with_data(offset, length)
388435
return False
389436
elif self.current_page_type == page_mix_type:
390437
align_correction = (

pandas/io/sas/sas7bdat.py

+57-94
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,10 @@
4242
)
4343

4444
from pandas.io.common import get_handle
45-
from pandas.io.sas._sas import Parser
45+
from pandas.io.sas._sas import (
46+
Parser,
47+
get_subheader_index,
48+
)
4649
import pandas.io.sas.sas_constants as const
4750
from pandas.io.sas.sasreader import ReaderBase
4851

@@ -87,19 +90,6 @@ def _convert_datetimes(sas_datetimes: pd.Series, unit: str) -> pd.Series:
8790
return s_series
8891

8992

90-
class _SubheaderPointer:
91-
offset: int
92-
length: int
93-
compression: int
94-
ptype: int
95-
96-
def __init__(self, offset: int, length: int, compression: int, ptype: int) -> None:
97-
self.offset = offset
98-
self.length = length
99-
self.compression = compression
100-
self.ptype = ptype
101-
102-
10393
class _Column:
10494
col_id: int
10595
name: str | bytes
@@ -189,7 +179,7 @@ def __init__(
189179
self.column_formats: list[str | bytes] = []
190180
self.columns: list[_Column] = []
191181

192-
self._current_page_data_subheader_pointers: list[_SubheaderPointer] = []
182+
self._current_page_data_subheader_pointers: list[tuple[int, int]] = []
193183
self._cached_page = None
194184
self._column_data_lengths: list[int] = []
195185
self._column_data_offsets: list[int] = []
@@ -205,6 +195,19 @@ def __init__(
205195

206196
self._path_or_buf = self.handles.handle
207197

198+
# Same order as const.SASIndex
199+
self._subheader_processors = [
200+
self._process_rowsize_subheader,
201+
self._process_columnsize_subheader,
202+
self._process_subheader_counts,
203+
self._process_columntext_subheader,
204+
self._process_columnname_subheader,
205+
self._process_columnattributes_subheader,
206+
self._process_format_subheader,
207+
self._process_columnlist_subheader,
208+
None, # Data
209+
]
210+
208211
try:
209212
self._get_properties()
210213
self._parse_metadata()
@@ -426,89 +429,47 @@ def _process_page_metadata(self) -> None:
426429
bit_offset = self._page_bit_offset
427430

428431
for i in range(self._current_page_subheaders_count):
429-
pointer = self._process_subheader_pointers(
430-
const.subheader_pointers_offset + bit_offset, i
431-
)
432-
if pointer.length == 0:
433-
continue
434-
if pointer.compression == const.truncated_subheader_id:
435-
continue
436-
subheader_signature = self._read_subheader_signature(pointer.offset)
437-
subheader_index = self._get_subheader_index(
438-
subheader_signature, pointer.compression, pointer.ptype
439-
)
440-
self._process_subheader(subheader_index, pointer)
441-
442-
def _get_subheader_index(self, signature: bytes, compression, ptype) -> int:
443-
# TODO: return here could be made an enum
444-
index = const.subheader_signature_to_index.get(signature)
445-
if index is None:
446-
f1 = (compression == const.compressed_subheader_id) or (compression == 0)
447-
f2 = ptype == const.compressed_subheader_type
448-
if (self.compression != b"") and f1 and f2:
449-
index = const.SASIndex.data_subheader_index
450-
else:
451-
self.close()
452-
raise ValueError("Unknown subheader signature")
453-
return index
454-
455-
def _process_subheader_pointers(
456-
self, offset: int, subheader_pointer_index: int
457-
) -> _SubheaderPointer:
458-
459-
subheader_pointer_length = self._subheader_pointer_length
460-
total_offset = offset + subheader_pointer_length * subheader_pointer_index
432+
offset = const.subheader_pointers_offset + bit_offset
433+
total_offset = offset + self._subheader_pointer_length * i
461434

462-
subheader_offset = self._read_int(total_offset, self._int_length)
463-
total_offset += self._int_length
435+
subheader_offset = self._read_int(total_offset, self._int_length)
436+
total_offset += self._int_length
464437

465-
subheader_length = self._read_int(total_offset, self._int_length)
466-
total_offset += self._int_length
438+
subheader_length = self._read_int(total_offset, self._int_length)
439+
total_offset += self._int_length
467440

468-
subheader_compression = self._read_int(total_offset, 1)
469-
total_offset += 1
470-
471-
subheader_type = self._read_int(total_offset, 1)
472-
473-
x = _SubheaderPointer(
474-
subheader_offset, subheader_length, subheader_compression, subheader_type
475-
)
441+
subheader_compression = self._read_int(total_offset, 1)
442+
total_offset += 1
476443

477-
return x
444+
subheader_type = self._read_int(total_offset, 1)
478445

479-
def _read_subheader_signature(self, offset: int) -> bytes:
480-
subheader_signature = self._read_bytes(offset, self._int_length)
481-
return subheader_signature
482-
483-
def _process_subheader(
484-
self, subheader_index: int, pointer: _SubheaderPointer
485-
) -> None:
486-
offset = pointer.offset
487-
length = pointer.length
488-
489-
if subheader_index == const.SASIndex.row_size_index:
490-
processor = self._process_rowsize_subheader
491-
elif subheader_index == const.SASIndex.column_size_index:
492-
processor = self._process_columnsize_subheader
493-
elif subheader_index == const.SASIndex.column_text_index:
494-
processor = self._process_columntext_subheader
495-
elif subheader_index == const.SASIndex.column_name_index:
496-
processor = self._process_columnname_subheader
497-
elif subheader_index == const.SASIndex.column_attributes_index:
498-
processor = self._process_columnattributes_subheader
499-
elif subheader_index == const.SASIndex.format_and_label_index:
500-
processor = self._process_format_subheader
501-
elif subheader_index == const.SASIndex.column_list_index:
502-
processor = self._process_columnlist_subheader
503-
elif subheader_index == const.SASIndex.subheader_counts_index:
504-
processor = self._process_subheader_counts
505-
elif subheader_index == const.SASIndex.data_subheader_index:
506-
self._current_page_data_subheader_pointers.append(pointer)
507-
return
508-
else:
509-
raise ValueError("unknown subheader index")
446+
if (
447+
subheader_length == 0
448+
or subheader_compression == const.truncated_subheader_id
449+
):
450+
continue
510451

511-
processor(offset, length)
452+
subheader_signature = self._read_bytes(subheader_offset, self._int_length)
453+
subheader_index = get_subheader_index(subheader_signature)
454+
subheader_processor = self._subheader_processors[subheader_index]
455+
456+
if subheader_processor is None:
457+
f1 = (
458+
subheader_compression == const.compressed_subheader_id
459+
or subheader_compression == 0
460+
)
461+
f2 = subheader_type == const.compressed_subheader_type
462+
if self.compression and f1 and f2:
463+
self._current_page_data_subheader_pointers.append(
464+
(subheader_offset, subheader_length)
465+
)
466+
else:
467+
self.close()
468+
raise ValueError(
469+
f"Unknown subheader signature {subheader_signature}"
470+
)
471+
else:
472+
subheader_processor(subheader_offset, subheader_length)
512473

513474
def _process_rowsize_subheader(self, offset: int, length: int) -> None:
514475

@@ -523,10 +484,12 @@ def _process_rowsize_subheader(self, offset: int, length: int) -> None:
523484
lcp_offset += 378
524485

525486
self.row_length = self._read_int(
526-
offset + const.row_length_offset_multiplier * int_len, int_len
487+
offset + const.row_length_offset_multiplier * int_len,
488+
int_len,
527489
)
528490
self.row_count = self._read_int(
529-
offset + const.row_count_offset_multiplier * int_len, int_len
491+
offset + const.row_count_offset_multiplier * int_len,
492+
int_len,
530493
)
531494
self.col_count_p1 = self._read_int(
532495
offset + const.col_count_p1_multiplier * int_len, int_len

0 commit comments

Comments
 (0)