Skip to content

Commit 4e5f856

Browse files
committed
SAS7BDAT parser: Improve subheader lookup performance
1 parent e7414aa commit 4e5f856

File tree

2 files changed

+118
-105
lines changed

2 files changed

+118
-105
lines changed

pandas/io/sas/sas.pyx

+61-11
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,18 @@
11
# cython: profile=False
22
# cython: boundscheck=False, initializedcheck=False
33
from cython cimport Py_ssize_t
4+
from libc.stdint cimport (
5+
int64_t,
6+
uint8_t,
7+
uint16_t,
8+
uint32_t,
9+
uint64_t,
10+
)
11+
412
import numpy as np
513

614
import pandas.io.sas.sas_constants as const
715

8-
ctypedef signed long long int64_t
9-
ctypedef unsigned char uint8_t
10-
ctypedef unsigned short uint16_t
1116

1217
# rle_decompress decompresses data using a Run Length Encoding
1318
# algorithm. It is partially documented here:
@@ -194,7 +199,7 @@ cdef enum ColumnTypes:
194199
column_type_string = 2
195200

196201

197-
# type the page_data types
202+
# Const aliases
198203
assert len(const.page_meta_types) == 2
199204
cdef:
200205
int page_meta_types_0 = const.page_meta_types[0]
@@ -203,6 +208,53 @@ cdef:
203208
int page_data_type = const.page_data_type
204209
int subheader_pointers_offset = const.subheader_pointers_offset
205210

211+
# Copy of subheader_signature_to_index that allows for much faster lookups.
212+
# Lookups are done in get_subheader_index. The C structures are initialized
213+
# in _init_subheader_signatures().
214+
uint32_t subheader_signatures_32bit[13]
215+
int subheader_indices_32bit[13]
216+
uint64_t subheader_signatures_64bit[17]
217+
int subheader_indices_64bit[17]
218+
int data_subheader_index = const.SASIndex.data_subheader_index
219+
220+
221+
def _init_subheader_signatures():
222+
subheaders_32bit = [(sig, idx) for sig, idx in const.subheader_signature_to_index.items() if len(sig) == 4]
223+
subheaders_64bit = [(sig, idx) for sig, idx in const.subheader_signature_to_index.items() if len(sig) == 8]
224+
assert len(subheaders_32bit) == 13
225+
assert len(subheaders_64bit) == 17
226+
assert len(const.subheader_signature_to_index) == 13 + 17
227+
for i, (signature, idx) in enumerate(subheaders_32bit):
228+
subheader_signatures_32bit[i] = (<uint32_t *><char *>signature)[0]
229+
subheader_indices_32bit[i] = idx
230+
for i, (signature, idx) in enumerate(subheaders_64bit):
231+
subheader_signatures_64bit[i] = (<uint64_t *><char *>signature)[0]
232+
subheader_indices_64bit[i] = idx
233+
234+
235+
_init_subheader_signatures()
236+
237+
238+
def get_subheader_index(bytes signature):
239+
"""Fast version of 'subheader_signature_to_index.get(signature)'."""
240+
cdef:
241+
uint32_t sig32
242+
uint64_t sig64
243+
size_t i
244+
assert len(signature) in (4, 8)
245+
if len(signature) == 4:
246+
sig32 = (<uint32_t *><char *>signature)[0]
247+
for i in range(len(subheader_signatures_32bit)):
248+
if subheader_signatures_32bit[i] == sig32:
249+
return subheader_indices_32bit[i]
250+
else:
251+
sig64 = (<uint64_t *><char *>signature)[0]
252+
for i in range(len(subheader_signatures_64bit)):
253+
if subheader_signatures_64bit[i] == sig64:
254+
return subheader_indices_64bit[i]
255+
256+
return data_subheader_index
257+
206258

207259
cdef class Parser:
208260

@@ -314,7 +366,7 @@ cdef class Parser:
314366
cdef bint readline(self) except? True:
315367

316368
cdef:
317-
int offset, bit_offset, align_correction
369+
int offset, length, bit_offset, align_correction
318370
int subheader_pointer_length, mn
319371
bint done, flag
320372

@@ -338,12 +390,10 @@ cdef class Parser:
338390
if done:
339391
return True
340392
continue
341-
current_subheader_pointer = (
342-
self.parser._current_page_data_subheader_pointers[
343-
self.current_row_on_page_index])
344-
self.process_byte_array_with_data(
345-
current_subheader_pointer.offset,
346-
current_subheader_pointer.length)
393+
offset, length = self.parser._current_page_data_subheader_pointers[
394+
self.current_row_on_page_index
395+
]
396+
self.process_byte_array_with_data(offset, length)
347397
return False
348398
elif self.current_page_type == page_mix_type:
349399
align_correction = (

pandas/io/sas/sas7bdat.py

+57-94
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,10 @@
4242
)
4343

4444
from pandas.io.common import get_handle
45-
from pandas.io.sas._sas import Parser
45+
from pandas.io.sas._sas import (
46+
Parser,
47+
get_subheader_index,
48+
)
4649
import pandas.io.sas.sas_constants as const
4750
from pandas.io.sas.sasreader import ReaderBase
4851

@@ -87,19 +90,6 @@ def _convert_datetimes(sas_datetimes: pd.Series, unit: str) -> pd.Series:
8790
return s_series
8891

8992

90-
class _SubheaderPointer:
91-
offset: int
92-
length: int
93-
compression: int
94-
ptype: int
95-
96-
def __init__(self, offset: int, length: int, compression: int, ptype: int) -> None:
97-
self.offset = offset
98-
self.length = length
99-
self.compression = compression
100-
self.ptype = ptype
101-
102-
10393
class _Column:
10494
col_id: int
10595
name: str | bytes
@@ -187,7 +177,7 @@ def __init__(
187177
self.column_formats: list[str | bytes] = []
188178
self.columns: list[_Column] = []
189179

190-
self._current_page_data_subheader_pointers: list[_SubheaderPointer] = []
180+
self._current_page_data_subheader_pointers: list[tuple[int, int]] = []
191181
self._cached_page = None
192182
self._column_data_lengths: list[int] = []
193183
self._column_data_offsets: list[int] = []
@@ -203,6 +193,19 @@ def __init__(
203193

204194
self._path_or_buf = self.handles.handle
205195

196+
# Same order as const.SASIndex
197+
self._subheader_processors = [
198+
self._process_rowsize_subheader,
199+
self._process_columnsize_subheader,
200+
self._process_subheader_counts,
201+
self._process_columntext_subheader,
202+
self._process_columnname_subheader,
203+
self._process_columnattributes_subheader,
204+
self._process_format_subheader,
205+
self._process_columnlist_subheader,
206+
None, # Data
207+
]
208+
206209
try:
207210
self._get_properties()
208211
self._parse_metadata()
@@ -422,89 +425,47 @@ def _process_page_metadata(self) -> None:
422425
bit_offset = self._page_bit_offset
423426

424427
for i in range(self._current_page_subheaders_count):
425-
pointer = self._process_subheader_pointers(
426-
const.subheader_pointers_offset + bit_offset, i
427-
)
428-
if pointer.length == 0:
429-
continue
430-
if pointer.compression == const.truncated_subheader_id:
431-
continue
432-
subheader_signature = self._read_subheader_signature(pointer.offset)
433-
subheader_index = self._get_subheader_index(
434-
subheader_signature, pointer.compression, pointer.ptype
435-
)
436-
self._process_subheader(subheader_index, pointer)
437-
438-
def _get_subheader_index(self, signature: bytes, compression, ptype) -> int:
439-
# TODO: return here could be made an enum
440-
index = const.subheader_signature_to_index.get(signature)
441-
if index is None:
442-
f1 = (compression == const.compressed_subheader_id) or (compression == 0)
443-
f2 = ptype == const.compressed_subheader_type
444-
if (self.compression != b"") and f1 and f2:
445-
index = const.SASIndex.data_subheader_index
446-
else:
447-
self.close()
448-
raise ValueError("Unknown subheader signature")
449-
return index
450-
451-
def _process_subheader_pointers(
452-
self, offset: int, subheader_pointer_index: int
453-
) -> _SubheaderPointer:
454-
455-
subheader_pointer_length = self._subheader_pointer_length
456-
total_offset = offset + subheader_pointer_length * subheader_pointer_index
428+
offset = const.subheader_pointers_offset + bit_offset
429+
total_offset = offset + self._subheader_pointer_length * i
457430

458-
subheader_offset = self._read_int(total_offset, self._int_length)
459-
total_offset += self._int_length
431+
subheader_offset = self._read_int(total_offset, self._int_length)
432+
total_offset += self._int_length
460433

461-
subheader_length = self._read_int(total_offset, self._int_length)
462-
total_offset += self._int_length
434+
subheader_length = self._read_int(total_offset, self._int_length)
435+
total_offset += self._int_length
463436

464-
subheader_compression = self._read_int(total_offset, 1)
465-
total_offset += 1
466-
467-
subheader_type = self._read_int(total_offset, 1)
468-
469-
x = _SubheaderPointer(
470-
subheader_offset, subheader_length, subheader_compression, subheader_type
471-
)
437+
subheader_compression = self._read_int(total_offset, 1)
438+
total_offset += 1
472439

473-
return x
440+
subheader_type = self._read_int(total_offset, 1)
474441

475-
def _read_subheader_signature(self, offset: int) -> bytes:
476-
subheader_signature = self._read_bytes(offset, self._int_length)
477-
return subheader_signature
478-
479-
def _process_subheader(
480-
self, subheader_index: int, pointer: _SubheaderPointer
481-
) -> None:
482-
offset = pointer.offset
483-
length = pointer.length
484-
485-
if subheader_index == const.SASIndex.row_size_index:
486-
processor = self._process_rowsize_subheader
487-
elif subheader_index == const.SASIndex.column_size_index:
488-
processor = self._process_columnsize_subheader
489-
elif subheader_index == const.SASIndex.column_text_index:
490-
processor = self._process_columntext_subheader
491-
elif subheader_index == const.SASIndex.column_name_index:
492-
processor = self._process_columnname_subheader
493-
elif subheader_index == const.SASIndex.column_attributes_index:
494-
processor = self._process_columnattributes_subheader
495-
elif subheader_index == const.SASIndex.format_and_label_index:
496-
processor = self._process_format_subheader
497-
elif subheader_index == const.SASIndex.column_list_index:
498-
processor = self._process_columnlist_subheader
499-
elif subheader_index == const.SASIndex.subheader_counts_index:
500-
processor = self._process_subheader_counts
501-
elif subheader_index == const.SASIndex.data_subheader_index:
502-
self._current_page_data_subheader_pointers.append(pointer)
503-
return
504-
else:
505-
raise ValueError("unknown subheader index")
442+
if (
443+
subheader_length == 0
444+
or subheader_compression == const.truncated_subheader_id
445+
):
446+
continue
506447

507-
processor(offset, length)
448+
subheader_signature = self._read_bytes(subheader_offset, self._int_length)
449+
subheader_index = get_subheader_index(subheader_signature)
450+
subheader_processor = self._subheader_processors[subheader_index]
451+
452+
if subheader_processor is None:
453+
f1 = (
454+
subheader_compression == const.compressed_subheader_id
455+
or subheader_compression == 0
456+
)
457+
f2 = subheader_type == const.compressed_subheader_type
458+
if self.compression and f1 and f2:
459+
self._current_page_data_subheader_pointers.append(
460+
(subheader_offset, subheader_length)
461+
)
462+
else:
463+
self.close()
464+
raise ValueError(
465+
f"Unknown subheader signature {subheader_signature}"
466+
)
467+
else:
468+
subheader_processor(subheader_offset, subheader_length)
508469

509470
def _process_rowsize_subheader(self, offset: int, length: int) -> None:
510471

@@ -519,10 +480,12 @@ def _process_rowsize_subheader(self, offset: int, length: int) -> None:
519480
lcp_offset += 378
520481

521482
self.row_length = self._read_int(
522-
offset + const.row_length_offset_multiplier * int_len, int_len
483+
offset + const.row_length_offset_multiplier * int_len,
484+
int_len,
523485
)
524486
self.row_count = self._read_int(
525-
offset + const.row_count_offset_multiplier * int_len, int_len
487+
offset + const.row_count_offset_multiplier * int_len,
488+
int_len,
526489
)
527490
self.col_count_p1 = self._read_int(
528491
offset + const.col_count_p1_multiplier * int_len, int_len

0 commit comments

Comments
 (0)