Skip to content

Commit c855be8

Browse files
authored
SAS7BDAT parser: Fast byteswap (#47403)
* Fast byteswap * Add types * Review feedback * Slightly faster variant (1 less bytes obj construction) * Make MyPy happy? * Update sas7bdat.py * Use intrinsics * Lint * Add tests + move byteswap to module * Add float tests + refactoring * Undo unrelated changes * Undo unrelated changes * Lint * Update v1.6.0.rst * read_int -> read_uint * Lint * Update sas7bdat.py
1 parent e25aa9d commit c855be8

File tree

6 files changed

+229
-56
lines changed

6 files changed

+229
-56
lines changed

doc/source/whatsnew/v1.6.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ Performance improvements
149149
- Performance improvement in :meth:`DataFrame.join` when joining on a subset of a :class:`MultiIndex` (:issue:`48611`)
150150
- Performance improvement for :meth:`MultiIndex.intersection` (:issue:`48604`)
151151
- Performance improvement in ``var`` for nullable dtypes (:issue:`48379`).
152-
- Performance improvement to :func:`read_sas` with ``blank_missing=True`` (:issue:`48502`)
152+
- Performance improvements to :func:`read_sas` (:issue:`47403`, :issue:`47405`, :issue:`47656`, :issue:`48502`)
153153
- Memory improvement in :meth:`RangeIndex.sort_values` (:issue:`48801`)
154154

155155
.. ---------------------------------------------------------------------------

pandas/io/sas/_byteswap.pyi

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
def read_float_with_byteswap(data: bytes, offset: int, byteswap: bool) -> float: ...
2+
def read_double_with_byteswap(data: bytes, offset: int, byteswap: bool) -> float: ...
3+
def read_uint16_with_byteswap(data: bytes, offset: int, byteswap: bool) -> int: ...
4+
def read_uint32_with_byteswap(data: bytes, offset: int, byteswap: bool) -> int: ...
5+
def read_uint64_with_byteswap(data: bytes, offset: int, byteswap: bool) -> int: ...

pandas/io/sas/byteswap.pyx

+92
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
"""
2+
The following are faster versions of struct.unpack that avoid the overhead of Python function calls.
3+
4+
In the SAS7BDAT parser, they may be called up to (n_rows * n_cols) times.
5+
"""
6+
from cython cimport Py_ssize_t
7+
from libc.stdint cimport (
8+
uint16_t,
9+
uint32_t,
10+
uint64_t,
11+
)
12+
13+
14+
def read_float_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap):
15+
assert offset + 4 < len(data)
16+
cdef:
17+
const char *data_ptr = data
18+
float res = (<float*>(data_ptr + offset))[0]
19+
if byteswap:
20+
res = _byteswap_float(res)
21+
return res
22+
23+
24+
def read_double_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap):
25+
assert offset + 8 < len(data)
26+
cdef:
27+
const char *data_ptr = data
28+
double res = (<double*>(data_ptr + offset))[0]
29+
if byteswap:
30+
res = _byteswap_double(res)
31+
return res
32+
33+
34+
def read_uint16_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap):
35+
assert offset + 2 < len(data)
36+
cdef:
37+
const char *data_ptr = data
38+
uint16_t res = (<uint16_t *>(data_ptr + offset))[0]
39+
if byteswap:
40+
res = _byteswap2(res)
41+
return res
42+
43+
44+
def read_uint32_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap):
45+
assert offset + 4 < len(data)
46+
cdef:
47+
const char *data_ptr = data
48+
uint32_t res = (<uint32_t *>(data_ptr + offset))[0]
49+
if byteswap:
50+
res = _byteswap4(res)
51+
return res
52+
53+
54+
def read_uint64_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap):
55+
assert offset + 8 < len(data)
56+
cdef:
57+
const char *data_ptr = data
58+
uint64_t res = (<uint64_t *>(data_ptr + offset))[0]
59+
if byteswap:
60+
res = _byteswap8(res)
61+
return res
62+
63+
64+
# Byteswapping
65+
66+
cdef extern from *:
67+
"""
68+
#ifdef _MSC_VER
69+
#define _byteswap2 _byteswap_ushort
70+
#define _byteswap4 _byteswap_ulong
71+
#define _byteswap8 _byteswap_uint64
72+
#else
73+
#define _byteswap2 __builtin_bswap16
74+
#define _byteswap4 __builtin_bswap32
75+
#define _byteswap8 __builtin_bswap64
76+
#endif
77+
"""
78+
uint16_t _byteswap2(uint16_t)
79+
uint32_t _byteswap4(uint32_t)
80+
uint64_t _byteswap8(uint64_t)
81+
82+
83+
cdef inline float _byteswap_float(float num):
84+
cdef uint32_t *intptr = <uint32_t *>&num
85+
intptr[0] = _byteswap4(intptr[0])
86+
return num
87+
88+
89+
cdef inline double _byteswap_double(double num):
90+
cdef uint64_t *intptr = <uint64_t *>&num
91+
intptr[0] = _byteswap8(intptr[0])
92+
return num

pandas/io/sas/sas7bdat.py

+75-55
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
datetime,
2121
timedelta,
2222
)
23-
import struct
23+
import sys
2424
from typing import cast
2525

2626
import numpy as np
@@ -42,6 +42,13 @@
4242
)
4343

4444
from pandas.io.common import get_handle
45+
from pandas.io.sas._byteswap import (
46+
read_double_with_byteswap,
47+
read_float_with_byteswap,
48+
read_uint16_with_byteswap,
49+
read_uint32_with_byteswap,
50+
read_uint64_with_byteswap,
51+
)
4552
from pandas.io.sas._sas import (
4653
Parser,
4754
get_subheader_index,
@@ -263,8 +270,10 @@ def _get_properties(self) -> None:
263270
buf = self._read_bytes(const.endianness_offset, const.endianness_length)
264271
if buf == b"\x01":
265272
self.byte_order = "<"
273+
self.need_byteswap = sys.byteorder == "big"
266274
else:
267275
self.byte_order = ">"
276+
self.need_byteswap = sys.byteorder == "little"
268277

269278
# Get encoding information
270279
buf = self._read_bytes(const.encoding_offset, const.encoding_length)[0]
@@ -286,7 +295,7 @@ def _get_properties(self) -> None:
286295
)
287296
self.date_modified = epoch + pd.to_timedelta(x, unit="s")
288297

289-
self.header_length = self._read_int(
298+
self.header_length = self._read_uint(
290299
const.header_size_offset + align1, const.header_size_length
291300
)
292301

@@ -298,7 +307,7 @@ def _get_properties(self) -> None:
298307
if len(self._cached_page) != self.header_length: # type: ignore[arg-type]
299308
raise ValueError("The SAS7BDAT file appears to be truncated.")
300309

301-
self._page_length = self._read_int(
310+
self._page_length = self._read_uint(
302311
const.page_size_offset + align1, const.page_size_length
303312
)
304313

@@ -311,37 +320,46 @@ def __next__(self) -> DataFrame:
311320

312321
# Read a single float of the given width (4 or 8).
313322
def _read_float(self, offset: int, width: int):
314-
if width not in (4, 8):
323+
assert self._cached_page is not None
324+
if width == 4:
325+
return read_float_with_byteswap(
326+
self._cached_page, offset, self.need_byteswap
327+
)
328+
elif width == 8:
329+
return read_double_with_byteswap(
330+
self._cached_page, offset, self.need_byteswap
331+
)
332+
else:
315333
self.close()
316334
raise ValueError("invalid float width")
317-
buf = self._read_bytes(offset, width)
318-
fd = "f" if width == 4 else "d"
319-
return struct.unpack(self.byte_order + fd, buf)[0]
320335

321-
# Read a single signed integer of the given width (1, 2, 4 or 8).
322-
def _read_int(self, offset: int, width: int) -> int:
323-
if width not in (1, 2, 4, 8):
336+
# Read a single unsigned integer of the given width (1, 2, 4 or 8).
337+
def _read_uint(self, offset: int, width: int) -> int:
338+
assert self._cached_page is not None
339+
if width == 1:
340+
return self._read_bytes(offset, 1)[0]
341+
elif width == 2:
342+
return read_uint16_with_byteswap(
343+
self._cached_page, offset, self.need_byteswap
344+
)
345+
elif width == 4:
346+
return read_uint32_with_byteswap(
347+
self._cached_page, offset, self.need_byteswap
348+
)
349+
elif width == 8:
350+
return read_uint64_with_byteswap(
351+
self._cached_page, offset, self.need_byteswap
352+
)
353+
else:
324354
self.close()
325355
raise ValueError("invalid int width")
326-
buf = self._read_bytes(offset, width)
327-
it = {1: "b", 2: "h", 4: "l", 8: "q"}[width]
328-
iv = struct.unpack(self.byte_order + it, buf)[0]
329-
return iv
330356

331357
def _read_bytes(self, offset: int, length: int):
332-
if self._cached_page is None:
333-
self._path_or_buf.seek(offset)
334-
buf = self._path_or_buf.read(length)
335-
if len(buf) < length:
336-
self.close()
337-
msg = f"Unable to read {length:d} bytes from file position {offset:d}."
338-
raise ValueError(msg)
339-
return buf
340-
else:
341-
if offset + length > len(self._cached_page):
342-
self.close()
343-
raise ValueError("The cached page is too small.")
344-
return self._cached_page[offset : offset + length]
358+
assert self._cached_page is not None
359+
if offset + length > len(self._cached_page):
360+
self.close()
361+
raise ValueError("The cached page is too small.")
362+
return self._cached_page[offset : offset + length]
345363

346364
def _read_and_convert_header_text(self, offset: int, length: int) -> str | bytes:
347365
return self._convert_header_text(
@@ -375,12 +393,12 @@ def _read_page_header(self) -> None:
375393
bit_offset = self._page_bit_offset
376394
tx = const.page_type_offset + bit_offset
377395
self._current_page_type = (
378-
self._read_int(tx, const.page_type_length) & const.page_type_mask2
396+
self._read_uint(tx, const.page_type_length) & const.page_type_mask2
379397
)
380398
tx = const.block_count_offset + bit_offset
381-
self._current_page_block_count = self._read_int(tx, const.block_count_length)
399+
self._current_page_block_count = self._read_uint(tx, const.block_count_length)
382400
tx = const.subheader_count_offset + bit_offset
383-
self._current_page_subheaders_count = self._read_int(
401+
self._current_page_subheaders_count = self._read_uint(
384402
tx, const.subheader_count_length
385403
)
386404

@@ -391,16 +409,16 @@ def _process_page_metadata(self) -> None:
391409
offset = const.subheader_pointers_offset + bit_offset
392410
total_offset = offset + self._subheader_pointer_length * i
393411

394-
subheader_offset = self._read_int(total_offset, self._int_length)
412+
subheader_offset = self._read_uint(total_offset, self._int_length)
395413
total_offset += self._int_length
396414

397-
subheader_length = self._read_int(total_offset, self._int_length)
415+
subheader_length = self._read_uint(total_offset, self._int_length)
398416
total_offset += self._int_length
399417

400-
subheader_compression = self._read_int(total_offset, 1)
418+
subheader_compression = self._read_uint(total_offset, 1)
401419
total_offset += 1
402420

403-
subheader_type = self._read_int(total_offset, 1)
421+
subheader_type = self._read_uint(total_offset, 1)
404422

405423
if (
406424
subheader_length == 0
@@ -442,29 +460,29 @@ def _process_rowsize_subheader(self, offset: int, length: int) -> None:
442460
lcs_offset += 354
443461
lcp_offset += 378
444462

445-
self.row_length = self._read_int(
463+
self.row_length = self._read_uint(
446464
offset + const.row_length_offset_multiplier * int_len,
447465
int_len,
448466
)
449-
self.row_count = self._read_int(
467+
self.row_count = self._read_uint(
450468
offset + const.row_count_offset_multiplier * int_len,
451469
int_len,
452470
)
453-
self.col_count_p1 = self._read_int(
471+
self.col_count_p1 = self._read_uint(
454472
offset + const.col_count_p1_multiplier * int_len, int_len
455473
)
456-
self.col_count_p2 = self._read_int(
474+
self.col_count_p2 = self._read_uint(
457475
offset + const.col_count_p2_multiplier * int_len, int_len
458476
)
459477
mx = const.row_count_on_mix_page_offset_multiplier * int_len
460-
self._mix_page_row_count = self._read_int(offset + mx, int_len)
461-
self._lcs = self._read_int(lcs_offset, 2)
462-
self._lcp = self._read_int(lcp_offset, 2)
478+
self._mix_page_row_count = self._read_uint(offset + mx, int_len)
479+
self._lcs = self._read_uint(lcs_offset, 2)
480+
self._lcp = self._read_uint(lcp_offset, 2)
463481

464482
def _process_columnsize_subheader(self, offset: int, length: int) -> None:
465483
int_len = self._int_length
466484
offset += int_len
467-
self.column_count = self._read_int(offset, int_len)
485+
self.column_count = self._read_uint(offset, int_len)
468486
if self.col_count_p1 + self.col_count_p2 != self.column_count:
469487
print(
470488
f"Warning: column count mismatch ({self.col_count_p1} + "
@@ -478,7 +496,7 @@ def _process_subheader_counts(self, offset: int, length: int) -> None:
478496
def _process_columntext_subheader(self, offset: int, length: int) -> None:
479497

480498
offset += self._int_length
481-
text_block_size = self._read_int(offset, const.text_block_size_length)
499+
text_block_size = self._read_uint(offset, const.text_block_size_length)
482500

483501
buf = self._read_bytes(offset, text_block_size)
484502
cname_raw = buf[0:text_block_size].rstrip(b"\x00 ")
@@ -542,13 +560,13 @@ def _process_columnname_subheader(self, offset: int, length: int) -> None:
542560
+ const.column_name_length_offset
543561
)
544562

545-
idx = self._read_int(
563+
idx = self._read_uint(
546564
text_subheader, const.column_name_text_subheader_length
547565
)
548-
col_offset = self._read_int(
566+
col_offset = self._read_uint(
549567
col_name_offset, const.column_name_offset_length
550568
)
551-
col_len = self._read_int(col_name_length, const.column_name_length_length)
569+
col_len = self._read_uint(col_name_length, const.column_name_length_length)
552570

553571
name_raw = self.column_names_raw[idx]
554572
cname = name_raw[col_offset : col_offset + col_len]
@@ -571,13 +589,13 @@ def _process_columnattributes_subheader(self, offset: int, length: int) -> None:
571589
offset + 2 * int_len + const.column_type_offset + i * (int_len + 8)
572590
)
573591

574-
x = self._read_int(col_data_offset, int_len)
592+
x = self._read_uint(col_data_offset, int_len)
575593
self._column_data_offsets.append(x)
576594

577-
x = self._read_int(col_data_len, const.column_data_length_length)
595+
x = self._read_uint(col_data_len, const.column_data_length_length)
578596
self._column_data_lengths.append(x)
579597

580-
x = self._read_int(col_types, const.column_type_length)
598+
x = self._read_uint(col_types, const.column_type_length)
581599
self._column_types.append(b"d" if x == 1 else b"s")
582600

583601
def _process_columnlist_subheader(self, offset: int, length: int) -> None:
@@ -597,23 +615,25 @@ def _process_format_subheader(self, offset: int, length: int) -> None:
597615
col_label_offset = offset + const.column_label_offset_offset + 3 * int_len
598616
col_label_len = offset + const.column_label_length_offset + 3 * int_len
599617

600-
x = self._read_int(
618+
x = self._read_uint(
601619
text_subheader_format, const.column_format_text_subheader_index_length
602620
)
603621
format_idx = min(x, len(self.column_names_raw) - 1)
604622

605-
format_start = self._read_int(
623+
format_start = self._read_uint(
606624
col_format_offset, const.column_format_offset_length
607625
)
608-
format_len = self._read_int(col_format_len, const.column_format_length_length)
626+
format_len = self._read_uint(col_format_len, const.column_format_length_length)
609627

610-
label_idx = self._read_int(
628+
label_idx = self._read_uint(
611629
text_subheader_label, const.column_label_text_subheader_index_length
612630
)
613631
label_idx = min(label_idx, len(self.column_names_raw) - 1)
614632

615-
label_start = self._read_int(col_label_offset, const.column_label_offset_length)
616-
label_len = self._read_int(col_label_len, const.column_label_length_length)
633+
label_start = self._read_uint(
634+
col_label_offset, const.column_label_offset_length
635+
)
636+
label_len = self._read_uint(col_label_len, const.column_label_length_length)
617637

618638
label_names = self.column_names_raw[label_idx]
619639
column_label = self._convert_header_text(

0 commit comments

Comments
 (0)