Skip to content

Commit 33683cc

Browse files
ksheddenjreback
authored andcommitted
Modest performance, address #12647
closes #12659 closes #12654 closes #12647 closes #12809 Majorperformance improvements through use of Cython Bug fixes in read_sas Author: Kerby Shedden <[email protected]> Closes #12656 from kshedden/sas7bdat_perf and squashes the following commits: b3024ed [Kerby Shedden] Add missing test data files af085f7 [Kerby Shedden] Add one more type fe4731b [Kerby Shedden] Integrate jreback's cython improvements b7de358 [Kerby Shedden] flake8 fixes ea87a7f [Kerby Shedden] Fix encoding handling bug for py2 8b4b96d [Kerby Shedden] pep8 cleanup 1af73b3 [Kerby Shedden] Further encoding work c26d22b [Kerby Shedden] added to whatsnew 873a877 [Kerby Shedden] Added option to not decode header text 11c2f31 [Kerby Shedden] Further cythonization 3ef626e [Kerby Shedden] Working on cython issues 7e156b7 [Kerby Shedden] Working on cython issues dc330c5 [Kerby Shedden] Add two missing alignment constants 23bdf7a [Kerby Shedden] Decouple data decoding and decoding e.g. of column names 3bd1b35 [Kerby Shedden] Move more code to cython bdc9a06 [Kerby Shedden] More cython for performance, refactored constants ea2339f [Kerby Shedden] Use encoding when reading column headers 7d91d51 [Kerby Shedden] Add test data set from raderaj a7df841 [Kerby Shedden] Modest performance, address #12647 fix up memoryview access on windows, installation issue
1 parent 797baf9 commit 33683cc

12 files changed

+2111
-432
lines changed

doc/source/whatsnew/v0.18.1.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,7 @@ Deprecations
248248
Performance Improvements
249249
~~~~~~~~~~~~~~~~~~~~~~~~
250250

251-
251+
- Improved speed of SAS reader (:issue:`12656`)
252252

253253
- Improved performance of ``DataFrame.to_sql`` when checking case sensitivity for tables. Now only checks if table has been created correctly when table name is not lower case. (:issue:`12876`)
254254

@@ -281,6 +281,7 @@ Bug Fixes
281281
- Bug in ``.drop()`` with a non-unique ``MultiIndex``. (:issue:`12701`)
282282
- Bug in ``.concat`` of datetime tz-aware and naive DataFrames (:issue:`12467`)
283283
- Bug in correctly raising a ``ValueError`` in ``.resample(..).fillna(..)`` when passing a non-string (:issue:`12952`)
284+
- Bug fixes in various encoding and header processing issues in ``pd.read_sas()`` (:issue:`12659`, :issue:`12654`, :issue:`12647`, :issue:`12809`)
284285

285286
- Bug in ``Timestamp.__repr__`` that caused ``pprint`` to fail in nested structures (:issue:`12622`)
286287
- Bug in ``Timedelta.min`` and ``Timedelta.max``, the properties now report the true minimum/maximum ``timedeltas`` as recognized by Pandas. See :ref:`documentation <timedeltas.limitations>`. (:issue:`12727`)

pandas/io/sas/sas7bdat.py

+184-363
Large diffs are not rendered by default.

pandas/io/sas/sas_constants.py

+147
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
magic = (b"\x00\x00\x00\x00\x00\x00\x00\x00" +
2+
b"\x00\x00\x00\x00\xc2\xea\x81\x60" +
3+
b"\xb3\x14\x11\xcf\xbd\x92\x08\x00" +
4+
b"\x09\xc7\x31\x8c\x18\x1f\x10\x11")
5+
6+
align_1_checker_value = b'3'
7+
align_1_offset = 32
8+
align_1_length = 1
9+
align_1_value = 4
10+
u64_byte_checker_value = b'3'
11+
align_2_offset = 35
12+
align_2_length = 1
13+
align_2_value = 4
14+
endianness_offset = 37
15+
endianness_length = 1
16+
platform_offset = 39
17+
platform_length = 1
18+
encoding_offset = 70
19+
encoding_length = 1
20+
dataset_offset = 92
21+
dataset_length = 64
22+
file_type_offset = 156
23+
file_type_length = 8
24+
date_created_offset = 164
25+
date_created_length = 8
26+
date_modified_offset = 172
27+
date_modified_length = 8
28+
header_size_offset = 196
29+
header_size_length = 4
30+
page_size_offset = 200
31+
page_size_length = 4
32+
page_count_offset = 204
33+
page_count_length = 4
34+
sas_release_offset = 216
35+
sas_release_length = 8
36+
sas_server_type_offset = 224
37+
sas_server_type_length = 16
38+
os_version_number_offset = 240
39+
os_version_number_length = 16
40+
os_maker_offset = 256
41+
os_maker_length = 16
42+
os_name_offset = 272
43+
os_name_length = 16
44+
page_bit_offset_x86 = 16
45+
page_bit_offset_x64 = 32
46+
subheader_pointer_length_x86 = 12
47+
subheader_pointer_length_x64 = 24
48+
page_type_offset = 0
49+
page_type_length = 2
50+
block_count_offset = 2
51+
block_count_length = 2
52+
subheader_count_offset = 4
53+
subheader_count_length = 2
54+
page_meta_type = 0
55+
page_data_type = 256
56+
page_amd_type = 1024
57+
page_metc_type = 16384
58+
page_comp_type = -28672
59+
page_mix_types = [512, 640]
60+
subheader_pointers_offset = 8
61+
truncated_subheader_id = 1
62+
compressed_subheader_id = 4
63+
compressed_subheader_type = 1
64+
text_block_size_length = 2
65+
row_length_offset_multiplier = 5
66+
row_count_offset_multiplier = 6
67+
col_count_p1_multiplier = 9
68+
col_count_p2_multiplier = 10
69+
row_count_on_mix_page_offset_multiplier = 15
70+
column_name_pointer_length = 8
71+
column_name_text_subheader_offset = 0
72+
column_name_text_subheader_length = 2
73+
column_name_offset_offset = 2
74+
column_name_offset_length = 2
75+
column_name_length_offset = 4
76+
column_name_length_length = 2
77+
column_data_offset_offset = 8
78+
column_data_length_offset = 8
79+
column_data_length_length = 4
80+
column_type_offset = 14
81+
column_type_length = 1
82+
column_format_text_subheader_index_offset = 22
83+
column_format_text_subheader_index_length = 2
84+
column_format_offset_offset = 24
85+
column_format_offset_length = 2
86+
column_format_length_offset = 26
87+
column_format_length_length = 2
88+
column_label_text_subheader_index_offset = 28
89+
column_label_text_subheader_index_length = 2
90+
column_label_offset_offset = 30
91+
column_label_offset_length = 2
92+
column_label_length_offset = 32
93+
column_label_length_length = 2
94+
rle_compression = b'SASYZCRL'
95+
rdc_compression = b'SASYZCR2'
96+
97+
compression_literals = [rle_compression, rdc_compression]
98+
99+
# Incomplete list of encodings, using SAS nomenclature:
100+
# http://support.sas.com/documentation/cdl/en/nlsref/61893/HTML/default/viewer.htm#a002607278.htm
101+
encoding_names = {29: "latin1", 20: "utf-8", 33: "cyrillic", 60: "wlatin2",
102+
61: "wcyrillic", 62: "wlatin1", 90: "ebcdic870"}
103+
104+
105+
class index:
106+
rowSizeIndex = 0
107+
columnSizeIndex = 1
108+
subheaderCountsIndex = 2
109+
columnTextIndex = 3
110+
columnNameIndex = 4
111+
columnAttributesIndex = 5
112+
formatAndLabelIndex = 6
113+
columnListIndex = 7
114+
dataSubheaderIndex = 8
115+
116+
117+
subheader_signature_to_index = {
118+
b"\xF7\xF7\xF7\xF7": index.rowSizeIndex,
119+
b"\x00\x00\x00\x00\xF7\xF7\xF7\xF7": index.rowSizeIndex,
120+
b"\xF7\xF7\xF7\xF7\x00\x00\x00\x00": index.rowSizeIndex,
121+
b"\xF7\xF7\xF7\xF7\xFF\xFF\xFB\xFE": index.rowSizeIndex,
122+
b"\xF6\xF6\xF6\xF6": index.columnSizeIndex,
123+
b"\x00\x00\x00\x00\xF6\xF6\xF6\xF6": index.columnSizeIndex,
124+
b"\xF6\xF6\xF6\xF6\x00\x00\x00\x00": index.columnSizeIndex,
125+
b"\xF6\xF6\xF6\xF6\xFF\xFF\xFB\xFE": index.columnSizeIndex,
126+
b"\x00\xFC\xFF\xFF": index.subheaderCountsIndex,
127+
b"\xFF\xFF\xFC\x00": index.subheaderCountsIndex,
128+
b"\x00\xFC\xFF\xFF\xFF\xFF\xFF\xFF": index.subheaderCountsIndex,
129+
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFC\x00": index.subheaderCountsIndex,
130+
b"\xFD\xFF\xFF\xFF": index.columnTextIndex,
131+
b"\xFF\xFF\xFF\xFD": index.columnTextIndex,
132+
b"\xFD\xFF\xFF\xFF\xFF\xFF\xFF\xFF": index.columnTextIndex,
133+
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFD": index.columnTextIndex,
134+
b"\xFF\xFF\xFF\xFF": index.columnNameIndex,
135+
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF": index.columnNameIndex,
136+
b"\xFC\xFF\xFF\xFF": index.columnAttributesIndex,
137+
b"\xFF\xFF\xFF\xFC": index.columnAttributesIndex,
138+
b"\xFC\xFF\xFF\xFF\xFF\xFF\xFF\xFF": index.columnAttributesIndex,
139+
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFC": index.columnAttributesIndex,
140+
b"\xFE\xFB\xFF\xFF": index.formatAndLabelIndex,
141+
b"\xFF\xFF\xFB\xFE": index.formatAndLabelIndex,
142+
b"\xFE\xFB\xFF\xFF\xFF\xFF\xFF\xFF": index.formatAndLabelIndex,
143+
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFB\xFE": index.formatAndLabelIndex,
144+
b"\xFE\xFF\xFF\xFF": index.columnListIndex,
145+
b"\xFF\xFF\xFF\xFE": index.columnListIndex,
146+
b"\xFE\xFF\xFF\xFF\xFF\xFF\xFF\xFF": index.columnListIndex,
147+
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFE": index.columnListIndex}

0 commit comments

Comments
 (0)