From 0e02b8d7697d90d12eebe59cfd2cf92d70889e35 Mon Sep 17 00:00:00 2001
From: Jonas Haag <jonas@lophus.org>
Date: Thu, 16 Jun 2022 01:03:31 +0200
Subject: [PATCH 01/13] Speed up RLE/RDC decompression

---
 asv_bench/benchmarks/io/sas.py |  37 +++----
 pandas/io/sas/sas.pyx          | 184 ++++++++++++++++++++-------------
 2 files changed, 128 insertions(+), 93 deletions(-)

diff --git a/asv_bench/benchmarks/io/sas.py b/asv_bench/benchmarks/io/sas.py
index 369b79641dbc4..411e5b6099f76 100644
--- a/asv_bench/benchmarks/io/sas.py
+++ b/asv_bench/benchmarks/io/sas.py
@@ -1,30 +1,23 @@
-import os
+from pathlib import Path
 
 from pandas import read_sas
 
+ROOT = Path(__file__).parents[3] / "pandas" / "tests" / "io" / "sas" / "data"
+
 
 class SAS:
+    def time_read_sas7bdat(self):
+        read_sas(ROOT / "test1.sas7bdat")
 
-    params = ["sas7bdat", "xport"]
-    param_names = ["format"]
+    def time_read_xpt(self):
+        read_sas(ROOT / "paxraw_d_short.xpt")
 
-    def setup(self, format):
-        # Read files that are located in 'pandas/tests/io/sas/data'
-        files = {"sas7bdat": "test1.sas7bdat", "xport": "paxraw_d_short.xpt"}
-        file = files[format]
-        paths = [
-            os.path.dirname(__file__),
-            "..",
-            "..",
-            "..",
-            "pandas",
-            "tests",
-            "io",
-            "sas",
-            "data",
-            file,
-        ]
-        self.f = os.path.join(*paths)
+    def time_read_sas7bdat_2(self):
+        next(read_sas(ROOT / "0x00controlbyte.sas7bdat.bz2", chunksize=11000))
 
-    def time_read_sas(self, format):
-        read_sas(self.f, format=format)
+    def time_read_sas7bdat_2_chunked(self):
+        for i, _ in enumerate(
+            read_sas(ROOT / "0x00controlbyte.sas7bdat.bz2", chunksize=1000)
+        ):
+            if i == 10:
+                break
diff --git a/pandas/io/sas/sas.pyx b/pandas/io/sas/sas.pyx
index 2df3e1f7243da..885fee17018af 100644
--- a/pandas/io/sas/sas.pyx
+++ b/pandas/io/sas/sas.pyx
@@ -1,195 +1,225 @@
-# cython: profile=False
-# cython: boundscheck=False, initializedcheck=False
+# cython: language_level=3, initializedcheck=False
+# cython: warn.undeclared=True, warn.maybe_uninitialized=True, warn.unused=True
 from cython cimport Py_ssize_t
+from libc.stddef cimport size_t
+from libc.stdint cimport (
+    int64_t,
+    uint8_t,
+    uint16_t,
+)
+from libc.stdlib cimport (
+    calloc,
+    free,
+)
+
 import numpy as np
 
 import pandas.io.sas.sas_constants as const
 
-ctypedef signed long long   int64_t
-ctypedef unsigned char      uint8_t
-ctypedef unsigned short     uint16_t
+
+cdef struct Buffer:
+    uint8_t *data
+    size_t length
+
+
+cdef inline uint8_t buf_get(Buffer buf, size_t offset) except? 0:
+    assert offset < buf.length
+    return buf.data[offset]
+
+
+cdef inline void buf_set(Buffer buf, size_t offset, uint8_t value) except *:
+    assert offset < buf.length
+    buf.data[offset] = value
+
+
+cdef inline bytes buf_as_bytes(Buffer buf, size_t offset, size_t length):
+    assert offset + length <= buf.length
+    return buf.data[offset:offset+length]
+
+
+cdef inline Buffer buf_new(size_t length) except *:
+    cdef uint8_t *data = <uint8_t *>calloc(length, sizeof(uint8_t))
+    if data == NULL:
+        raise MemoryError(f"Failed to allocate {length} bytes")
+    return Buffer(data, length)
+
+
+cdef inline buf_free(Buffer buf):
+    if buf.data != NULL:
+        free(buf.data)
+
 
 # rle_decompress decompresses data using a Run Length Encoding
 # algorithm.  It is partially documented here:
 #
 # https://cran.r-project.org/package=sas7bdat/vignettes/sas7bdat.pdf
-cdef const uint8_t[:] rle_decompress(int result_length, const uint8_t[:] inbuff) except *:
+cdef int rle_decompress(Buffer inbuff, Buffer outbuff) except? 0:
 
     cdef:
         uint8_t control_byte, x
-        uint8_t[:] result = np.zeros(result_length, np.uint8)
         int rpos = 0
-        int i, nbytes, end_of_first_byte
-        Py_ssize_t ipos = 0, length = len(inbuff)
+        int i, nbytes, end_of_first_byte, value
+        Py_ssize_t ipos = 0
 
-    while ipos < length:
-        control_byte = inbuff[ipos] & 0xF0
-        end_of_first_byte = <int>(inbuff[ipos] & 0x0F)
+    while ipos < inbuff.length:
+        control_byte = buf_get(inbuff, ipos) & 0xF0
+        end_of_first_byte = <int>(buf_get(inbuff, ipos) & 0x0F)
         ipos += 1
 
         if control_byte == 0x00:
             if end_of_first_byte != 0:
                 raise ValueError("Unexpected non-zero end_of_first_byte")
-            nbytes = <int>(inbuff[ipos]) + 64
+            nbytes = <int>(buf_get(inbuff, ipos)) + 64
             ipos += 1
             for _ in range(nbytes):
-                result[rpos] = inbuff[ipos]
+                buf_set(outbuff, rpos, buf_get(inbuff, ipos))
                 rpos += 1
                 ipos += 1
         elif control_byte == 0x40:
             # not documented
             nbytes = end_of_first_byte * 16
-            nbytes += <int>(inbuff[ipos])
+            nbytes += <int>(buf_get(inbuff, ipos))
             ipos += 1
             for _ in range(nbytes):
-                result[rpos] = inbuff[ipos]
+                buf_set(outbuff, rpos, buf_get(inbuff, ipos))
                 rpos += 1
             ipos += 1
         elif control_byte == 0x60:
-            nbytes = end_of_first_byte * 256 + <int>(inbuff[ipos]) + 17
+            nbytes = end_of_first_byte * 256 + <int>(buf_get(inbuff, ipos)) + 17
             ipos += 1
             for _ in range(nbytes):
-                result[rpos] = 0x20
+                buf_set(outbuff, rpos, 0x20)
                 rpos += 1
         elif control_byte == 0x70:
-            nbytes = end_of_first_byte * 256 + <int>(inbuff[ipos]) + 17
+            nbytes = end_of_first_byte * 256 + <int>(buf_get(inbuff, ipos)) + 17
             ipos += 1
             for _ in range(nbytes):
-                result[rpos] = 0x00
+                buf_set(outbuff, rpos, 0x00)
                 rpos += 1
         elif control_byte == 0x80:
             nbytes = end_of_first_byte + 1
             for i in range(nbytes):
-                result[rpos] = inbuff[ipos + i]
+                buf_set(outbuff, rpos, buf_get(inbuff, ipos + i))
                 rpos += 1
             ipos += nbytes
         elif control_byte == 0x90:
             nbytes = end_of_first_byte + 17
             for i in range(nbytes):
-                result[rpos] = inbuff[ipos + i]
+                buf_set(outbuff, rpos, buf_get(inbuff, ipos + i))
                 rpos += 1
             ipos += nbytes
         elif control_byte == 0xA0:
             nbytes = end_of_first_byte + 33
             for i in range(nbytes):
-                result[rpos] = inbuff[ipos + i]
+                buf_set(outbuff, rpos, buf_get(inbuff, ipos + i))
                 rpos += 1
             ipos += nbytes
         elif control_byte == 0xB0:
             nbytes = end_of_first_byte + 49
             for i in range(nbytes):
-                result[rpos] = inbuff[ipos + i]
+                buf_set(outbuff, rpos, buf_get(inbuff, ipos + i))
                 rpos += 1
             ipos += nbytes
         elif control_byte == 0xC0:
             nbytes = end_of_first_byte + 3
-            x = inbuff[ipos]
+            x = buf_get(inbuff, ipos)
             ipos += 1
             for _ in range(nbytes):
-                result[rpos] = x
+                buf_set(outbuff, rpos, x)
                 rpos += 1
         elif control_byte == 0xD0:
             nbytes = end_of_first_byte + 2
             for _ in range(nbytes):
-                result[rpos] = 0x40
+                buf_set(outbuff, rpos, 0x40)
                 rpos += 1
         elif control_byte == 0xE0:
             nbytes = end_of_first_byte + 2
             for _ in range(nbytes):
-                result[rpos] = 0x20
+                buf_set(outbuff, rpos, 0x20)
                 rpos += 1
         elif control_byte == 0xF0:
             nbytes = end_of_first_byte + 2
             for _ in range(nbytes):
-                result[rpos] = 0x00
+                buf_set(outbuff, rpos, 0x00)
                 rpos += 1
         else:
             raise ValueError(f"unknown control byte: {control_byte}")
 
-    # In py37 cython/clang sees `len(outbuff)` as size_t and not Py_ssize_t
-    if <Py_ssize_t>len(result) != <Py_ssize_t>result_length:
-        raise ValueError(f"RLE: {len(result)} != {result_length}")
-
-    return np.asarray(result)
+    return rpos
 
 
 # rdc_decompress decompresses data using the Ross Data Compression algorithm:
 #
 # http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm
-cdef const uint8_t[:] rdc_decompress(int result_length, const uint8_t[:] inbuff) except *:
+cdef int rdc_decompress(Buffer inbuff, Buffer outbuff) except? 0:
 
     cdef:
         uint8_t cmd
         uint16_t ctrl_bits = 0, ctrl_mask = 0, ofs, cnt
-        int rpos = 0, k
-        uint8_t[:] outbuff = np.zeros(result_length, dtype=np.uint8)
-        Py_ssize_t ipos = 0, length = len(inbuff)
+        int rpos = 0, k, ii
+        Py_ssize_t ipos = 0
 
     ii = -1
 
-    while ipos < length:
+    while ipos < inbuff.length:
         ii += 1
         ctrl_mask = ctrl_mask >> 1
         if ctrl_mask == 0:
-            ctrl_bits = ((<uint16_t>inbuff[ipos] << 8) +
-                         <uint16_t>inbuff[ipos + 1])
+            ctrl_bits = ((<uint16_t>buf_get(inbuff, ipos) << 8) +
+                         <uint16_t>buf_get(inbuff, ipos + 1))
             ipos += 2
             ctrl_mask = 0x8000
 
         if ctrl_bits & ctrl_mask == 0:
-            outbuff[rpos] = inbuff[ipos]
+            buf_set(outbuff, rpos, buf_get(inbuff, ipos))
             ipos += 1
             rpos += 1
             continue
 
-        cmd = (inbuff[ipos] >> 4) & 0x0F
-        cnt = <uint16_t>(inbuff[ipos] & 0x0F)
+        cmd = (buf_get(inbuff, ipos) >> 4) & 0x0F
+        cnt = <uint16_t>(buf_get(inbuff, ipos) & 0x0F)
         ipos += 1
 
         # short RLE
         if cmd == 0:
             cnt += 3
             for k in range(cnt):
-                outbuff[rpos + k] = inbuff[ipos]
+                buf_set(outbuff, rpos + k, buf_get(inbuff, ipos))
             rpos += cnt
             ipos += 1
 
         # long RLE
         elif cmd == 1:
-            cnt += <uint16_t>inbuff[ipos] << 4
+            cnt += <uint16_t>buf_get(inbuff, ipos) << 4
             cnt += 19
             ipos += 1
             for k in range(cnt):
-                outbuff[rpos + k] = inbuff[ipos]
+                buf_set(outbuff, rpos + k, buf_get(inbuff, ipos))
             rpos += cnt
             ipos += 1
 
         # long pattern
         elif cmd == 2:
             ofs = cnt + 3
-            ofs += <uint16_t>inbuff[ipos] << 4
+            ofs += <uint16_t>buf_get(inbuff, ipos) << 4
             ipos += 1
-            cnt = <uint16_t>inbuff[ipos]
+            cnt = <uint16_t>buf_get(inbuff, ipos)
             ipos += 1
             cnt += 16
             for k in range(cnt):
-                outbuff[rpos + k] = outbuff[rpos - <int>ofs + k]
+                buf_set(outbuff, rpos + k, buf_get(outbuff, rpos - <int>ofs + k))
             rpos += cnt
 
         # short pattern
         else:
             ofs = cnt + 3
-            ofs += <uint16_t>inbuff[ipos] << 4
+            ofs += <uint16_t>buf_get(inbuff, ipos) << 4
             ipos += 1
             for k in range(cmd):
-                outbuff[rpos + k] = outbuff[rpos - <int>ofs + k]
+                buf_set(outbuff, rpos + k, buf_get(outbuff, rpos - <int>ofs + k))
             rpos += cmd
 
-    # In py37 cython/clang sees `len(outbuff)` as size_t and not Py_ssize_t
-    if <Py_ssize_t>len(outbuff) != <Py_ssize_t>result_length:
-        raise ValueError(f"RDC: {len(outbuff)} != {result_length}\n")
-
-    return np.asarray(outbuff)
+    return rpos
 
 
 cdef enum ColumnTypes:
@@ -216,7 +246,8 @@ cdef class Parser:
         int64_t[:] column_types
         uint8_t[:, :] byte_chunk
         object[:, :] string_chunk
-        char *cached_page
+        uint8_t *cached_page
+        int cached_page_len
         int current_row_on_page_index
         int current_page_block_count
         int current_page_data_subheader_pointers_len
@@ -229,7 +260,7 @@ cdef class Parser:
         int subheader_pointer_length
         int current_page_type
         bint is_little_endian
-        const uint8_t[:] (*decompress)(int result_length, const uint8_t[:] inbuff) except *
+        int (*decompress)(Buffer, Buffer) except *
         object parser
 
     def __init__(self, object parser):
@@ -305,7 +336,8 @@ cdef class Parser:
     cdef update_next_page(self):
         # update data for the current page
 
-        self.cached_page = <char *>self.parser._cached_page
+        self.cached_page = <uint8_t *>self.parser._cached_page
+        self.cached_page_len = len(self.parser._cached_page)
         self.current_row_on_page_index = 0
         self.current_page_type = self.parser._current_page_type
         self.current_page_block_count = self.parser._current_page_block_count
@@ -386,20 +418,28 @@ cdef class Parser:
 
         cdef:
             Py_ssize_t j
-            int s, k, m, jb, js, current_row
+            int s, k, m, jb, js, current_row, rpos
             int64_t lngt, start, ct
-            const uint8_t[:] source
+            Buffer source, decompressed_source
             int64_t[:] column_types
             int64_t[:] lengths
             int64_t[:] offsets
             uint8_t[:, :] byte_chunk
             object[:, :] string_chunk
-
-        source = np.frombuffer(
-            self.cached_page[offset:offset + length], dtype=np.uint8)
-
-        if self.decompress != NULL and (length < self.row_length):
-            source = self.decompress(self.row_length, source)
+            bint compressed
+
+        assert offset + length <= self.cached_page_len
+        source = Buffer(&self.cached_page[offset], length)
+
+        compressed = self.decompress != NULL and length < self.row_length
+        if compressed:
+            decompressed_source = buf_new(self.row_length)
+            rpos = self.decompress(source, decompressed_source)
+            if rpos != self.row_length:
+                raise ValueError(
+                    f"Expected decompressed line of length {self.row_length} bytes but decompressed {rpos} bytes"
+                )
+            source = decompressed_source
 
         current_row = self.current_row_in_chunk_index
         column_types = self.column_types
@@ -423,14 +463,16 @@ cdef class Parser:
                 else:
                     m = s
                 for k in range(lngt):
-                    byte_chunk[jb, m + k] = source[start + k]
+                    byte_chunk[jb, m + k] = buf_get(source, start + k)
                 jb += 1
             elif column_types[j] == column_type_string:
                 # string
-                string_chunk[js, current_row] = np.array(source[start:(
-                    start + lngt)]).tobytes().rstrip(b"\x00 ")
+                string_chunk[js, current_row] = buf_as_bytes(source, start, lngt).rstrip(b"\x00 ")
                 js += 1
 
         self.current_row_on_page_index += 1
         self.current_row_in_chunk_index += 1
         self.current_row_in_file_index += 1
+
+        if compressed:
+            buf_free(decompressed_source)

From eca0db4399a70cad19e8c70f6de180ad4ac130ae Mon Sep 17 00:00:00 2001
From: Jonas Haag <jonas@lophus.org>
Date: Fri, 17 Jun 2022 16:02:49 +0200
Subject: [PATCH 02/13] Update tests

---
 pandas/io/sas/sas.pyx                |  8 ++---
 pandas/tests/io/sas/test_sas7bdat.py | 46 ++++++++++------------------
 2 files changed, 20 insertions(+), 34 deletions(-)

diff --git a/pandas/io/sas/sas.pyx b/pandas/io/sas/sas.pyx
index 885fee17018af..ee34074c14648 100644
--- a/pandas/io/sas/sas.pyx
+++ b/pandas/io/sas/sas.pyx
@@ -23,17 +23,17 @@ cdef struct Buffer:
 
 
 cdef inline uint8_t buf_get(Buffer buf, size_t offset) except? 0:
-    assert offset < buf.length
+    assert offset < buf.length, f"Out of bounds read"
     return buf.data[offset]
 
 
 cdef inline void buf_set(Buffer buf, size_t offset, uint8_t value) except *:
-    assert offset < buf.length
+    assert offset < buf.length, "Out of bounds write"
     buf.data[offset] = value
 
 
 cdef inline bytes buf_as_bytes(Buffer buf, size_t offset, size_t length):
-    assert offset + length <= buf.length
+    assert offset + length <= buf.length, "Out of bounds read"
     return buf.data[offset:offset+length]
 
 
@@ -428,7 +428,7 @@ cdef class Parser:
             object[:, :] string_chunk
             bint compressed
 
-        assert offset + length <= self.cached_page_len
+        assert offset + length <= self.cached_page_len, "Out of bounds read"
         source = Buffer(&self.cached_page[offset], length)
 
         compressed = self.decompress != NULL and length < self.row_length
diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py
index 3f150c1a061ee..e11a23b08a9e4 100644
--- a/pandas/tests/io/sas/test_sas7bdat.py
+++ b/pandas/tests/io/sas/test_sas7bdat.py
@@ -350,34 +350,20 @@ def test_meta2_page(datapath):
     assert len(df) == 1000
 
 
-@pytest.mark.parametrize("test_file", ["test2.sas7bdat", "test3.sas7bdat"])
-def test_exception_propagation_rdc_rle_decompress(datapath, monkeypatch, test_file):
-    """Errors in RLE/RDC decompression should propagate the same error."""
-    orig_np_zeros = np.zeros
-
-    def _patched_zeros(size, dtype):
-        if isinstance(size, int):
-            # np.zeros() call in {rdc,rle}_decompress
-            raise Exception("Test exception")
-        else:
-            # Other calls to np.zeros
-            return orig_np_zeros(size, dtype)
-
-    monkeypatch.setattr(np, "zeros", _patched_zeros)
-
-    with pytest.raises(Exception, match="^Test exception$"):
-        pd.read_sas(datapath("io", "sas", "data", test_file))
-
-
-def test_exception_propagation_rle_decompress(tmp_path, datapath):
-    """Illegal control byte in RLE decompressor should raise the correct ValueError."""
-    with open(datapath("io", "sas", "data", "test2.sas7bdat"), "rb") as f:
+@pytest.mark.parametrize(
+    "test_file, override_offset, override_value, expected_msg",
+    [
+        ("test2.sas7bdat", 0x10000 + 55229, 0x80 | 0x0F, "Out of bounds"),
+        ("test2.sas7bdat", 0x10000 + 55229, 0x10, "unknown control byte"),
+        ("test3.sas7bdat", 118170, 184, "Out of bounds"),
+    ],
+)
+def test_rle_rdc_exceptions(
+    datapath, test_file, override_offset, override_value, expected_msg
+):
+    """Errors in RLE/RDC decompression should propagate."""
+    with open(datapath("io", "sas", "data", test_file), "rb") as f:
         data = bytearray(f.read())
-    invalid_control_byte = 0x10
-    page_offset = 0x10000
-    control_byte_pos = 55229
-    data[page_offset + control_byte_pos] = invalid_control_byte
-    tmp_file = tmp_path / "test2.sas7bdat"
-    tmp_file.write_bytes(data)
-    with pytest.raises(ValueError, match="unknown control byte"):
-        pd.read_sas(tmp_file)
+    data[override_offset] = override_value
+    with pytest.raises(Exception, match=expected_msg):
+        pd.read_sas(io.BytesIO(data), format="sas7bdat")

From 041a04b1a6c0d3c204abfdb950fdaf972a99148c Mon Sep 17 00:00:00 2001
From: Jonas Haag <jonas@lophus.org>
Date: Mon, 20 Jun 2022 10:25:11 +0200
Subject: [PATCH 03/13] ssize_t -> size_t

---
 pandas/io/sas/sas.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/io/sas/sas.pyx b/pandas/io/sas/sas.pyx
index ee34074c14648..cf8f5f110eaeb 100644
--- a/pandas/io/sas/sas.pyx
+++ b/pandas/io/sas/sas.pyx
@@ -59,7 +59,7 @@ cdef int rle_decompress(Buffer inbuff, Buffer outbuff) except? 0:
         uint8_t control_byte, x
         int rpos = 0
         int i, nbytes, end_of_first_byte, value
-        Py_ssize_t ipos = 0
+        size_t ipos = 0
 
     while ipos < inbuff.length:
         control_byte = buf_get(inbuff, ipos) & 0xF0
@@ -157,7 +157,7 @@ cdef int rdc_decompress(Buffer inbuff, Buffer outbuff) except? 0:
         uint8_t cmd
         uint16_t ctrl_bits = 0, ctrl_mask = 0, ofs, cnt
         int rpos = 0, k, ii
-        Py_ssize_t ipos = 0
+        size_t ipos = 0
 
     ii = -1
 

From f2c8b0eb4c96f082538f8f1f477fb2702564d679 Mon Sep 17 00:00:00 2001
From: Jonas Haag <jonas@lophus.org>
Date: Sun, 26 Jun 2022 18:37:55 +0200
Subject: [PATCH 04/13] Update sas.pyx

---
 pandas/io/sas/sas.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/io/sas/sas.pyx b/pandas/io/sas/sas.pyx
index 8b02051edbdc3..631f703ea86c3 100644
--- a/pandas/io/sas/sas.pyx
+++ b/pandas/io/sas/sas.pyx
@@ -77,7 +77,7 @@ cdef int rle_decompress(Buffer inbuff, Buffer outbuff) except? 0:
                 ipos += 1
         elif control_byte == 0x40:
             # not documented
-            nbytes = (inbuff[ipos] & 0xFF) + 18 + end_of_first_byte * 256
+            nbytes = <int>(buf_get(inbuff, ipos)) + 18 + end_of_first_byte * 256
             ipos += 1
             for _ in range(nbytes):
                 buf_set(outbuff, rpos, buf_get(inbuff, ipos))

From 213b08f89ff3ad61b1dd61e8200ec91658a67efe Mon Sep 17 00:00:00 2001
From: Jonas Haag <jonas@lophus.org>
Date: Thu, 7 Jul 2022 22:57:24 +0200
Subject: [PATCH 05/13] Don't use null byte as except value

---
 pandas/io/sas/sas.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/io/sas/sas.pyx b/pandas/io/sas/sas.pyx
index 2fb166d84bed7..d381d2fa2b19a 100644
--- a/pandas/io/sas/sas.pyx
+++ b/pandas/io/sas/sas.pyx
@@ -22,8 +22,8 @@ cdef struct Buffer:
     size_t length
 
 
-cdef inline uint8_t buf_get(Buffer buf, size_t offset) except? 0:
-    assert offset < buf.length, f"Out of bounds read"
+cdef inline uint8_t buf_get(Buffer buf, size_t offset) except? 255:
+    assert offset < buf.length, "Out of bounds read"
     return buf.data[offset]
 
 

From 4b24773bff491f39bb50be1a41595815b887e213 Mon Sep 17 00:00:00 2001
From: Jonas Haag <jonas@lophus.org>
Date: Thu, 7 Jul 2022 22:59:02 +0200
Subject: [PATCH 06/13] Nit

---
 pandas/tests/io/sas/test_sas7bdat.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py
index 964de323a444c..6a34ed70e9cbf 100644
--- a/pandas/tests/io/sas/test_sas7bdat.py
+++ b/pandas/tests/io/sas/test_sas7bdat.py
@@ -362,8 +362,8 @@ def test_rle_rdc_exceptions(
     datapath, test_file, override_offset, override_value, expected_msg
 ):
     """Errors in RLE/RDC decompression should propagate."""
-    with open(datapath("io", "sas", "data", test_file), "rb") as f:
-        data = bytearray(f.read())
+    with open(datapath("io", "sas", "data", test_file), "rb") as fd:
+        data = bytearray(fd.read())
     data[override_offset] = override_value
     with pytest.raises(Exception, match=expected_msg):
         pd.read_sas(io.BytesIO(data), format="sas7bdat")

From 263aea64e5df62a8f45b36e826b0e7d41a6cf146 Mon Sep 17 00:00:00 2001
From: Jonas Haag <jonas@lophus.org>
Date: Sat, 9 Jul 2022 09:57:57 +0200
Subject: [PATCH 07/13] Simplify condition

---
 pandas/io/sas/sas.pyx | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/pandas/io/sas/sas.pyx b/pandas/io/sas/sas.pyx
index d381d2fa2b19a..8d50a1c130af3 100644
--- a/pandas/io/sas/sas.pyx
+++ b/pandas/io/sas/sas.pyx
@@ -415,7 +415,7 @@ cdef class Parser:
 
         cdef:
             Py_ssize_t j
-            int s, k, m, jb, js, current_row, rpos
+            int k, m, jb, js, current_row, rpos
             int64_t lngt, start, ct
             Buffer source, decompressed_source
             int64_t[:] column_types
@@ -444,7 +444,6 @@ cdef class Parser:
         offsets = self.offsets
         byte_chunk = self.byte_chunk
         string_chunk = self.string_chunk
-        s = 8 * self.current_row_in_chunk_index
         js = 0
         jb = 0
         for j in range(self.column_count):
@@ -455,10 +454,10 @@ cdef class Parser:
             ct = column_types[j]
             if ct == column_type_decimal:
                 # decimal
-                if self.is_little_endian:
-                    m = s + 8 - lngt
-                else:
-                    m = s
+                assert lngt in (4, 8)
+                m = 8 * self.current_row_in_chunk_index
+                if lngt == 4 and self.is_little_endian:
+                    m += 4
                 for k in range(lngt):
                     byte_chunk[jb, m + k] = buf_get(source, start + k)
                 jb += 1

From 785f752fc8bce759a190056aee94222b6330b124 Mon Sep 17 00:00:00 2001
From: Jonas Haag <jonas@lophus.org>
Date: Sat, 9 Jul 2022 10:02:26 +0200
Subject: [PATCH 08/13] Review feedback

---
 pandas/io/sas/sas.pyx | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/pandas/io/sas/sas.pyx b/pandas/io/sas/sas.pyx
index 8d50a1c130af3..0a4be88868840 100644
--- a/pandas/io/sas/sas.pyx
+++ b/pandas/io/sas/sas.pyx
@@ -18,6 +18,12 @@ import pandas.io.sas.sas_constants as const
 
 
 cdef struct Buffer:
+    """Convenience wrapper for uint8_t data to allow fast and safe reads and writes.
+
+    We use this as a replacement for np.array(..., dtype=np.uint8) because it's
+    much slower to create NumPy arrays and we create Buffer instances many times
+    when reading a SAS7BDAT file (roughly once per row that is being read).
+    """
     uint8_t *data
     size_t length
 

From 1f36f99233063abc5a787a1924d49f85a5044cf0 Mon Sep 17 00:00:00 2001
From: Jonas Haag <jonas@lophus.org>
Date: Sat, 9 Jul 2022 19:31:28 +0200
Subject: [PATCH 09/13] Docstring -> comment

---
 pandas/io/sas/sas.pyx | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/pandas/io/sas/sas.pyx b/pandas/io/sas/sas.pyx
index 0a4be88868840..a7b222441e2c6 100644
--- a/pandas/io/sas/sas.pyx
+++ b/pandas/io/sas/sas.pyx
@@ -18,12 +18,10 @@ import pandas.io.sas.sas_constants as const
 
 
 cdef struct Buffer:
-    """Convenience wrapper for uint8_t data to allow fast and safe reads and writes.
-
-    We use this as a replacement for np.array(..., dtype=np.uint8) because it's
-    much slower to create NumPy arrays and we create Buffer instances many times
-    when reading a SAS7BDAT file (roughly once per row that is being read).
-    """
+    # Convenience wrapper for uint8_t data to allow fast and safe reads and writes.
+    # We use this as a replacement for np.array(..., dtype=np.uint8) because it's
+    # much slower to create NumPy arrays and we create Buffer instances many times
+    # when reading a SAS7BDAT file (roughly once per row that is being read).
     uint8_t *data
     size_t length
 

From 26aea28c8d48bc90e6faf435a5388d1d48049d3d Mon Sep 17 00:00:00 2001
From: Jonas Haag <jonas@lophus.org>
Date: Sat, 9 Jul 2022 20:00:46 +0200
Subject: [PATCH 10/13] Revert "Simplify condition"

This reverts commit 263aea64e5df62a8f45b36e826b0e7d41a6cf146.
---
 pandas/io/sas/sas.pyx | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/pandas/io/sas/sas.pyx b/pandas/io/sas/sas.pyx
index a7b222441e2c6..a1d1fcf2f652c 100644
--- a/pandas/io/sas/sas.pyx
+++ b/pandas/io/sas/sas.pyx
@@ -419,7 +419,7 @@ cdef class Parser:
 
         cdef:
             Py_ssize_t j
-            int k, m, jb, js, current_row, rpos
+            int s, k, m, jb, js, current_row, rpos
             int64_t lngt, start, ct
             Buffer source, decompressed_source
             int64_t[:] column_types
@@ -448,6 +448,7 @@ cdef class Parser:
         offsets = self.offsets
         byte_chunk = self.byte_chunk
         string_chunk = self.string_chunk
+        s = 8 * self.current_row_in_chunk_index
         js = 0
         jb = 0
         for j in range(self.column_count):
@@ -458,10 +459,10 @@ cdef class Parser:
             ct = column_types[j]
             if ct == column_type_decimal:
                 # decimal
-                assert lngt in (4, 8)
-                m = 8 * self.current_row_in_chunk_index
-                if lngt == 4 and self.is_little_endian:
-                    m += 4
+                if self.is_little_endian:
+                    m = s + 8 - lngt
+                else:
+                    m = s
                 for k in range(lngt):
                     byte_chunk[jb, m + k] = buf_get(source, start + k)
                 jb += 1

From 21ba0b2aabd5a258b4cbbf7fdaaf4fdc66ae3dc1 Mon Sep 17 00:00:00 2001
From: Jonas Haag <jonas@lophus.org>
Date: Thu, 15 Sep 2022 11:34:02 +0200
Subject: [PATCH 11/13] Lint

---
 pandas/io/sas/sas.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/io/sas/sas.pyx b/pandas/io/sas/sas.pyx
index c0d0205e2aa6c..74d7612ce3233 100644
--- a/pandas/io/sas/sas.pyx
+++ b/pandas/io/sas/sas.pyx
@@ -64,7 +64,7 @@ cdef int rle_decompress(Buffer inbuff, Buffer outbuff) except? 0:
     cdef:
         uint8_t control_byte, x
         int rpos = 0
-        int i, nbytes, end_of_first_byte, value
+        int i, nbytes, end_of_first_byte
         size_t ipos = 0
 
     while ipos < inbuff.length:

From 55cceb74090c5e5fc8651fb52bd6d0644a4c184e Mon Sep 17 00:00:00 2001
From: Jonas Haag <jonas@lophus.org>
Date: Thu, 22 Sep 2022 15:01:29 +0200
Subject: [PATCH 12/13] Speed up some Cython `except`

---
 pandas/io/sas/sas.pyx | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pandas/io/sas/sas.pyx b/pandas/io/sas/sas.pyx
index 74d7612ce3233..febcef19d9906 100644
--- a/pandas/io/sas/sas.pyx
+++ b/pandas/io/sas/sas.pyx
@@ -31,9 +31,10 @@ cdef inline uint8_t buf_get(Buffer buf, size_t offset) except? 255:
     return buf.data[offset]
 
 
-cdef inline void buf_set(Buffer buf, size_t offset, uint8_t value) except *:
+cdef inline bint buf_set(Buffer buf, size_t offset, uint8_t value) except 0:
     assert offset < buf.length, "Out of bounds write"
     buf.data[offset] = value
+    return True
 
 
 cdef inline bytes buf_as_bytes(Buffer buf, size_t offset, size_t length):
@@ -264,7 +265,7 @@ cdef class Parser:
         int subheader_pointer_length
         int current_page_type
         bint is_little_endian
-        int (*decompress)(Buffer, Buffer) except *
+        int (*decompress)(Buffer, Buffer) except 0
         object parser
 
     def __init__(self, object parser):

From ba9b019ffd85a30233cf8bc7dd8146b64b994e24 Mon Sep 17 00:00:00 2001
From: Jonas Haag <jonas@lophus.org>
Date: Thu, 22 Sep 2022 15:31:47 +0200
Subject: [PATCH 13/13] Typo

---
 pandas/io/sas/sas.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/io/sas/sas.pyx b/pandas/io/sas/sas.pyx
index febcef19d9906..8065859844b30 100644
--- a/pandas/io/sas/sas.pyx
+++ b/pandas/io/sas/sas.pyx
@@ -265,7 +265,7 @@ cdef class Parser:
         int subheader_pointer_length
         int current_page_type
         bint is_little_endian
-        int (*decompress)(Buffer, Buffer) except 0
+        int (*decompress)(Buffer, Buffer) except? 0
         object parser
 
     def __init__(self, object parser):