diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 6fc1ec9c6ff90..a462c343d0bd3 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -940,6 +940,7 @@ I/O - Bug in :meth:`DataFrame.to_excel` when writing an empty dataframe with :class:`MultiIndex` (:issue:`19543`) - Bug in :func:`read_sas` with RLE-compressed SAS7BDAT files that contain 0x40 control bytes (:issue:`31243`) - Bug in :func:`read_sas` that scrambled column names (:issue:`31243`) +- Bug in :func:`read_sas` with RLE-compressed SAS7BDAT files that contain 0x00 control bytes (:issue:`47099`) - Period diff --git a/pandas/io/sas/sas.pyx b/pandas/io/sas/sas.pyx index 9fcef64e07133..9ea1c31c3d5cf 100644 --- a/pandas/io/sas/sas.pyx +++ b/pandas/io/sas/sas.pyx @@ -28,9 +28,7 @@ cdef const uint8_t[:] rle_decompress(int result_length, const uint8_t[:] inbuff) ipos += 1 if control_byte == 0x00: - if end_of_first_byte != 0: - raise ValueError("Unexpected non-zero end_of_first_byte") - nbytes = (inbuff[ipos]) + 64 + nbytes = (inbuff[ipos]) + 64 + end_of_first_byte * 256 ipos += 1 for _ in range(nbytes): result[rpos] = inbuff[ipos] diff --git a/pandas/tests/io/sas/data/0x00controlbyte.sas7bdat.bz2 b/pandas/tests/io/sas/data/0x00controlbyte.sas7bdat.bz2 new file mode 100644 index 0000000000000..ef980fb907694 Binary files /dev/null and b/pandas/tests/io/sas/data/0x00controlbyte.sas7bdat.bz2 differ diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index 9724fcac815b5..41b2e78d093ea 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -390,3 +390,10 @@ def test_0x40_control_byte(datapath): fname = datapath("io", "sas", "data", "0x40controlbyte.csv") df0 = pd.read_csv(fname, dtype="object") tm.assert_frame_equal(df, df0) + + +def test_0x00_control_byte(datapath): + # GH 47099 + fname = datapath("io", "sas", "data", "0x00controlbyte.sas7bdat.bz2") + df = next(pd.read_sas(fname, chunksize=11_000)) + assert df.shape == (11_000, 20)