Skip to content

BUG: Dont include deleted rows from sas7bdat files (#15963) #22650

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -760,6 +760,8 @@ I/O
- :func:`read_sas()` will correctly parse sas7bdat files with many columns (:issue:`22628`)
- :func:`read_sas()` will correctly parse sas7bdat files with data page types having also bit 7 set (so page type is 128 + 256 = 384) (:issue:`16615`)
- Bug in :meth:`detect_client_encoding` where potential ``IOError`` goes unhandled when importing in a mod_wsgi process due to restricted access to stdout. (:issue:`21552`)
- :func:`read_sas()` will not include rows in sas7bdat files that has been marked as deleted by SAS, but are still present in the file. (:issue:`15963`)


Plotting
^^^^^^^^
Expand Down
108 changes: 93 additions & 15 deletions pandas/io/sas/sas.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -204,9 +204,9 @@ cdef enum ColumnTypes:

# type the page_data types
cdef int page_meta_type = const.page_meta_type
cdef int page_mix_types_0 = const.page_mix_types[0]
cdef int page_mix_types_1 = const.page_mix_types[1]
cdef int page_data_type = const.page_data_type
cdef int page_mix_type = const.page_mix_type
cdef int page_type_mask = const.page_type_mask
cdef int subheader_pointers_offset = const.subheader_pointers_offset


Expand All @@ -219,7 +219,7 @@ cdef class Parser(object):
int64_t[:] column_types
uint8_t[:, :] byte_chunk
object[:, :] string_chunk
char *cached_page
uint8_t *cached_page
int current_row_on_page_index
int current_page_block_count
int current_page_data_subheader_pointers_len
Expand All @@ -231,6 +231,7 @@ cdef class Parser(object):
int bit_offset
int subheader_pointer_length
int current_page_type
int current_page_deleted_rows_bitmap_offset
bint is_little_endian
const uint8_t[:] (*decompress)(int result_length,
const uint8_t[:] inbuff)
Expand All @@ -253,6 +254,7 @@ cdef class Parser(object):
self.subheader_pointer_length = self.parser._subheader_pointer_length
self.is_little_endian = parser.byte_order == "<"
self.column_types = np.empty(self.column_count, dtype='int64')
self.current_page_deleted_rows_bitmap_offset = -1

# page indicators
self.update_next_page()
Expand Down Expand Up @@ -309,10 +311,55 @@ cdef class Parser(object):
self.update_next_page()
return done

cdef int calculate_deleted_rows_bitmap_offset(self):
"""Calculate where the deleted rows bitmap is located
in the page. It is _current_page_deleted_rows_bitmap_offset's
bytes away from the end of the row values"""

cdef:
int deleted_rows_bitmap_offset, page_type
int subheader_pointers_length, align_correction
int row_count

if self.parser._current_page_deleted_rows_bitmap_offset is None:
return -1

deleted_rows_bitmap_offset = \
self.parser._current_page_deleted_rows_bitmap_offset

page_type = self.current_page_type
subheader_pointers_length = \
self.subheader_pointer_length * self.current_page_subheaders_count

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add some comments in here

if page_type & page_type_mask == page_data_type:
return (
self.bit_offset +
subheader_pointers_offset +
self.row_length * self.current_page_block_count +
deleted_rows_bitmap_offset)
elif page_type & page_type_mask == page_mix_type:
align_correction = (
self.bit_offset +
subheader_pointers_offset +
subheader_pointers_length
) % 8
row_count = min(self.parser._mix_page_row_count,
self.parser.row_count)
return (
self.bit_offset +
subheader_pointers_offset +
subheader_pointers_length +
align_correction +
self.row_length * row_count +
deleted_rows_bitmap_offset)
else:
# I have never seen this case.
return -1

cdef update_next_page(self):
# update data for the current page

self.cached_page = <char *>self.parser._cached_page
self.cached_page = <uint8_t * >self.parser._cached_page
self.current_row_on_page_index = 0
self.current_page_type = self.parser._current_page_type
self.current_page_block_count = self.parser._current_page_block_count
Expand All @@ -321,11 +368,29 @@ cdef class Parser(object):
self.current_page_subheaders_count =\
self.parser._current_page_subheaders_count

self.current_page_deleted_rows_bitmap_offset =\
self.calculate_deleted_rows_bitmap_offset()

cdef bint is_row_deleted(self, int row_number):
cdef:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

doc-string & comments

int row_disk
unsigned char byte, row_bit
if self.current_page_deleted_rows_bitmap_offset == -1:
return 0
row_idx = (row_number + 1) // 8
row_bit = 1 << (7 - (row_number % 8))

byte = self.cached_page[
self.current_page_deleted_rows_bitmap_offset + row_idx]

return byte & row_bit

cdef readline(self):

cdef:
int offset, bit_offset, align_correction
int subheader_pointer_length, mn
int block_count
bint done, flag

bit_offset = self.bit_offset
Expand All @@ -340,7 +405,7 @@ cdef class Parser(object):

# Loop until a data row is read
while True:
if self.current_page_type == page_meta_type:
if self.current_page_type & page_type_mask == page_meta_type:
flag = self.current_row_on_page_index >=\
self.current_page_data_subheader_pointers_len
if flag:
Expand All @@ -355,8 +420,7 @@ cdef class Parser(object):
current_subheader_pointer.offset,
current_subheader_pointer.length)
return False
elif (self.current_page_type == page_mix_types_0 or
self.current_page_type == page_mix_types_1):
elif self.current_page_type & page_type_mask == page_mix_type:
align_correction = (bit_offset + subheader_pointers_offset +
self.current_page_subheaders_count *
subheader_pointer_length)
Expand All @@ -365,21 +429,35 @@ cdef class Parser(object):
offset += subheader_pointers_offset
offset += (self.current_page_subheaders_count *
subheader_pointer_length)
offset += self.current_row_on_page_index * self.row_length
self.process_byte_array_with_data(offset,
self.row_length)

# Skip past rows marked as deleted
mn = min(self.parser.row_count,
self.parser._mix_page_row_count)
while (self.is_row_deleted(self.current_row_on_page_index) and
self.current_row_on_page_index < mn):
self.current_row_on_page_index += 1

if self.current_row_on_page_index < mn:
offset += self.current_row_on_page_index * self.row_length
self.process_byte_array_with_data(offset, self.row_length)
if self.current_row_on_page_index == mn:
done = self.read_next_page()
if done:
return True
return False
elif self.current_page_type & page_data_type == page_data_type:
self.process_byte_array_with_data(
bit_offset + subheader_pointers_offset +
self.current_row_on_page_index * self.row_length,
self.row_length)
elif self.current_page_type & page_type_mask == page_data_type:
block_count = self.current_page_block_count

# Skip past rows marked as deleted
while (self.is_row_deleted(self.current_row_on_page_index) and
self.current_row_on_page_index != block_count):
self.current_row_on_page_index += 1

if self.current_row_on_page_index < block_count:
self.process_byte_array_with_data(
bit_offset + subheader_pointers_offset +
self.current_row_on_page_index * self.row_length,
self.row_length)
flag = (self.current_row_on_page_index ==
self.current_page_block_count)
if flag:
Expand Down
33 changes: 21 additions & 12 deletions pandas/io/sas/sas7bdat.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,12 +298,12 @@ def _parse_metadata(self):

def _process_page_meta(self):
self._read_page_header()
pt = [const.page_meta_type, const.page_amd_type] + const.page_mix_types
if self._current_page_type in pt:
pt = [const.page_meta_type, const.page_amd_type, const.page_mix_type]
page_type = self._current_page_type
if page_type & const.page_type_mask in pt:
self._process_page_metadata()
is_data_page = self._current_page_type & const.page_data_type
is_mix_page = self._current_page_type in const.page_mix_types
return (is_data_page or is_mix_page
pt = [const.page_mix_type, const.page_data_type]
return (page_type & const.page_type_mask in pt
or self._current_page_data_subheader_pointers != [])

def _read_page_header(self):
Expand All @@ -313,6 +313,12 @@ def _read_page_header(self):
tx = const.block_count_offset + bit_offset
self._current_page_block_count = self._read_int(
tx, const.block_count_length)
if self._current_page_type & const.page_has_deleted_rows_bitmap:
tx = const.page_deleted_rows_bitmap_offset * self._int_length
self._current_page_deleted_rows_bitmap_offset = self._read_int(
tx, self._int_length)
else:
self._current_page_deleted_rows_bitmap_offset = None
tx = const.subheader_count_offset + bit_offset
self._current_page_subheaders_count = (
self._read_int(tx, const.subheader_count_length))
Expand Down Expand Up @@ -420,6 +426,9 @@ def _process_rowsize_subheader(self, offset, length):
offset + const.row_length_offset_multiplier * int_len, int_len)
self.row_count = self._read_int(
offset + const.row_count_offset_multiplier * int_len, int_len)
self.rows_deleted_count = self._read_int(
offset + const.rows_deleted_count_offset_multiplier * int_len,
int_len)
self.col_count_p1 = self._read_int(
offset + const.col_count_p1_multiplier * int_len, int_len)
self.col_count_p2 = self._read_int(
Expand Down Expand Up @@ -601,19 +610,20 @@ def _process_format_subheader(self, offset, length):

def read(self, nrows=None):

row_count = self.row_count - self.rows_deleted_count
if (nrows is None) and (self.chunksize is not None):
nrows = self.chunksize
elif nrows is None:
nrows = self.row_count
nrows = row_count

if len(self._column_types) == 0:
self.close()
raise EmptyDataError("No columns to parse from file")

if self._current_row_in_file_index >= self.row_count:
if self._current_row_in_file_index >= row_count:
return None

m = self.row_count - self._current_row_in_file_index
m = row_count - self._current_row_in_file_index
if nrows > m:
nrows = m

Expand Down Expand Up @@ -647,12 +657,11 @@ def _read_next_page(self):

self._read_page_header()
page_type = self._current_page_type
if page_type == const.page_meta_type:
if page_type & const.page_type_mask == const.page_meta_type:
self._process_page_metadata()

is_data_page = page_type & const.page_data_type
pt = [const.page_meta_type] + const.page_mix_types
if not is_data_page and self._current_page_type not in pt:
pt = [const.page_meta_type, const.page_mix_type, const.page_data_type]
if page_type & const.page_type_mask not in pt:
return self._read_next_page()

return False
Expand Down
8 changes: 7 additions & 1 deletion pandas/io/sas/sas_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
os_name_length = 16
page_bit_offset_x86 = 16
page_bit_offset_x64 = 32
page_deleted_rows_bitmap_offset = 3
subheader_pointer_length_x86 = 12
subheader_pointer_length_x64 = 24
page_type_offset = 0
Expand All @@ -52,18 +53,23 @@
subheader_count_offset = 4
subheader_count_length = 2
page_meta_type = 0
# If page_type has bit 7 set there may be deleted rows.
# These are marked in a bitmap following the row data.
page_has_deleted_rows_bitmap = 128
page_data_type = 256
page_amd_type = 1024
page_metc_type = 16384
page_comp_type = -28672
page_mix_types = [512, 640]
page_mix_type = 512
page_type_mask = (page_data_type | page_mix_type | page_amd_type)
subheader_pointers_offset = 8
truncated_subheader_id = 1
compressed_subheader_id = 4
compressed_subheader_type = 1
text_block_size_length = 2
row_length_offset_multiplier = 5
row_count_offset_multiplier = 6
rows_deleted_count_offset_multiplier = 7
col_count_p1_multiplier = 9
col_count_p2_multiplier = 10
row_count_on_mix_page_offset_multiplier = 15
Expand Down
4 changes: 4 additions & 0 deletions pandas/tests/io/sas/data/datetime_deleted_rows.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Date1,Date2,DateTime,DateTimeHi,Taiw
1960-01-06,1960-01-04,1677-09-21 00:12:44,1677-09-21 00:12:43.145225525,1912-01-01
1960-01-03,1960-01-05,2262-04-11 23:47:16,1960-01-01 00:00:00.000000000,1960-01-02
1960-01-06,1960-01-04,1677-09-21 00:12:44,2262-04-11 23:47:16.854774475,1912-01-01
Binary file not shown.
Loading