Skip to content

Commit 9465a59

Browse files
troelsaeltanawy
authored andcommitted
BUG: Some sas7bdat files with many columns are not parseable by read_sas (pandas-dev#22628)
1 parent 3ec461f commit 9465a59

File tree

7 files changed

+62
-31
lines changed

7 files changed

+62
-31
lines changed

doc/source/whatsnew/v0.24.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -743,6 +743,8 @@ I/O
743743
- :func:`read_excel()` will correctly show the deprecation warning for previously deprecated ``sheetname`` (:issue:`17994`)
744744
- :func:`read_csv()` will correctly parse timezone-aware datetimes (:issue:`22256`)
745745
- :func:`read_sas()` will parse numbers in sas7bdat-files that have width less than 8 bytes correctly. (:issue:`21616`)
746+
- :func:`read_sas()` will correctly parse sas7bdat files with many columns (:issue:`22628`)
747+
- :func:`read_sas()` will correctly parse sas7bdat files with data page types having also bit 7 set (so page type is 128 + 256 = 384) (:issue:`16615`)
746748

747749
Plotting
748750
^^^^^^^^

pandas/io/sas/sas.pyx

+5-5
Original file line numberDiff line numberDiff line change
@@ -244,8 +244,8 @@ cdef class Parser(object):
244244
self.parser = parser
245245
self.header_length = self.parser.header_length
246246
self.column_count = parser.column_count
247-
self.lengths = parser._column_data_lengths
248-
self.offsets = parser._column_data_offsets
247+
self.lengths = parser.column_data_lengths()
248+
self.offsets = parser.column_data_offsets()
249249
self.byte_chunk = parser._byte_chunk
250250
self.string_chunk = parser._string_chunk
251251
self.row_length = parser.row_length
@@ -257,7 +257,7 @@ cdef class Parser(object):
257257
# page indicators
258258
self.update_next_page()
259259

260-
column_types = parser.column_types
260+
column_types = parser.column_types()
261261

262262
# map column types
263263
for j in range(self.column_count):
@@ -375,7 +375,7 @@ cdef class Parser(object):
375375
if done:
376376
return True
377377
return False
378-
elif self.current_page_type == page_data_type:
378+
elif self.current_page_type & page_data_type == page_data_type:
379379
self.process_byte_array_with_data(
380380
bit_offset + subheader_pointers_offset +
381381
self.current_row_on_page_index * self.row_length,
@@ -437,7 +437,7 @@ cdef class Parser(object):
437437
elif column_types[j] == column_type_string:
438438
# string
439439
string_chunk[js, current_row] = np.array(source[start:(
440-
start + lngt)]).tostring().rstrip()
440+
start + lngt)]).tostring().rstrip(b"\x00 ")
441441
js += 1
442442

443443
self.current_row_on_page_index += 1

pandas/io/sas/sas7bdat.py

+35-26
Original file line numberDiff line numberDiff line change
@@ -82,14 +82,15 @@ def __init__(self, path_or_buf, index=None, convert_dates=True,
8282
self.compression = ""
8383
self.column_names_strings = []
8484
self.column_names = []
85-
self.column_types = []
8685
self.column_formats = []
8786
self.columns = []
8887

8988
self._current_page_data_subheader_pointers = []
9089
self._cached_page = None
9190
self._column_data_lengths = []
9291
self._column_data_offsets = []
92+
self._column_types = []
93+
9394
self._current_row_in_file_index = 0
9495
self._current_row_on_page_index = 0
9596
self._current_row_in_file_index = 0
@@ -102,6 +103,19 @@ def __init__(self, path_or_buf, index=None, convert_dates=True,
102103
self._get_properties()
103104
self._parse_metadata()
104105

106+
def column_data_lengths(self):
107+
"""Return a numpy int64 array of the column data lengths"""
108+
return np.asarray(self._column_data_lengths, dtype=np.int64)
109+
110+
def column_data_offsets(self):
111+
"""Return a numpy int64 array of the column offsets"""
112+
return np.asarray(self._column_data_offsets, dtype=np.int64)
113+
114+
def column_types(self):
115+
"""Returns a numpy character array of the column types:
116+
s (string) or d (double)"""
117+
return np.asarray(self._column_types, dtype=np.dtype('S1'))
118+
105119
def close(self):
106120
try:
107121
self.handle.close()
@@ -287,8 +301,10 @@ def _process_page_meta(self):
287301
pt = [const.page_meta_type, const.page_amd_type] + const.page_mix_types
288302
if self._current_page_type in pt:
289303
self._process_page_metadata()
290-
return ((self._current_page_type in [256] + const.page_mix_types) or
291-
(self._current_page_data_subheader_pointers is not None))
304+
is_data_page = self._current_page_type & const.page_data_type
305+
is_mix_page = self._current_page_type in const.page_mix_types
306+
return (is_data_page or is_mix_page
307+
or self._current_page_data_subheader_pointers != [])
292308

293309
def _read_page_header(self):
294310
bit_offset = self._page_bit_offset
@@ -503,12 +519,6 @@ def _process_columnattributes_subheader(self, offset, length):
503519
int_len = self._int_length
504520
column_attributes_vectors_count = (
505521
length - 2 * int_len - 12) // (int_len + 8)
506-
self.column_types = np.empty(
507-
column_attributes_vectors_count, dtype=np.dtype('S1'))
508-
self._column_data_lengths = np.empty(
509-
column_attributes_vectors_count, dtype=np.int64)
510-
self._column_data_offsets = np.empty(
511-
column_attributes_vectors_count, dtype=np.int64)
512522
for i in range(column_attributes_vectors_count):
513523
col_data_offset = (offset + int_len +
514524
const.column_data_offset_offset +
@@ -520,16 +530,13 @@ def _process_columnattributes_subheader(self, offset, length):
520530
const.column_type_offset + i * (int_len + 8))
521531

522532
x = self._read_int(col_data_offset, int_len)
523-
self._column_data_offsets[i] = x
533+
self._column_data_offsets.append(x)
524534

525535
x = self._read_int(col_data_len, const.column_data_length_length)
526-
self._column_data_lengths[i] = x
536+
self._column_data_lengths.append(x)
527537

528538
x = self._read_int(col_types, const.column_type_length)
529-
if x == 1:
530-
self.column_types[i] = b'd'
531-
else:
532-
self.column_types[i] = b's'
539+
self._column_types.append(b'd' if x == 1 else b's')
533540

534541
def _process_columnlist_subheader(self, offset, length):
535542
# unknown purpose
@@ -586,7 +593,7 @@ def _process_format_subheader(self, offset, length):
586593
col.name = self.column_names[current_column_number]
587594
col.label = column_label
588595
col.format = column_format
589-
col.ctype = self.column_types[current_column_number]
596+
col.ctype = self._column_types[current_column_number]
590597
col.length = self._column_data_lengths[current_column_number]
591598

592599
self.column_formats.append(column_format)
@@ -599,7 +606,7 @@ def read(self, nrows=None):
599606
elif nrows is None:
600607
nrows = self.row_count
601608

602-
if len(self.column_types) == 0:
609+
if len(self._column_types) == 0:
603610
self.close()
604611
raise EmptyDataError("No columns to parse from file")
605612

@@ -610,8 +617,8 @@ def read(self, nrows=None):
610617
if nrows > m:
611618
nrows = m
612619

613-
nd = (self.column_types == b'd').sum()
614-
ns = (self.column_types == b's').sum()
620+
nd = self._column_types.count(b'd')
621+
ns = self._column_types.count(b's')
615622

616623
self._string_chunk = np.empty((ns, nrows), dtype=np.object)
617624
self._byte_chunk = np.zeros((nd, 8 * nrows), dtype=np.uint8)
@@ -639,11 +646,13 @@ def _read_next_page(self):
639646
self._page_length))
640647

641648
self._read_page_header()
642-
if self._current_page_type == const.page_meta_type:
649+
page_type = self._current_page_type
650+
if page_type == const.page_meta_type:
643651
self._process_page_metadata()
644-
pt = [const.page_meta_type, const.page_data_type]
645-
pt += [const.page_mix_types]
646-
if self._current_page_type not in pt:
652+
653+
is_data_page = page_type & const.page_data_type
654+
pt = [const.page_meta_type] + const.page_mix_types
655+
if not is_data_page and self._current_page_type not in pt:
647656
return self._read_next_page()
648657

649658
return False
@@ -660,7 +669,7 @@ def _chunk_to_dataframe(self):
660669

661670
name = self.column_names[j]
662671

663-
if self.column_types[j] == b'd':
672+
if self._column_types[j] == b'd':
664673
rslt[name] = self._byte_chunk[jb, :].view(
665674
dtype=self.byte_order + 'd')
666675
rslt[name] = np.asarray(rslt[name], dtype=np.float64)
@@ -674,7 +683,7 @@ def _chunk_to_dataframe(self):
674683
rslt[name] = pd.to_datetime(rslt[name], unit=unit,
675684
origin="1960-01-01")
676685
jb += 1
677-
elif self.column_types[j] == b's':
686+
elif self._column_types[j] == b's':
678687
rslt[name] = self._string_chunk[js, :]
679688
if self.convert_text and (self.encoding is not None):
680689
rslt[name] = rslt[name].str.decode(
@@ -686,6 +695,6 @@ def _chunk_to_dataframe(self):
686695
else:
687696
self.close()
688697
raise ValueError("unknown column type %s" %
689-
self.column_types[j])
698+
self._column_types[j])
690699

691700
return rslt
576 KB
Binary file not shown.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
DATASRC,PDDOCID,age,agegt89,ASSESSA,ASSESS1,ASSESS3,ASSESS4,ASSESS5,ASSESS6,ASSESS7,week,BECK,conf1,conf2,conf3,demo3,demo4,demo5,demo6,demo7,demo11a,demo11b,demo11c,demo11d,derm1b,derm2,derm3,derm4,derm5a,derm5b,derm7,derm7a,derm7b,derm8,derm9,ECG3,ecgrtxt,ecgrhr,ecgrpr,ecgrqrs,ecgrqrsaxis,ecgrqt,ecgrqtc,ecgrrep,ecgrtime,mmse1,mmse2,mmse3,mmse4,mmse5,mmse6,mmse7,mmse8,mmse9,mmse10,mmse11,mmse12,mmse13,mmse14,mmse15,mmse16,mmse17,mmse18,mmse19,mmse20,mmse,mmsescor,mrf1,mrf2,mrf3,mrf4,mrf5,mrf6,mrf7,mrf8,mrf9,mrf10,mrf11,mrf12,mrf13,nvitl1s,nvitl1d,nvitl1r,nvitl2s,nvitl2d,nvitl2r,nvitl3s,nvitl3d,nvitl3r,nvitl4s,nvitl4d,nvitl4r,nvitl5,nvitl1,nvitl2,nvitl3,nvitl4,phys1,phys1a,phys14,phys15a,phys15b,phys15c,phys15d,phys16a,phys16b,phys16c,phys16d,phys17a,phys17b,phys17c,phys17d,phys18a,phys18b,phys18c,phys18d,phys19a,phys19b,phys20,phys22,phys24,phys26,phys28,PREG1,PREG2,updrsa,updrs1,updrs2,updrs3,updrs4,updrs5a,updrs6a,updrs7a,updrs8a,updrs9a,updrs10a,updrs11a,updrs12a,updrs13a,updrs14a,updrs15a,updrs16a,updrs17a,updrs18a,updrs19a,updrs20a1,updrs20b1,updrs20c1,updrs20d1,updrs20e1,updrs21a1,updrs21b1,updrs22a1,updrs22b1,updrs22c1,updrs22d1,updrs22e1,updrs23a1,updrs23b1,updrs24a1,updrs24b1,updrs25a1,updrs25b1,updrs26a1,updrs26b1,updrs26c1,updrs26d1,updrs27a,updrs28a,updrs29a,updrs30a,updrs31a,updrs32a,updrs33a,updrs34a,updrs35,updrs36,updrs37,updrs38,updrs39,updrs5b,updrs6b,updrs7b,updrs8b,updrs9b,updrs10b,updrs11b,updrs12b,updrs13b,updrs14b,updrs15b,updrs16b,updrs17b,updrs18b,updrs19b,updrs20a2,updrs20b2,updrs20c2,updrs20d2,updrs20e2,updrs21a2,updrs21b2,updrs22a2,updrs22b2,updrs22c2,updrs22d2,updrs22e2,updrs23a2,updrs23b2,updrs24a2,updrs24b2,updrs25a2,updrs25b2,updrs26a2,updrs26b2,updrs26c2,updrs26d2,updrs27b,updrs28b,updrs29b,updrs30b,updrs31b,updrs32b,updrs33b,updrs34b,updrs5c,updrs6c,updrs7c,updrs8c,updrs9c,updrs10c,updrs11c,updrs12c,updrs13c,updrs14c,updrs15c,updrs16c,updrs17c,updrs32c,updrs33c,updrs34c,updrsmental,updrsadl,updrsadlon,updrsadloff,updrsadlmin,updrstremor,updrstremortreat,updrstremormin,updrsrigid,updrsrigidtreat,updrsrigidmin,updrsmotor,updrsmotortreat,updrsmotormin,updrs,updrstrt,updrsmin,updrs4a,updrs41,updrs42,updrs43,updrs44,updrs45,updrs46,updrs47,updrs48,updrs49,updrs410,updrs411,vitl1s,vitl1d,vitl2,vitl3s,vitl3d,vitl4,vitl5,vitl6,assess,fbeck,conf,demo1,derm,ecg,ecgr,mrf,nvitl,fphys1,fpreg,fupdrs,fupdrs4,vitl,site,race,rImaged,rPD,rPDlt5,rAgeGt30,rHY,rMed,rMelanoma,rPreclude,rNeed,rEligible,gender,incsae,incsusp,incterm,increlated,inctermat,increason,incafter24,incendp,incres,disp2,disp3,disp4,disp6,inex1,inex2,inex3,inex4,inex5,inex6,inex7,inex8,inex9,inex10,inex11,inex12,inex13,inex14,inex15,inex16,inex17,inex18,inex19,inex20,inex21,inex22,inex23,inex24,inex25,inex26,inex27,inex28,treatment,treat,disp,inex,classify,enrollyr,demoyear,dob_yr,inexdays,demodays,onsetdays,diagdays,medstartdays,physdays,phys21dys,phys23dys,phys25dys,phys27dys,phys29dys,confdays,pregdays,nvitldays,nvitlscandays,vitldays,labdays,ecgdays,ecgtestdays,mrfdays,dermdays,dermexamdays,dermbiopdays,mmsedays,beckdays,updrdays,updr4days,assessdays,daystotherapy,dispdays,endpdys,termdys,SAEdys,resdys,lmeddys,wddays,VISIT_NO
2+
a030,ab304,43.0,0.0,0.0,0.0,,,,,,-2.0,0.0,1.0,1.0,,2.0,1.0,19.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,,,,,,,0.0,2.0,ABNORMAL,75.0,150.0,100.0,-3.0,410.0,460.0,2.0,1000.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,3.0,5.0,2.0,1.0,1.0,1.0,0.0,3.0,1.0,1.0,1.0,26.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,150.0,94.0,73.0,155.0,96.0,71.0,148.0,91.0,69.0,146.0,67.0,72.0,1.0,42840.0,46080.0,46980.0,30600.0,100.0,175.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,4.0,4.0,4.0,2.0,1.0,,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.5,0.0,0.0,0.0,1.0,1.0,2.0,2.0,1.0,1.5,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,2.5,95.0,95.0,7.0,,2.0,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,5.0,,,5.0,1.5,,1.5,7.5,,7.5,20.0,,20.0,25.0,,25.0,,,,,,,,,,,,,138.0,86.0,72.0,130.0,80.0,80.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,abc,1.0,1.0,1.0,0.0,1.0,34.0,5.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,,0.0,3.0,0.0,1.0,0.0,4.0,3.0,,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Placebo,1.0,1.0,1.0,1.0,2002.0,2002.0,1914.0,-28.0,-28.0,-404.0,-28.0,0.0,-28.0,,,,,-6.0,-28.0,-13.0,-13.0,-12.0,-28.0,-28.0,-28.0,-28.0,-28.0,-14.0,-14.0,,-28.0,-28.0,-28.0,,-28.0,,659.0,426.0,659.0,,,658.0,100.0,ab
3+
a030,ab304,43.0,0.0,0.0,0.0,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1000.0,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,2.0,0.0,0.0,1.0,0.0,1.0,2.0,95.0,95.0,7.0,,2.0,1.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,3.0,,,3.0,0.0,,0.0,3.0,,3.0,13.0,,13.0,16.0,,16.0,,,,,,,,,,,,,140.0,86.0,76.0,132.0,80.0,84.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,abc,0.0,0.0,1.0,0.0,1.0,34.0,5.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,,0.0,3.0,0.0,1.0,0.0,4.0,3.0,,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Placebo,1.0,1.0,1.0,1.0,2002.0,,1914.0,-28.0,,,,0.0,,,,,,,,,,,0.0,0.0,,,,,,,,,0.0,,0.0,,659.0,426.0,659.0,,,658.0,100.0,ab
4+
a030,ab304,43.0,0.0,0.0,0.0,,,,,,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1000.0,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,2.0,0.0,1.0,1.0,0.5,1.0,2.0,90.0,95.0,7.0,,2.0,2.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,5.0,,,5.0,0.5,,0.5,2.0,,2.0,16.0,,16.0,21.0,,21.0,0.0,,,,,,,,,,,,149.0,88.0,80.0,136.0,90.0,82.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,abc,0.0,0.0,1.0,1.0,1.0,34.0,5.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,,0.0,3.0,0.0,1.0,0.0,4.0,3.0,,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Placebo,1.0,1.0,1.0,1.0,2002.0,,1914.0,-28.0,,,,0.0,,,,,,,,,,,29.0,29.0,,,,,,,,,29.0,29.0,29.0,,659.0,426.0,659.0,,,658.0,100.0,ab
Binary file not shown.

pandas/tests/io/sas/test_sas7bdat.py

+16
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,22 @@ def test_compact_numerical_values(datapath):
199199
tm.assert_series_equal(result, expected, check_exact=True)
200200

201201

202+
def test_many_columns(datapath):
203+
# Test for looking for column information in more places (PR #22628)
204+
fname = datapath("io", "sas", "data", "many_columns.sas7bdat")
205+
df = pd.read_sas(fname, encoding='latin-1')
206+
fname = datapath("io", "sas", "data", "many_columns.csv")
207+
df0 = pd.read_csv(fname, encoding='latin-1')
208+
tm.assert_frame_equal(df, df0)
209+
210+
211+
def test_inconsistent_number_of_rows(datapath):
212+
# Regression test for issue #16615. (PR #22628)
213+
fname = datapath("io", "sas", "data", "load_log.sas7bdat")
214+
df = pd.read_sas(fname, encoding='latin-1')
215+
assert len(df) == 2097
216+
217+
202218
def test_zero_variables(datapath):
203219
# Check if the SAS file has zero variables (PR #18184)
204220
fname = datapath("io", "sas", "data", "zero_variables.sas7bdat")

0 commit comments

Comments
 (0)