BUG: Some sas7bdat files with many columns are not parseable by read_sas (pandas-dev#22628)

troels · aeltanawy · commit 9465a596c591 · 2018-09-19T23:01:07.000-07:00
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -743,6 +743,8 @@ I/O
 - :func:`read_excel()` will correctly show the deprecation warning for previously deprecated ``sheetname`` (:issue:`17994`)
 - :func:`read_csv()` will correctly parse timezone-aware datetimes (:issue:`22256`)
 - :func:`read_sas()` will parse numbers in sas7bdat-files that have width less than 8 bytes correctly. (:issue:`21616`)
+- :func:`read_sas()` will correctly parse sas7bdat files with many columns (:issue:`22628`)
+- :func:`read_sas()` will correctly parse sas7bdat files with data page types having also bit 7 set (so page type is 128 + 256 = 384) (:issue:`16615`)
 
 Plotting
 ^^^^^^^^
diff --git a/pandas/io/sas/sas.pyx b/pandas/io/sas/sas.pyx
@@ -244,8 +244,8 @@ cdef class Parser(object):
         self.parser = parser
         self.header_length = self.parser.header_length
         self.column_count = parser.column_count
-        self.lengths = parser._column_data_lengths
-        self.offsets = parser._column_data_offsets
+        self.lengths = parser.column_data_lengths()
+        self.offsets = parser.column_data_offsets()
         self.byte_chunk = parser._byte_chunk
         self.string_chunk = parser._string_chunk
         self.row_length = parser.row_length
@@ -257,7 +257,7 @@ cdef class Parser(object):
         # page indicators
         self.update_next_page()
 
-        column_types = parser.column_types
+        column_types = parser.column_types()
 
         # map column types
         for j in range(self.column_count):
@@ -375,7 +375,7 @@ cdef class Parser(object):
                     if done:
                         return True
                 return False
-            elif self.current_page_type == page_data_type:
+            elif self.current_page_type & page_data_type == page_data_type:
                 self.process_byte_array_with_data(
                     bit_offset + subheader_pointers_offset +
                     self.current_row_on_page_index * self.row_length,
@@ -437,7 +437,7 @@ cdef class Parser(object):
             elif column_types[j] == column_type_string:
                 # string
                 string_chunk[js, current_row] = np.array(source[start:(
-                    start + lngt)]).tostring().rstrip()
+                    start + lngt)]).tostring().rstrip(b"\x00 ")
                 js += 1
 
         self.current_row_on_page_index += 1
diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py
@@ -82,14 +82,15 @@ def __init__(self, path_or_buf, index=None, convert_dates=True,
         self.compression = ""
         self.column_names_strings = []
         self.column_names = []
-        self.column_types = []
         self.column_formats = []
         self.columns = []
 
         self._current_page_data_subheader_pointers = []
         self._cached_page = None
         self._column_data_lengths = []
         self._column_data_offsets = []
+        self._column_types = []
+
         self._current_row_in_file_index = 0
         self._current_row_on_page_index = 0
         self._current_row_in_file_index = 0
@@ -102,6 +103,19 @@ def __init__(self, path_or_buf, index=None, convert_dates=True,
         self._get_properties()
         self._parse_metadata()
 
+    def column_data_lengths(self):
+        """Return a numpy int64 array of the column data lengths"""
+        return np.asarray(self._column_data_lengths, dtype=np.int64)
+
+    def column_data_offsets(self):
+        """Return a numpy int64 array of the column offsets"""
+        return np.asarray(self._column_data_offsets, dtype=np.int64)
+
+    def column_types(self):
+        """Returns a numpy character array of the column types:
+           s (string) or d (double)"""
+        return np.asarray(self._column_types, dtype=np.dtype('S1'))
+
     def close(self):
         try:
             self.handle.close()
@@ -287,8 +301,10 @@ def _process_page_meta(self):
         pt = [const.page_meta_type, const.page_amd_type] + const.page_mix_types
         if self._current_page_type in pt:
             self._process_page_metadata()
-        return ((self._current_page_type in [256] + const.page_mix_types) or
-                (self._current_page_data_subheader_pointers is not None))
+        is_data_page = self._current_page_type & const.page_data_type
+        is_mix_page = self._current_page_type in const.page_mix_types
+        return (is_data_page or is_mix_page
+                or self._current_page_data_subheader_pointers != [])
 
     def _read_page_header(self):
         bit_offset = self._page_bit_offset
@@ -503,12 +519,6 @@ def _process_columnattributes_subheader(self, offset, length):
         int_len = self._int_length
         column_attributes_vectors_count = (
             length - 2 * int_len - 12) // (int_len + 8)
-        self.column_types = np.empty(
-            column_attributes_vectors_count, dtype=np.dtype('S1'))
-        self._column_data_lengths = np.empty(
-            column_attributes_vectors_count, dtype=np.int64)
-        self._column_data_offsets = np.empty(
-            column_attributes_vectors_count, dtype=np.int64)
         for i in range(column_attributes_vectors_count):
             col_data_offset = (offset + int_len +
                                const.column_data_offset_offset +
@@ -520,16 +530,13 @@ def _process_columnattributes_subheader(self, offset, length):
                          const.column_type_offset + i * (int_len + 8))
 
             x = self._read_int(col_data_offset, int_len)
-            self._column_data_offsets[i] = x
+            self._column_data_offsets.append(x)
 
             x = self._read_int(col_data_len, const.column_data_length_length)
-            self._column_data_lengths[i] = x
+            self._column_data_lengths.append(x)
 
             x = self._read_int(col_types, const.column_type_length)
-            if x == 1:
-                self.column_types[i] = b'd'
-            else:
-                self.column_types[i] = b's'
+            self._column_types.append(b'd' if x == 1 else b's')
 
     def _process_columnlist_subheader(self, offset, length):
         # unknown purpose
@@ -586,7 +593,7 @@ def _process_format_subheader(self, offset, length):
         col.name = self.column_names[current_column_number]
         col.label = column_label
         col.format = column_format
-        col.ctype = self.column_types[current_column_number]
+        col.ctype = self._column_types[current_column_number]
         col.length = self._column_data_lengths[current_column_number]
 
         self.column_formats.append(column_format)
@@ -599,7 +606,7 @@ def read(self, nrows=None):
         elif nrows is None:
             nrows = self.row_count
 
-        if len(self.column_types) == 0:
+        if len(self._column_types) == 0:
             self.close()
             raise EmptyDataError("No columns to parse from file")
 
@@ -610,8 +617,8 @@ def read(self, nrows=None):
         if nrows > m:
             nrows = m
 
-        nd = (self.column_types == b'd').sum()
-        ns = (self.column_types == b's').sum()
+        nd = self._column_types.count(b'd')
+        ns = self._column_types.count(b's')
 
         self._string_chunk = np.empty((ns, nrows), dtype=np.object)
         self._byte_chunk = np.zeros((nd, 8 * nrows), dtype=np.uint8)
@@ -639,11 +646,13 @@ def _read_next_page(self):
                                         self._page_length))
 
         self._read_page_header()
-        if self._current_page_type == const.page_meta_type:
+        page_type = self._current_page_type
+        if page_type == const.page_meta_type:
             self._process_page_metadata()
-        pt = [const.page_meta_type, const.page_data_type]
-        pt += [const.page_mix_types]
-        if self._current_page_type not in pt:
+
+        is_data_page = page_type & const.page_data_type
+        pt = [const.page_meta_type] + const.page_mix_types
+        if not is_data_page and self._current_page_type not in pt:
             return self._read_next_page()
 
         return False
@@ -660,7 +669,7 @@ def _chunk_to_dataframe(self):
 
             name = self.column_names[j]
 
-            if self.column_types[j] == b'd':
+            if self._column_types[j] == b'd':
                 rslt[name] = self._byte_chunk[jb, :].view(
                     dtype=self.byte_order + 'd')
                 rslt[name] = np.asarray(rslt[name], dtype=np.float64)
@@ -674,7 +683,7 @@ def _chunk_to_dataframe(self):
                         rslt[name] = pd.to_datetime(rslt[name], unit=unit,
                                                     origin="1960-01-01")
                 jb += 1
-            elif self.column_types[j] == b's':
+            elif self._column_types[j] == b's':
                 rslt[name] = self._string_chunk[js, :]
                 if self.convert_text and (self.encoding is not None):
                     rslt[name] = rslt[name].str.decode(
@@ -686,6 +695,6 @@ def _chunk_to_dataframe(self):
             else:
                 self.close()
                 raise ValueError("unknown column type %s" %
-                                 self.column_types[j])
+                                 self._column_types[j])
 
         return rslt
diff --git a/pandas/tests/io/sas/data/load_log.sas7bdat b/pandas/tests/io/sas/data/load_log.sas7bdat
diff --git a/pandas/tests/io/sas/data/many_columns.csv b/pandas/tests/io/sas/data/many_columns.csv
@@ -0,0 +1,4 @@
+DATASRC,PDDOCID,age,agegt89,ASSESSA,ASSESS1,ASSESS3,ASSESS4,ASSESS5,ASSESS6,ASSESS7,week,BECK,conf1,conf2,conf3,demo3,demo4,demo5,demo6,demo7,demo11a,demo11b,demo11c,demo11d,derm1b,derm2,derm3,derm4,derm5a,derm5b,derm7,derm7a,derm7b,derm8,derm9,ECG3,ecgrtxt,ecgrhr,ecgrpr,ecgrqrs,ecgrqrsaxis,ecgrqt,ecgrqtc,ecgrrep,ecgrtime,mmse1,mmse2,mmse3,mmse4,mmse5,mmse6,mmse7,mmse8,mmse9,mmse10,mmse11,mmse12,mmse13,mmse14,mmse15,mmse16,mmse17,mmse18,mmse19,mmse20,mmse,mmsescor,mrf1,mrf2,mrf3,mrf4,mrf5,mrf6,mrf7,mrf8,mrf9,mrf10,mrf11,mrf12,mrf13,nvitl1s,nvitl1d,nvitl1r,nvitl2s,nvitl2d,nvitl2r,nvitl3s,nvitl3d,nvitl3r,nvitl4s,nvitl4d,nvitl4r,nvitl5,nvitl1,nvitl2,nvitl3,nvitl4,phys1,phys1a,phys14,phys15a,phys15b,phys15c,phys15d,phys16a,phys16b,phys16c,phys16d,phys17a,phys17b,phys17c,phys17d,phys18a,phys18b,phys18c,phys18d,phys19a,phys19b,phys20,phys22,phys24,phys26,phys28,PREG1,PREG2,updrsa,updrs1,updrs2,updrs3,updrs4,updrs5a,updrs6a,updrs7a,updrs8a,updrs9a,updrs10a,updrs11a,updrs12a,updrs13a,updrs14a,updrs15a,updrs16a,updrs17a,updrs18a,updrs19a,updrs20a1,updrs20b1,updrs20c1,updrs20d1,updrs20e1,updrs21a1,updrs21b1,updrs22a1,updrs22b1,updrs22c1,updrs22d1,updrs22e1,updrs23a1,updrs23b1,updrs24a1,updrs24b1,updrs25a1,updrs25b1,updrs26a1,updrs26b1,updrs26c1,updrs26d1,updrs27a,updrs28a,updrs29a,updrs30a,updrs31a,updrs32a,updrs33a,updrs34a,updrs35,updrs36,updrs37,updrs38,updrs39,updrs5b,updrs6b,updrs7b,updrs8b,updrs9b,updrs10b,updrs11b,updrs12b,updrs13b,updrs14b,updrs15b,updrs16b,updrs17b,updrs18b,updrs19b,updrs20a2,updrs20b2,updrs20c2,updrs20d2,updrs20e2,updrs21a2,updrs21b2,updrs22a2,updrs22b2,updrs22c2,updrs22d2,updrs22e2,updrs23a2,updrs23b2,updrs24a2,updrs24b2,updrs25a2,updrs25b2,updrs26a2,updrs26b2,updrs26c2,updrs26d2,updrs27b,updrs28b,updrs29b,updrs30b,updrs31b,updrs32b,updrs33b,updrs34b,updrs5c,updrs6c,updrs7c,updrs8c,updrs9c,updrs10c,updrs11c,updrs12c,updrs13c,updrs14c,updrs15c,updrs16c,updrs17c,updrs32c,updrs33c,updrs34c,updrsmental,updrsadl,updrsadlon,updrsadloff,updrsadlmin,updrstremor,updrstremortreat,updrstremormin,updrsrigid,updrsrigidtreat,updrsrigidmin,updrsmotor,updrsmotortreat,updrsmotormin,updrs,updrstrt,updrsmin,updrs4a,updrs41,updrs42,updrs43,updrs44,updrs45,updrs46,updrs47,updrs48,updrs49,updrs410,updrs411,vitl1s,vitl1d,vitl2,vitl3s,vitl3d,vitl4,vitl5,vitl6,assess,fbeck,conf,demo1,derm,ecg,ecgr,mrf,nvitl,fphys1,fpreg,fupdrs,fupdrs4,vitl,site,race,rImaged,rPD,rPDlt5,rAgeGt30,rHY,rMed,rMelanoma,rPreclude,rNeed,rEligible,gender,incsae,incsusp,incterm,increlated,inctermat,increason,incafter24,incendp,incres,disp2,disp3,disp4,disp6,inex1,inex2,inex3,inex4,inex5,inex6,inex7,inex8,inex9,inex10,inex11,inex12,inex13,inex14,inex15,inex16,inex17,inex18,inex19,inex20,inex21,inex22,inex23,inex24,inex25,inex26,inex27,inex28,treatment,treat,disp,inex,classify,enrollyr,demoyear,dob_yr,inexdays,demodays,onsetdays,diagdays,medstartdays,physdays,phys21dys,phys23dys,phys25dys,phys27dys,phys29dys,confdays,pregdays,nvitldays,nvitlscandays,vitldays,labdays,ecgdays,ecgtestdays,mrfdays,dermdays,dermexamdays,dermbiopdays,mmsedays,beckdays,updrdays,updr4days,assessdays,daystotherapy,dispdays,endpdys,termdys,SAEdys,resdys,lmeddys,wddays,VISIT_NO
+a030,ab304,43.0,0.0,0.0,0.0,,,,,,-2.0,0.0,1.0,1.0,,2.0,1.0,19.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,,,,,,,0.0,2.0,ABNORMAL,75.0,150.0,100.0,-3.0,410.0,460.0,2.0,1000.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,3.0,5.0,2.0,1.0,1.0,1.0,0.0,3.0,1.0,1.0,1.0,26.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,150.0,94.0,73.0,155.0,96.0,71.0,148.0,91.0,69.0,146.0,67.0,72.0,1.0,42840.0,46080.0,46980.0,30600.0,100.0,175.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,4.0,4.0,4.0,2.0,1.0,,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.5,0.0,0.0,0.0,1.0,1.0,2.0,2.0,1.0,1.5,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,2.5,95.0,95.0,7.0,,2.0,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,5.0,,,5.0,1.5,,1.5,7.5,,7.5,20.0,,20.0,25.0,,25.0,,,,,,,,,,,,,138.0,86.0,72.0,130.0,80.0,80.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,abc,1.0,1.0,1.0,0.0,1.0,34.0,5.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,,0.0,3.0,0.0,1.0,0.0,4.0,3.0,,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Placebo,1.0,1.0,1.0,1.0,2002.0,2002.0,1914.0,-28.0,-28.0,-404.0,-28.0,0.0,-28.0,,,,,-6.0,-28.0,-13.0,-13.0,-12.0,-28.0,-28.0,-28.0,-28.0,-28.0,-14.0,-14.0,,-28.0,-28.0,-28.0,,-28.0,,659.0,426.0,659.0,,,658.0,100.0,ab
+a030,ab304,43.0,0.0,0.0,0.0,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1000.0,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,2.0,0.0,0.0,1.0,0.0,1.0,2.0,95.0,95.0,7.0,,2.0,1.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,3.0,,,3.0,0.0,,0.0,3.0,,3.0,13.0,,13.0,16.0,,16.0,,,,,,,,,,,,,140.0,86.0,76.0,132.0,80.0,84.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,abc,0.0,0.0,1.0,0.0,1.0,34.0,5.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,,0.0,3.0,0.0,1.0,0.0,4.0,3.0,,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Placebo,1.0,1.0,1.0,1.0,2002.0,,1914.0,-28.0,,,,0.0,,,,,,,,,,,0.0,0.0,,,,,,,,,0.0,,0.0,,659.0,426.0,659.0,,,658.0,100.0,ab
+a030,ab304,43.0,0.0,0.0,0.0,,,,,,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1000.0,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,2.0,0.0,1.0,1.0,0.5,1.0,2.0,90.0,95.0,7.0,,2.0,2.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,5.0,,,5.0,0.5,,0.5,2.0,,2.0,16.0,,16.0,21.0,,21.0,0.0,,,,,,,,,,,,149.0,88.0,80.0,136.0,90.0,82.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,abc,0.0,0.0,1.0,1.0,1.0,34.0,5.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,,0.0,3.0,0.0,1.0,0.0,4.0,3.0,,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Placebo,1.0,1.0,1.0,1.0,2002.0,,1914.0,-28.0,,,,0.0,,,,,,,,,,,29.0,29.0,,,,,,,,,29.0,29.0,29.0,,659.0,426.0,659.0,,,658.0,100.0,ab
diff --git a/pandas/tests/io/sas/data/many_columns.sas7bdat b/pandas/tests/io/sas/data/many_columns.sas7bdat
diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py
@@ -199,6 +199,22 @@ def test_compact_numerical_values(datapath):
     tm.assert_series_equal(result, expected, check_exact=True)
 
 
+def test_many_columns(datapath):
+    # Test for looking for column information in more places (PR #22628)
+    fname = datapath("io", "sas", "data", "many_columns.sas7bdat")
+    df = pd.read_sas(fname, encoding='latin-1')
+    fname = datapath("io", "sas", "data", "many_columns.csv")
+    df0 = pd.read_csv(fname, encoding='latin-1')
+    tm.assert_frame_equal(df, df0)
+
+
+def test_inconsistent_number_of_rows(datapath):
+    # Regression test for issue #16615. (PR #22628)
+    fname = datapath("io", "sas", "data", "load_log.sas7bdat")
+    df = pd.read_sas(fname, encoding='latin-1')
+    assert len(df) == 2097
+
+
 def test_zero_variables(datapath):
     # Check if the SAS file has zero variables (PR #18184)
     fname = datapath("io", "sas", "data", "zero_variables.sas7bdat")