ENH: add truncated float support to read_sas, #11713

kshedden · jreback · commit 547750aa5ba5 · 2015-12-01T10:13:50.000-05:00
diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt
@@ -31,6 +31,8 @@ New features
 Other enhancements
 ^^^^^^^^^^^^^^^^^^
 
+- Handle truncated floats in SAS xport files (:issue:`11713`)
+
 .. _whatsnew_0180.enhancements.rounding:
 
 Datetimelike rounding
diff --git a/pandas/io/sas.py b/pandas/io/sas.py
@@ -1,5 +1,5 @@
 """
-Tools for reading SAS XPort files into Pandas objects.
+Read a SAS XPort format file into a Pandas DataFrame.
 
 Based on code from Jack Cushman (github.com/jcushman/xport).
 
@@ -25,10 +25,6 @@
               'nifl', 'nifd', 'npos', '_']
 
 
-# TODO: Support for 4 byte floats, see https://github.com/jcushman/xport/pull/3
-# Need a test file
-
-
 _base_params_doc = """\
 Parameters
 ----------
@@ -161,15 +157,33 @@ def _split_line(s, parts):
     return out
 
 
+def _handle_truncated_float_vec(vec, nbytes):
+    # This feature is not well documented, but some SAS XPORT files
+    # have 2-7 byte "truncated" floats.  To read these truncated
+    # floats, pad them with zeros on the right to make 8 byte floats.
+    #
+    # References:
+    # https://github.com/jcushman/xport/pull/3
+    # The R "foreign" library
+
+    if nbytes != 8:
+        vec1 = np.zeros(len(vec), np.dtype('S8'))
+        dtype = np.dtype('S%d,S%d' % (nbytes, 8 - nbytes))
+        vec2 = vec1.view(dtype=dtype)
+        vec2['f0'] = vec
+        return vec2
+
+    return vec
+
+
 def _parse_float_vec(vec):
     """
-    Parse a vector of 8-byte values representing IBM 8 byte floats
-    into native 8 byte floats.
+    Parse a vector of float values representing IBM 8 byte floats into
+    native 8 byte floats.
     """
 
     dtype = np.dtype('>u4,>u4')
     vec1 = vec.view(dtype=dtype)
-
     xport1 = vec1['f0']
     xport2 = vec1['f1']
 
@@ -266,7 +280,8 @@ def _read_header(self):
             raise ValueError("Header record is not an XPORT file.")
 
         line2 = self._get_row()
-        file_info = _split_line(line2, [ ['prefix',24], ['version',8], ['OS',8], ['_',24], ['created',16]])
+        file_info = _split_line(line2, [['prefix', 24], ['version', 8], ['OS', 8],
+                                        ['_', 24], ['created', 16]])
         if file_info['prefix'] != "SAS     SAS     SASLIB":
             raise ValueError("Header record has invalid prefix.")
         file_info['created'] = _parse_date(file_info['created'])
@@ -283,11 +298,11 @@ def _read_header(self):
         fieldnamelength = int(header1[-5:-2]) # usually 140, could be 135
 
         # member info
-        member_info = _split_line(self._get_row(), [['prefix',8], ['set_name',8],
-                                                    ['sasdata',8],['version',8],
-                                                    ['OS',8],['_',24],['created',16]])
-        member_info.update( _split_line(self._get_row(), [['modified',16], ['_',16],
-                                                          ['label',40],['type',8]]))
+        member_info = _split_line(self._get_row(), [['prefix', 8], ['set_name', 8],
+                                                    ['sasdata', 8],['version', 8],
+                                                    ['OS', 8],['_', 24],['created', 16]])
+        member_info.update( _split_line(self._get_row(), [['modified', 16], ['_', 16],
+                                                          ['label', 40],['type', 8]]))
         member_info['modified'] = _parse_date(member_info['modified'])
         member_info['created'] = _parse_date(member_info['created'])
         self.member_info = member_info
@@ -313,8 +328,9 @@ def _read_header(self):
             field = dict(zip(_fieldkeys, fieldstruct))
             del field['_']
             field['ntype'] = types[field['ntype']]
-            if field['ntype'] == 'numeric' and field['field_length'] != 8:
-                raise TypeError("Only 8-byte floats are currently implemented. Can't read field %s." % field)
+            fl = field['field_length']
+            if field['ntype'] == 'numeric' and ((fl < 2) or (fl > 8)):
+                raise TypeError("Floating point field width %d is not between 2 and 8." % fw)
 
             for k, v in field.items():
                 try:
@@ -339,11 +355,7 @@ def _read_header(self):
         # Setup the dtype.
         dtypel = []
         for i,field in enumerate(self.fields):
-            ntype = field['ntype']
-            if ntype == "numeric":
-                dtypel.append(('s' + str(i), ">u8"))
-            elif ntype == "char":
-                dtypel.append(('s' + str(i), "S" + str(field['field_length'])))
+            dtypel.append(('s' + str(i), "S" + str(field['field_length'])))
         dtype = np.dtype(dtypel)
         self._dtype = dtype
 
@@ -416,8 +428,8 @@ def get_chunk(self, size=None):
     def _missing_double(self, vec):
         v = vec.view(dtype='u1,u1,u2,u4')
         miss = (v['f1'] == 0) & (v['f2'] == 0) & (v['f3'] == 0)
-        miss1 = ((v['f0'] >= 0x41) & (v['f0'] <= 0x5a)) |\
-                (v['f0'] == 0x5f) | (v['f0'] == 0x2e)
+        miss1 = (((v['f0'] >= 0x41) & (v['f0'] <= 0x5a)) |
+                 (v['f0'] == 0x5f) | (v['f0'] == 0x2e))
         miss &= miss1
         return miss
 
@@ -440,6 +452,7 @@ def read(self, nrows=None):
             vec = data['s%d' % j]
             ntype = self.fields[j]['ntype']
             if ntype == "numeric":
+                vec = _handle_truncated_float_vec(vec, self.fields[j]['field_length'])
                 miss = self._missing_double(vec)
                 v = _parse_float_vec(vec)
                 v[miss] = np.nan
diff --git a/pandas/io/tests/data/paxraw_d_short.csv b/pandas/io/tests/data/paxraw_d_short.csv
@@ -0,0 +1,101 @@
+SEQN,PAXSTAT,PAXCAL,PAXDAY,PAXN,PAXHOUR,PAXMINUT,PAXINTEN,PAXSTEP
+31128,1,1,1,1,0,0,166,4
+31128,1,1,1,2,0,1,27,0
+31128,1,1,1,3,0,2,0,0
+31128,1,1,1,4,0,3,276,4
+31128,1,1,1,5,0,4,0,0
+31128,1,1,1,6,0,5,0,0
+31128,1,1,1,7,0,6,0,0
+31128,1,1,1,8,0,7,0,0
+31128,1,1,1,9,0,8,0,0
+31128,1,1,1,10,0,9,0,0
+31128,1,1,1,11,0,10,0,0
+31128,1,1,1,12,0,11,0,0
+31128,1,1,1,13,0,12,0,0
+31128,1,1,1,14,0,13,0,0
+31128,1,1,1,15,0,14,0,0
+31128,1,1,1,16,0,15,0,0
+31128,1,1,1,17,0,16,0,0
+31128,1,1,1,18,0,17,0,0
+31128,1,1,1,19,0,18,0,0
+31128,1,1,1,20,0,19,0,0
+31128,1,1,1,21,0,20,260,3
+31128,1,1,1,22,0,21,0,0
+31128,1,1,1,23,0,22,0,0
+31128,1,1,1,24,0,23,19,0
+31128,1,1,1,25,0,24,34,1
+31128,1,1,1,26,0,25,47,4
+31128,1,1,1,27,0,26,4,0
+31128,1,1,1,28,0,27,11,0
+31128,1,1,1,29,0,28,48,1
+31128,1,1,1,30,0,29,58,3
+31128,1,1,1,31,0,30,32,2
+31128,1,1,1,32,0,31,15,1
+31128,1,1,1,33,0,32,117,3
+31128,1,1,1,34,0,33,24,0
+31128,1,1,1,35,0,34,61,7
+31128,1,1,1,36,0,35,115,12
+31128,1,1,1,37,0,36,183,11
+31128,1,1,1,38,0,37,68,5
+31128,1,1,1,39,0,38,73,3
+31128,1,1,1,40,0,39,93,7
+31128,1,1,1,41,0,40,201,14
+31128,1,1,1,42,0,41,126,6
+31128,1,1,1,43,0,42,61,4
+31128,1,1,1,44,0,43,97,7
+31128,1,1,1,45,0,44,62,3
+31128,1,1,1,46,0,45,77,10
+31128,1,1,1,47,0,46,105,8
+31128,1,1,1,48,0,47,209,12
+31128,1,1,1,49,0,48,72,4
+31128,1,1,1,50,0,49,50,1
+31128,1,1,1,51,0,50,324,7
+31128,1,1,1,52,0,51,582,16
+31128,1,1,1,53,0,52,387,31
+31128,1,1,1,54,0,53,780,54
+31128,1,1,1,55,0,54,618,10
+31128,1,1,1,56,0,55,0,0
+31128,1,1,1,57,0,56,0,0
+31128,1,1,1,58,0,57,0,0
+31128,1,1,1,59,0,58,123,1
+31128,1,1,1,60,0,59,0,0
+31128,1,1,1,61,1,0,0,0
+31128,1,1,1,62,1,1,0,0
+31128,1,1,1,63,1,2,0,0
+31128,1,1,1,64,1,3,0,0
+31128,1,1,1,65,1,4,0,0
+31128,1,1,1,66,1,5,0,0
+31128,1,1,1,67,1,6,0,0
+31128,1,1,1,68,1,7,0,0
+31128,1,1,1,69,1,8,0,0
+31128,1,1,1,70,1,9,0,0
+31128,1,1,1,71,1,10,0,0
+31128,1,1,1,72,1,11,0,0
+31128,1,1,1,73,1,12,0,0
+31128,1,1,1,74,1,13,0,0
+31128,1,1,1,75,1,14,0,0
+31128,1,1,1,76,1,15,0,0
+31128,1,1,1,77,1,16,0,0
+31128,1,1,1,78,1,17,0,0
+31128,1,1,1,79,1,18,0,0
+31128,1,1,1,80,1,19,0,0
+31128,1,1,1,81,1,20,0,0
+31128,1,1,1,82,1,21,0,0
+31128,1,1,1,83,1,22,0,0
+31128,1,1,1,84,1,23,0,0
+31128,1,1,1,85,1,24,0,0
+31128,1,1,1,86,1,25,0,0
+31128,1,1,1,87,1,26,0,0
+31128,1,1,1,88,1,27,0,0
+31128,1,1,1,89,1,28,0,0
+31128,1,1,1,90,1,29,0,0
+31128,1,1,1,91,1,30,0,0
+31128,1,1,1,92,1,31,0,0
+31128,1,1,1,93,1,32,0,0
+31128,1,1,1,94,1,33,0,0
+31128,1,1,1,95,1,34,2,0
+31128,1,1,1,96,1,35,0,0
+31128,1,1,1,97,1,36,0,0
+31128,1,1,1,98,1,37,0,0
+31128,1,1,1,99,1,38,0,0
+31128,1,1,1,100,1,39,0,0
diff --git a/pandas/io/tests/data/paxraw_d_short.xpt b/pandas/io/tests/data/paxraw_d_short.xpt
diff --git a/pandas/io/tests/test_sas.py b/pandas/io/tests/test_sas.py
@@ -22,9 +22,10 @@ def setUp(self):
         self.file01 = os.path.join(self.dirpath, "DEMO_G.XPT")
         self.file02 = os.path.join(self.dirpath, "SSHSV1_A.XPT")
         self.file03 = os.path.join(self.dirpath, "DRXFCD_G.XPT")
+        self.file04 = os.path.join(self.dirpath, "paxraw_d_short.xpt")
 
 
-    def test1(self):
+    def test1_basic(self):
         # Tests with DEMO_G.XPT (all numeric file)
 
         # Compare to this
@@ -99,7 +100,7 @@ def test2(self):
         tm.assert_frame_equal(data, data_csv)
 
 
-    def test3(self):
+    def test_multiple_types(self):
         # Test with DRXFCD_G.XPT (contains text and numeric variables)
 
         # Compare to this
@@ -110,3 +111,19 @@ def test3(self):
 
         data = read_sas(self.file03)
         tm.assert_frame_equal(data, data_csv)
+
+
+    def test_truncated_float_support(self):
+        # Test with paxraw_d_short.xpt, a shortened version of:
+        # http://wwwn.cdc.gov/Nchs/Nhanes/2005-2006/PAXRAW_D.ZIP
+        # This file has truncated floats (5 bytes in this case).
+
+        # GH 11713
+
+        data_csv = pd.read_csv(self.file04.replace(".xpt", ".csv"))
+
+        data = XportReader(self.file04).read()
+        tm.assert_frame_equal(data.astype('int64'), data_csv)
+
+        data = read_sas(self.file04)
+        tm.assert_frame_equal(data.astype('int64'), data_csv)