Skip to content

Commit 547750a

Browse files
ksheddenjreback
authored andcommitted
ENH: add truncated float support to read_sas, #11713
1 parent 3c4a34e commit 547750a

File tree

5 files changed

+158
-25
lines changed

5 files changed

+158
-25
lines changed

doc/source/whatsnew/v0.18.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ New features
3131
Other enhancements
3232
^^^^^^^^^^^^^^^^^^
3333

34+
- Handle truncated floats in SAS xport files (:issue:`11713`)
35+
3436
.. _whatsnew_0180.enhancements.rounding:
3537

3638
Datetimelike rounding

pandas/io/sas.py

+36-23
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
"""
2-
Tools for reading SAS XPort files into Pandas objects.
2+
Read a SAS XPort format file into a Pandas DataFrame.
33
44
Based on code from Jack Cushman (github.com/jcushman/xport).
55
@@ -25,10 +25,6 @@
2525
'nifl', 'nifd', 'npos', '_']
2626

2727

28-
# TODO: Support for 4 byte floats, see https://github.com/jcushman/xport/pull/3
29-
# Need a test file
30-
31-
3228
_base_params_doc = """\
3329
Parameters
3430
----------
@@ -161,15 +157,33 @@ def _split_line(s, parts):
161157
return out
162158

163159

160+
def _handle_truncated_float_vec(vec, nbytes):
161+
# This feature is not well documented, but some SAS XPORT files
162+
# have 2-7 byte "truncated" floats. To read these truncated
163+
# floats, pad them with zeros on the right to make 8 byte floats.
164+
#
165+
# References:
166+
# https://github.com/jcushman/xport/pull/3
167+
# The R "foreign" library
168+
169+
if nbytes != 8:
170+
vec1 = np.zeros(len(vec), np.dtype('S8'))
171+
dtype = np.dtype('S%d,S%d' % (nbytes, 8 - nbytes))
172+
vec2 = vec1.view(dtype=dtype)
173+
vec2['f0'] = vec
174+
return vec2
175+
176+
return vec
177+
178+
164179
def _parse_float_vec(vec):
165180
"""
166-
Parse a vector of 8-byte values representing IBM 8 byte floats
167-
into native 8 byte floats.
181+
Parse a vector of float values representing IBM 8 byte floats into
182+
native 8 byte floats.
168183
"""
169184

170185
dtype = np.dtype('>u4,>u4')
171186
vec1 = vec.view(dtype=dtype)
172-
173187
xport1 = vec1['f0']
174188
xport2 = vec1['f1']
175189

@@ -266,7 +280,8 @@ def _read_header(self):
266280
raise ValueError("Header record is not an XPORT file.")
267281

268282
line2 = self._get_row()
269-
file_info = _split_line(line2, [ ['prefix',24], ['version',8], ['OS',8], ['_',24], ['created',16]])
283+
file_info = _split_line(line2, [['prefix', 24], ['version', 8], ['OS', 8],
284+
['_', 24], ['created', 16]])
270285
if file_info['prefix'] != "SAS SAS SASLIB":
271286
raise ValueError("Header record has invalid prefix.")
272287
file_info['created'] = _parse_date(file_info['created'])
@@ -283,11 +298,11 @@ def _read_header(self):
283298
fieldnamelength = int(header1[-5:-2]) # usually 140, could be 135
284299

285300
# member info
286-
member_info = _split_line(self._get_row(), [['prefix',8], ['set_name',8],
287-
['sasdata',8],['version',8],
288-
['OS',8],['_',24],['created',16]])
289-
member_info.update( _split_line(self._get_row(), [['modified',16], ['_',16],
290-
['label',40],['type',8]]))
301+
member_info = _split_line(self._get_row(), [['prefix', 8], ['set_name', 8],
302+
['sasdata', 8],['version', 8],
303+
['OS', 8],['_', 24],['created', 16]])
304+
member_info.update( _split_line(self._get_row(), [['modified', 16], ['_', 16],
305+
['label', 40],['type', 8]]))
291306
member_info['modified'] = _parse_date(member_info['modified'])
292307
member_info['created'] = _parse_date(member_info['created'])
293308
self.member_info = member_info
@@ -313,8 +328,9 @@ def _read_header(self):
313328
field = dict(zip(_fieldkeys, fieldstruct))
314329
del field['_']
315330
field['ntype'] = types[field['ntype']]
316-
if field['ntype'] == 'numeric' and field['field_length'] != 8:
317-
raise TypeError("Only 8-byte floats are currently implemented. Can't read field %s." % field)
331+
fl = field['field_length']
332+
if field['ntype'] == 'numeric' and ((fl < 2) or (fl > 8)):
333+
raise TypeError("Floating point field width %d is not between 2 and 8." % fw)
318334

319335
for k, v in field.items():
320336
try:
@@ -339,11 +355,7 @@ def _read_header(self):
339355
# Setup the dtype.
340356
dtypel = []
341357
for i,field in enumerate(self.fields):
342-
ntype = field['ntype']
343-
if ntype == "numeric":
344-
dtypel.append(('s' + str(i), ">u8"))
345-
elif ntype == "char":
346-
dtypel.append(('s' + str(i), "S" + str(field['field_length'])))
358+
dtypel.append(('s' + str(i), "S" + str(field['field_length'])))
347359
dtype = np.dtype(dtypel)
348360
self._dtype = dtype
349361

@@ -416,8 +428,8 @@ def get_chunk(self, size=None):
416428
def _missing_double(self, vec):
417429
v = vec.view(dtype='u1,u1,u2,u4')
418430
miss = (v['f1'] == 0) & (v['f2'] == 0) & (v['f3'] == 0)
419-
miss1 = ((v['f0'] >= 0x41) & (v['f0'] <= 0x5a)) |\
420-
(v['f0'] == 0x5f) | (v['f0'] == 0x2e)
431+
miss1 = (((v['f0'] >= 0x41) & (v['f0'] <= 0x5a)) |
432+
(v['f0'] == 0x5f) | (v['f0'] == 0x2e))
421433
miss &= miss1
422434
return miss
423435

@@ -440,6 +452,7 @@ def read(self, nrows=None):
440452
vec = data['s%d' % j]
441453
ntype = self.fields[j]['ntype']
442454
if ntype == "numeric":
455+
vec = _handle_truncated_float_vec(vec, self.fields[j]['field_length'])
443456
miss = self._missing_double(vec)
444457
v = _parse_float_vec(vec)
445458
v[miss] = np.nan
+101
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
SEQN,PAXSTAT,PAXCAL,PAXDAY,PAXN,PAXHOUR,PAXMINUT,PAXINTEN,PAXSTEP
2+
31128,1,1,1,1,0,0,166,4
3+
31128,1,1,1,2,0,1,27,0
4+
31128,1,1,1,3,0,2,0,0
5+
31128,1,1,1,4,0,3,276,4
6+
31128,1,1,1,5,0,4,0,0
7+
31128,1,1,1,6,0,5,0,0
8+
31128,1,1,1,7,0,6,0,0
9+
31128,1,1,1,8,0,7,0,0
10+
31128,1,1,1,9,0,8,0,0
11+
31128,1,1,1,10,0,9,0,0
12+
31128,1,1,1,11,0,10,0,0
13+
31128,1,1,1,12,0,11,0,0
14+
31128,1,1,1,13,0,12,0,0
15+
31128,1,1,1,14,0,13,0,0
16+
31128,1,1,1,15,0,14,0,0
17+
31128,1,1,1,16,0,15,0,0
18+
31128,1,1,1,17,0,16,0,0
19+
31128,1,1,1,18,0,17,0,0
20+
31128,1,1,1,19,0,18,0,0
21+
31128,1,1,1,20,0,19,0,0
22+
31128,1,1,1,21,0,20,260,3
23+
31128,1,1,1,22,0,21,0,0
24+
31128,1,1,1,23,0,22,0,0
25+
31128,1,1,1,24,0,23,19,0
26+
31128,1,1,1,25,0,24,34,1
27+
31128,1,1,1,26,0,25,47,4
28+
31128,1,1,1,27,0,26,4,0
29+
31128,1,1,1,28,0,27,11,0
30+
31128,1,1,1,29,0,28,48,1
31+
31128,1,1,1,30,0,29,58,3
32+
31128,1,1,1,31,0,30,32,2
33+
31128,1,1,1,32,0,31,15,1
34+
31128,1,1,1,33,0,32,117,3
35+
31128,1,1,1,34,0,33,24,0
36+
31128,1,1,1,35,0,34,61,7
37+
31128,1,1,1,36,0,35,115,12
38+
31128,1,1,1,37,0,36,183,11
39+
31128,1,1,1,38,0,37,68,5
40+
31128,1,1,1,39,0,38,73,3
41+
31128,1,1,1,40,0,39,93,7
42+
31128,1,1,1,41,0,40,201,14
43+
31128,1,1,1,42,0,41,126,6
44+
31128,1,1,1,43,0,42,61,4
45+
31128,1,1,1,44,0,43,97,7
46+
31128,1,1,1,45,0,44,62,3
47+
31128,1,1,1,46,0,45,77,10
48+
31128,1,1,1,47,0,46,105,8
49+
31128,1,1,1,48,0,47,209,12
50+
31128,1,1,1,49,0,48,72,4
51+
31128,1,1,1,50,0,49,50,1
52+
31128,1,1,1,51,0,50,324,7
53+
31128,1,1,1,52,0,51,582,16
54+
31128,1,1,1,53,0,52,387,31
55+
31128,1,1,1,54,0,53,780,54
56+
31128,1,1,1,55,0,54,618,10
57+
31128,1,1,1,56,0,55,0,0
58+
31128,1,1,1,57,0,56,0,0
59+
31128,1,1,1,58,0,57,0,0
60+
31128,1,1,1,59,0,58,123,1
61+
31128,1,1,1,60,0,59,0,0
62+
31128,1,1,1,61,1,0,0,0
63+
31128,1,1,1,62,1,1,0,0
64+
31128,1,1,1,63,1,2,0,0
65+
31128,1,1,1,64,1,3,0,0
66+
31128,1,1,1,65,1,4,0,0
67+
31128,1,1,1,66,1,5,0,0
68+
31128,1,1,1,67,1,6,0,0
69+
31128,1,1,1,68,1,7,0,0
70+
31128,1,1,1,69,1,8,0,0
71+
31128,1,1,1,70,1,9,0,0
72+
31128,1,1,1,71,1,10,0,0
73+
31128,1,1,1,72,1,11,0,0
74+
31128,1,1,1,73,1,12,0,0
75+
31128,1,1,1,74,1,13,0,0
76+
31128,1,1,1,75,1,14,0,0
77+
31128,1,1,1,76,1,15,0,0
78+
31128,1,1,1,77,1,16,0,0
79+
31128,1,1,1,78,1,17,0,0
80+
31128,1,1,1,79,1,18,0,0
81+
31128,1,1,1,80,1,19,0,0
82+
31128,1,1,1,81,1,20,0,0
83+
31128,1,1,1,82,1,21,0,0
84+
31128,1,1,1,83,1,22,0,0
85+
31128,1,1,1,84,1,23,0,0
86+
31128,1,1,1,85,1,24,0,0
87+
31128,1,1,1,86,1,25,0,0
88+
31128,1,1,1,87,1,26,0,0
89+
31128,1,1,1,88,1,27,0,0
90+
31128,1,1,1,89,1,28,0,0
91+
31128,1,1,1,90,1,29,0,0
92+
31128,1,1,1,91,1,30,0,0
93+
31128,1,1,1,92,1,31,0,0
94+
31128,1,1,1,93,1,32,0,0
95+
31128,1,1,1,94,1,33,0,0
96+
31128,1,1,1,95,1,34,2,0
97+
31128,1,1,1,96,1,35,0,0
98+
31128,1,1,1,97,1,36,0,0
99+
31128,1,1,1,98,1,37,0,0
100+
31128,1,1,1,99,1,38,0,0
101+
31128,1,1,1,100,1,39,0,0
6.8 KB
Binary file not shown.

pandas/io/tests/test_sas.py

+19-2
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,10 @@ def setUp(self):
2222
self.file01 = os.path.join(self.dirpath, "DEMO_G.XPT")
2323
self.file02 = os.path.join(self.dirpath, "SSHSV1_A.XPT")
2424
self.file03 = os.path.join(self.dirpath, "DRXFCD_G.XPT")
25+
self.file04 = os.path.join(self.dirpath, "paxraw_d_short.xpt")
2526

2627

27-
def test1(self):
28+
def test1_basic(self):
2829
# Tests with DEMO_G.XPT (all numeric file)
2930

3031
# Compare to this
@@ -99,7 +100,7 @@ def test2(self):
99100
tm.assert_frame_equal(data, data_csv)
100101

101102

102-
def test3(self):
103+
def test_multiple_types(self):
103104
# Test with DRXFCD_G.XPT (contains text and numeric variables)
104105

105106
# Compare to this
@@ -110,3 +111,19 @@ def test3(self):
110111

111112
data = read_sas(self.file03)
112113
tm.assert_frame_equal(data, data_csv)
114+
115+
116+
def test_truncated_float_support(self):
117+
# Test with paxraw_d_short.xpt, a shortened version of:
118+
# http://wwwn.cdc.gov/Nchs/Nhanes/2005-2006/PAXRAW_D.ZIP
119+
# This file has truncated floats (5 bytes in this case).
120+
121+
# GH 11713
122+
123+
data_csv = pd.read_csv(self.file04.replace(".xpt", ".csv"))
124+
125+
data = XportReader(self.file04).read()
126+
tm.assert_frame_equal(data.astype('int64'), data_csv)
127+
128+
data = read_sas(self.file04)
129+
tm.assert_frame_equal(data.astype('int64'), data_csv)

0 commit comments

Comments
 (0)