Skip to content

Commit 703834c

Browse files
committed
[ENH] Added support for reading Stata formats 104, 105 and 108
1 parent 9a71737 commit 703834c

File tree

1 file changed

+50
-18
lines changed

1 file changed

+50
-18
lines changed

pandas/io/parsers.py

+50-18
Original file line numberDiff line numberDiff line change
@@ -2491,6 +2491,13 @@ def __init__(self):
24912491
'd': (-1.798e+308, +8.988e+307)
24922492
}
24932493

2494+
self.OLD_TYPE_MAPPING = \
2495+
{
2496+
'i': 252,
2497+
'f': 254,
2498+
'b': 251
2499+
}
2500+
24942501

24952502
class StataReader(StataParser):
24962503
"""
@@ -2547,42 +2554,64 @@ def __init__(self, path_or_buf, encoding=None):
25472554

25482555
def _read_header(self):
25492556
# header
2550-
format_version = struct.unpack('b', self.path_or_buf.read(1))[0]
2551-
if format_version not in [113, 114, 115]:
2552-
raise ValueError("Version of given Stata file is not 113 (Stata 8/9), 114 (Stata 10/11) or 115 (Stata 12)")
2557+
self.format_version = struct.unpack('b', self.path_or_buf.read(1))[0]
2558+
if self.format_version not in [104, 105, 108, 113, 114, 115]:
2559+
raise ValueError("Version of given Stata file is not 104, 105, 108, 113 (Stata 8/9), 114 (Stata 10/11) or 115 (Stata 12)")
25532560
self.byteorder = self.path_or_buf.read(1) == 0x1 and '>' or '<'
25542561
self.filetype = struct.unpack('b', self.path_or_buf.read(1))[0]
25552562
self.path_or_buf.read(1) # unused
25562563

25572564
self.nvar = struct.unpack(self.byteorder + 'H', self.path_or_buf.read(2))[0]
25582565
self.nobs = struct.unpack(self.byteorder + 'I', self.path_or_buf.read(4))[0]
2559-
self.data_label = self.path_or_buf.read(81)
2560-
self.time_stamp = self.path_or_buf.read(18)
2566+
if self.format_version > 105:
2567+
self.data_label = self.path_or_buf.read(81)
2568+
else:
2569+
self.data_label = self.path_or_buf.read(32)
2570+
if self.format_version > 104:
2571+
self.time_stamp = self.path_or_buf.read(18)
25612572

25622573
# descriptors
2563-
typlist = [ord(self.path_or_buf.read(1)) for i in range(self.nvar)]
2574+
if self.format_version > 108:
2575+
typlist = [ord(self.path_or_buf.read(1)) for i in range(self.nvar)]
2576+
else:
2577+
typlist = [self.OLD_TYPE_MAPPING[self.path_or_buf.read(1).decode(self.encoding)] for i in range(self.nvar)]
25642578
self.typlist = [self.TYPE_MAP[typ] for typ in typlist]
25652579
self.dtyplist = [self.DTYPE_MAP[typ] for typ in typlist]
2566-
self.varlist = [self._null_terminate(self.path_or_buf.read(33)) for i in range(self.nvar)]
2580+
if self.format_version > 108:
2581+
self.varlist = [self._null_terminate(self.path_or_buf.read(33)) for i in range(self.nvar)]
2582+
else:
2583+
self.varlist = [self._null_terminate(self.path_or_buf.read(9)) for i in range(self.nvar)]
25672584
self.srtlist = struct.unpack(self.byteorder + ('h' * (self.nvar + 1)), self.path_or_buf.read(2 * (self.nvar + 1)))[:-1]
2568-
if format_version <= 113:
2585+
if self.format_version > 113:
2586+
self.fmtlist = [self._null_terminate(self.path_or_buf.read(49)) for i in range(self.nvar)]
2587+
elif self.format_version > 104:
25692588
self.fmtlist = [self._null_terminate(self.path_or_buf.read(12)) for i in range(self.nvar)]
25702589
else:
2571-
self.fmtlist = [self._null_terminate(self.path_or_buf.read(49)) for i in range(self.nvar)]
2572-
self.lbllist = [self._null_terminate(self.path_or_buf.read(33)) for i in range(self.nvar)]
2573-
self.vlblist = [self._null_terminate(self.path_or_buf.read(81)) for i in range(self.nvar)]
2590+
self.fmtlist = [self._null_terminate(self.path_or_buf.read(7)) for i in range(self.nvar)]
2591+
if self.format_version > 108:
2592+
self.lbllist = [self._null_terminate(self.path_or_buf.read(33)) for i in range(self.nvar)]
2593+
else:
2594+
self.lbllist = [self._null_terminate(self.path_or_buf.read(9)) for i in range(self.nvar)]
2595+
if self.format_version > 105:
2596+
self.vlblist = [self._null_terminate(self.path_or_buf.read(81)) for i in range(self.nvar)]
2597+
else:
2598+
self.vlblist = [self._null_terminate(self.path_or_buf.read(32)) for i in range(self.nvar)]
25742599

2575-
# ignore expansion fields
2600+
# ignore expansion fields (Format 105 and later)
25762601
# When reading, read five bytes; the last four bytes now tell you the
25772602
# size of the next read, which you discard. You then continue like
25782603
# this until you read 5 bytes of zeros.
25792604

2580-
while True:
2581-
self.data_type = struct.unpack(self.byteorder + 'b', self.path_or_buf.read(1))[0]
2582-
self.data_len = struct.unpack(self.byteorder + 'i', self.path_or_buf.read(4))[0]
2583-
if self.data_type == 0:
2584-
break
2585-
self.path_or_buf.read(self.data_len)
2605+
if self.format_version > 104:
2606+
while True:
2607+
data_type = struct.unpack(self.byteorder + 'b', self.path_or_buf.read(1))[0]
2608+
if self.format_version > 108:
2609+
data_len = struct.unpack(self.byteorder + 'i', self.path_or_buf.read(4))[0]
2610+
else:
2611+
data_len = struct.unpack(self.byteorder + 'h', self.path_or_buf.read(2))[0]
2612+
if data_type == 0:
2613+
break
2614+
self.path_or_buf.read(data_len)
25862615

25872616
# necessary data to continue parsing
25882617
self.data_location = self.path_or_buf.tell()
@@ -2678,6 +2707,9 @@ def _read_value_labels(self):
26782707

26792708
self.value_label_dict = dict()
26802709

2710+
if self.format_version <= 108:
2711+
return # Value labels are not supported in version 108 and earlier.
2712+
26812713
while True:
26822714
slength = self.path_or_buf.read(4)
26832715
if not slength:

0 commit comments

Comments
 (0)