Skip to content

Commit ca4f738

Browse files
ksheddenjreback
authored andcommitted
Read very old stata DTA files
Closes #12232, although the issue may resurface for files containing double values (I can't determine the old type code for doubles). Author: Kerby Shedden <[email protected]> Closes #12233 from kshedden/old_stata and squashes the following commits: aba666c [Kerby Shedden] Read old stat files (bugfix)
1 parent 70f79ce commit ca4f738

File tree

4 files changed

+53
-42
lines changed

4 files changed

+53
-42
lines changed

doc/source/whatsnew/v0.18.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -733,7 +733,7 @@ Bug Fixes
733733
- Bug in getitem when the values of a ``Series`` were tz-aware (:issue:`12089`)
734734
- Bug in ``Series.str.get_dummies`` when one of the variables was 'name' (:issue:`12180`)
735735
- Bug in ``pd.concat`` while concatenating tz-aware NaT series. (:issue:`11693`, :issue:`11755`)
736-
736+
- Bug in ``pd.read_stata`` with version <= 108 files (:issue:`12232`)
737737

738738

739739
- Bug in ``Timedelta.round`` with negative values (:issue:`11690`)

pandas/io/stata.py

+35-38
Original file line numberDiff line numberDiff line change
@@ -851,23 +851,24 @@ def __init__(self, encoding):
851851
float32_max = b'\xff\xff\xff\x7e'
852852
float64_min = b'\xff\xff\xff\xff\xff\xff\xef\xff'
853853
float64_max = b'\xff\xff\xff\xff\xff\xff\xdf\x7f'
854-
self.VALID_RANGE = \
855-
{
856-
'b': (-127, 100),
857-
'h': (-32767, 32740),
858-
'l': (-2147483647, 2147483620),
859-
'f': (np.float32(struct.unpack('<f', float32_min)[0]),
860-
np.float32(struct.unpack('<f', float32_max)[0])),
861-
'd': (np.float64(struct.unpack('<d', float64_min)[0]),
862-
np.float64(struct.unpack('<d', float64_max)[0]))
863-
}
864-
865-
self.OLD_TYPE_MAPPING = \
866-
{
867-
'i': 252,
868-
'f': 254,
869-
'b': 251
870-
}
854+
self.VALID_RANGE = {
855+
'b': (-127, 100),
856+
'h': (-32767, 32740),
857+
'l': (-2147483647, 2147483620),
858+
'f': (np.float32(struct.unpack('<f', float32_min)[0]),
859+
np.float32(struct.unpack('<f', float32_max)[0])),
860+
'd': (np.float64(struct.unpack('<d', float64_min)[0]),
861+
np.float64(struct.unpack('<d', float64_max)[0]))
862+
}
863+
864+
self.OLD_TYPE_MAPPING = {
865+
98: 251, # byte
866+
105: 252, # int
867+
108: 253, # long
868+
102: 254 # float
869+
# don't know old code for double
870+
}
871+
871872
# These missing values are the generic '.' in Stata, and are used
872873
# to replace nans
873874
self.MISSING_VALUES = {
@@ -878,15 +879,14 @@ def __init__(self, encoding):
878879
'd': np.float64(
879880
struct.unpack('<d', b'\x00\x00\x00\x00\x00\x00\xe0\x7f')[0])
880881
}
881-
self.NUMPY_TYPE_MAP = \
882-
{
883-
'b': 'i1',
884-
'h': 'i2',
885-
'l': 'i4',
886-
'f': 'f4',
887-
'd': 'f8',
888-
'Q': 'u8'
889-
}
882+
self.NUMPY_TYPE_MAP = {
883+
'b': 'i1',
884+
'h': 'i2',
885+
'l': 'i4',
886+
'f': 'f4',
887+
'd': 'f8',
888+
'Q': 'u8'
889+
}
890890

891891
# Reserved words cannot be used as variable names
892892
self.RESERVED_WORDS = ('aggregate', 'array', 'boolean', 'break',
@@ -900,12 +900,6 @@ def __init__(self, encoding):
900900
'protected', 'quad', 'rowvector', 'short',
901901
'typedef', 'typename', 'virtual')
902902

903-
def _decode_bytes(self, str, errors=None):
904-
if compat.PY3 or self._encoding is not None:
905-
return str.decode(self._encoding, errors)
906-
else:
907-
return str
908-
909903

910904
class StataReader(StataParser, BaseIterator):
911905
__doc__ = _stata_reader_doc
@@ -1201,11 +1195,14 @@ def _read_old_header(self, first_char):
12011195
typlist = [ord(self.path_or_buf.read(1))
12021196
for i in range(self.nvar)]
12031197
else:
1204-
typlist = [
1205-
self.OLD_TYPE_MAPPING[
1206-
self._decode_bytes(self.path_or_buf.read(1))
1207-
] for i in range(self.nvar)
1208-
]
1198+
buf = self.path_or_buf.read(self.nvar)
1199+
typlistb = np.frombuffer(buf, dtype=np.uint8)
1200+
typlist = []
1201+
for tp in typlistb:
1202+
if tp in self.OLD_TYPE_MAPPING:
1203+
typlist.append(self.OLD_TYPE_MAPPING[tp])
1204+
else:
1205+
typlist.append(tp - 127) # string
12091206

12101207
try:
12111208
self.typlist = [self.TYPE_MAP[typ] for typ in typlist]
@@ -1526,7 +1523,7 @@ def read(self, nrows=None, convert_dates=None,
15261523
data[col],
15271524
self.fmtlist[i])
15281525

1529-
if convert_categoricals and self.value_label_dict:
1526+
if convert_categoricals and self.format_version > 108:
15301527
data = self._do_convert_categoricals(data,
15311528
self.value_label_dict,
15321529
self.lbllist,

pandas/io/tests/data/S4_EDUC1.DTA

2.93 KB
Binary file not shown.

pandas/io/tests/test_stata.py

+17-3
Original file line numberDiff line numberDiff line change
@@ -409,9 +409,9 @@ def test_read_write_dta12(self):
409409
written_and_read_again.set_index('index'), formatted)
410410

411411
def test_read_write_dta13(self):
412-
s1 = Series(2**9, dtype=np.int16)
413-
s2 = Series(2**17, dtype=np.int32)
414-
s3 = Series(2**33, dtype=np.int64)
412+
s1 = Series(2 ** 9, dtype=np.int16)
413+
s2 = Series(2 ** 17, dtype=np.int32)
414+
s3 = Series(2 ** 33, dtype=np.int64)
415415
original = DataFrame({'int16': s1, 'int32': s2, 'int64': s3})
416416
original.index.name = 'index'
417417

@@ -568,6 +568,20 @@ def test_dates_invalid_column(self):
568568
tm.assert_frame_equal(written_and_read_again.set_index('index'),
569569
modified)
570570

571+
def test_105(self):
572+
# Data obtained from:
573+
# http://go.worldbank.org/ZXY29PVJ21
574+
dpath = os.path.join(self.dirpath, 'S4_EDUC1.DTA')
575+
df = pd.read_stata(dpath)
576+
df0 = [[1, 1, 3, -2], [2, 1, 2, -2], [4, 1, 1, -2]]
577+
df0 = pd.DataFrame(df0)
578+
df0.columns = ["clustnum", "pri_schl", "psch_num", "psch_dis"]
579+
df0['clustnum'] = df0["clustnum"].astype(np.int16)
580+
df0['pri_schl'] = df0["pri_schl"].astype(np.int8)
581+
df0['psch_num'] = df0["psch_num"].astype(np.int8)
582+
df0['psch_dis'] = df0["psch_dis"].astype(np.float32)
583+
tm.assert_frame_equal(df.head(3), df0)
584+
571585
def test_date_export_formats(self):
572586
columns = ['tc', 'td', 'tw', 'tm', 'tq', 'th', 'ty']
573587
conversions = dict(((c, c) for c in columns))

0 commit comments

Comments
 (0)