Read very old stata DTA files

kshedden · jreback · commit ca4f738114a1 · 2016-02-08T10:18:32.000-05:00
Closes #12232, although the issue may resurface for files containing double values (I can't determine the old type code for doubles). Author: Kerby Shedden <kshedden@umich.edu> Closes #12233 from kshedden/old_stata and squashes the following commits: aba666c [Kerby Shedden] Read old stat files (bugfix)
diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt
@@ -733,7 +733,7 @@ Bug Fixes
 - Bug in getitem when the values of a ``Series`` were tz-aware (:issue:`12089`)
 - Bug in ``Series.str.get_dummies`` when one of the variables was 'name' (:issue:`12180`)
 - Bug in ``pd.concat`` while concatenating tz-aware NaT series. (:issue:`11693`, :issue:`11755`)
-
+- Bug in ``pd.read_stata`` with version <= 108 files (:issue:`12232`)
 
 
 - Bug in ``Timedelta.round`` with negative values (:issue:`11690`)
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -851,23 +851,24 @@ def __init__(self, encoding):
         float32_max = b'\xff\xff\xff\x7e'
         float64_min = b'\xff\xff\xff\xff\xff\xff\xef\xff'
         float64_max = b'\xff\xff\xff\xff\xff\xff\xdf\x7f'
-        self.VALID_RANGE = \
-            {
-                'b': (-127, 100),
-                'h': (-32767, 32740),
-                'l': (-2147483647, 2147483620),
-                'f': (np.float32(struct.unpack('<f', float32_min)[0]),
-                      np.float32(struct.unpack('<f', float32_max)[0])),
-                'd': (np.float64(struct.unpack('<d', float64_min)[0]),
-                      np.float64(struct.unpack('<d', float64_max)[0]))
-            }
-
-        self.OLD_TYPE_MAPPING = \
-            {
-                'i': 252,
-                'f': 254,
-                'b': 251
-            }
+        self.VALID_RANGE = {
+            'b': (-127, 100),
+            'h': (-32767, 32740),
+            'l': (-2147483647, 2147483620),
+            'f': (np.float32(struct.unpack('<f', float32_min)[0]),
+                  np.float32(struct.unpack('<f', float32_max)[0])),
+            'd': (np.float64(struct.unpack('<d', float64_min)[0]),
+                  np.float64(struct.unpack('<d', float64_max)[0]))
+        }
+
+        self.OLD_TYPE_MAPPING = {
+            98: 251,   # byte
+            105: 252,  # int
+            108: 253,  # long
+            102: 254   # float
+            # don't know old code for double
+        }
+
         # These missing values are the generic '.' in Stata, and are used
         # to replace nans
         self.MISSING_VALUES = {
@@ -878,15 +879,14 @@ def __init__(self, encoding):
             'd': np.float64(
                 struct.unpack('<d', b'\x00\x00\x00\x00\x00\x00\xe0\x7f')[0])
         }
-        self.NUMPY_TYPE_MAP = \
-            {
-                'b': 'i1',
-                'h': 'i2',
-                'l': 'i4',
-                'f': 'f4',
-                'd': 'f8',
-                'Q': 'u8'
-            }
+        self.NUMPY_TYPE_MAP = {
+            'b': 'i1',
+            'h': 'i2',
+            'l': 'i4',
+            'f': 'f4',
+            'd': 'f8',
+            'Q': 'u8'
+        }
 
         # Reserved words cannot be used as variable names
         self.RESERVED_WORDS = ('aggregate', 'array', 'boolean', 'break',
@@ -900,12 +900,6 @@ def __init__(self, encoding):
                                'protected', 'quad', 'rowvector', 'short',
                                'typedef', 'typename', 'virtual')
 
-    def _decode_bytes(self, str, errors=None):
-        if compat.PY3 or self._encoding is not None:
-            return str.decode(self._encoding, errors)
-        else:
-            return str
-
 
 class StataReader(StataParser, BaseIterator):
     __doc__ = _stata_reader_doc
@@ -1201,11 +1195,14 @@ def _read_old_header(self, first_char):
             typlist = [ord(self.path_or_buf.read(1))
                        for i in range(self.nvar)]
         else:
-            typlist = [
-                self.OLD_TYPE_MAPPING[
-                    self._decode_bytes(self.path_or_buf.read(1))
-                ] for i in range(self.nvar)
-            ]
+            buf = self.path_or_buf.read(self.nvar)
+            typlistb = np.frombuffer(buf, dtype=np.uint8)
+            typlist = []
+            for tp in typlistb:
+                if tp in self.OLD_TYPE_MAPPING:
+                    typlist.append(self.OLD_TYPE_MAPPING[tp])
+                else:
+                    typlist.append(tp - 127)  # string
 
         try:
             self.typlist = [self.TYPE_MAP[typ] for typ in typlist]
@@ -1526,7 +1523,7 @@ def read(self, nrows=None, convert_dates=None,
                     data[col],
                     self.fmtlist[i])
 
-        if convert_categoricals and self.value_label_dict:
+        if convert_categoricals and self.format_version > 108:
             data = self._do_convert_categoricals(data,
                                                  self.value_label_dict,
                                                  self.lbllist,
diff --git a/pandas/io/tests/data/S4_EDUC1.DTA b/pandas/io/tests/data/S4_EDUC1.DTA
diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py
@@ -409,9 +409,9 @@ def test_read_write_dta12(self):
                 written_and_read_again.set_index('index'), formatted)
 
     def test_read_write_dta13(self):
-        s1 = Series(2**9, dtype=np.int16)
-        s2 = Series(2**17, dtype=np.int32)
-        s3 = Series(2**33, dtype=np.int64)
+        s1 = Series(2 ** 9, dtype=np.int16)
+        s2 = Series(2 ** 17, dtype=np.int32)
+        s3 = Series(2 ** 33, dtype=np.int64)
         original = DataFrame({'int16': s1, 'int32': s2, 'int64': s3})
         original.index.name = 'index'
 
@@ -568,6 +568,20 @@ def test_dates_invalid_column(self):
             tm.assert_frame_equal(written_and_read_again.set_index('index'),
                                   modified)
 
+    def test_105(self):
+        # Data obtained from:
+        # http://go.worldbank.org/ZXY29PVJ21
+        dpath = os.path.join(self.dirpath, 'S4_EDUC1.DTA')
+        df = pd.read_stata(dpath)
+        df0 = [[1, 1, 3, -2], [2, 1, 2, -2], [4, 1, 1, -2]]
+        df0 = pd.DataFrame(df0)
+        df0.columns = ["clustnum", "pri_schl", "psch_num", "psch_dis"]
+        df0['clustnum'] = df0["clustnum"].astype(np.int16)
+        df0['pri_schl'] = df0["pri_schl"].astype(np.int8)
+        df0['psch_num'] = df0["psch_num"].astype(np.int8)
+        df0['psch_dis'] = df0["psch_dis"].astype(np.float32)
+        tm.assert_frame_equal(df.head(3), df0)
+
     def test_date_export_formats(self):
         columns = ['tc', 'td', 'tw', 'tm', 'tq', 'th', 'ty']
         conversions = dict(((c, c) for c in columns))