Skip to content

Commit 54ab5be

Browse files
ksheddenjreback
authored andcommitted
BUG: compat with Stata ver 111
closes pandas-dev#11526 closes pandas-dev#14159
1 parent 6c73e76 commit 54ab5be

File tree

5 files changed

+20
-6
lines changed

5 files changed

+20
-6
lines changed

doc/source/whatsnew/v0.19.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -1327,6 +1327,7 @@ Other API Changes
13271327
- More informative exceptions are passed through the csv parser. The exception type would now be the original exception type instead of ``CParserError``. (:issue:`13652`)
13281328
- ``pd.read_csv()`` in the C engine will now issue a ``ParserWarning`` or raise a ``ValueError`` when ``sep`` encoded is more than one character long (:issue:`14065`)
13291329
- ``DataFrame.values`` will now return ``float64`` with a ``DataFrame`` of mixed ``int64`` and ``uint64`` dtypes, conforming to ``np.find_common_type`` (:issue:`10364`, :issue:`13917`)
1330+
- ``pd.read_stata()`` can now handle some format 111 files, which are produced by SAS when generating Stata dta files (:issue:`11526`)
13301331

13311332
.. _whatsnew_0190.deprecations:
13321333

doc/source/whatsnew/v0.20.0.txt

-3
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,6 @@ Other enhancements
3333

3434

3535

36-
37-
3836
.. _whatsnew_0200.api_breaking:
3937

4038
Backwards incompatible API changes
@@ -81,4 +79,3 @@ Performance Improvements
8179

8280
Bug Fixes
8381
~~~~~~~~~
84-

pandas/io/stata.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,8 @@
3434
from pandas.tslib import NaT, Timestamp
3535

3636
_version_error = ("Version of given Stata file is not 104, 105, 108, "
37-
"113 (Stata 8/9), 114 (Stata 10/11), 115 (Stata 12), "
38-
"117 (Stata 13), or 118 (Stata 14)")
37+
"111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), "
38+
"115 (Stata 12), 117 (Stata 13), or 118 (Stata 14)")
3939

4040
_statafile_processing_params1 = """\
4141
convert_dates : boolean, defaults to True
@@ -1183,7 +1183,7 @@ def _get_seek_variable_labels(self):
11831183

11841184
def _read_old_header(self, first_char):
11851185
self.format_version = struct.unpack('b', first_char)[0]
1186-
if self.format_version not in [104, 105, 108, 113, 114, 115]:
1186+
if self.format_version not in [104, 105, 108, 111, 113, 114, 115]:
11871187
raise ValueError(_version_error)
11881188
self.byteorder = struct.unpack('b', self.path_or_buf.read(1))[
11891189
0] == 0x1 and '>' or '<'

pandas/io/tests/data/stata7_111.dta

1 KB
Binary file not shown.

pandas/io/tests/test_stata.py

+16
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,8 @@ def setUp(self):
8282
self.dta22_118 = os.path.join(self.dirpath, 'stata14_118.dta')
8383
self.dta23 = os.path.join(self.dirpath, 'stata15.dta')
8484

85+
self.dta24_111 = os.path.join(self.dirpath, 'stata7_111.dta')
86+
8587
def read_dta(self, file):
8688
# Legacy default reader configuration
8789
return read_stata(file, convert_dates=True)
@@ -1219,6 +1221,20 @@ def test_repeated_column_labels(self):
12191221
read_stata(self.dta23, convert_categoricals=True)
12201222
tm.assertTrue('wolof' in cm.exception)
12211223

1224+
def test_stata_111(self):
1225+
# 111 is an old version but still used by current versions of
1226+
# SAS when exporting to Stata format. We do not know of any
1227+
# on-line documentation for this version.
1228+
df = read_stata(self.dta24_111)
1229+
original = pd.DataFrame({'y': [1, 1, 1, 1, 1, 0, 0, np.NaN, 0, 0],
1230+
'x': [1, 2, 1, 3, np.NaN, 4, 3, 5, 1, 6],
1231+
'w': [2, np.NaN, 5, 2, 4, 4, 3, 1, 2, 3],
1232+
'z': ['a', 'b', 'c', 'd', 'e', '', 'g', 'h',
1233+
'i', 'j']})
1234+
original = original[['y', 'x', 'w', 'z']]
1235+
tm.assert_frame_equal(original, df)
1236+
1237+
12221238
if __name__ == '__main__':
12231239
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
12241240
exit=False)

0 commit comments

Comments
 (0)