Skip to content

Commit 6265450

Browse files
committed
BUG: Fixed failure in StataReader when reading variable labels in 117 files
Stata's implementation does not match the online dta file format description. The solution used here is to directly compute the offset rather than reading it from the dta file. If Stata fixes their implementation, the original code can be restored. closes #7816
1 parent d077f93 commit 6265450

File tree

5 files changed

+26
-4
lines changed

5 files changed

+26
-4
lines changed

doc/source/v0.15.0.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -212,7 +212,7 @@ Bug Fixes
212212

213213

214214
- Bug in ``DataFrame.plot`` with ``subplots=True`` may draw unnecessary minor xticks and yticks (:issue:`7801`)
215-
215+
- Bug in ``StataReader`` which did not read variable labels in 117 files due to difference between Stata documentation and implementation (:issue:`7816`)
216216

217217

218218

pandas/io/stata.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -520,8 +520,15 @@ def _read_header(self):
520520
self.byteorder + 'q', self.path_or_buf.read(8))[0] + 9
521521
seek_value_label_names = struct.unpack(
522522
self.byteorder + 'q', self.path_or_buf.read(8))[0] + 19
523-
seek_variable_labels = struct.unpack(
524-
self.byteorder + 'q', self.path_or_buf.read(8))[0] + 17
523+
# Stata 117 data files do not follow the described format. This is
524+
# a work around that uses the previous label, 33 bytes for each
525+
# variable, 20 for the closing tag and 17 for the opening tag
526+
self.path_or_buf.read(8) # <variable_lables>, throw away
527+
seek_variable_labels = seek_value_label_names + (33*self.nvar) + 20 + 17
528+
# Below is the original, correct code (per Stata sta format doc,
529+
# although this is not followed in actual 117 dtas)
530+
#seek_variable_labels = struct.unpack(
531+
# self.byteorder + 'q', self.path_or_buf.read(8))[0] + 17
525532
self.path_or_buf.read(8) # <characteristics>
526533
self.data_location = struct.unpack(
527534
self.byteorder + 'q', self.path_or_buf.read(8))[0] + 6

pandas/io/tests/data/stata7_115.dta

722 Bytes
Binary file not shown.

pandas/io/tests/data/stata7_117.dta

1.13 KB
Binary file not shown.

pandas/io/tests/test_stata.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,9 @@ def setUp(self):
6868
self.dta15_115 = os.path.join(self.dirpath, 'stata6_115.dta')
6969
self.dta15_117 = os.path.join(self.dirpath, 'stata6_117.dta')
7070

71+
self.dta16_115 = os.path.join(self.dirpath, 'stata7_115.dta')
72+
self.dta16_117 = os.path.join(self.dirpath, 'stata7_117.dta')
73+
7174
def read_dta(self, file):
7275
return read_stata(file, convert_dates=True)
7376

@@ -199,7 +202,7 @@ def test_read_dta4(self):
199202
'labeled_with_missings', 'float_labelled'])
200203

201204
# these are all categoricals
202-
expected = pd.concat([ Series(pd.Categorical(value)) for col, value in expected.iteritems() ],axis=1)
205+
expected = pd.concat([ Series(pd.Categorical(value)) for col, value in compat.iteritems(expected)],axis=1)
203206

204207
tm.assert_frame_equal(parsed_113, expected)
205208
tm.assert_frame_equal(parsed_114, expected)
@@ -551,6 +554,18 @@ def test_bool_uint(self):
551554
written_and_read_again = written_and_read_again.set_index('index')
552555
tm.assert_frame_equal(written_and_read_again, expected)
553556

557+
def test_variable_labels(self):
558+
sr_115 = StataReader(self.dta16_115).variable_labels()
559+
sr_117 = StataReader(self.dta16_117).variable_labels()
560+
keys = ('var1', 'var2', 'var3')
561+
labels = ('label1', 'label2', 'label3')
562+
for k,v in compat.iteritems(sr_115):
563+
self.assertTrue(k in sr_117)
564+
self.assertTrue(v == sr_117[k])
565+
self.assertTrue(k in keys)
566+
self.assertTrue(v in labels)
567+
568+
554569
if __name__ == '__main__':
555570
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
556571
exit=False)

0 commit comments

Comments
 (0)