Skip to content

Commit d62d77b

Browse files
authored
ENH: Add support for reading 110-format Stata dta files (#58044)
* ENH: Add support for reading 110-format Stata dta files * Add whatsnew note to v3.0.0.rst * Add a test data file containing value labels * Compare version number inclusively when determining whether to use old or new typlist version * Add a big-endian version of the test data set
1 parent 8d543ba commit d62d77b

File tree

6 files changed

+13
-13
lines changed

6 files changed

+13
-13
lines changed

doc/source/whatsnew/v3.0.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@ Other enhancements
4444
- :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`)
4545
- :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
4646
- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
47+
- Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`)
48+
-
4749

4850
.. ---------------------------------------------------------------------------
4951
.. _whatsnew_300.notable_bug_fixes:

pandas/io/stata.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@
9191

9292
_version_error = (
9393
"Version of given Stata file is {version}. pandas supports importing "
94-
"versions 105, 108, 111 (Stata 7SE), 113 (Stata 8/9), "
94+
"versions 105, 108, 110 (Stata 7), 111 (Stata 7SE), 113 (Stata 8/9), "
9595
"114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), 118 (Stata 14/15/16),"
9696
"and 119 (Stata 15/16, over 32,767 variables)."
9797
)
@@ -1393,7 +1393,7 @@ def _get_seek_variable_labels(self) -> int:
13931393

13941394
def _read_old_header(self, first_char: bytes) -> None:
13951395
self._format_version = int(first_char[0])
1396-
if self._format_version not in [104, 105, 108, 111, 113, 114, 115]:
1396+
if self._format_version not in [104, 105, 108, 110, 111, 113, 114, 115]:
13971397
raise ValueError(_version_error.format(version=self._format_version))
13981398
self._set_encoding()
13991399
self._byteorder = ">" if self._read_int8() == 0x1 else "<"
@@ -1408,7 +1408,7 @@ def _read_old_header(self, first_char: bytes) -> None:
14081408
self._time_stamp = self._get_time_stamp()
14091409

14101410
# descriptors
1411-
if self._format_version > 108:
1411+
if self._format_version >= 111:
14121412
typlist = [int(c) for c in self._path_or_buf.read(self._nvar)]
14131413
else:
14141414
buf = self._path_or_buf.read(self._nvar)
1.48 KB
Binary file not shown.
Binary file not shown.
1.49 KB
Binary file not shown.

pandas/tests/io/test_stata.py

+8-10
Original file line numberDiff line numberDiff line change
@@ -225,11 +225,9 @@ def test_read_dta3(self, file, datapath):
225225

226226
tm.assert_frame_equal(parsed, expected)
227227

228-
@pytest.mark.parametrize(
229-
"file", ["stata4_111", "stata4_113", "stata4_114", "stata4_115", "stata4_117"]
230-
)
231-
def test_read_dta4(self, file, datapath):
232-
file = datapath("io", "data", "stata", f"{file}.dta")
228+
@pytest.mark.parametrize("version", [110, 111, 113, 114, 115, 117])
229+
def test_read_dta4(self, version, datapath):
230+
file = datapath("io", "data", "stata", f"stata4_{version}.dta")
233231
parsed = self.read_dta(file)
234232

235233
expected = DataFrame.from_records(
@@ -271,11 +269,11 @@ def test_read_dta4(self, file, datapath):
271269
# stata doesn't save .category metadata
272270
tm.assert_frame_equal(parsed, expected)
273271

274-
@pytest.mark.parametrize("file", ["stata4_105", "stata4_108"])
275-
def test_readold_dta4(self, file, datapath):
272+
@pytest.mark.parametrize("version", [105, 108])
273+
def test_readold_dta4(self, version, datapath):
276274
# This test is the same as test_read_dta4 above except that the columns
277275
# had to be renamed to match the restrictions in older file format
278-
file = datapath("io", "data", "stata", f"{file}.dta")
276+
file = datapath("io", "data", "stata", f"stata4_{version}.dta")
279277
parsed = self.read_dta(file)
280278

281279
expected = DataFrame.from_records(
@@ -2002,7 +2000,7 @@ def test_read_write_ea_dtypes(self, dtype_backend, temp_file, tmp_path):
20022000
tm.assert_frame_equal(written_and_read_again.set_index("index"), expected)
20032001

20042002

2005-
@pytest.mark.parametrize("version", [105, 108, 111, 113, 114])
2003+
@pytest.mark.parametrize("version", [105, 108, 110, 111, 113, 114])
20062004
def test_backward_compat(version, datapath):
20072005
data_base = datapath("io", "data", "stata")
20082006
ref = os.path.join(data_base, "stata-compat-118.dta")
@@ -2012,7 +2010,7 @@ def test_backward_compat(version, datapath):
20122010
tm.assert_frame_equal(old_dta, expected, check_dtype=False)
20132011

20142012

2015-
@pytest.mark.parametrize("version", [105, 108, 111, 113, 114, 118])
2013+
@pytest.mark.parametrize("version", [105, 108, 110, 111, 113, 114, 118])
20162014
def test_bigendian(version, datapath):
20172015
ref = datapath("io", "data", "stata", f"stata-compat-{version}.dta")
20182016
big = datapath("io", "data", "stata", f"stata-compat-be-{version}.dta")

0 commit comments

Comments
 (0)