Skip to content

Commit 67a58cd

Browse files
ENH: Add support for reading 102-format Stata dta files (#58978)
* ENH: Add support for reading 102-format Stata dta files * Add reference to pull request in whatsnew file * Update doc/source/whatsnew/v3.0.0.rst Co-authored-by: Matthew Roeschke <[email protected]> * Remove extra space * Use datapath() for specifying test file locations --------- Co-authored-by: Matthew Roeschke <[email protected]>
1 parent b8bc510 commit 67a58cd

File tree

5 files changed

+38
-6
lines changed

5 files changed

+38
-6
lines changed

doc/source/whatsnew/v3.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ Other enhancements
5050
- :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
5151
- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
5252
- Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`)
53+
- Support reading Stata 102-format (Stata 1) dta files (:issue:`58978`)
5354
- Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`)
5455

5556
.. ---------------------------------------------------------------------------

pandas/io/stata.py

+22-5
Original file line numberDiff line numberDiff line change
@@ -91,9 +91,9 @@
9191

9292
_version_error = (
9393
"Version of given Stata file is {version}. pandas supports importing "
94-
"versions 103, 104, 105, 108, 110 (Stata 7), 111 (Stata 7SE), 113 (Stata 8/9), "
95-
"114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), 118 (Stata 14/15/16),"
96-
"and 119 (Stata 15/16, over 32,767 variables)."
94+
"versions 102, 103, 104, 105, 108, 110 (Stata 7), 111 (Stata 7SE), "
95+
"113 (Stata 8/9), 114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), "
96+
"118 (Stata 14/15/16), and 119 (Stata 15/16, over 32,767 variables)."
9797
)
9898

9999
_statafile_processing_params1 = """\
@@ -1352,8 +1352,10 @@ def _get_variable_labels(self) -> list[str]:
13521352
def _get_nobs(self) -> int:
13531353
if self._format_version >= 118:
13541354
return self._read_uint64()
1355-
else:
1355+
elif self._format_version >= 103:
13561356
return self._read_uint32()
1357+
else:
1358+
return self._read_uint16()
13571359

13581360
def _get_data_label(self) -> str:
13591361
if self._format_version >= 118:
@@ -1393,9 +1395,24 @@ def _get_seek_variable_labels(self) -> int:
13931395

13941396
def _read_old_header(self, first_char: bytes) -> None:
13951397
self._format_version = int(first_char[0])
1396-
if self._format_version not in [103, 104, 105, 108, 110, 111, 113, 114, 115]:
1398+
if self._format_version not in [
1399+
102,
1400+
103,
1401+
104,
1402+
105,
1403+
108,
1404+
110,
1405+
111,
1406+
113,
1407+
114,
1408+
115,
1409+
]:
13971410
raise ValueError(_version_error.format(version=self._format_version))
13981411
self._set_encoding()
1412+
# Note 102 format will have a zero in this header position, so support
1413+
# relies on little-endian being set whenever this value isn't one,
1414+
# even though for later releases strictly speaking the value should
1415+
# be either one or two to be valid
13991416
self._byteorder = ">" if self._read_int8() == 0x1 else "<"
14001417
self._filetype = self._read_int8()
14011418
self._path_or_buf.read(1) # unused
558 Bytes
Binary file not shown.
778 Bytes
Binary file not shown.

pandas/tests/io/test_stata.py

+15-1
Original file line numberDiff line numberDiff line change
@@ -267,7 +267,7 @@ def test_read_dta4(self, version, datapath):
267267
# stata doesn't save .category metadata
268268
tm.assert_frame_equal(parsed, expected)
269269

270-
@pytest.mark.parametrize("version", [103, 104, 105, 108])
270+
@pytest.mark.parametrize("version", [102, 103, 104, 105, 108])
271271
def test_readold_dta4(self, version, datapath):
272272
# This test is the same as test_read_dta4 above except that the columns
273273
# had to be renamed to match the restrictions in older file format
@@ -2058,6 +2058,19 @@ def test_backward_compat_nodateconversion(version, datapath):
20582058
tm.assert_frame_equal(old_dta, expected, check_dtype=False)
20592059

20602060

2061+
@pytest.mark.parametrize("version", [102])
2062+
def test_backward_compat_nostring(version, datapath):
2063+
# The Stata data format prior to 105 did not support a date format
2064+
# so read the raw values for comparison
2065+
ref = datapath("io", "data", "stata", "stata-compat-118.dta")
2066+
old = datapath("io", "data", "stata", f"stata-compat-{version}.dta")
2067+
expected = read_stata(ref, convert_dates=False)
2068+
# The Stata data format prior to 103 did not support string data
2069+
expected = expected.drop(columns=["s10"])
2070+
old_dta = read_stata(old, convert_dates=False)
2071+
tm.assert_frame_equal(old_dta, expected, check_dtype=False)
2072+
2073+
20612074
@pytest.mark.parametrize("version", [105, 108, 110, 111, 113, 114, 118])
20622075
def test_bigendian(version, datapath):
20632076
ref = datapath("io", "data", "stata", f"stata-compat-{version}.dta")
@@ -2067,6 +2080,7 @@ def test_bigendian(version, datapath):
20672080
tm.assert_frame_equal(big_dta, expected)
20682081

20692082

2083+
# Note: 102 format does not support big-endian byte order
20702084
@pytest.mark.parametrize("version", [103, 104])
20712085
def test_bigendian_nodateconversion(version, datapath):
20722086
# The Stata data format prior to 105 did not support a date format

0 commit comments

Comments
 (0)