Skip to content

Commit a5492ee

Browse files
authored
ENH: Restore support for reading Stata 104 format dta files, and add support for 103 (#58555)
1 parent ed4b867 commit a5492ee

File tree

9 files changed

+29
-5
lines changed

9 files changed

+29
-5
lines changed

doc/source/whatsnew/v3.0.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,8 @@ Other enhancements
4444
- :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`)
4545
- :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
4646
- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
47+
- Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`)
4748
- Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`)
48-
-
4949

5050
.. ---------------------------------------------------------------------------
5151
.. _whatsnew_300.notable_bug_fixes:

pandas/io/stata.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@
9191

9292
_version_error = (
9393
"Version of given Stata file is {version}. pandas supports importing "
94-
"versions 105, 108, 110 (Stata 7), 111 (Stata 7SE), 113 (Stata 8/9), "
94+
"versions 103, 104, 105, 108, 110 (Stata 7), 111 (Stata 7SE), 113 (Stata 8/9), "
9595
"114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), 118 (Stata 14/15/16),"
9696
"and 119 (Stata 15/16, over 32,767 variables)."
9797
)
@@ -1393,7 +1393,7 @@ def _get_seek_variable_labels(self) -> int:
13931393

13941394
def _read_old_header(self, first_char: bytes) -> None:
13951395
self._format_version = int(first_char[0])
1396-
if self._format_version not in [104, 105, 108, 110, 111, 113, 114, 115]:
1396+
if self._format_version not in [103, 104, 105, 108, 110, 111, 113, 114, 115]:
13971397
raise ValueError(_version_error.format(version=self._format_version))
13981398
self._set_encoding()
13991399
self._byteorder = ">" if self._read_int8() == 0x1 else "<"
@@ -1405,7 +1405,8 @@ def _read_old_header(self, first_char: bytes) -> None:
14051405

14061406
self._data_label = self._get_data_label()
14071407

1408-
self._time_stamp = self._get_time_stamp()
1408+
if self._format_version >= 105:
1409+
self._time_stamp = self._get_time_stamp()
14091410

14101411
# descriptors
14111412
if self._format_version >= 111:
650 Bytes
Binary file not shown.
647 Bytes
Binary file not shown.
650 Bytes
Binary file not shown.
647 Bytes
Binary file not shown.
780 Bytes
Binary file not shown.
770 Bytes
Binary file not shown.

pandas/tests/io/test_stata.py

+24-1
Original file line numberDiff line numberDiff line change
@@ -267,7 +267,7 @@ def test_read_dta4(self, version, datapath):
267267
# stata doesn't save .category metadata
268268
tm.assert_frame_equal(parsed, expected)
269269

270-
@pytest.mark.parametrize("version", [105, 108])
270+
@pytest.mark.parametrize("version", [103, 104, 105, 108])
271271
def test_readold_dta4(self, version, datapath):
272272
# This test is the same as test_read_dta4 above except that the columns
273273
# had to be renamed to match the restrictions in older file format
@@ -2011,6 +2011,18 @@ def test_backward_compat(version, datapath):
20112011
tm.assert_frame_equal(old_dta, expected, check_dtype=False)
20122012

20132013

2014+
@pytest.mark.parametrize("version", [103, 104])
2015+
def test_backward_compat_nodateconversion(version, datapath):
2016+
# The Stata data format prior to 105 did not support a date format
2017+
# so read the raw values for comparison
2018+
data_base = datapath("io", "data", "stata")
2019+
ref = os.path.join(data_base, "stata-compat-118.dta")
2020+
old = os.path.join(data_base, f"stata-compat-{version}.dta")
2021+
expected = read_stata(ref, convert_dates=False)
2022+
old_dta = read_stata(old, convert_dates=False)
2023+
tm.assert_frame_equal(old_dta, expected, check_dtype=False)
2024+
2025+
20142026
@pytest.mark.parametrize("version", [105, 108, 110, 111, 113, 114, 118])
20152027
def test_bigendian(version, datapath):
20162028
ref = datapath("io", "data", "stata", f"stata-compat-{version}.dta")
@@ -2020,6 +2032,17 @@ def test_bigendian(version, datapath):
20202032
tm.assert_frame_equal(big_dta, expected)
20212033

20222034

2035+
@pytest.mark.parametrize("version", [103, 104])
2036+
def test_bigendian_nodateconversion(version, datapath):
2037+
# The Stata data format prior to 105 did not support a date format
2038+
# so read the raw values for comparison
2039+
ref = datapath("io", "data", "stata", f"stata-compat-{version}.dta")
2040+
big = datapath("io", "data", "stata", f"stata-compat-be-{version}.dta")
2041+
expected = read_stata(ref, convert_dates=False)
2042+
big_dta = read_stata(big, convert_dates=False)
2043+
tm.assert_frame_equal(big_dta, expected)
2044+
2045+
20232046
def test_direct_read(datapath, monkeypatch):
20242047
file_path = datapath("io", "data", "stata", "stata-compat-118.dta")
20252048

0 commit comments

Comments
 (0)