diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 865996bdf8892..9cce2a04ed89c 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -44,8 +44,8 @@ Other enhancements - :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`) - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`) - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`) +- Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`) - Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`) -- .. --------------------------------------------------------------------------- .. _whatsnew_300.notable_bug_fixes: diff --git a/pandas/io/stata.py b/pandas/io/stata.py index b87ec94b85bb0..4e7bd160a5a52 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -91,7 +91,7 @@ _version_error = ( "Version of given Stata file is {version}. pandas supports importing " - "versions 105, 108, 110 (Stata 7), 111 (Stata 7SE), 113 (Stata 8/9), " + "versions 103, 104, 105, 108, 110 (Stata 7), 111 (Stata 7SE), 113 (Stata 8/9), " "114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), 118 (Stata 14/15/16)," "and 119 (Stata 15/16, over 32,767 variables)." ) @@ -1393,7 +1393,7 @@ def _get_seek_variable_labels(self) -> int: def _read_old_header(self, first_char: bytes) -> None: self._format_version = int(first_char[0]) - if self._format_version not in [104, 105, 108, 110, 111, 113, 114, 115]: + if self._format_version not in [103, 104, 105, 108, 110, 111, 113, 114, 115]: raise ValueError(_version_error.format(version=self._format_version)) self._set_encoding() self._byteorder = ">" if self._read_int8() == 0x1 else "<" @@ -1405,7 +1405,8 @@ def _read_old_header(self, first_char: bytes) -> None: self._data_label = self._get_data_label() - self._time_stamp = self._get_time_stamp() + if self._format_version >= 105: + self._time_stamp = self._get_time_stamp() # descriptors if self._format_version >= 111: diff --git a/pandas/tests/io/data/stata/stata-compat-103.dta b/pandas/tests/io/data/stata/stata-compat-103.dta new file mode 100644 index 0000000000000..adfeb6c672333 Binary files /dev/null and b/pandas/tests/io/data/stata/stata-compat-103.dta differ diff --git a/pandas/tests/io/data/stata/stata-compat-104.dta b/pandas/tests/io/data/stata/stata-compat-104.dta new file mode 100644 index 0000000000000..9bc3659afd31c Binary files /dev/null and b/pandas/tests/io/data/stata/stata-compat-104.dta differ diff --git a/pandas/tests/io/data/stata/stata-compat-be-103.dta b/pandas/tests/io/data/stata/stata-compat-be-103.dta new file mode 100644 index 0000000000000..0e2ef231f91c0 Binary files /dev/null and b/pandas/tests/io/data/stata/stata-compat-be-103.dta differ diff --git a/pandas/tests/io/data/stata/stata-compat-be-104.dta b/pandas/tests/io/data/stata/stata-compat-be-104.dta new file mode 100644 index 0000000000000..98185d8ce27dc Binary files /dev/null and b/pandas/tests/io/data/stata/stata-compat-be-104.dta differ diff --git a/pandas/tests/io/data/stata/stata4_103.dta b/pandas/tests/io/data/stata/stata4_103.dta new file mode 100644 index 0000000000000..3c63935e63df9 Binary files /dev/null and b/pandas/tests/io/data/stata/stata4_103.dta differ diff --git a/pandas/tests/io/data/stata/stata4_104.dta b/pandas/tests/io/data/stata/stata4_104.dta new file mode 100644 index 0000000000000..c2517355ebff1 Binary files /dev/null and b/pandas/tests/io/data/stata/stata4_104.dta differ diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index d5134a3e3afd0..a27448a342a19 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -267,7 +267,7 @@ def test_read_dta4(self, version, datapath): # stata doesn't save .category metadata tm.assert_frame_equal(parsed, expected) - @pytest.mark.parametrize("version", [105, 108]) + @pytest.mark.parametrize("version", [103, 104, 105, 108]) def test_readold_dta4(self, version, datapath): # This test is the same as test_read_dta4 above except that the columns # had to be renamed to match the restrictions in older file format @@ -2011,6 +2011,18 @@ def test_backward_compat(version, datapath): tm.assert_frame_equal(old_dta, expected, check_dtype=False) +@pytest.mark.parametrize("version", [103, 104]) +def test_backward_compat_nodateconversion(version, datapath): + # The Stata data format prior to 105 did not support a date format + # so read the raw values for comparison + data_base = datapath("io", "data", "stata") + ref = os.path.join(data_base, "stata-compat-118.dta") + old = os.path.join(data_base, f"stata-compat-{version}.dta") + expected = read_stata(ref, convert_dates=False) + old_dta = read_stata(old, convert_dates=False) + tm.assert_frame_equal(old_dta, expected, check_dtype=False) + + @pytest.mark.parametrize("version", [105, 108, 110, 111, 113, 114, 118]) def test_bigendian(version, datapath): ref = datapath("io", "data", "stata", f"stata-compat-{version}.dta") @@ -2020,6 +2032,17 @@ def test_bigendian(version, datapath): tm.assert_frame_equal(big_dta, expected) +@pytest.mark.parametrize("version", [103, 104]) +def test_bigendian_nodateconversion(version, datapath): + # The Stata data format prior to 105 did not support a date format + # so read the raw values for comparison + ref = datapath("io", "data", "stata", f"stata-compat-{version}.dta") + big = datapath("io", "data", "stata", f"stata-compat-be-{version}.dta") + expected = read_stata(ref, convert_dates=False) + big_dta = read_stata(big, convert_dates=False) + tm.assert_frame_equal(big_dta, expected) + + def test_direct_read(datapath, monkeypatch): file_path = datapath("io", "data", "stata", "stata-compat-118.dta")