diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index e5917c9176c54..8cc41947d43e0 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -49,6 +49,7 @@ Other enhancements - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`) - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`) - Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`) +- Support reading Stata 102-format (Stata 1) dta files (:issue:`58978`) - Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`) .. --------------------------------------------------------------------------- diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 5146876d20374..dd92b1bbfdba0 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -91,9 +91,9 @@ _version_error = ( "Version of given Stata file is {version}. pandas supports importing " - "versions 103, 104, 105, 108, 110 (Stata 7), 111 (Stata 7SE), 113 (Stata 8/9), " - "114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), 118 (Stata 14/15/16)," - "and 119 (Stata 15/16, over 32,767 variables)." + "versions 102, 103, 104, 105, 108, 110 (Stata 7), 111 (Stata 7SE), " + "113 (Stata 8/9), 114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), " + "118 (Stata 14/15/16), and 119 (Stata 15/16, over 32,767 variables)." ) _statafile_processing_params1 = """\ @@ -1352,8 +1352,10 @@ def _get_variable_labels(self) -> list[str]: def _get_nobs(self) -> int: if self._format_version >= 118: return self._read_uint64() - else: + elif self._format_version >= 103: return self._read_uint32() + else: + return self._read_uint16() def _get_data_label(self) -> str: if self._format_version >= 118: @@ -1393,9 +1395,24 @@ def _get_seek_variable_labels(self) -> int: def _read_old_header(self, first_char: bytes) -> None: self._format_version = int(first_char[0]) - if self._format_version not in [103, 104, 105, 108, 110, 111, 113, 114, 115]: + if self._format_version not in [ + 102, + 103, + 104, + 105, + 108, + 110, + 111, + 113, + 114, + 115, + ]: raise ValueError(_version_error.format(version=self._format_version)) self._set_encoding() + # Note 102 format will have a zero in this header position, so support + # relies on little-endian being set whenever this value isn't one, + # even though for later releases strictly speaking the value should + # be either one or two to be valid self._byteorder = ">" if self._read_int8() == 0x1 else "<" self._filetype = self._read_int8() self._path_or_buf.read(1) # unused diff --git a/pandas/tests/io/data/stata/stata-compat-102.dta b/pandas/tests/io/data/stata/stata-compat-102.dta new file mode 100644 index 0000000000000..424b767b0011c Binary files /dev/null and b/pandas/tests/io/data/stata/stata-compat-102.dta differ diff --git a/pandas/tests/io/data/stata/stata4_102.dta b/pandas/tests/io/data/stata/stata4_102.dta new file mode 100644 index 0000000000000..669fe06c2b492 Binary files /dev/null and b/pandas/tests/io/data/stata/stata4_102.dta differ diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 2534df6a82f89..6d6f222fc0660 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -267,7 +267,7 @@ def test_read_dta4(self, version, datapath): # stata doesn't save .category metadata tm.assert_frame_equal(parsed, expected) - @pytest.mark.parametrize("version", [103, 104, 105, 108]) + @pytest.mark.parametrize("version", [102, 103, 104, 105, 108]) def test_readold_dta4(self, version, datapath): # This test is the same as test_read_dta4 above except that the columns # had to be renamed to match the restrictions in older file format @@ -2058,6 +2058,19 @@ def test_backward_compat_nodateconversion(version, datapath): tm.assert_frame_equal(old_dta, expected, check_dtype=False) +@pytest.mark.parametrize("version", [102]) +def test_backward_compat_nostring(version, datapath): + # The Stata data format prior to 105 did not support a date format + # so read the raw values for comparison + ref = datapath("io", "data", "stata", "stata-compat-118.dta") + old = datapath("io", "data", "stata", f"stata-compat-{version}.dta") + expected = read_stata(ref, convert_dates=False) + # The Stata data format prior to 103 did not support string data + expected = expected.drop(columns=["s10"]) + old_dta = read_stata(old, convert_dates=False) + tm.assert_frame_equal(old_dta, expected, check_dtype=False) + + @pytest.mark.parametrize("version", [105, 108, 110, 111, 113, 114, 118]) def test_bigendian(version, datapath): ref = datapath("io", "data", "stata", f"stata-compat-{version}.dta") @@ -2067,6 +2080,7 @@ def test_bigendian(version, datapath): tm.assert_frame_equal(big_dta, expected) +# Note: 102 format does not support big-endian byte order @pytest.mark.parametrize("version", [103, 104]) def test_bigendian_nodateconversion(version, datapath): # The Stata data format prior to 105 did not support a date format