diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index e71220102cbb4..768b12ba1007f 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -584,6 +584,7 @@ I/O - Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`) - Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`) - Bug in :meth:`read_stata` where extreme value integers were incorrectly interpreted as missing for format versions 111 and prior (:issue:`58130`) +- Bug in :meth:`read_stata` where the missing code for double was not recognised for format versions 105 and prior (:issue:`58149`) Period ^^^^^^ diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 03c15d0ab07bb..4be06f93689f2 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1817,10 +1817,19 @@ def read( return data def _do_convert_missing(self, data: DataFrame, convert_missing: bool) -> DataFrame: + # missing code for double was different in version 105 and prior + old_missingdouble = float.fromhex("0x1.0p333") + # Check for missing values, and replace if found replacements = {} for i in range(len(data.columns)): fmt = self._typlist[i] + # recode instances of the old missing code to the currently used value + if self._format_version <= 105 and fmt == "d": + data.iloc[:, i] = data.iloc[:, i].replace( + old_missingdouble, self.MISSING_VALUES["d"] + ) + if self._format_version <= 111: if fmt not in self.OLD_VALID_RANGE: continue diff --git a/pandas/tests/io/data/stata/stata1_102.dta b/pandas/tests/io/data/stata/stata1_102.dta new file mode 100644 index 0000000000000..d0ca1b2a8c02d Binary files /dev/null and b/pandas/tests/io/data/stata/stata1_102.dta differ diff --git a/pandas/tests/io/data/stata/stata1_103.dta b/pandas/tests/io/data/stata/stata1_103.dta new file mode 100644 index 0000000000000..98072ba6bd4fc Binary files /dev/null and b/pandas/tests/io/data/stata/stata1_103.dta differ diff --git a/pandas/tests/io/data/stata/stata1_104.dta b/pandas/tests/io/data/stata/stata1_104.dta new file mode 100644 index 0000000000000..a46aeb9128ecf Binary files /dev/null and b/pandas/tests/io/data/stata/stata1_104.dta differ diff --git a/pandas/tests/io/data/stata/stata1_105.dta b/pandas/tests/io/data/stata/stata1_105.dta new file mode 100644 index 0000000000000..ba2c463486dbf Binary files /dev/null and b/pandas/tests/io/data/stata/stata1_105.dta differ diff --git a/pandas/tests/io/data/stata/stata8_102.dta b/pandas/tests/io/data/stata/stata8_102.dta new file mode 100644 index 0000000000000..5d3a4fb171e9c Binary files /dev/null and b/pandas/tests/io/data/stata/stata8_102.dta differ diff --git a/pandas/tests/io/data/stata/stata8_103.dta b/pandas/tests/io/data/stata/stata8_103.dta new file mode 100644 index 0000000000000..623a21e37650f Binary files /dev/null and b/pandas/tests/io/data/stata/stata8_103.dta differ diff --git a/pandas/tests/io/data/stata/stata8_104.dta b/pandas/tests/io/data/stata/stata8_104.dta new file mode 100644 index 0000000000000..df79d6a8af230 Binary files /dev/null and b/pandas/tests/io/data/stata/stata8_104.dta differ diff --git a/pandas/tests/io/data/stata/stata8_105.dta b/pandas/tests/io/data/stata/stata8_105.dta new file mode 100644 index 0000000000000..cf01463a83d81 Binary files /dev/null and b/pandas/tests/io/data/stata/stata8_105.dta differ diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index fb7182fdefb32..c2c4140fa304d 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -120,9 +120,9 @@ def test_read_index_col_none(self, version, temp_file): expected["a"] = expected["a"].astype(np.int32) tm.assert_frame_equal(read_df, expected, check_index_type=True) - # Note this test starts at format version 108 as the missing code for double - # was different prior to this (see GH 58149) and would therefore fail - @pytest.mark.parametrize("version", [108, 110, 111, 113, 114, 115, 117, 118, 119]) + @pytest.mark.parametrize( + "version", [102, 103, 104, 105, 108, 110, 111, 113, 114, 115, 117, 118, 119] + ) def test_read_dta1(self, version, datapath): file = datapath("io", "data", "stata", f"stata1_{version}.dta") parsed = self.read_dta(file) @@ -918,8 +918,8 @@ def test_missing_value_generator(self, temp_file): ) assert val.string == ".z" - @pytest.mark.parametrize("file", ["stata8_113", "stata8_115", "stata8_117"]) - def test_missing_value_conversion(self, file, datapath): + @pytest.mark.parametrize("version", [113, 115, 117]) + def test_missing_value_conversion(self, version, datapath): columns = ["int8_", "int16_", "int32_", "float32_", "float64_"] smv = StataMissingValue(101) keys = sorted(smv.MISSING_VALUES.keys()) @@ -930,14 +930,13 @@ def test_missing_value_conversion(self, file, datapath): expected = DataFrame(data, columns=columns) parsed = read_stata( - datapath("io", "data", "stata", f"{file}.dta"), convert_missing=True + datapath("io", "data", "stata", f"stata8_{version}.dta"), + convert_missing=True, ) tm.assert_frame_equal(parsed, expected) - # Note this test starts at format version 108 as the missing code for double - # was different prior to this (see GH 58149) and would therefore fail - @pytest.mark.parametrize("file", ["stata8_108", "stata8_110", "stata8_111"]) - def test_missing_value_conversion_compat(self, file, datapath): + @pytest.mark.parametrize("version", [104, 105, 108, 110, 111]) + def test_missing_value_conversion_compat(self, version, datapath): columns = ["int8_", "int16_", "int32_", "float32_", "float64_"] smv = StataMissingValue(101) keys = sorted(smv.MISSING_VALUES.keys()) @@ -947,7 +946,25 @@ def test_missing_value_conversion_compat(self, file, datapath): expected = DataFrame(data, columns=columns) parsed = read_stata( - datapath("io", "data", "stata", f"{file}.dta"), convert_missing=True + datapath("io", "data", "stata", f"stata8_{version}.dta"), + convert_missing=True, + ) + tm.assert_frame_equal(parsed, expected) + + # The byte type was not supported prior to the 104 format + @pytest.mark.parametrize("version", [102, 103]) + def test_missing_value_conversion_compat_nobyte(self, version, datapath): + columns = ["int8_", "int16_", "int32_", "float32_", "float64_"] + smv = StataMissingValue(101) + keys = sorted(smv.MISSING_VALUES.keys()) + data = [] + row = [StataMissingValue(keys[j * 27]) for j in [1, 1, 2, 3, 4]] + data.append(row) + expected = DataFrame(data, columns=columns) + + parsed = read_stata( + datapath("io", "data", "stata", f"stata8_{version}.dta"), + convert_missing=True, ) tm.assert_frame_equal(parsed, expected)