From 6f20675428ed240fd1f886052f6c75dc30217c0e Mon Sep 17 00:00:00 2001 From: Chris Charlton Date: Fri, 26 Jul 2024 13:15:45 +0100 Subject: [PATCH 1/2] BUG: Missing value code not recognised for Stata format version 105 and earlier --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/io/stata.py | 7 ++++ pandas/tests/io/data/stata/stata1_102.dta | Bin 0 -> 362 bytes pandas/tests/io/data/stata/stata1_103.dta | Bin 0 -> 364 bytes pandas/tests/io/data/stata/stata1_104.dta | Bin 0 -> 363 bytes pandas/tests/io/data/stata/stata1_105.dta | Bin 0 -> 409 bytes pandas/tests/io/data/stata/stata8_102.dta | Bin 0 -> 362 bytes pandas/tests/io/data/stata/stata8_103.dta | Bin 0 -> 364 bytes pandas/tests/io/data/stata/stata8_104.dta | Bin 0 -> 363 bytes pandas/tests/io/data/stata/stata8_105.dta | Bin 0 -> 409 bytes pandas/tests/io/test_stata.py | 39 ++++++++++++++++------ 11 files changed, 36 insertions(+), 11 deletions(-) create mode 100644 pandas/tests/io/data/stata/stata1_102.dta create mode 100644 pandas/tests/io/data/stata/stata1_103.dta create mode 100644 pandas/tests/io/data/stata/stata1_104.dta create mode 100644 pandas/tests/io/data/stata/stata1_105.dta create mode 100644 pandas/tests/io/data/stata/stata8_102.dta create mode 100644 pandas/tests/io/data/stata/stata8_103.dta create mode 100644 pandas/tests/io/data/stata/stata8_104.dta create mode 100644 pandas/tests/io/data/stata/stata8_105.dta diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index e71220102cbb4..768b12ba1007f 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -584,6 +584,7 @@ I/O - Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`) - Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`) - Bug in :meth:`read_stata` where extreme value integers were incorrectly interpreted as missing for format versions 111 and prior (:issue:`58130`) +- Bug in :meth:`read_stata` where the missing code for double was not recognised for format versions 105 and prior (:issue:`58149`) Period ^^^^^^ diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 03c15d0ab07bb..2ef9ed53ae86f 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1821,6 +1821,13 @@ def _do_convert_missing(self, data: DataFrame, convert_missing: bool) -> DataFra replacements = {} for i in range(len(data.columns)): fmt = self._typlist[i] + # missing code for double was different in version 105 and prior + # recode instances of this to the currently used value + if self._format_version <= 105 and fmt == "d": + data.iloc[:, i] = data.iloc[:, i].replace( + float.fromhex("0x1.0p333"), self.MISSING_VALUES["d"] + ) + if self._format_version <= 111: if fmt not in self.OLD_VALID_RANGE: continue diff --git a/pandas/tests/io/data/stata/stata1_102.dta b/pandas/tests/io/data/stata/stata1_102.dta new file mode 100644 index 0000000000000000000000000000000000000000..d0ca1b2a8c02d7053e9dea85f60c070758ceba7a GIT binary patch literal 362 zcmYdeU}RtgVnQG-B{MT8Ej~B1xEQE31;$8%F*0F{92f)HL{&>YgLDQ4RYL<1t!e?` bK`BEcFc(?jy&kj+%J)H6tDU{EzQ0MV)z c5FV5=Gy-#x)ei%9J;Vjy&kj+%J)H6tDU{EzQ0MV)z d5FV5=Gy-#x)ei%9J;VLk}`AB;&U^Li-FoxV2mUfBNN8R wfiaM6QMJ@FNQVlj8XCYksupl20}9*F2qA%N(C}c_L%eYyr2c>X|NsB%0kr-fy8r+H literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata8_102.dta b/pandas/tests/io/data/stata/stata8_102.dta new file mode 100644 index 0000000000000000000000000000000000000000..5d3a4fb171e9cd58d763080649c593989b4ba18b GIT binary patch literal 362 zcmYdeU}RtgVnQG@Gbb%2Gq1!V9;6b;Ff;?PfDB_J5F;%oKM~9XGt5jtVroo40t{3w e^$gM(81PXrrNe^$Uk`-;|JMUO1HwRXAOrxWY8(>) literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata8_103.dta b/pandas/tests/io/data/stata/stata8_103.dta new file mode 100644 index 0000000000000000000000000000000000000000..623a21e37650f5a308047b14bb1df86d7abe88a1 GIT binary patch literal 364 zcmYdiVq{=tU}PW+GBb11QZn;OEaE|Gfeb@45DUmKHUcrya`F?wOfbXD1SF=$1SG&f f)l$zOoq+)#1yedK=>PRV`2T-B&@&(m1P4L@tB4#H literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata8_104.dta b/pandas/tests/io/data/stata/stata8_104.dta new file mode 100644 index 0000000000000000000000000000000000000000..df79d6a8af23018aafb2a2bf2b4fac488bad6d67 GIT binary patch literal 363 zcmc~`Vq{=tU}PW+k}`ABQZn;OEaE|Gfeb@45DUmKHUcrya`F?wOfbXD1SF=$1SG&f g)l$zOoq+)#1yedK==%Tl|NsB52l@qsf#5(00DLtY5dZ)H literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata8_105.dta b/pandas/tests/io/data/stata/stata8_105.dta new file mode 100644 index 0000000000000000000000000000000000000000..cf01463a83d8146fc7736a0ec4db0581bfd393f0 GIT binary patch literal 409 zcmc~~Vq{=tU}PW+49yiBOVbsM3=BnSjL9n1BQrs9Ne7q?5^pY8y6G{r~#^|Nqwm{R6^4a3BN#rLrEr literal 0 HcmV?d00001 diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index fb7182fdefb32..c2c4140fa304d 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -120,9 +120,9 @@ def test_read_index_col_none(self, version, temp_file): expected["a"] = expected["a"].astype(np.int32) tm.assert_frame_equal(read_df, expected, check_index_type=True) - # Note this test starts at format version 108 as the missing code for double - # was different prior to this (see GH 58149) and would therefore fail - @pytest.mark.parametrize("version", [108, 110, 111, 113, 114, 115, 117, 118, 119]) + @pytest.mark.parametrize( + "version", [102, 103, 104, 105, 108, 110, 111, 113, 114, 115, 117, 118, 119] + ) def test_read_dta1(self, version, datapath): file = datapath("io", "data", "stata", f"stata1_{version}.dta") parsed = self.read_dta(file) @@ -918,8 +918,8 @@ def test_missing_value_generator(self, temp_file): ) assert val.string == ".z" - @pytest.mark.parametrize("file", ["stata8_113", "stata8_115", "stata8_117"]) - def test_missing_value_conversion(self, file, datapath): + @pytest.mark.parametrize("version", [113, 115, 117]) + def test_missing_value_conversion(self, version, datapath): columns = ["int8_", "int16_", "int32_", "float32_", "float64_"] smv = StataMissingValue(101) keys = sorted(smv.MISSING_VALUES.keys()) @@ -930,14 +930,13 @@ def test_missing_value_conversion(self, file, datapath): expected = DataFrame(data, columns=columns) parsed = read_stata( - datapath("io", "data", "stata", f"{file}.dta"), convert_missing=True + datapath("io", "data", "stata", f"stata8_{version}.dta"), + convert_missing=True, ) tm.assert_frame_equal(parsed, expected) - # Note this test starts at format version 108 as the missing code for double - # was different prior to this (see GH 58149) and would therefore fail - @pytest.mark.parametrize("file", ["stata8_108", "stata8_110", "stata8_111"]) - def test_missing_value_conversion_compat(self, file, datapath): + @pytest.mark.parametrize("version", [104, 105, 108, 110, 111]) + def test_missing_value_conversion_compat(self, version, datapath): columns = ["int8_", "int16_", "int32_", "float32_", "float64_"] smv = StataMissingValue(101) keys = sorted(smv.MISSING_VALUES.keys()) @@ -947,7 +946,25 @@ def test_missing_value_conversion_compat(self, file, datapath): expected = DataFrame(data, columns=columns) parsed = read_stata( - datapath("io", "data", "stata", f"{file}.dta"), convert_missing=True + datapath("io", "data", "stata", f"stata8_{version}.dta"), + convert_missing=True, + ) + tm.assert_frame_equal(parsed, expected) + + # The byte type was not supported prior to the 104 format + @pytest.mark.parametrize("version", [102, 103]) + def test_missing_value_conversion_compat_nobyte(self, version, datapath): + columns = ["int8_", "int16_", "int32_", "float32_", "float64_"] + smv = StataMissingValue(101) + keys = sorted(smv.MISSING_VALUES.keys()) + data = [] + row = [StataMissingValue(keys[j * 27]) for j in [1, 1, 2, 3, 4]] + data.append(row) + expected = DataFrame(data, columns=columns) + + parsed = read_stata( + datapath("io", "data", "stata", f"stata8_{version}.dta"), + convert_missing=True, ) tm.assert_frame_equal(parsed, expected) From 15409f8b43a063bdcacfefa443e067213a05d6a5 Mon Sep 17 00:00:00 2001 From: Chris Charlton Date: Fri, 26 Jul 2024 19:14:14 +0100 Subject: [PATCH 2/2] Move definition of the old missing value constant for the double type out of the loop --- pandas/io/stata.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 2ef9ed53ae86f..4be06f93689f2 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1817,15 +1817,17 @@ def read( return data def _do_convert_missing(self, data: DataFrame, convert_missing: bool) -> DataFrame: + # missing code for double was different in version 105 and prior + old_missingdouble = float.fromhex("0x1.0p333") + # Check for missing values, and replace if found replacements = {} for i in range(len(data.columns)): fmt = self._typlist[i] - # missing code for double was different in version 105 and prior - # recode instances of this to the currently used value + # recode instances of the old missing code to the currently used value if self._format_version <= 105 and fmt == "d": data.iloc[:, i] = data.iloc[:, i].replace( - float.fromhex("0x1.0p333"), self.MISSING_VALUES["d"] + old_missingdouble, self.MISSING_VALUES["d"] ) if self._format_version <= 111: