From 6f20675428ed240fd1f886052f6c75dc30217c0e Mon Sep 17 00:00:00 2001
From: Chris Charlton <c.charlton@bristol.ac.uk>
Date: Fri, 26 Jul 2024 13:15:45 +0100
Subject: [PATCH 1/2] BUG: Missing value code not recognised for Stata format
 version 105 and earlier

---
 doc/source/whatsnew/v3.0.0.rst            |   1 +
 pandas/io/stata.py                        |   7 ++++
 pandas/tests/io/data/stata/stata1_102.dta | Bin 0 -> 362 bytes
 pandas/tests/io/data/stata/stata1_103.dta | Bin 0 -> 364 bytes
 pandas/tests/io/data/stata/stata1_104.dta | Bin 0 -> 363 bytes
 pandas/tests/io/data/stata/stata1_105.dta | Bin 0 -> 409 bytes
 pandas/tests/io/data/stata/stata8_102.dta | Bin 0 -> 362 bytes
 pandas/tests/io/data/stata/stata8_103.dta | Bin 0 -> 364 bytes
 pandas/tests/io/data/stata/stata8_104.dta | Bin 0 -> 363 bytes
 pandas/tests/io/data/stata/stata8_105.dta | Bin 0 -> 409 bytes
 pandas/tests/io/test_stata.py             |  39 ++++++++++++++++------
 11 files changed, 36 insertions(+), 11 deletions(-)
 create mode 100644 pandas/tests/io/data/stata/stata1_102.dta
 create mode 100644 pandas/tests/io/data/stata/stata1_103.dta
 create mode 100644 pandas/tests/io/data/stata/stata1_104.dta
 create mode 100644 pandas/tests/io/data/stata/stata1_105.dta
 create mode 100644 pandas/tests/io/data/stata/stata8_102.dta
 create mode 100644 pandas/tests/io/data/stata/stata8_103.dta
 create mode 100644 pandas/tests/io/data/stata/stata8_104.dta
 create mode 100644 pandas/tests/io/data/stata/stata8_105.dta

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index e71220102cbb4..768b12ba1007f 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -584,6 +584,7 @@ I/O
 - Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`)
 - Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`)
 - Bug in :meth:`read_stata` where extreme value integers were incorrectly interpreted as missing for format versions 111 and prior (:issue:`58130`)
+- Bug in :meth:`read_stata` where the missing code for double was not recognised for format versions 105 and prior (:issue:`58149`)
 
 Period
 ^^^^^^
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
index 03c15d0ab07bb..2ef9ed53ae86f 100644
--- a/pandas/io/stata.py
+++ b/pandas/io/stata.py
@@ -1821,6 +1821,13 @@ def _do_convert_missing(self, data: DataFrame, convert_missing: bool) -> DataFra
         replacements = {}
         for i in range(len(data.columns)):
             fmt = self._typlist[i]
+            # missing code for double was different in version 105 and prior
+            # recode instances of this to the currently used value
+            if self._format_version <= 105 and fmt == "d":
+                data.iloc[:, i] = data.iloc[:, i].replace(
+                    float.fromhex("0x1.0p333"), self.MISSING_VALUES["d"]
+                )
+
             if self._format_version <= 111:
                 if fmt not in self.OLD_VALID_RANGE:
                     continue
diff --git a/pandas/tests/io/data/stata/stata1_102.dta b/pandas/tests/io/data/stata/stata1_102.dta
new file mode 100644
index 0000000000000000000000000000000000000000..d0ca1b2a8c02d7053e9dea85f60c070758ceba7a
GIT binary patch
literal 362
zcmYdeU}RtgVnQG-B{MT8Ej~B1xEQE31;$8%F*0F{92f)HL{&>YgLDQ4RYL<1t!e?`
bK`BEcFc(?<Fksh1{BR)Te?1WX|6dOPikBS-

literal 0
HcmV?d00001

diff --git a/pandas/tests/io/data/stata/stata1_103.dta b/pandas/tests/io/data/stata/stata1_103.dta
new file mode 100644
index 0000000000000000000000000000000000000000..98072ba6bd4fc1d85cce2f48dce0259c6a530414
GIT binary patch
literal 364
zcmYdiVq{=tU}PW+(o!-rbJF5-GmDFXic?^WBp4$T#>jy&kj+%J)H6tDU{EzQ0MV)z
c5FV5=Gy-#x)ei%9J;V<OLjKnS;s5{j0FSjD3jhEB

literal 0
HcmV?d00001

diff --git a/pandas/tests/io/data/stata/stata1_104.dta b/pandas/tests/io/data/stata/stata1_104.dta
new file mode 100644
index 0000000000000000000000000000000000000000..a46aeb9128ecfb85972caacb24f109cb423eef28
GIT binary patch
literal 363
zcmc~`Vq{=tU}PW+(o&K#bJF5-GmDFXic?^WBp4$T#>jy&kj+%J)H6tDU{EzQ0MV)z
d5FV5=Gy-#x)ei%9J;V<OLhAq5|NsBL9sp_F90mXY

literal 0
HcmV?d00001

diff --git a/pandas/tests/io/data/stata/stata1_105.dta b/pandas/tests/io/data/stata/stata1_105.dta
new file mode 100644
index 0000000000000000000000000000000000000000..ba2c463486dbfa4d11dd5cb4762653e267d2ab20
GIT binary patch
literal 409
zcmc~~Vq{=tU}PW+3{4aqOVbsM3=EAG3{9*|EE&>Lk}`AB;&U^Li-FoxV2mUfBNN8R
wfiaM6QMJ@FNQVlj8XCYksupl20}9*F2qA%N(C}c_L%eYyr2c>X|NsB%0kr-fy8r+H

literal 0
HcmV?d00001

diff --git a/pandas/tests/io/data/stata/stata8_102.dta b/pandas/tests/io/data/stata/stata8_102.dta
new file mode 100644
index 0000000000000000000000000000000000000000..5d3a4fb171e9cd58d763080649c593989b4ba18b
GIT binary patch
literal 362
zcmYdeU}RtgVnQG@Gbb%2Gq1!V9;6b;Ff;?PfDB_J5F;%oKM~9XGt5jtVroo40t{3w
e^$gM(81PXrrNe^$Uk`-;|JMUO1HwRXAOrxWY8(>)

literal 0
HcmV?d00001

diff --git a/pandas/tests/io/data/stata/stata8_103.dta b/pandas/tests/io/data/stata/stata8_103.dta
new file mode 100644
index 0000000000000000000000000000000000000000..623a21e37650f5a308047b14bb1df86d7abe88a1
GIT binary patch
literal 364
zcmYdiVq{=tU}PW+GBb11QZn;OEaE|Gfeb@45DUmKHUcrya`F?wOfbXD1SF=$1SG&f
f)l$zOoq+)#1yedK=>PRV`2T-B&@&(m1P4L@tB4#H

literal 0
HcmV?d00001

diff --git a/pandas/tests/io/data/stata/stata8_104.dta b/pandas/tests/io/data/stata/stata8_104.dta
new file mode 100644
index 0000000000000000000000000000000000000000..df79d6a8af23018aafb2a2bf2b4fac488bad6d67
GIT binary patch
literal 363
zcmc~`Vq{=tU}PW+k}`ABQZn;OEaE|Gfeb@45DUmKHUcrya`F?wOfbXD1SF=$1SG&f
g)l$zOoq+)#1yedK==%Tl|NsB52l@qsf#5(00DLtY5dZ)H

literal 0
HcmV?d00001

diff --git a/pandas/tests/io/data/stata/stata8_105.dta b/pandas/tests/io/data/stata/stata8_105.dta
new file mode 100644
index 0000000000000000000000000000000000000000..cf01463a83d8146fc7736a0ec4db0581bfd393f0
GIT binary patch
literal 409
zcmc~~Vq{=tU}PW+49yiBOVbsM3=B<xgq5K&LsDi=T1sYKiA6j}H;`dy24Vpj#zr7U
vT26i<m<eW>nSjL9n1BQrs9Ne7q?5^pY8y6G{r~#^|Nqwm{R6^4a3BN#rLrEr

literal 0
HcmV?d00001

diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
index fb7182fdefb32..c2c4140fa304d 100644
--- a/pandas/tests/io/test_stata.py
+++ b/pandas/tests/io/test_stata.py
@@ -120,9 +120,9 @@ def test_read_index_col_none(self, version, temp_file):
         expected["a"] = expected["a"].astype(np.int32)
         tm.assert_frame_equal(read_df, expected, check_index_type=True)
 
-    # Note this test starts at format version 108 as the missing code for double
-    # was different prior to this (see GH 58149) and would therefore fail
-    @pytest.mark.parametrize("version", [108, 110, 111, 113, 114, 115, 117, 118, 119])
+    @pytest.mark.parametrize(
+        "version", [102, 103, 104, 105, 108, 110, 111, 113, 114, 115, 117, 118, 119]
+    )
     def test_read_dta1(self, version, datapath):
         file = datapath("io", "data", "stata", f"stata1_{version}.dta")
         parsed = self.read_dta(file)
@@ -918,8 +918,8 @@ def test_missing_value_generator(self, temp_file):
         )
         assert val.string == ".z"
 
-    @pytest.mark.parametrize("file", ["stata8_113", "stata8_115", "stata8_117"])
-    def test_missing_value_conversion(self, file, datapath):
+    @pytest.mark.parametrize("version", [113, 115, 117])
+    def test_missing_value_conversion(self, version, datapath):
         columns = ["int8_", "int16_", "int32_", "float32_", "float64_"]
         smv = StataMissingValue(101)
         keys = sorted(smv.MISSING_VALUES.keys())
@@ -930,14 +930,13 @@ def test_missing_value_conversion(self, file, datapath):
         expected = DataFrame(data, columns=columns)
 
         parsed = read_stata(
-            datapath("io", "data", "stata", f"{file}.dta"), convert_missing=True
+            datapath("io", "data", "stata", f"stata8_{version}.dta"),
+            convert_missing=True,
         )
         tm.assert_frame_equal(parsed, expected)
 
-    # Note this test starts at format version 108 as the missing code for double
-    # was different prior to this (see GH 58149) and would therefore fail
-    @pytest.mark.parametrize("file", ["stata8_108", "stata8_110", "stata8_111"])
-    def test_missing_value_conversion_compat(self, file, datapath):
+    @pytest.mark.parametrize("version", [104, 105, 108, 110, 111])
+    def test_missing_value_conversion_compat(self, version, datapath):
         columns = ["int8_", "int16_", "int32_", "float32_", "float64_"]
         smv = StataMissingValue(101)
         keys = sorted(smv.MISSING_VALUES.keys())
@@ -947,7 +946,25 @@ def test_missing_value_conversion_compat(self, file, datapath):
         expected = DataFrame(data, columns=columns)
 
         parsed = read_stata(
-            datapath("io", "data", "stata", f"{file}.dta"), convert_missing=True
+            datapath("io", "data", "stata", f"stata8_{version}.dta"),
+            convert_missing=True,
+        )
+        tm.assert_frame_equal(parsed, expected)
+
+    # The byte type was not supported prior to the 104 format
+    @pytest.mark.parametrize("version", [102, 103])
+    def test_missing_value_conversion_compat_nobyte(self, version, datapath):
+        columns = ["int8_", "int16_", "int32_", "float32_", "float64_"]
+        smv = StataMissingValue(101)
+        keys = sorted(smv.MISSING_VALUES.keys())
+        data = []
+        row = [StataMissingValue(keys[j * 27]) for j in [1, 1, 2, 3, 4]]
+        data.append(row)
+        expected = DataFrame(data, columns=columns)
+
+        parsed = read_stata(
+            datapath("io", "data", "stata", f"stata8_{version}.dta"),
+            convert_missing=True,
         )
         tm.assert_frame_equal(parsed, expected)
 

From 15409f8b43a063bdcacfefa443e067213a05d6a5 Mon Sep 17 00:00:00 2001
From: Chris Charlton <c.charlton@bristol.ac.uk>
Date: Fri, 26 Jul 2024 19:14:14 +0100
Subject: [PATCH 2/2] Move definition of the old missing value constant for the
 double type out of the loop

---
 pandas/io/stata.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/pandas/io/stata.py b/pandas/io/stata.py
index 2ef9ed53ae86f..4be06f93689f2 100644
--- a/pandas/io/stata.py
+++ b/pandas/io/stata.py
@@ -1817,15 +1817,17 @@ def read(
         return data
 
     def _do_convert_missing(self, data: DataFrame, convert_missing: bool) -> DataFrame:
+        # missing code for double was different in version 105 and prior
+        old_missingdouble = float.fromhex("0x1.0p333")
+
         # Check for missing values, and replace if found
         replacements = {}
         for i in range(len(data.columns)):
             fmt = self._typlist[i]
-            # missing code for double was different in version 105 and prior
-            # recode instances of this to the currently used value
+            # recode instances of the old missing code to the currently used value
             if self._format_version <= 105 and fmt == "d":
                 data.iloc[:, i] = data.iloc[:, i].replace(
-                    float.fromhex("0x1.0p333"), self.MISSING_VALUES["d"]
+                    old_missingdouble, self.MISSING_VALUES["d"]
                 )
 
             if self._format_version <= 111: