@@ -120,9 +120,11 @@ def test_read_index_col_none(self, version, temp_file):
120
120
expected ["a" ] = expected ["a" ].astype (np .int32 )
121
121
tm .assert_frame_equal (read_df , expected , check_index_type = True )
122
122
123
- @pytest .mark .parametrize ("file" , ["stata1_114" , "stata1_117" ])
124
- def test_read_dta1 (self , file , datapath ):
125
- file = datapath ("io" , "data" , "stata" , f"{ file } .dta" )
123
+ # Note this test starts at format version 108 as the missing code for double
124
+ # was different prior to this (see GH 58149) and would therefore fail
125
+ @pytest .mark .parametrize ("version" , [108 , 110 , 111 , 113 , 114 , 115 , 117 , 118 , 119 ])
126
+ def test_read_dta1 (self , version , datapath ):
127
+ file = datapath ("io" , "data" , "stata" , f"stata1_{ version } .dta" )
126
128
parsed = self .read_dta (file )
127
129
128
130
# Pandas uses np.nan as missing value.
@@ -136,6 +138,18 @@ def test_read_dta1(self, file, datapath):
136
138
# the casting doesn't fail so need to match stata here
137
139
expected ["float_miss" ] = expected ["float_miss" ].astype (np .float32 )
138
140
141
+ # Column names too long for older Stata formats
142
+ if version <= 108 :
143
+ expected = expected .rename (
144
+ columns = {
145
+ "float_miss" : "f_miss" ,
146
+ "double_miss" : "d_miss" ,
147
+ "byte_miss" : "b_miss" ,
148
+ "int_miss" : "i_miss" ,
149
+ "long_miss" : "l_miss" ,
150
+ }
151
+ )
152
+
139
153
tm .assert_frame_equal (parsed , expected )
140
154
141
155
def test_read_dta2 (self , datapath ):
@@ -920,6 +934,23 @@ def test_missing_value_conversion(self, file, datapath):
920
934
)
921
935
tm .assert_frame_equal (parsed , expected )
922
936
937
+ # Note this test starts at format version 108 as the missing code for double
938
+ # was different prior to this (see GH 58149) and would therefore fail
939
+ @pytest .mark .parametrize ("file" , ["stata8_108" , "stata8_110" , "stata8_111" ])
940
+ def test_missing_value_conversion_compat (self , file , datapath ):
941
+ columns = ["int8_" , "int16_" , "int32_" , "float32_" , "float64_" ]
942
+ smv = StataMissingValue (101 )
943
+ keys = sorted (smv .MISSING_VALUES .keys ())
944
+ data = []
945
+ row = [StataMissingValue (keys [j * 27 ]) for j in range (5 )]
946
+ data .append (row )
947
+ expected = DataFrame (data , columns = columns )
948
+
949
+ parsed = read_stata (
950
+ datapath ("io" , "data" , "stata" , f"{ file } .dta" ), convert_missing = True
951
+ )
952
+ tm .assert_frame_equal (parsed , expected )
953
+
923
954
def test_big_dates (self , datapath , temp_file ):
924
955
yr = [1960 , 2000 , 9999 , 100 , 2262 , 1677 ]
925
956
mo = [1 , 1 , 12 , 1 , 4 , 9 ]
@@ -2035,6 +2066,52 @@ def test_read_write_ea_dtypes(self, dtype_backend, temp_file, tmp_path):
2035
2066
2036
2067
tm .assert_frame_equal (written_and_read_again .set_index ("index" ), expected )
2037
2068
2069
+ @pytest .mark .parametrize ("version" , [113 , 114 , 115 , 117 , 118 , 119 ])
2070
+ def test_read_data_int_validranges (self , version , datapath ):
2071
+ expected = DataFrame (
2072
+ {
2073
+ "byte" : np .array ([- 127 , 100 ], dtype = np .int8 ),
2074
+ "int" : np .array ([- 32767 , 32740 ], dtype = np .int16 ),
2075
+ "long" : np .array ([- 2147483647 , 2147483620 ], dtype = np .int32 ),
2076
+ }
2077
+ )
2078
+
2079
+ parsed = read_stata (
2080
+ datapath ("io" , "data" , "stata" , f"stata_int_validranges_{ version } .dta" )
2081
+ )
2082
+ tm .assert_frame_equal (parsed , expected )
2083
+
2084
+ @pytest .mark .parametrize ("version" , [104 , 105 , 108 , 110 , 111 ])
2085
+ def test_read_data_int_validranges_compat (self , version , datapath ):
2086
+ expected = DataFrame (
2087
+ {
2088
+ "byte" : np .array ([- 128 , 126 ], dtype = np .int8 ),
2089
+ "int" : np .array ([- 32768 , 32766 ], dtype = np .int16 ),
2090
+ "long" : np .array ([- 2147483648 , 2147483646 ], dtype = np .int32 ),
2091
+ }
2092
+ )
2093
+
2094
+ parsed = read_stata (
2095
+ datapath ("io" , "data" , "stata" , f"stata_int_validranges_{ version } .dta" )
2096
+ )
2097
+ tm .assert_frame_equal (parsed , expected )
2098
+
2099
+ # The byte type was not supported prior to the 104 format
2100
+ @pytest .mark .parametrize ("version" , [102 , 103 ])
2101
+ def test_read_data_int_validranges_compat_nobyte (self , version , datapath ):
2102
+ expected = DataFrame (
2103
+ {
2104
+ "byte" : np .array ([- 128 , 126 ], dtype = np .int16 ),
2105
+ "int" : np .array ([- 32768 , 32766 ], dtype = np .int16 ),
2106
+ "long" : np .array ([- 2147483648 , 2147483646 ], dtype = np .int32 ),
2107
+ }
2108
+ )
2109
+
2110
+ parsed = read_stata (
2111
+ datapath ("io" , "data" , "stata" , f"stata_int_validranges_{ version } .dta" )
2112
+ )
2113
+ tm .assert_frame_equal (parsed , expected )
2114
+
2038
2115
2039
2116
@pytest .mark .parametrize ("version" , [105 , 108 , 110 , 111 , 113 , 114 ])
2040
2117
def test_backward_compat (version , datapath ):
0 commit comments