Skip to content

Commit d6c9941

Browse files
authored
BUG: Integer values at the top end of the supported range incorrectly… (#59310)
* BUG: Integer values at the top end of the supported range incorrectly interpreted as missing for format versions 111 and prior * StataMissingValue expects value passed in to be of float type, so cast to this * Add type hint to StataParser.MISSING_VALUES to avoid mypy error when constructing StataMissingValue from value
1 parent ecea7c3 commit d6c9941

26 files changed

+112
-9
lines changed

doc/source/whatsnew/v3.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -583,6 +583,7 @@ I/O
583583
- Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
584584
- Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`)
585585
- Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`)
586+
- Bug in :meth:`read_stata` where extreme value integers were incorrectly interpreted as missing for format versions 111 and prior (:issue:`58130`)
586587

587588
Period
588589
^^^^^^

pandas/io/stata.py

+31-6
Original file line numberDiff line numberDiff line change
@@ -983,6 +983,19 @@ def __init__(self) -> None:
983983
np.float64(struct.unpack("<d", float64_max)[0]),
984984
),
985985
}
986+
self.OLD_VALID_RANGE = {
987+
"b": (-128, 126),
988+
"h": (-32768, 32766),
989+
"l": (-2147483648, 2147483646),
990+
"f": (
991+
np.float32(struct.unpack("<f", float32_min)[0]),
992+
np.float32(struct.unpack("<f", float32_max)[0]),
993+
),
994+
"d": (
995+
np.float64(struct.unpack("<d", float64_min)[0]),
996+
np.float64(struct.unpack("<d", float64_max)[0]),
997+
),
998+
}
986999

9871000
self.OLD_TYPE_MAPPING = {
9881001
98: 251, # byte
@@ -994,7 +1007,7 @@ def __init__(self) -> None:
9941007

9951008
# These missing values are the generic '.' in Stata, and are used
9961009
# to replace nans
997-
self.MISSING_VALUES = {
1010+
self.MISSING_VALUES: dict[str, int | np.float32 | np.float64] = {
9981011
"b": 101,
9991012
"h": 32741,
10001013
"l": 2147483621,
@@ -1808,11 +1821,18 @@ def _do_convert_missing(self, data: DataFrame, convert_missing: bool) -> DataFra
18081821
replacements = {}
18091822
for i in range(len(data.columns)):
18101823
fmt = self._typlist[i]
1811-
if fmt not in self.VALID_RANGE:
1812-
continue
1824+
if self._format_version <= 111:
1825+
if fmt not in self.OLD_VALID_RANGE:
1826+
continue
18131827

1814-
fmt = cast(str, fmt) # only strs in VALID_RANGE
1815-
nmin, nmax = self.VALID_RANGE[fmt]
1828+
fmt = cast(str, fmt) # only strs in OLD_VALID_RANGE
1829+
nmin, nmax = self.OLD_VALID_RANGE[fmt]
1830+
else:
1831+
if fmt not in self.VALID_RANGE:
1832+
continue
1833+
1834+
fmt = cast(str, fmt) # only strs in VALID_RANGE
1835+
nmin, nmax = self.VALID_RANGE[fmt]
18161836
series = data.iloc[:, i]
18171837

18181838
# appreciably faster to do this with ndarray instead of Series
@@ -1827,7 +1847,12 @@ def _do_convert_missing(self, data: DataFrame, convert_missing: bool) -> DataFra
18271847
umissing, umissing_loc = np.unique(series[missing], return_inverse=True)
18281848
replacement = Series(series, dtype=object)
18291849
for j, um in enumerate(umissing):
1830-
missing_value = StataMissingValue(um)
1850+
if self._format_version <= 111:
1851+
missing_value = StataMissingValue(
1852+
float(self.MISSING_VALUES[fmt])
1853+
)
1854+
else:
1855+
missing_value = StataMissingValue(um)
18311856

18321857
loc = missing_loc[umissing_loc == j]
18331858
replacement.iloc[loc] = missing_value
703 Bytes
Binary file not shown.
945 Bytes
Binary file not shown.
945 Bytes
Binary file not shown.
945 Bytes
Binary file not shown.
1.1 KB
Binary file not shown.
3.69 KB
Binary file not shown.
3.7 KB
Binary file not shown.
703 Bytes
Binary file not shown.
945 Bytes
Binary file not shown.
945 Bytes
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

pandas/tests/io/test_stata.py

+80-3
Original file line numberDiff line numberDiff line change
@@ -120,9 +120,11 @@ def test_read_index_col_none(self, version, temp_file):
120120
expected["a"] = expected["a"].astype(np.int32)
121121
tm.assert_frame_equal(read_df, expected, check_index_type=True)
122122

123-
@pytest.mark.parametrize("file", ["stata1_114", "stata1_117"])
124-
def test_read_dta1(self, file, datapath):
125-
file = datapath("io", "data", "stata", f"{file}.dta")
123+
# Note this test starts at format version 108 as the missing code for double
124+
# was different prior to this (see GH 58149) and would therefore fail
125+
@pytest.mark.parametrize("version", [108, 110, 111, 113, 114, 115, 117, 118, 119])
126+
def test_read_dta1(self, version, datapath):
127+
file = datapath("io", "data", "stata", f"stata1_{version}.dta")
126128
parsed = self.read_dta(file)
127129

128130
# Pandas uses np.nan as missing value.
@@ -136,6 +138,18 @@ def test_read_dta1(self, file, datapath):
136138
# the casting doesn't fail so need to match stata here
137139
expected["float_miss"] = expected["float_miss"].astype(np.float32)
138140

141+
# Column names too long for older Stata formats
142+
if version <= 108:
143+
expected = expected.rename(
144+
columns={
145+
"float_miss": "f_miss",
146+
"double_miss": "d_miss",
147+
"byte_miss": "b_miss",
148+
"int_miss": "i_miss",
149+
"long_miss": "l_miss",
150+
}
151+
)
152+
139153
tm.assert_frame_equal(parsed, expected)
140154

141155
def test_read_dta2(self, datapath):
@@ -920,6 +934,23 @@ def test_missing_value_conversion(self, file, datapath):
920934
)
921935
tm.assert_frame_equal(parsed, expected)
922936

937+
# Note this test starts at format version 108 as the missing code for double
938+
# was different prior to this (see GH 58149) and would therefore fail
939+
@pytest.mark.parametrize("file", ["stata8_108", "stata8_110", "stata8_111"])
940+
def test_missing_value_conversion_compat(self, file, datapath):
941+
columns = ["int8_", "int16_", "int32_", "float32_", "float64_"]
942+
smv = StataMissingValue(101)
943+
keys = sorted(smv.MISSING_VALUES.keys())
944+
data = []
945+
row = [StataMissingValue(keys[j * 27]) for j in range(5)]
946+
data.append(row)
947+
expected = DataFrame(data, columns=columns)
948+
949+
parsed = read_stata(
950+
datapath("io", "data", "stata", f"{file}.dta"), convert_missing=True
951+
)
952+
tm.assert_frame_equal(parsed, expected)
953+
923954
def test_big_dates(self, datapath, temp_file):
924955
yr = [1960, 2000, 9999, 100, 2262, 1677]
925956
mo = [1, 1, 12, 1, 4, 9]
@@ -2035,6 +2066,52 @@ def test_read_write_ea_dtypes(self, dtype_backend, temp_file, tmp_path):
20352066

20362067
tm.assert_frame_equal(written_and_read_again.set_index("index"), expected)
20372068

2069+
@pytest.mark.parametrize("version", [113, 114, 115, 117, 118, 119])
2070+
def test_read_data_int_validranges(self, version, datapath):
2071+
expected = DataFrame(
2072+
{
2073+
"byte": np.array([-127, 100], dtype=np.int8),
2074+
"int": np.array([-32767, 32740], dtype=np.int16),
2075+
"long": np.array([-2147483647, 2147483620], dtype=np.int32),
2076+
}
2077+
)
2078+
2079+
parsed = read_stata(
2080+
datapath("io", "data", "stata", f"stata_int_validranges_{version}.dta")
2081+
)
2082+
tm.assert_frame_equal(parsed, expected)
2083+
2084+
@pytest.mark.parametrize("version", [104, 105, 108, 110, 111])
2085+
def test_read_data_int_validranges_compat(self, version, datapath):
2086+
expected = DataFrame(
2087+
{
2088+
"byte": np.array([-128, 126], dtype=np.int8),
2089+
"int": np.array([-32768, 32766], dtype=np.int16),
2090+
"long": np.array([-2147483648, 2147483646], dtype=np.int32),
2091+
}
2092+
)
2093+
2094+
parsed = read_stata(
2095+
datapath("io", "data", "stata", f"stata_int_validranges_{version}.dta")
2096+
)
2097+
tm.assert_frame_equal(parsed, expected)
2098+
2099+
# The byte type was not supported prior to the 104 format
2100+
@pytest.mark.parametrize("version", [102, 103])
2101+
def test_read_data_int_validranges_compat_nobyte(self, version, datapath):
2102+
expected = DataFrame(
2103+
{
2104+
"byte": np.array([-128, 126], dtype=np.int16),
2105+
"int": np.array([-32768, 32766], dtype=np.int16),
2106+
"long": np.array([-2147483648, 2147483646], dtype=np.int32),
2107+
}
2108+
)
2109+
2110+
parsed = read_stata(
2111+
datapath("io", "data", "stata", f"stata_int_validranges_{version}.dta")
2112+
)
2113+
tm.assert_frame_equal(parsed, expected)
2114+
20382115

20392116
@pytest.mark.parametrize("version", [105, 108, 110, 111, 113, 114])
20402117
def test_backward_compat(version, datapath):

0 commit comments

Comments
 (0)