Skip to content

Commit eadeb0b

Browse files
mzeitlin11luckyvs1
authored andcommitted
BUG: Fix precise_xstrtod segfault on long exponent (pandas-dev#38789)
1 parent 9f346e7 commit eadeb0b

File tree

4 files changed

+69
-9
lines changed

4 files changed

+69
-9
lines changed

doc/source/whatsnew/v1.2.1.rst

+1
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ Fixed regressions
2020
- Bug in repr of float-like strings of an ``object`` dtype having trailing 0's truncated after the decimal (:issue:`38708`)
2121
- Fixed regression in :meth:`DataFrame.groupby()` with :class:`Categorical` grouping column not showing unused categories for ``grouped.indices`` (:issue:`38642`)
2222
- Fixed regression in :meth:`DataFrame.any` and :meth:`DataFrame.all` not returning a result for tz-aware ``datetime64`` columns (:issue:`38723`)
23+
- Bug in :meth:`read_csv` with ``float_precision="high"`` caused segfault or wrong parsing of long exponent strings (:issue:`38753`)
2324
-
2425

2526
.. ---------------------------------------------------------------------------

pandas/_libs/src/parser/tokenizer.c

+7-4
Original file line numberDiff line numberDiff line change
@@ -1726,7 +1726,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal,
17261726
// Process string of digits.
17271727
num_digits = 0;
17281728
n = 0;
1729-
while (isdigit_ascii(*p)) {
1729+
while (num_digits < max_digits && isdigit_ascii(*p)) {
17301730
n = n * 10 + (*p - '0');
17311731
num_digits++;
17321732
p++;
@@ -1747,10 +1747,13 @@ double precise_xstrtod(const char *str, char **endptr, char decimal,
17471747
} else if (exponent > 0) {
17481748
number *= e[exponent];
17491749
} else if (exponent < -308) { // Subnormal
1750-
if (exponent < -616) // Prevent invalid array access.
1750+
if (exponent < -616) { // Prevent invalid array access.
17511751
number = 0.;
1752-
number /= e[-308 - exponent];
1753-
number /= e[308];
1752+
} else {
1753+
number /= e[-308 - exponent];
1754+
number /= e[308];
1755+
}
1756+
17541757
} else {
17551758
number /= e[-exponent];
17561759
}

pandas/tests/io/parser/conftest.py

+27
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,33 @@ def python_parser_only(request):
9797
return request.param
9898

9999

100+
def _get_all_parser_float_precision_combinations():
101+
"""
102+
Return all allowable parser and float precision
103+
combinations and corresponding ids.
104+
"""
105+
params = []
106+
ids = []
107+
for parser, parser_id in zip(_all_parsers, _all_parser_ids):
108+
for precision in parser.float_precision_choices:
109+
params.append((parser, precision))
110+
ids.append(f"{parser_id}-{precision}")
111+
112+
return {"params": params, "ids": ids}
113+
114+
115+
@pytest.fixture(
116+
params=_get_all_parser_float_precision_combinations()["params"],
117+
ids=_get_all_parser_float_precision_combinations()["ids"],
118+
)
119+
def all_parsers_all_precisions(request):
120+
"""
121+
Fixture for all allowable combinations of parser
122+
and float precision
123+
"""
124+
return request.param
125+
126+
100127
_utf_values = [8, 16, 32]
101128

102129
_encoding_seps = ["", "-", "_"]

pandas/tests/io/parser/test_common.py

+34-5
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import pytest
1616

1717
from pandas._libs.tslib import Timestamp
18+
from pandas.compat import is_platform_linux
1819
from pandas.errors import DtypeWarning, EmptyDataError, ParserError
1920
import pandas.util._test_decorators as td
2021

@@ -1259,15 +1260,14 @@ def test_float_parser(all_parsers):
12591260
tm.assert_frame_equal(result, expected)
12601261

12611262

1262-
def test_scientific_no_exponent(all_parsers):
1263+
def test_scientific_no_exponent(all_parsers_all_precisions):
12631264
# see gh-12215
12641265
df = DataFrame.from_dict({"w": ["2e"], "x": ["3E"], "y": ["42e"], "z": ["632E"]})
12651266
data = df.to_csv(index=False)
1266-
parser = all_parsers
1267+
parser, precision = all_parsers_all_precisions
12671268

1268-
for precision in parser.float_precision_choices:
1269-
df_roundtrip = parser.read_csv(StringIO(data), float_precision=precision)
1270-
tm.assert_frame_equal(df_roundtrip, df)
1269+
df_roundtrip = parser.read_csv(StringIO(data), float_precision=precision)
1270+
tm.assert_frame_equal(df_roundtrip, df)
12711271

12721272

12731273
@pytest.mark.parametrize("conv", [None, np.int64, np.uint64])
@@ -1351,6 +1351,35 @@ def test_numeric_range_too_wide(all_parsers, exp_data):
13511351
tm.assert_frame_equal(result, expected)
13521352

13531353

1354+
@pytest.mark.parametrize("neg_exp", [-617, -100000, -99999999999999999])
1355+
def test_very_negative_exponent(all_parsers_all_precisions, neg_exp):
1356+
# GH#38753
1357+
parser, precision = all_parsers_all_precisions
1358+
data = f"data\n10E{neg_exp}"
1359+
result = parser.read_csv(StringIO(data), float_precision=precision)
1360+
expected = DataFrame({"data": [0.0]})
1361+
tm.assert_frame_equal(result, expected)
1362+
1363+
1364+
@pytest.mark.parametrize("exp", [999999999999999999, -999999999999999999])
1365+
def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request):
1366+
# GH#38753
1367+
parser, precision = all_parsers_all_precisions
1368+
data = f"data\n10E{exp}"
1369+
result = parser.read_csv(StringIO(data), float_precision=precision)
1370+
if precision == "round_trip":
1371+
if exp == 999999999999999999 and is_platform_linux():
1372+
mark = pytest.mark.xfail(reason="GH38794, on Linux gives object result")
1373+
request.node.add_marker(mark)
1374+
1375+
value = np.inf if exp > 0 else 0.0
1376+
expected = DataFrame({"data": [value]})
1377+
else:
1378+
expected = DataFrame({"data": [f"10E{exp}"]})
1379+
1380+
tm.assert_frame_equal(result, expected)
1381+
1382+
13541383
@pytest.mark.parametrize("iterator", [True, False])
13551384
def test_empty_with_nrows_chunksize(all_parsers, iterator):
13561385
# see gh-9535

0 commit comments

Comments
 (0)