Skip to content

Commit 9faf502

Browse files
mzeitlin11meeseeksmachine
authored andcommitted
Backport PR pandas-dev#38789: BUG: Fix precise_xstrtod segfault on long exponent
1 parent 46f896d commit 9faf502

File tree

4 files changed

+69
-9
lines changed

4 files changed

+69
-9
lines changed

doc/source/whatsnew/v1.2.1.rst

+1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ Fixed regressions
1919
- Bug in repr of float-like strings of an ``object`` dtype having trailing 0's truncated after the decimal (:issue:`38708`)
2020
- Fixed regression in :meth:`DataFrame.groupby()` with :class:`Categorical` grouping column not showing unused categories for ``grouped.indices`` (:issue:`38642`)
2121
- Fixed regression in :meth:`DataFrame.any` and :meth:`DataFrame.all` not returning a result for tz-aware ``datetime64`` columns (:issue:`38723`)
22+
- Bug in :meth:`read_csv` with ``float_precision="high"`` caused segfault or wrong parsing of long exponent strings (:issue:`38753`)
2223
-
2324

2425
.. ---------------------------------------------------------------------------

pandas/_libs/src/parser/tokenizer.c

+7-4
Original file line numberDiff line numberDiff line change
@@ -1733,7 +1733,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal,
17331733
// Process string of digits.
17341734
num_digits = 0;
17351735
n = 0;
1736-
while (isdigit_ascii(*p)) {
1736+
while (num_digits < max_digits && isdigit_ascii(*p)) {
17371737
n = n * 10 + (*p - '0');
17381738
num_digits++;
17391739
p++;
@@ -1754,10 +1754,13 @@ double precise_xstrtod(const char *str, char **endptr, char decimal,
17541754
} else if (exponent > 0) {
17551755
number *= e[exponent];
17561756
} else if (exponent < -308) { // Subnormal
1757-
if (exponent < -616) // Prevent invalid array access.
1757+
if (exponent < -616) { // Prevent invalid array access.
17581758
number = 0.;
1759-
number /= e[-308 - exponent];
1760-
number /= e[308];
1759+
} else {
1760+
number /= e[-308 - exponent];
1761+
number /= e[308];
1762+
}
1763+
17611764
} else {
17621765
number /= e[-exponent];
17631766
}

pandas/tests/io/parser/conftest.py

+27
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,33 @@ def python_parser_only(request):
9797
return request.param
9898

9999

100+
def _get_all_parser_float_precision_combinations():
101+
"""
102+
Return all allowable parser and float precision
103+
combinations and corresponding ids.
104+
"""
105+
params = []
106+
ids = []
107+
for parser, parser_id in zip(_all_parsers, _all_parser_ids):
108+
for precision in parser.float_precision_choices:
109+
params.append((parser, precision))
110+
ids.append(f"{parser_id}-{precision}")
111+
112+
return {"params": params, "ids": ids}
113+
114+
115+
@pytest.fixture(
116+
params=_get_all_parser_float_precision_combinations()["params"],
117+
ids=_get_all_parser_float_precision_combinations()["ids"],
118+
)
119+
def all_parsers_all_precisions(request):
120+
"""
121+
Fixture for all allowable combinations of parser
122+
and float precision
123+
"""
124+
return request.param
125+
126+
100127
_utf_values = [8, 16, 32]
101128

102129
_encoding_seps = ["", "-", "_"]

pandas/tests/io/parser/test_common.py

+34-5
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import pytest
1616

1717
from pandas._libs.tslib import Timestamp
18+
from pandas.compat import is_platform_linux
1819
from pandas.errors import DtypeWarning, EmptyDataError, ParserError
1920
import pandas.util._test_decorators as td
2021

@@ -1258,15 +1259,14 @@ def test_float_parser(all_parsers):
12581259
tm.assert_frame_equal(result, expected)
12591260

12601261

1261-
def test_scientific_no_exponent(all_parsers):
1262+
def test_scientific_no_exponent(all_parsers_all_precisions):
12621263
# see gh-12215
12631264
df = DataFrame.from_dict({"w": ["2e"], "x": ["3E"], "y": ["42e"], "z": ["632E"]})
12641265
data = df.to_csv(index=False)
1265-
parser = all_parsers
1266+
parser, precision = all_parsers_all_precisions
12661267

1267-
for precision in parser.float_precision_choices:
1268-
df_roundtrip = parser.read_csv(StringIO(data), float_precision=precision)
1269-
tm.assert_frame_equal(df_roundtrip, df)
1268+
df_roundtrip = parser.read_csv(StringIO(data), float_precision=precision)
1269+
tm.assert_frame_equal(df_roundtrip, df)
12701270

12711271

12721272
@pytest.mark.parametrize("conv", [None, np.int64, np.uint64])
@@ -1350,6 +1350,35 @@ def test_numeric_range_too_wide(all_parsers, exp_data):
13501350
tm.assert_frame_equal(result, expected)
13511351

13521352

1353+
@pytest.mark.parametrize("neg_exp", [-617, -100000, -99999999999999999])
1354+
def test_very_negative_exponent(all_parsers_all_precisions, neg_exp):
1355+
# GH#38753
1356+
parser, precision = all_parsers_all_precisions
1357+
data = f"data\n10E{neg_exp}"
1358+
result = parser.read_csv(StringIO(data), float_precision=precision)
1359+
expected = DataFrame({"data": [0.0]})
1360+
tm.assert_frame_equal(result, expected)
1361+
1362+
1363+
@pytest.mark.parametrize("exp", [999999999999999999, -999999999999999999])
1364+
def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request):
1365+
# GH#38753
1366+
parser, precision = all_parsers_all_precisions
1367+
data = f"data\n10E{exp}"
1368+
result = parser.read_csv(StringIO(data), float_precision=precision)
1369+
if precision == "round_trip":
1370+
if exp == 999999999999999999 and is_platform_linux():
1371+
mark = pytest.mark.xfail(reason="GH38794, on Linux gives object result")
1372+
request.node.add_marker(mark)
1373+
1374+
value = np.inf if exp > 0 else 0.0
1375+
expected = DataFrame({"data": [value]})
1376+
else:
1377+
expected = DataFrame({"data": [f"10E{exp}"]})
1378+
1379+
tm.assert_frame_equal(result, expected)
1380+
1381+
13531382
@pytest.mark.parametrize("iterator", [True, False])
13541383
def test_empty_with_nrows_chunksize(all_parsers, iterator):
13551384
# see gh-9535

0 commit comments

Comments
 (0)