Backport PR pandas-dev#38789: BUG: Fix precise_xstrtod segfault on long exponent

mzeitlin11 · meeseeksmachine · commit 9faf5029aae4 · 2020-12-30T18:41:27.000Z
diff --git a/doc/source/whatsnew/v1.2.1.rst b/doc/source/whatsnew/v1.2.1.rst
@@ -19,6 +19,7 @@ Fixed regressions
 - Bug in repr of float-like strings of an ``object`` dtype having trailing 0's truncated after the decimal (:issue:`38708`)
 - Fixed regression in :meth:`DataFrame.groupby()` with :class:`Categorical` grouping column not showing unused categories for ``grouped.indices`` (:issue:`38642`)
 - Fixed regression in :meth:`DataFrame.any` and :meth:`DataFrame.all` not returning a result for tz-aware ``datetime64`` columns (:issue:`38723`)
+- Bug in :meth:`read_csv` with ``float_precision="high"`` caused segfault or wrong parsing of long exponent strings (:issue:`38753`)
 -
 
 .. ---------------------------------------------------------------------------
diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
@@ -1733,7 +1733,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal,
         // Process string of digits.
         num_digits = 0;
         n = 0;
-        while (isdigit_ascii(*p)) {
+        while (num_digits < max_digits && isdigit_ascii(*p)) {
             n = n * 10 + (*p - '0');
             num_digits++;
             p++;
@@ -1754,10 +1754,13 @@ double precise_xstrtod(const char *str, char **endptr, char decimal,
     } else if (exponent > 0) {
         number *= e[exponent];
     } else if (exponent < -308) {  // Subnormal
-        if (exponent < -616)       // Prevent invalid array access.
+        if (exponent < -616) {  // Prevent invalid array access.
             number = 0.;
-        number /= e[-308 - exponent];
-        number /= e[308];
+        } else {
+            number /= e[-308 - exponent];
+            number /= e[308];
+        }
+
     } else {
         number /= e[-exponent];
     }
diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py
@@ -97,6 +97,33 @@ def python_parser_only(request):
     return request.param
 
 
+def _get_all_parser_float_precision_combinations():
+    """
+    Return all allowable parser and float precision
+    combinations and corresponding ids.
+    """
+    params = []
+    ids = []
+    for parser, parser_id in zip(_all_parsers, _all_parser_ids):
+        for precision in parser.float_precision_choices:
+            params.append((parser, precision))
+            ids.append(f"{parser_id}-{precision}")
+
+    return {"params": params, "ids": ids}
+
+
+@pytest.fixture(
+    params=_get_all_parser_float_precision_combinations()["params"],
+    ids=_get_all_parser_float_precision_combinations()["ids"],
+)
+def all_parsers_all_precisions(request):
+    """
+    Fixture for all allowable combinations of parser
+    and float precision
+    """
+    return request.param
+
+
 _utf_values = [8, 16, 32]
 
 _encoding_seps = ["", "-", "_"]
diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py
@@ -15,6 +15,7 @@
 import pytest
 
 from pandas._libs.tslib import Timestamp
+from pandas.compat import is_platform_linux
 from pandas.errors import DtypeWarning, EmptyDataError, ParserError
 import pandas.util._test_decorators as td
 
@@ -1258,15 +1259,14 @@ def test_float_parser(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-def test_scientific_no_exponent(all_parsers):
+def test_scientific_no_exponent(all_parsers_all_precisions):
     # see gh-12215
     df = DataFrame.from_dict({"w": ["2e"], "x": ["3E"], "y": ["42e"], "z": ["632E"]})
     data = df.to_csv(index=False)
-    parser = all_parsers
+    parser, precision = all_parsers_all_precisions
 
-    for precision in parser.float_precision_choices:
-        df_roundtrip = parser.read_csv(StringIO(data), float_precision=precision)
-        tm.assert_frame_equal(df_roundtrip, df)
+    df_roundtrip = parser.read_csv(StringIO(data), float_precision=precision)
+    tm.assert_frame_equal(df_roundtrip, df)
 
 
 @pytest.mark.parametrize("conv", [None, np.int64, np.uint64])
@@ -1350,6 +1350,35 @@ def test_numeric_range_too_wide(all_parsers, exp_data):
     tm.assert_frame_equal(result, expected)
 
 
+@pytest.mark.parametrize("neg_exp", [-617, -100000, -99999999999999999])
+def test_very_negative_exponent(all_parsers_all_precisions, neg_exp):
+    # GH#38753
+    parser, precision = all_parsers_all_precisions
+    data = f"data\n10E{neg_exp}"
+    result = parser.read_csv(StringIO(data), float_precision=precision)
+    expected = DataFrame({"data": [0.0]})
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("exp", [999999999999999999, -999999999999999999])
+def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request):
+    # GH#38753
+    parser, precision = all_parsers_all_precisions
+    data = f"data\n10E{exp}"
+    result = parser.read_csv(StringIO(data), float_precision=precision)
+    if precision == "round_trip":
+        if exp == 999999999999999999 and is_platform_linux():
+            mark = pytest.mark.xfail(reason="GH38794, on Linux gives object result")
+            request.node.add_marker(mark)
+
+        value = np.inf if exp > 0 else 0.0
+        expected = DataFrame({"data": [value]})
+    else:
+        expected = DataFrame({"data": [f"10E{exp}"]})
+
+    tm.assert_frame_equal(result, expected)
+
+
 @pytest.mark.parametrize("iterator", [True, False])
 def test_empty_with_nrows_chunksize(all_parsers, iterator):
     # see gh-9535

Original file line number	Diff line number	Diff line change
`@@ -19,6 +19,7 @@ Fixed regressions`
`19`	`19`	- Bug in repr of float-like strings of an ``object`` dtype having trailing 0's truncated after the decimal (:issue:`38708`)
`20`	`20`	- Fixed regression in :meth:`DataFrame.groupby()` with :class:`Categorical` grouping column not showing unused categories for ``grouped.indices`` (:issue:`38642`)
`21`	`21`	- Fixed regression in :meth:`DataFrame.any` and :meth:`DataFrame.all` not returning a result for tz-aware ``datetime64`` columns (:issue:`38723`)
	`22`	+- Bug in :meth:`read_csv` with ``float_precision="high"`` caused segfault or wrong parsing of long exponent strings (:issue:`38753`)
`22`	`23`	`-`
`23`	`24`
`24`	`25`	`.. ---------------------------------------------------------------------------`