From 9faf5029aae49013023e55c1c90604aba1afb0c3 Mon Sep 17 00:00:00 2001 From: mzeitlin11 <37011898+mzeitlin11@users.noreply.github.com> Date: Wed, 30 Dec 2020 13:40:59 -0500 Subject: [PATCH] Backport PR #38789: BUG: Fix precise_xstrtod segfault on long exponent --- doc/source/whatsnew/v1.2.1.rst | 1 + pandas/_libs/src/parser/tokenizer.c | 11 +++++--- pandas/tests/io/parser/conftest.py | 27 +++++++++++++++++++ pandas/tests/io/parser/test_common.py | 39 +++++++++++++++++++++++---- 4 files changed, 69 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v1.2.1.rst b/doc/source/whatsnew/v1.2.1.rst index 0bc01c683e0ad..31c5b770b1f35 100644 --- a/doc/source/whatsnew/v1.2.1.rst +++ b/doc/source/whatsnew/v1.2.1.rst @@ -19,6 +19,7 @@ Fixed regressions - Bug in repr of float-like strings of an ``object`` dtype having trailing 0's truncated after the decimal (:issue:`38708`) - Fixed regression in :meth:`DataFrame.groupby()` with :class:`Categorical` grouping column not showing unused categories for ``grouped.indices`` (:issue:`38642`) - Fixed regression in :meth:`DataFrame.any` and :meth:`DataFrame.all` not returning a result for tz-aware ``datetime64`` columns (:issue:`38723`) +- Bug in :meth:`read_csv` with ``float_precision="high"`` caused segfault or wrong parsing of long exponent strings (:issue:`38753`) - .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 88144330c1fe9..4ddbd6cf3ae60 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1733,7 +1733,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, // Process string of digits. num_digits = 0; n = 0; - while (isdigit_ascii(*p)) { + while (num_digits < max_digits && isdigit_ascii(*p)) { n = n * 10 + (*p - '0'); num_digits++; p++; @@ -1754,10 +1754,13 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, } else if (exponent > 0) { number *= e[exponent]; } else if (exponent < -308) { // Subnormal - if (exponent < -616) // Prevent invalid array access. + if (exponent < -616) { // Prevent invalid array access. number = 0.; - number /= e[-308 - exponent]; - number /= e[308]; + } else { + number /= e[-308 - exponent]; + number /= e[308]; + } + } else { number /= e[-exponent]; } diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index e8893b4c02238..ec098353960d7 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -97,6 +97,33 @@ def python_parser_only(request): return request.param +def _get_all_parser_float_precision_combinations(): + """ + Return all allowable parser and float precision + combinations and corresponding ids. + """ + params = [] + ids = [] + for parser, parser_id in zip(_all_parsers, _all_parser_ids): + for precision in parser.float_precision_choices: + params.append((parser, precision)) + ids.append(f"{parser_id}-{precision}") + + return {"params": params, "ids": ids} + + +@pytest.fixture( + params=_get_all_parser_float_precision_combinations()["params"], + ids=_get_all_parser_float_precision_combinations()["ids"], +) +def all_parsers_all_precisions(request): + """ + Fixture for all allowable combinations of parser + and float precision + """ + return request.param + + _utf_values = [8, 16, 32] _encoding_seps = ["", "-", "_"] diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index c8ed0d75b13a2..d42bd7a004584 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -15,6 +15,7 @@ import pytest from pandas._libs.tslib import Timestamp +from pandas.compat import is_platform_linux from pandas.errors import DtypeWarning, EmptyDataError, ParserError import pandas.util._test_decorators as td @@ -1258,15 +1259,14 @@ def test_float_parser(all_parsers): tm.assert_frame_equal(result, expected) -def test_scientific_no_exponent(all_parsers): +def test_scientific_no_exponent(all_parsers_all_precisions): # see gh-12215 df = DataFrame.from_dict({"w": ["2e"], "x": ["3E"], "y": ["42e"], "z": ["632E"]}) data = df.to_csv(index=False) - parser = all_parsers + parser, precision = all_parsers_all_precisions - for precision in parser.float_precision_choices: - df_roundtrip = parser.read_csv(StringIO(data), float_precision=precision) - tm.assert_frame_equal(df_roundtrip, df) + df_roundtrip = parser.read_csv(StringIO(data), float_precision=precision) + tm.assert_frame_equal(df_roundtrip, df) @pytest.mark.parametrize("conv", [None, np.int64, np.uint64]) @@ -1350,6 +1350,35 @@ def test_numeric_range_too_wide(all_parsers, exp_data): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("neg_exp", [-617, -100000, -99999999999999999]) +def test_very_negative_exponent(all_parsers_all_precisions, neg_exp): + # GH#38753 + parser, precision = all_parsers_all_precisions + data = f"data\n10E{neg_exp}" + result = parser.read_csv(StringIO(data), float_precision=precision) + expected = DataFrame({"data": [0.0]}) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("exp", [999999999999999999, -999999999999999999]) +def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request): + # GH#38753 + parser, precision = all_parsers_all_precisions + data = f"data\n10E{exp}" + result = parser.read_csv(StringIO(data), float_precision=precision) + if precision == "round_trip": + if exp == 999999999999999999 and is_platform_linux(): + mark = pytest.mark.xfail(reason="GH38794, on Linux gives object result") + request.node.add_marker(mark) + + value = np.inf if exp > 0 else 0.0 + expected = DataFrame({"data": [value]}) + else: + expected = DataFrame({"data": [f"10E{exp}"]}) + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("iterator", [True, False]) def test_empty_with_nrows_chunksize(all_parsers, iterator): # see gh-9535