From f42b65ad7fa3345f03e6cb45907de08cee60cccf Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Tue, 29 Dec 2020 15:33:18 -0500 Subject: [PATCH 1/8] WIP --- pandas/_libs/src/parser/tokenizer.c | 11 +++++++---- pandas/tests/io/parser/test_common.py | 24 ++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 965fece370721..1b229171ea879 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1726,7 +1726,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, // Process string of digits. num_digits = 0; n = 0; - while (isdigit_ascii(*p)) { + while (num_digits < max_digits && isdigit_ascii(*p)) { n = n * 10 + (*p - '0'); num_digits++; p++; @@ -1747,10 +1747,13 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, } else if (exponent > 0) { number *= e[exponent]; } else if (exponent < -308) { // Subnormal - if (exponent < -616) // Prevent invalid array access. + if (exponent < -616) { // Prevent invalid array access. number = 0.; - number /= e[-308 - exponent]; - number /= e[308]; + } else { + number /= e[-308 - exponent]; + number /= e[308]; + } + } else { number /= e[-exponent]; } diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index ce3557e098bfd..e85ebd22ffe81 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -1351,6 +1351,30 @@ def test_numeric_range_too_wide(all_parsers, exp_data): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("neg_exp", [-617, -100000, -99999999999999999]) +def test_very_negative_exponent(all_parsers, neg_exp): + # GH#38753 + parser = all_parsers + data = f"data\n10E{neg_exp}" + for precision in parser.float_precision_choices: + result = parser.read_csv(StringIO(data), float_precision=precision) + expected = DataFrame({"data": [0.0]}) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("exp", [999999999999999999, -999999999999999999]) +def test_too_many_exponent_digits(all_parsers, exp): + # GH#38753 + parser = all_parsers + data = f"data\n10E{exp}" + for precision in parser.float_precision_choices: + if precision == "round_trip": + continue + result = parser.read_csv(StringIO(data), float_precision=precision) + expected = DataFrame({"data": [f"10E{exp}"]}) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("iterator", [True, False]) def test_empty_with_nrows_chunksize(all_parsers, iterator): # see gh-9535 From bb77ab213b5dcf084cb5441e30cf767246d30bc2 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Tue, 29 Dec 2020 15:52:16 -0500 Subject: [PATCH 2/8] BUG: precise_xstrtod segfault --- doc/source/whatsnew/v1.2.1.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.2.1.rst b/doc/source/whatsnew/v1.2.1.rst index 649b17e255f3d..b0dd7175680f9 100644 --- a/doc/source/whatsnew/v1.2.1.rst +++ b/doc/source/whatsnew/v1.2.1.rst @@ -17,6 +17,7 @@ Fixed regressions - The deprecated attributes ``_AXIS_NAMES`` and ``_AXIS_NUMBERS`` of :class:`DataFrame` and :class:`Series` will no longer show up in ``dir`` or ``inspect.getmembers`` calls (:issue:`38740`) - :meth:`to_csv` created corrupted zip files when there were more rows than ``chunksize`` (issue:`38714`) - Bug in repr of float-like strings of an ``object`` dtype having trailing 0's truncated after the decimal (:issue:`38708`) +- Bug in :meth:`read_csv` with `float_precision="high"` caused segfault or wrong parsing of long exponents strings (:issue:`38753`) - .. --------------------------------------------------------------------------- From 3a0ac2e5c86ab017bcfa80bb2fe53ac5af01ac85 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Tue, 29 Dec 2020 15:57:49 -0500 Subject: [PATCH 3/8] Fix typo --- doc/source/whatsnew/v1.2.1.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.1.rst b/doc/source/whatsnew/v1.2.1.rst index b0dd7175680f9..85d2aa0ecd262 100644 --- a/doc/source/whatsnew/v1.2.1.rst +++ b/doc/source/whatsnew/v1.2.1.rst @@ -17,7 +17,7 @@ Fixed regressions - The deprecated attributes ``_AXIS_NAMES`` and ``_AXIS_NUMBERS`` of :class:`DataFrame` and :class:`Series` will no longer show up in ``dir`` or ``inspect.getmembers`` calls (:issue:`38740`) - :meth:`to_csv` created corrupted zip files when there were more rows than ``chunksize`` (issue:`38714`) - Bug in repr of float-like strings of an ``object`` dtype having trailing 0's truncated after the decimal (:issue:`38708`) -- Bug in :meth:`read_csv` with `float_precision="high"` caused segfault or wrong parsing of long exponents strings (:issue:`38753`) +- Bug in :meth:`read_csv` with `float_precision="high"` caused segfault or wrong parsing of long exponent strings (:issue:`38753`) - .. --------------------------------------------------------------------------- From 7b0cc6226eaeb84c905d2c11f138b65150a44f93 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Tue, 29 Dec 2020 16:08:35 -0500 Subject: [PATCH 4/8] Fix whatsnew --- doc/source/whatsnew/v1.2.1.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.1.rst b/doc/source/whatsnew/v1.2.1.rst index 85d2aa0ecd262..2e9c5ed8e6839 100644 --- a/doc/source/whatsnew/v1.2.1.rst +++ b/doc/source/whatsnew/v1.2.1.rst @@ -17,7 +17,7 @@ Fixed regressions - The deprecated attributes ``_AXIS_NAMES`` and ``_AXIS_NUMBERS`` of :class:`DataFrame` and :class:`Series` will no longer show up in ``dir`` or ``inspect.getmembers`` calls (:issue:`38740`) - :meth:`to_csv` created corrupted zip files when there were more rows than ``chunksize`` (issue:`38714`) - Bug in repr of float-like strings of an ``object`` dtype having trailing 0's truncated after the decimal (:issue:`38708`) -- Bug in :meth:`read_csv` with `float_precision="high"` caused segfault or wrong parsing of long exponent strings (:issue:`38753`) +- Bug in :meth:`read_csv` with ``float_precision``="high" caused segfault or wrong parsing of long exponent strings (:issue:`38753`) - .. --------------------------------------------------------------------------- From 39dd79c0b677fd1342818d675e2d5ccfe1e970fc Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Tue, 29 Dec 2020 20:42:29 -0500 Subject: [PATCH 5/8] Fix quotes --- doc/source/whatsnew/v1.2.1.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.1.rst b/doc/source/whatsnew/v1.2.1.rst index dfe7823f08c6e..5e6ef82f78ce1 100644 --- a/doc/source/whatsnew/v1.2.1.rst +++ b/doc/source/whatsnew/v1.2.1.rst @@ -19,7 +19,7 @@ Fixed regressions - Fixed a regression in ``groupby().rolling()`` where :class:`MultiIndex` levels were dropped (:issue:`38523`) - Bug in repr of float-like strings of an ``object`` dtype having trailing 0's truncated after the decimal (:issue:`38708`) - Fixed regression in :meth:`DataFrame.groupby()` with :class:`Categorical` grouping column not showing unused categories for ``grouped.indices`` (:issue:`38642`) -- Bug in :meth:`read_csv` with ``float_precision``="high" caused segfault or wrong parsing of long exponent strings (:issue:`38753`) +- Bug in :meth:`read_csv` with ``float_precision="high"`` caused segfault or wrong parsing of long exponent strings (:issue:`38753`) .. --------------------------------------------------------------------------- From 58bbed5b72a1f4396d5669830354e9d0df8985ba Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Tue, 29 Dec 2020 21:04:37 -0500 Subject: [PATCH 6/8] xfail inconsistent test --- pandas/tests/io/parser/test_common.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index c7ef653533357..6bc778f50937e 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -15,6 +15,7 @@ import pytest from pandas._libs.tslib import Timestamp +from pandas.compat import is_platform_linux from pandas.errors import DtypeWarning, EmptyDataError, ParserError import pandas.util._test_decorators as td @@ -1361,12 +1362,16 @@ def test_very_negative_exponent(all_parsers_all_precisions, neg_exp): @pytest.mark.parametrize("exp", [999999999999999999, -999999999999999999]) -def test_too_many_exponent_digits(all_parsers_all_precisions, exp): +def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request): # GH#38753 parser, precision = all_parsers_all_precisions data = f"data\n10E{exp}" result = parser.read_csv(StringIO(data), float_precision=precision) if precision == "round_trip": + if exp == 999999999999999999 and is_platform_linux(): + mark = pytest.mark.xfail(reason="On Linux gives object result") + request.node.add_marker(mark) + value = np.inf if exp > 0 else 0.0 expected = DataFrame({"data": [value]}) else: From 0f5f1033cea21383bf33173b67eb43cbe97d4169 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Wed, 30 Dec 2020 10:46:47 -0500 Subject: [PATCH 7/8] Add issue number to xfail --- pandas/tests/io/parser/test_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 6bc778f50937e..31f1581a6184b 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -1369,7 +1369,7 @@ def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request): result = parser.read_csv(StringIO(data), float_precision=precision) if precision == "round_trip": if exp == 999999999999999999 and is_platform_linux(): - mark = pytest.mark.xfail(reason="On Linux gives object result") + mark = pytest.mark.xfail(reason="GH38794, on Linux gives object result") request.node.add_marker(mark) value = np.inf if exp > 0 else 0.0 From 0a48ed843efdc03a40b2611b255473617fdaeb26 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Wed, 30 Dec 2020 10:49:29 -0500 Subject: [PATCH 8/8] Keep extra line --- doc/source/whatsnew/v1.2.1.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.2.1.rst b/doc/source/whatsnew/v1.2.1.rst index 97ea9b11b884e..3ecea674fd34c 100644 --- a/doc/source/whatsnew/v1.2.1.rst +++ b/doc/source/whatsnew/v1.2.1.rst @@ -21,6 +21,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.groupby()` with :class:`Categorical` grouping column not showing unused categories for ``grouped.indices`` (:issue:`38642`) - Fixed regression in :meth:`DataFrame.any` and :meth:`DataFrame.all` not returning a result for tz-aware ``datetime64`` columns (:issue:`38723`) - Bug in :meth:`read_csv` with ``float_precision="high"`` caused segfault or wrong parsing of long exponent strings (:issue:`38753`) +- .. ---------------------------------------------------------------------------