From 28ac8e4efbc9c6f85cbb71adc95547b4627fc697 Mon Sep 17 00:00:00 2001 From: Sergei Ivko Date: Tue, 27 Aug 2019 21:49:16 +0300 Subject: [PATCH 1/5] ENH: Enable read_csv interpret 'Infinity' as floating point value (issue #10065) --- doc/source/whatsnew/v0.25.2.rst | 2 +- pandas/_libs/parsers.pyx | 18 ++++++++++++++---- pandas/tests/io/parser/test_common.py | 17 +++++++++++++++++ 3 files changed, 32 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.25.2.rst b/doc/source/whatsnew/v0.25.2.rst index 6974c7521a237..8f4b278a014a9 100644 --- a/doc/source/whatsnew/v0.25.2.rst +++ b/doc/source/whatsnew/v0.25.2.rst @@ -61,7 +61,7 @@ Missing I/O ^^^ - +- Improve infinity parsing. ``pd.read_csv()`` will now interpret ``Infinity``, ``+Infinity``, ``-Infinity`` as floating point values (:issue:`10065`). - - - diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 6cc9dd22ce7c9..bead57119b482 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1693,6 +1693,10 @@ cdef: char* cposinf = b'+inf' char* cneginf = b'-inf' + char* cinfty = b'Infinity' + char* cposinfty = b'+Infinity' + char* cneginfty = b'-Infinity' + cdef _try_double(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, @@ -1772,9 +1776,12 @@ cdef inline int _try_double_nogil(parser_t *parser, if error != 0 or p_end == word or p_end[0]: error = 0 if (strcasecmp(word, cinf) == 0 or - strcasecmp(word, cposinf) == 0): + strcasecmp(word, cposinf) == 0 or + strcasecmp(word, cinfty) == 0 or + strcasecmp(word, cposinfty) == 0): data[0] = INF - elif strcasecmp(word, cneginf) == 0: + elif (strcasecmp(word, cneginf) == 0 or + strcasecmp(word, cneginfty) == 0 ): data[0] = NEGINF else: return 1 @@ -1793,9 +1800,12 @@ cdef inline int _try_double_nogil(parser_t *parser, if error != 0 or p_end == word or p_end[0]: error = 0 if (strcasecmp(word, cinf) == 0 or - strcasecmp(word, cposinf) == 0): + strcasecmp(word, cposinf) == 0 or + strcasecmp(word, cinfty) == 0 or + strcasecmp(word, cposinfty) == 0): data[0] = INF - elif strcasecmp(word, cneginf) == 0: + elif (strcasecmp(word, cneginf) == 0 or + strcasecmp(word, cneginfty) == 0): data[0] = NEGINF else: return 1 diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index e04535df56663..7780f3afd1a9e 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -1865,6 +1865,23 @@ def test_inf_parsing(all_parsers, na_filter): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("na_filter", [True, False]) +def test_infinity_parsing(all_parsers, na_filter): + parser = all_parsers + data = """\ +,A +a,Infinity +b,-Infinity +c,+Infinity +""" + expected = DataFrame( + {"A": [float("inf"), float("-inf"), float("inf")]}, + index=["a", "b", "c"], + ) + result = parser.read_csv(StringIO(data), index_col=0, na_filter=na_filter) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("nrows", [0, 1, 2, 3, 4, 5]) def test_raise_on_no_columns(all_parsers, nrows): parser = all_parsers From 059808a0fd45d948f65f3b5e4ef43fd8c0447a8b Mon Sep 17 00:00:00 2001 From: Sergei Ivko Date: Wed, 28 Aug 2019 21:03:14 +0300 Subject: [PATCH 2/5] Add missing files --- pandas/_libs/src/parse_helper.h | 19 ++++++++++++++++++- pandas/tests/io/parser/test_common.py | 3 +-- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/src/parse_helper.h b/pandas/_libs/src/parse_helper.h index 1db1878a8a773..1db4c813bb493 100644 --- a/pandas/_libs/src/parse_helper.h +++ b/pandas/_libs/src/parse_helper.h @@ -50,7 +50,7 @@ int floatify(PyObject *str, double *result, int *maybe_int) { status = to_double(data, result, sci, dec, maybe_int); if (!status) { - /* handle inf/-inf */ + /* handle inf/-inf infinity/-infinity */ if (strlen(data) == 3) { if (0 == strcasecmp(data, "inf")) { *result = HUGE_VAL; @@ -68,6 +68,23 @@ int floatify(PyObject *str, double *result, int *maybe_int) { } else { goto parsingerror; } + } else if (strlen(data) == 8) { + if (0 == strcasecmp(data, "infinity")) { + *result = HUGE_VAL; + *maybe_int = 0; + } else { + goto parsingerror; + } + } else if (strlen(data) == 9) { + if (0 == strcasecmp(data, "-infinity")) { + *result = -HUGE_VAL; + *maybe_int = 0; + } else if (0 == strcasecmp(data, "+infinity")) { + *result = HUGE_VAL; + *maybe_int = 0; + } else { + goto parsingerror; + } } else { goto parsingerror; } diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 7780f3afd1a9e..a3473e634c468 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -1875,8 +1875,7 @@ def test_infinity_parsing(all_parsers, na_filter): c,+Infinity """ expected = DataFrame( - {"A": [float("inf"), float("-inf"), float("inf")]}, - index=["a", "b", "c"], + {"A": [float("inf"), float("-inf"), float("inf")]}, index=["a", "b", "c"] ) result = parser.read_csv(StringIO(data), index_col=0, na_filter=na_filter) tm.assert_frame_equal(result, expected) From 2592575958f2182e1fb61af69af894981e560930 Mon Sep 17 00:00:00 2001 From: Sergei Ivko Date: Wed, 28 Aug 2019 23:03:39 +0300 Subject: [PATCH 3/5] Fix formatting --- pandas/_libs/parsers.pyx | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index bead57119b482..62a3568932def 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1776,12 +1776,12 @@ cdef inline int _try_double_nogil(parser_t *parser, if error != 0 or p_end == word or p_end[0]: error = 0 if (strcasecmp(word, cinf) == 0 or - strcasecmp(word, cposinf) == 0 or - strcasecmp(word, cinfty) == 0 or - strcasecmp(word, cposinfty) == 0): + strcasecmp(word, cposinf) == 0 or + strcasecmp(word, cinfty) == 0 or + strcasecmp(word, cposinfty) == 0): data[0] = INF - elif (strcasecmp(word, cneginf) == 0 or - strcasecmp(word, cneginfty) == 0 ): + elif (strcasecmp(word, cneginf) == 0 or + strcasecmp(word, cneginfty) == 0 ): data[0] = NEGINF else: return 1 @@ -1804,8 +1804,8 @@ cdef inline int _try_double_nogil(parser_t *parser, strcasecmp(word, cinfty) == 0 or strcasecmp(word, cposinfty) == 0): data[0] = INF - elif (strcasecmp(word, cneginf) == 0 or - strcasecmp(word, cneginfty) == 0): + elif (strcasecmp(word, cneginf) == 0 or + strcasecmp(word, cneginfty) == 0): data[0] = NEGINF else: return 1 From 215477f0f3b3918ddd0fd2ffe19dfe247a5d4ccd Mon Sep 17 00:00:00 2001 From: Sergei Ivko Date: Fri, 30 Aug 2019 22:52:10 +0300 Subject: [PATCH 4/5] Move whatsnew to v1.0.0 --- doc/source/whatsnew/v0.25.2.rst | 2 +- doc/source/whatsnew/v1.0.0.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.25.2.rst b/doc/source/whatsnew/v0.25.2.rst index 8f4b278a014a9..6974c7521a237 100644 --- a/doc/source/whatsnew/v0.25.2.rst +++ b/doc/source/whatsnew/v0.25.2.rst @@ -61,7 +61,7 @@ Missing I/O ^^^ -- Improve infinity parsing. ``pd.read_csv()`` will now interpret ``Infinity``, ``+Infinity``, ``-Infinity`` as floating point values (:issue:`10065`). + - - - diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 7a10447e3ad40..e5f46103e0f04 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -161,7 +161,7 @@ I/O - :meth:`read_csv` now accepts binary mode file buffers when using the Python csv engine (:issue:`23779`) - Bug in :meth:`DataFrame.to_json` where using a Tuple as a column or index value and using ``orient="columns"`` or ``orient="index"`` would produce invalid JSON (:issue:`20500`) -- +- Improve infinity parsing. :meth:`read_csv` now interprets ``Infinity``, ``+Infinity``, ``-Infinity`` as floating point values (:issue:`10065`) Plotting ^^^^^^^^ From a77f7083f97872f4920ad54dbfe88d7b9ae44f2b Mon Sep 17 00:00:00 2001 From: Sergei Ivko Date: Fri, 30 Aug 2019 23:04:50 +0300 Subject: [PATCH 5/5] Clarify test case --- pandas/tests/io/parser/test_common.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index a3473e634c468..0586593c87cc5 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -1875,7 +1875,8 @@ def test_infinity_parsing(all_parsers, na_filter): c,+Infinity """ expected = DataFrame( - {"A": [float("inf"), float("-inf"), float("inf")]}, index=["a", "b", "c"] + {"A": [float("infinity"), float("-infinity"), float("+infinity")]}, + index=["a", "b", "c"], ) result = parser.read_csv(StringIO(data), index_col=0, na_filter=na_filter) tm.assert_frame_equal(result, expected)