From f37b130d3219dde0413ba98632fde36c57a13517 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Wed, 25 May 2016 00:18:02 +0100 Subject: [PATCH] BUG, ENH: Improve infinity parsing in read_csv 1) Python infinity parsing bug Initially an attempt to fix a Python parsing bug of mixed-case infinity strings, the bug was traced back via lib.maybe_convert_numeric to the 'floatify' method in pandas/src/parse_helper.h. In addition to correcting the bug and adding tests for it, this commit also moves the infinity-parsing test from CParser-only to common. 2) Interpret '+inf' as positive infinity This is consistent with the Python API, where float('+inf') is interpreted as positive infinity. --- doc/source/whatsnew/v0.18.2.txt | 2 ++ pandas/io/tests/parser/c_parser_only.py | 22 -------------- pandas/io/tests/parser/common.py | 24 +++++++++++++++ pandas/parser.pyx | 5 ++-- pandas/src/parse_helper.h | 33 +++++++++++++++------ pandas/tests/test_lib.py | 39 +++++++++++++++++++++++++ 6 files changed, 92 insertions(+), 33 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index ebae54f292e3c..9d53394ce70c9 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -78,6 +78,7 @@ Other enhancements - ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`) - ``Categorical.astype()`` now accepts an optional boolean argument ``copy``, effective when dtype is categorical (:issue:`13209`) +- Consistent with the Python API, ``pd.read_csv`` will now interpret ``+inf`` as positive infinity (:issue:`13274`) .. _whatsnew_0182.api: @@ -257,3 +258,4 @@ Bug Fixes - Bug in ``Categorical.remove_unused_categories()`` changes ``.codes`` dtype to platform int (:issue:`13261`) +- Bug in ``pd.read_csv`` for the Python engine in which infinities of mixed-case forms were not being interpreted properly (:issue:`13274`) diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index 325418f87af6a..aeee77bb02e98 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -447,25 +447,3 @@ def test_empty_header_read(count): for count in range(1, 101): test_empty_header_read(count) - - def test_inf_parsing(self): - data = """\ -,A -a,inf -b,-inf -c,Inf -d,-Inf -e,INF -f,-INF -g,INf -h,-INf -i,inF -j,-inF""" - inf = float('inf') - expected = Series([inf, -inf] * 5) - - df = self.read_csv(StringIO(data), index_col=0) - tm.assert_almost_equal(df['A'].values, expected.values) - - df = self.read_csv(StringIO(data), index_col=0, na_filter=False) - tm.assert_almost_equal(df['A'].values, expected.values) diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 8c4bf3644127e..3912bbbf11e53 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -1300,3 +1300,27 @@ def test_read_duplicate_names(self): expected = DataFrame([[0, 1, 2], [3, 4, 5]], columns=['a', 'b', 'a.1']) tm.assert_frame_equal(df, expected) + + def test_inf_parsing(self): + data = """\ +,A +a,inf +b,-inf +c,+Inf +d,-Inf +e,INF +f,-INF +g,+INf +h,-INf +i,inF +j,-inF""" + inf = float('inf') + expected = Series([inf, -inf] * 5) + + df = self.read_csv(StringIO(data), index_col=0) + tm.assert_almost_equal(df['A'].values, expected.values) + + if self.engine == 'c': + # TODO: remove condition when 'na_filter' is supported for Python + df = self.read_csv(StringIO(data), index_col=0, na_filter=False) + tm.assert_almost_equal(df['A'].values, expected.values) diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 94d7f36f4f205..729e5af528b80 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -1501,6 +1501,7 @@ cdef inline void _to_fw_string_nogil(parser_t *parser, int col, int line_start, data += width cdef char* cinf = b'inf' +cdef char* cposinf = b'+inf' cdef char* cneginf = b'-inf' cdef _try_double(parser_t *parser, int col, int line_start, int line_end, @@ -1562,7 +1563,7 @@ cdef inline int _try_double_nogil(parser_t *parser, int col, int line_start, int data[0] = parser.converter(word, &p_end, parser.decimal, parser.sci, parser.thousands, 1) if errno != 0 or p_end[0] or p_end == word: - if strcasecmp(word, cinf) == 0: + if strcasecmp(word, cinf) == 0 or strcasecmp(word, cposinf) == 0: data[0] = INF elif strcasecmp(word, cneginf) == 0: data[0] = NEGINF @@ -1581,7 +1582,7 @@ cdef inline int _try_double_nogil(parser_t *parser, int col, int line_start, int data[0] = parser.converter(word, &p_end, parser.decimal, parser.sci, parser.thousands, 1) if errno != 0 or p_end[0] or p_end == word: - if strcasecmp(word, cinf) == 0: + if strcasecmp(word, cinf) == 0 or strcasecmp(word, cposinf) == 0: data[0] = INF elif strcasecmp(word, cneginf) == 0: data[0] = NEGINF diff --git a/pandas/src/parse_helper.h b/pandas/src/parse_helper.h index d47e448700029..fd5089dd8963d 100644 --- a/pandas/src/parse_helper.h +++ b/pandas/src/parse_helper.h @@ -1,5 +1,6 @@ #include #include +#include "headers/portable.h" static double xstrtod(const char *p, char **q, char decimal, char sci, int skip_trailing, int *maybe_int); @@ -39,22 +40,36 @@ int floatify(PyObject* str, double *result, int *maybe_int) { if (!status) { /* handle inf/-inf */ - if (0 == strcmp(data, "-inf")) { - *result = -HUGE_VAL; - *maybe_int = 0; - } else if (0 == strcmp(data, "inf")) { - *result = HUGE_VAL; - *maybe_int = 0; + if (strlen(data) == 3) { + if (0 == strcasecmp(data, "inf")) { + *result = HUGE_VAL; + *maybe_int = 0; + } else { + goto parsingerror; + } + } else if (strlen(data) == 4) { + if (0 == strcasecmp(data, "-inf")) { + *result = -HUGE_VAL; + *maybe_int = 0; + } else if (0 == strcasecmp(data, "+inf")) { + *result = HUGE_VAL; + *maybe_int = 0; + } else { + goto parsingerror; + } } else { - PyErr_SetString(PyExc_ValueError, "Unable to parse string"); - Py_XDECREF(tmp); - return -1; + goto parsingerror; } } Py_XDECREF(tmp); return 0; +parsingerror: + PyErr_SetString(PyExc_ValueError, "Unable to parse string"); + Py_XDECREF(tmp); + return -1; + /* #if PY_VERSION_HEX >= 0x03000000 return PyFloat_FromString(str); diff --git a/pandas/tests/test_lib.py b/pandas/tests/test_lib.py index 6912e3a7ff68c..2aa31063df446 100644 --- a/pandas/tests/test_lib.py +++ b/pandas/tests/test_lib.py @@ -188,6 +188,45 @@ def test_isinf_scalar(self): self.assertFalse(lib.isneginf_scalar(1)) self.assertFalse(lib.isneginf_scalar('a')) + def test_maybe_convert_numeric_infinities(self): + # see gh-13274 + infinities = ['inf', 'inF', 'iNf', 'Inf', + 'iNF', 'InF', 'INf', 'INF'] + na_values = set(['', 'NULL', 'nan']) + + pos = np.array(['inf'], dtype=np.float64) + neg = np.array(['-inf'], dtype=np.float64) + + msg = "Unable to parse string" + + for infinity in infinities: + for maybe_int in (True, False): + out = lib.maybe_convert_numeric( + np.array([infinity], dtype=object), + na_values, maybe_int) + tm.assert_numpy_array_equal(out, pos) + + out = lib.maybe_convert_numeric( + np.array(['-' + infinity], dtype=object), + na_values, maybe_int) + tm.assert_numpy_array_equal(out, neg) + + out = lib.maybe_convert_numeric( + np.array([u(infinity)], dtype=object), + na_values, maybe_int) + tm.assert_numpy_array_equal(out, pos) + + out = lib.maybe_convert_numeric( + np.array(['+' + infinity], dtype=object), + na_values, maybe_int) + tm.assert_numpy_array_equal(out, pos) + + # too many characters + with tm.assertRaisesRegexp(ValueError, msg): + lib.maybe_convert_numeric( + np.array(['foo_' + infinity], dtype=object), + na_values, maybe_int) + class Testisscalar(tm.TestCase):