BUG, ENH: Improve infinity parsing in read_csv

gfyoung · gfyoung · commit f37b130d3219 · 2016-05-25T15:58:02.000+01:00
1) Python infinity parsing bug

Initially an attempt to fix a Python parsing bug of
mixed-case infinity strings, the bug was traced back
via lib.maybe_convert_numeric to the 'floatify' method
in pandas/src/parse_helper.h.

In addition to correcting the bug and adding tests for
it, this commit also moves the infinity-parsing test
from CParser-only to common.

2) Interpret '+inf' as positive infinity

This is consistent with the Python API, where
float('+inf') is interpreted as positive infinity.
diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt
@@ -78,6 +78,7 @@ Other enhancements
 
 - ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`)
 - ``Categorical.astype()`` now accepts an optional boolean argument ``copy``, effective when dtype is categorical (:issue:`13209`)
+- Consistent with the Python API, ``pd.read_csv`` will now interpret ``+inf`` as positive infinity (:issue:`13274`)
 
 
 .. _whatsnew_0182.api:
@@ -257,3 +258,4 @@ Bug Fixes
 
 
 - Bug in ``Categorical.remove_unused_categories()`` changes ``.codes`` dtype to platform int (:issue:`13261`)
+- Bug in ``pd.read_csv`` for the Python engine in which infinities of mixed-case forms were not being interpreted properly (:issue:`13274`)
diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py
@@ -447,25 +447,3 @@ def test_empty_header_read(count):
 
         for count in range(1, 101):
             test_empty_header_read(count)
-
-    def test_inf_parsing(self):
-        data = """\
-,A
-a,inf
-b,-inf
-c,Inf
-d,-Inf
-e,INF
-f,-INF
-g,INf
-h,-INf
-i,inF
-j,-inF"""
-        inf = float('inf')
-        expected = Series([inf, -inf] * 5)
-
-        df = self.read_csv(StringIO(data), index_col=0)
-        tm.assert_almost_equal(df['A'].values, expected.values)
-
-        df = self.read_csv(StringIO(data), index_col=0, na_filter=False)
-        tm.assert_almost_equal(df['A'].values, expected.values)
diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py
@@ -1300,3 +1300,27 @@ def test_read_duplicate_names(self):
         expected = DataFrame([[0, 1, 2], [3, 4, 5]],
                              columns=['a', 'b', 'a.1'])
         tm.assert_frame_equal(df, expected)
+
+    def test_inf_parsing(self):
+        data = """\
+,A
+a,inf
+b,-inf
+c,+Inf
+d,-Inf
+e,INF
+f,-INF
+g,+INf
+h,-INf
+i,inF
+j,-inF"""
+        inf = float('inf')
+        expected = Series([inf, -inf] * 5)
+
+        df = self.read_csv(StringIO(data), index_col=0)
+        tm.assert_almost_equal(df['A'].values, expected.values)
+
+        if self.engine == 'c':
+            # TODO: remove condition when 'na_filter' is supported for Python
+            df = self.read_csv(StringIO(data), index_col=0, na_filter=False)
+            tm.assert_almost_equal(df['A'].values, expected.values)
diff --git a/pandas/parser.pyx b/pandas/parser.pyx
@@ -1501,6 +1501,7 @@ cdef inline void _to_fw_string_nogil(parser_t *parser, int col, int line_start,
         data += width
 
 cdef char* cinf = b'inf'
+cdef char* cposinf = b'+inf'
 cdef char* cneginf = b'-inf'
 
 cdef _try_double(parser_t *parser, int col, int line_start, int line_end,
@@ -1562,7 +1563,7 @@ cdef inline int _try_double_nogil(parser_t *parser, int col, int line_start, int
                 data[0] = parser.converter(word, &p_end, parser.decimal, parser.sci,
                                            parser.thousands, 1)
                 if errno != 0 or p_end[0] or p_end == word:
-                    if strcasecmp(word, cinf) == 0:
+                    if strcasecmp(word, cinf) == 0 or strcasecmp(word, cposinf) == 0:
                         data[0] = INF
                     elif strcasecmp(word, cneginf) == 0:
                         data[0] = NEGINF
@@ -1581,7 +1582,7 @@ cdef inline int _try_double_nogil(parser_t *parser, int col, int line_start, int
             data[0] = parser.converter(word, &p_end, parser.decimal, parser.sci,
                                        parser.thousands, 1)
             if errno != 0 or p_end[0] or p_end == word:
-                if strcasecmp(word, cinf) == 0:
+                if strcasecmp(word, cinf) == 0 or strcasecmp(word, cposinf) == 0:
                     data[0] = INF
                 elif strcasecmp(word, cneginf) == 0:
                     data[0] = NEGINF
diff --git a/pandas/src/parse_helper.h b/pandas/src/parse_helper.h
@@ -1,5 +1,6 @@
 #include <errno.h>
 #include <float.h>
+#include "headers/portable.h"
 
 static double xstrtod(const char *p, char **q, char decimal, char sci,
                       int skip_trailing, int *maybe_int);
@@ -39,22 +40,36 @@ int floatify(PyObject* str, double *result, int *maybe_int) {
 
     if (!status) {
         /* handle inf/-inf */
-        if (0 == strcmp(data, "-inf")) {
-            *result = -HUGE_VAL;
-            *maybe_int = 0;
-        } else if (0 == strcmp(data, "inf")) {
-            *result = HUGE_VAL;
-            *maybe_int = 0;
+        if (strlen(data) == 3) {
+            if (0 == strcasecmp(data, "inf")) {
+                *result = HUGE_VAL;
+                *maybe_int = 0;
+            } else {
+                goto parsingerror;
+            }
+        } else if (strlen(data) == 4) {
+            if (0 == strcasecmp(data, "-inf")) {
+                *result = -HUGE_VAL;
+                *maybe_int = 0;
+            } else if (0 == strcasecmp(data, "+inf")) {
+                *result = HUGE_VAL;
+                *maybe_int = 0;
+            } else {
+                goto parsingerror;
+            }
         } else {
-            PyErr_SetString(PyExc_ValueError, "Unable to parse string");
-            Py_XDECREF(tmp);
-            return -1;
+            goto parsingerror;
         }
     }
 
     Py_XDECREF(tmp);
     return 0;
 
+parsingerror:
+    PyErr_SetString(PyExc_ValueError, "Unable to parse string");
+    Py_XDECREF(tmp);
+    return -1;
+
 /*
 #if PY_VERSION_HEX >= 0x03000000
   return PyFloat_FromString(str);
diff --git a/pandas/tests/test_lib.py b/pandas/tests/test_lib.py
@@ -188,6 +188,45 @@ def test_isinf_scalar(self):
         self.assertFalse(lib.isneginf_scalar(1))
         self.assertFalse(lib.isneginf_scalar('a'))
 
+    def test_maybe_convert_numeric_infinities(self):
+        # see gh-13274
+        infinities = ['inf', 'inF', 'iNf', 'Inf',
+                      'iNF', 'InF', 'INf', 'INF']
+        na_values = set(['', 'NULL', 'nan'])
+
+        pos = np.array(['inf'], dtype=np.float64)
+        neg = np.array(['-inf'], dtype=np.float64)
+
+        msg = "Unable to parse string"
+
+        for infinity in infinities:
+            for maybe_int in (True, False):
+                out = lib.maybe_convert_numeric(
+                    np.array([infinity], dtype=object),
+                    na_values, maybe_int)
+                tm.assert_numpy_array_equal(out, pos)
+
+                out = lib.maybe_convert_numeric(
+                    np.array(['-' + infinity], dtype=object),
+                    na_values, maybe_int)
+                tm.assert_numpy_array_equal(out, neg)
+
+                out = lib.maybe_convert_numeric(
+                    np.array([u(infinity)], dtype=object),
+                    na_values, maybe_int)
+                tm.assert_numpy_array_equal(out, pos)
+
+                out = lib.maybe_convert_numeric(
+                    np.array(['+' + infinity], dtype=object),
+                    na_values, maybe_int)
+                tm.assert_numpy_array_equal(out, pos)
+
+                # too many characters
+                with tm.assertRaisesRegexp(ValueError, msg):
+                    lib.maybe_convert_numeric(
+                        np.array(['foo_' + infinity], dtype=object),
+                        na_values, maybe_int)
+
 
 class Testisscalar(tm.TestCase):