Skip to content

BUG, ENH: Improve infinity parsing for read_csv #13274

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.18.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ Other enhancements

- ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`)
- ``Categorical.astype()`` now accepts an optional boolean argument ``copy``, effective when dtype is categorical (:issue:`13209`)
- Consistent with the Python API, ``pd.read_csv`` will now interpret ``+inf`` as positive infinity (:issue:`13274`)


.. _whatsnew_0182.api:
Expand Down Expand Up @@ -257,3 +258,4 @@ Bug Fixes


- Bug in ``Categorical.remove_unused_categories()`` changes ``.codes`` dtype to platform int (:issue:`13261`)
- Bug in ``pd.read_csv`` for the Python engine in which infinities of mixed-case forms were not being interpreted properly (:issue:`13274`)
22 changes: 0 additions & 22 deletions pandas/io/tests/parser/c_parser_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,25 +447,3 @@ def test_empty_header_read(count):

for count in range(1, 101):
test_empty_header_read(count)

def test_inf_parsing(self):
data = """\
,A
a,inf
b,-inf
c,Inf
d,-Inf
e,INF
f,-INF
g,INf
h,-INf
i,inF
j,-inF"""
inf = float('inf')
expected = Series([inf, -inf] * 5)

df = self.read_csv(StringIO(data), index_col=0)
tm.assert_almost_equal(df['A'].values, expected.values)

df = self.read_csv(StringIO(data), index_col=0, na_filter=False)
tm.assert_almost_equal(df['A'].values, expected.values)
24 changes: 24 additions & 0 deletions pandas/io/tests/parser/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1300,3 +1300,27 @@ def test_read_duplicate_names(self):
expected = DataFrame([[0, 1, 2], [3, 4, 5]],
columns=['a', 'b', 'a.1'])
tm.assert_frame_equal(df, expected)

def test_inf_parsing(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

did you change anything for python engine? (or is floatify called there and so its fixed)?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No need to change anything for Python engine. floatify fix fixes that issue 😄

data = """\
,A
a,inf
b,-inf
c,+Inf
d,-Inf
e,INF
f,-INF
g,+INf
h,-INf
i,inF
j,-inF"""
inf = float('inf')
expected = Series([inf, -inf] * 5)

df = self.read_csv(StringIO(data), index_col=0)
tm.assert_almost_equal(df['A'].values, expected.values)

if self.engine == 'c':
# TODO: remove condition when 'na_filter' is supported for Python
df = self.read_csv(StringIO(data), index_col=0, na_filter=False)
tm.assert_almost_equal(df['A'].values, expected.values)
5 changes: 3 additions & 2 deletions pandas/parser.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1501,6 +1501,7 @@ cdef inline void _to_fw_string_nogil(parser_t *parser, int col, int line_start,
data += width

cdef char* cinf = b'inf'
cdef char* cposinf = b'+inf'
cdef char* cneginf = b'-inf'

cdef _try_double(parser_t *parser, int col, int line_start, int line_end,
Expand Down Expand Up @@ -1562,7 +1563,7 @@ cdef inline int _try_double_nogil(parser_t *parser, int col, int line_start, int
data[0] = parser.converter(word, &p_end, parser.decimal, parser.sci,
parser.thousands, 1)
if errno != 0 or p_end[0] or p_end == word:
if strcasecmp(word, cinf) == 0:
if strcasecmp(word, cinf) == 0 or strcasecmp(word, cposinf) == 0:
data[0] = INF
elif strcasecmp(word, cneginf) == 0:
data[0] = NEGINF
Expand All @@ -1581,7 +1582,7 @@ cdef inline int _try_double_nogil(parser_t *parser, int col, int line_start, int
data[0] = parser.converter(word, &p_end, parser.decimal, parser.sci,
parser.thousands, 1)
if errno != 0 or p_end[0] or p_end == word:
if strcasecmp(word, cinf) == 0:
if strcasecmp(word, cinf) == 0 or strcasecmp(word, cposinf) == 0:
data[0] = INF
elif strcasecmp(word, cneginf) == 0:
data[0] = NEGINF
Expand Down
33 changes: 24 additions & 9 deletions pandas/src/parse_helper.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include <errno.h>
#include <float.h>
#include "headers/portable.h"

static double xstrtod(const char *p, char **q, char decimal, char sci,
int skip_trailing, int *maybe_int);
Expand Down Expand Up @@ -39,22 +40,36 @@ int floatify(PyObject* str, double *result, int *maybe_int) {

if (!status) {
/* handle inf/-inf */
if (0 == strcmp(data, "-inf")) {
*result = -HUGE_VAL;
*maybe_int = 0;
} else if (0 == strcmp(data, "inf")) {
*result = HUGE_VAL;
*maybe_int = 0;
if (strlen(data) == 3) {
if (0 == strcasecmp(data, "inf")) {
*result = HUGE_VAL;
*maybe_int = 0;
} else {
goto parsingerror;
}
} else if (strlen(data) == 4) {
if (0 == strcasecmp(data, "-inf")) {
*result = -HUGE_VAL;
*maybe_int = 0;
} else if (0 == strcasecmp(data, "+inf")) {
*result = HUGE_VAL;
*maybe_int = 0;
} else {
goto parsingerror;
}
} else {
PyErr_SetString(PyExc_ValueError, "Unable to parse string");
Py_XDECREF(tmp);
return -1;
goto parsingerror;
}
}

Py_XDECREF(tmp);
return 0;

parsingerror:
PyErr_SetString(PyExc_ValueError, "Unable to parse string");
Py_XDECREF(tmp);
return -1;

/*
#if PY_VERSION_HEX >= 0x03000000
return PyFloat_FromString(str);
Expand Down
39 changes: 39 additions & 0 deletions pandas/tests/test_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,45 @@ def test_isinf_scalar(self):
self.assertFalse(lib.isneginf_scalar(1))
self.assertFalse(lib.isneginf_scalar('a'))

def test_maybe_convert_numeric_infinities(self):
# see gh-13274
infinities = ['inf', 'inF', 'iNf', 'Inf',
'iNF', 'InF', 'INf', 'INF']
na_values = set(['', 'NULL', 'nan'])

pos = np.array(['inf'], dtype=np.float64)
neg = np.array(['-inf'], dtype=np.float64)

msg = "Unable to parse string"

for infinity in infinities:
for maybe_int in (True, False):
out = lib.maybe_convert_numeric(
np.array([infinity], dtype=object),
na_values, maybe_int)
tm.assert_numpy_array_equal(out, pos)

out = lib.maybe_convert_numeric(
np.array(['-' + infinity], dtype=object),
na_values, maybe_int)
tm.assert_numpy_array_equal(out, neg)

out = lib.maybe_convert_numeric(
np.array([u(infinity)], dtype=object),
na_values, maybe_int)
tm.assert_numpy_array_equal(out, pos)

out = lib.maybe_convert_numeric(
np.array(['+' + infinity], dtype=object),
na_values, maybe_int)
tm.assert_numpy_array_equal(out, pos)

# too many characters
with tm.assertRaisesRegexp(ValueError, msg):
lib.maybe_convert_numeric(
np.array(['foo_' + infinity], dtype=object),
na_values, maybe_int)


class Testisscalar(tm.TestCase):

Expand Down