Skip to content

Commit f37b130

Browse files
committed
BUG, ENH: Improve infinity parsing in read_csv
1) Python infinity parsing bug Initially an attempt to fix a Python parsing bug of mixed-case infinity strings, the bug was traced back via lib.maybe_convert_numeric to the 'floatify' method in pandas/src/parse_helper.h. In addition to correcting the bug and adding tests for it, this commit also moves the infinity-parsing test from CParser-only to common. 2) Interpret '+inf' as positive infinity This is consistent with the Python API, where float('+inf') is interpreted as positive infinity.
1 parent 8749273 commit f37b130

File tree

6 files changed

+92
-33
lines changed

6 files changed

+92
-33
lines changed

doc/source/whatsnew/v0.18.2.txt

+2
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ Other enhancements
7878

7979
- ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`)
8080
- ``Categorical.astype()`` now accepts an optional boolean argument ``copy``, effective when dtype is categorical (:issue:`13209`)
81+
- Consistent with the Python API, ``pd.read_csv`` will now interpret ``+inf`` as positive infinity (:issue:`13274`)
8182

8283

8384
.. _whatsnew_0182.api:
@@ -257,3 +258,4 @@ Bug Fixes
257258

258259

259260
- Bug in ``Categorical.remove_unused_categories()`` changes ``.codes`` dtype to platform int (:issue:`13261`)
261+
- Bug in ``pd.read_csv`` for the Python engine in which infinities of mixed-case forms were not being interpreted properly (:issue:`13274`)

pandas/io/tests/parser/c_parser_only.py

-22
Original file line numberDiff line numberDiff line change
@@ -447,25 +447,3 @@ def test_empty_header_read(count):
447447

448448
for count in range(1, 101):
449449
test_empty_header_read(count)
450-
451-
def test_inf_parsing(self):
452-
data = """\
453-
,A
454-
a,inf
455-
b,-inf
456-
c,Inf
457-
d,-Inf
458-
e,INF
459-
f,-INF
460-
g,INf
461-
h,-INf
462-
i,inF
463-
j,-inF"""
464-
inf = float('inf')
465-
expected = Series([inf, -inf] * 5)
466-
467-
df = self.read_csv(StringIO(data), index_col=0)
468-
tm.assert_almost_equal(df['A'].values, expected.values)
469-
470-
df = self.read_csv(StringIO(data), index_col=0, na_filter=False)
471-
tm.assert_almost_equal(df['A'].values, expected.values)

pandas/io/tests/parser/common.py

+24
Original file line numberDiff line numberDiff line change
@@ -1300,3 +1300,27 @@ def test_read_duplicate_names(self):
13001300
expected = DataFrame([[0, 1, 2], [3, 4, 5]],
13011301
columns=['a', 'b', 'a.1'])
13021302
tm.assert_frame_equal(df, expected)
1303+
1304+
def test_inf_parsing(self):
1305+
data = """\
1306+
,A
1307+
a,inf
1308+
b,-inf
1309+
c,+Inf
1310+
d,-Inf
1311+
e,INF
1312+
f,-INF
1313+
g,+INf
1314+
h,-INf
1315+
i,inF
1316+
j,-inF"""
1317+
inf = float('inf')
1318+
expected = Series([inf, -inf] * 5)
1319+
1320+
df = self.read_csv(StringIO(data), index_col=0)
1321+
tm.assert_almost_equal(df['A'].values, expected.values)
1322+
1323+
if self.engine == 'c':
1324+
# TODO: remove condition when 'na_filter' is supported for Python
1325+
df = self.read_csv(StringIO(data), index_col=0, na_filter=False)
1326+
tm.assert_almost_equal(df['A'].values, expected.values)

pandas/parser.pyx

+3-2
Original file line numberDiff line numberDiff line change
@@ -1501,6 +1501,7 @@ cdef inline void _to_fw_string_nogil(parser_t *parser, int col, int line_start,
15011501
data += width
15021502

15031503
cdef char* cinf = b'inf'
1504+
cdef char* cposinf = b'+inf'
15041505
cdef char* cneginf = b'-inf'
15051506

15061507
cdef _try_double(parser_t *parser, int col, int line_start, int line_end,
@@ -1562,7 +1563,7 @@ cdef inline int _try_double_nogil(parser_t *parser, int col, int line_start, int
15621563
data[0] = parser.converter(word, &p_end, parser.decimal, parser.sci,
15631564
parser.thousands, 1)
15641565
if errno != 0 or p_end[0] or p_end == word:
1565-
if strcasecmp(word, cinf) == 0:
1566+
if strcasecmp(word, cinf) == 0 or strcasecmp(word, cposinf) == 0:
15661567
data[0] = INF
15671568
elif strcasecmp(word, cneginf) == 0:
15681569
data[0] = NEGINF
@@ -1581,7 +1582,7 @@ cdef inline int _try_double_nogil(parser_t *parser, int col, int line_start, int
15811582
data[0] = parser.converter(word, &p_end, parser.decimal, parser.sci,
15821583
parser.thousands, 1)
15831584
if errno != 0 or p_end[0] or p_end == word:
1584-
if strcasecmp(word, cinf) == 0:
1585+
if strcasecmp(word, cinf) == 0 or strcasecmp(word, cposinf) == 0:
15851586
data[0] = INF
15861587
elif strcasecmp(word, cneginf) == 0:
15871588
data[0] = NEGINF

pandas/src/parse_helper.h

+24-9
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#include <errno.h>
22
#include <float.h>
3+
#include "headers/portable.h"
34

45
static double xstrtod(const char *p, char **q, char decimal, char sci,
56
int skip_trailing, int *maybe_int);
@@ -39,22 +40,36 @@ int floatify(PyObject* str, double *result, int *maybe_int) {
3940

4041
if (!status) {
4142
/* handle inf/-inf */
42-
if (0 == strcmp(data, "-inf")) {
43-
*result = -HUGE_VAL;
44-
*maybe_int = 0;
45-
} else if (0 == strcmp(data, "inf")) {
46-
*result = HUGE_VAL;
47-
*maybe_int = 0;
43+
if (strlen(data) == 3) {
44+
if (0 == strcasecmp(data, "inf")) {
45+
*result = HUGE_VAL;
46+
*maybe_int = 0;
47+
} else {
48+
goto parsingerror;
49+
}
50+
} else if (strlen(data) == 4) {
51+
if (0 == strcasecmp(data, "-inf")) {
52+
*result = -HUGE_VAL;
53+
*maybe_int = 0;
54+
} else if (0 == strcasecmp(data, "+inf")) {
55+
*result = HUGE_VAL;
56+
*maybe_int = 0;
57+
} else {
58+
goto parsingerror;
59+
}
4860
} else {
49-
PyErr_SetString(PyExc_ValueError, "Unable to parse string");
50-
Py_XDECREF(tmp);
51-
return -1;
61+
goto parsingerror;
5262
}
5363
}
5464

5565
Py_XDECREF(tmp);
5666
return 0;
5767

68+
parsingerror:
69+
PyErr_SetString(PyExc_ValueError, "Unable to parse string");
70+
Py_XDECREF(tmp);
71+
return -1;
72+
5873
/*
5974
#if PY_VERSION_HEX >= 0x03000000
6075
return PyFloat_FromString(str);

pandas/tests/test_lib.py

+39
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,45 @@ def test_isinf_scalar(self):
188188
self.assertFalse(lib.isneginf_scalar(1))
189189
self.assertFalse(lib.isneginf_scalar('a'))
190190

191+
def test_maybe_convert_numeric_infinities(self):
192+
# see gh-13274
193+
infinities = ['inf', 'inF', 'iNf', 'Inf',
194+
'iNF', 'InF', 'INf', 'INF']
195+
na_values = set(['', 'NULL', 'nan'])
196+
197+
pos = np.array(['inf'], dtype=np.float64)
198+
neg = np.array(['-inf'], dtype=np.float64)
199+
200+
msg = "Unable to parse string"
201+
202+
for infinity in infinities:
203+
for maybe_int in (True, False):
204+
out = lib.maybe_convert_numeric(
205+
np.array([infinity], dtype=object),
206+
na_values, maybe_int)
207+
tm.assert_numpy_array_equal(out, pos)
208+
209+
out = lib.maybe_convert_numeric(
210+
np.array(['-' + infinity], dtype=object),
211+
na_values, maybe_int)
212+
tm.assert_numpy_array_equal(out, neg)
213+
214+
out = lib.maybe_convert_numeric(
215+
np.array([u(infinity)], dtype=object),
216+
na_values, maybe_int)
217+
tm.assert_numpy_array_equal(out, pos)
218+
219+
out = lib.maybe_convert_numeric(
220+
np.array(['+' + infinity], dtype=object),
221+
na_values, maybe_int)
222+
tm.assert_numpy_array_equal(out, pos)
223+
224+
# too many characters
225+
with tm.assertRaisesRegexp(ValueError, msg):
226+
lib.maybe_convert_numeric(
227+
np.array(['foo_' + infinity], dtype=object),
228+
na_values, maybe_int)
229+
191230

192231
class Testisscalar(tm.TestCase):
193232

0 commit comments

Comments
 (0)