Skip to content

Commit 38876df

Browse files
evanpwjreback
authored andcommitted
BUG: Strings with exponent but no decimal point parsed as integers in python csv engine (GH 9565)
1 parent b206548 commit 38876df

File tree

4 files changed

+68
-39
lines changed

4 files changed

+68
-39
lines changed

doc/source/whatsnew/v0.16.2.txt

+2
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,8 @@ Bug Fixes
151151

152152
- Bug in `plot` not defaulting to matplotlib `axes.grid` setting (:issue:`9792`)
153153

154+
- Bug causing strings containing an exponent but no decimal to be parsed as ints instead of floats in python csv parser. (:issue:`9565`)
155+
154156
- Bug in ``Series.align`` resets ``name`` when ``fill_value`` is specified (:issue:`10067`)
155157
- Bug in ``read_csv`` causing index name not to be set on an empty DataFrame (:issue:`10184`)
156158
- Bug in ``SparseSeries.abs`` resets ``name`` (:issue:`10241`)

pandas/io/tests/test_parsers.py

+42-18
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
from numpy.testing.decorators import slow
3636
from numpy.testing import assert_array_equal
3737

38-
from pandas.parser import OverflowError, CParserError
38+
import pandas.parser
3939

4040

4141
class ParserTests(object):
@@ -1649,7 +1649,7 @@ def test_read_table_buglet_4x_multiindex(self):
16491649
# Temporarily copied to TestPythonParser.
16501650
# Here test that CParserError is raised:
16511651

1652-
with tm.assertRaises(CParserError):
1652+
with tm.assertRaises(pandas.parser.CParserError):
16531653
text = """ A B C D E
16541654
one two three four
16551655
a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640
@@ -2301,6 +2301,46 @@ def test_empty_with_index(self):
23012301
expected = DataFrame([], columns=['y'], index=Index([], name='x'))
23022302
tm.assert_frame_equal(result, expected)
23032303

2304+
def test_float_parser(self):
2305+
# GH 9565
2306+
data = '45e-1,4.5,45.,inf,-inf'
2307+
result = self.read_csv(StringIO(data), header=None)
2308+
expected = pd.DataFrame([[float(s) for s in data.split(',')]])
2309+
tm.assert_frame_equal(result, expected)
2310+
2311+
def test_int64_overflow(self):
2312+
data = """ID
2313+
00013007854817840016671868
2314+
00013007854817840016749251
2315+
00013007854817840016754630
2316+
00013007854817840016781876
2317+
00013007854817840017028824
2318+
00013007854817840017963235
2319+
00013007854817840018860166"""
2320+
2321+
result = self.read_csv(StringIO(data))
2322+
self.assertTrue(result['ID'].dtype == object)
2323+
2324+
self.assertRaises((OverflowError, pandas.parser.OverflowError),
2325+
self.read_csv, StringIO(data),
2326+
converters={'ID' : np.int64})
2327+
2328+
# Just inside int64 range: parse as integer
2329+
i_max = np.iinfo(np.int64).max
2330+
i_min = np.iinfo(np.int64).min
2331+
for x in [i_max, i_min]:
2332+
result = pd.read_csv(StringIO(str(x)), header=None)
2333+
expected = pd.DataFrame([x])
2334+
tm.assert_frame_equal(result, expected)
2335+
2336+
# Just outside int64 range: parse as string
2337+
too_big = i_max + 1
2338+
too_small = i_min - 1
2339+
for x in [too_big, too_small]:
2340+
result = pd.read_csv(StringIO(str(x)), header=None)
2341+
expected = pd.DataFrame([str(x)])
2342+
tm.assert_frame_equal(result, expected)
2343+
23042344

23052345
class TestPythonParser(ParserTests, tm.TestCase):
23062346
def test_negative_skipfooter_raises(self):
@@ -3575,22 +3615,6 @@ def test_disable_bool_parsing(self):
35753615
result = read_csv(StringIO(data), dtype=object, na_filter=False)
35763616
self.assertEqual(result['B'][2], '')
35773617

3578-
def test_int64_overflow(self):
3579-
data = """ID
3580-
00013007854817840016671868
3581-
00013007854817840016749251
3582-
00013007854817840016754630
3583-
00013007854817840016781876
3584-
00013007854817840017028824
3585-
00013007854817840017963235
3586-
00013007854817840018860166"""
3587-
3588-
result = read_csv(StringIO(data))
3589-
self.assertTrue(result['ID'].dtype == object)
3590-
3591-
self.assertRaises(OverflowError, read_csv, StringIO(data),
3592-
dtype='i8')
3593-
35943618
def test_euro_decimal_format(self):
35953619
data = """Id;Number1;Number2;Text1;Text2;Number3
35963620
1;1521,1541;187101,9543;ABC;poi;4,738797819

pandas/src/inference.pyx

+12-15
Original file line numberDiff line numberDiff line change
@@ -514,11 +514,10 @@ def is_period_array(ndarray[object] values):
514514

515515

516516
cdef extern from "parse_helper.h":
517-
inline int floatify(object, double *result) except -1
518-
519-
cdef double fINT64_MAX = <double> INT64_MAX
520-
cdef double fINT64_MIN = <double> INT64_MIN
517+
inline int floatify(object, double *result, int *maybe_int) except -1
521518

519+
cdef int64_t iINT64_MAX = <int64_t> INT64_MAX
520+
cdef int64_t iINT64_MIN = <int64_t> INT64_MIN
522521

523522
def maybe_convert_numeric(object[:] values, set na_values,
524523
bint convert_empty=True, bint coerce_numeric=False):
@@ -527,7 +526,7 @@ def maybe_convert_numeric(object[:] values, set na_values,
527526
convert to proper dtype array
528527
'''
529528
cdef:
530-
int status
529+
int status, maybe_int
531530
Py_ssize_t i, n = values.size
532531
ndarray[float64_t] floats = np.empty(n, dtype='f8')
533532
ndarray[complex128_t] complexes = np.empty(n, dtype='c16')
@@ -569,18 +568,16 @@ def maybe_convert_numeric(object[:] values, set na_values,
569568
seen_complex = True
570569
else:
571570
try:
572-
status = floatify(val, &fval)
571+
status = floatify(val, &fval, &maybe_int)
573572
floats[i] = fval
574573
if not seen_float:
575-
if '.' in val or fval == INF or fval == NEGINF:
576-
seen_float = True
577-
elif 'inf' in val: # special case to handle +/-inf
578-
seen_float = True
579-
elif fval < fINT64_MAX and fval > fINT64_MIN:
580-
try:
581-
ints[i] = int(val)
582-
except ValueError:
583-
ints[i] = <int64_t> fval
574+
if maybe_int:
575+
as_int = int(val)
576+
577+
if as_int <= iINT64_MAX and as_int >= iINT64_MIN:
578+
ints[i] = as_int
579+
else:
580+
raise ValueError('integer out of range')
584581
else:
585582
seen_float = True
586583
except:

pandas/src/parse_helper.h

+12-6
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,13 @@
22
#include <float.h>
33

44
static double xstrtod(const char *p, char **q, char decimal, char sci,
5-
int skip_trailing);
5+
int skip_trailing, int *maybe_int);
66

7-
int to_double(char *item, double *p_value, char sci, char decimal)
7+
int to_double(char *item, double *p_value, char sci, char decimal, int *maybe_int)
88
{
99
char *p_end;
1010

11-
*p_value = xstrtod(item, &p_end, decimal, sci, 1);
11+
*p_value = xstrtod(item, &p_end, decimal, sci, 1, maybe_int);
1212

1313
return (errno == 0) && (!*p_end);
1414
}
@@ -18,7 +18,7 @@ int to_double(char *item, double *p_value, char sci, char decimal)
1818
#define PyBytes_AS_STRING PyString_AS_STRING
1919
#endif
2020

21-
int floatify(PyObject* str, double *result) {
21+
int floatify(PyObject* str, double *result, int *maybe_int) {
2222
int status;
2323
char *data;
2424
PyObject* tmp = NULL;
@@ -35,14 +35,16 @@ int floatify(PyObject* str, double *result) {
3535
return -1;
3636
}
3737

38-
status = to_double(data, result, sci, dec);
38+
status = to_double(data, result, sci, dec, maybe_int);
3939

4040
if (!status) {
4141
/* handle inf/-inf */
4242
if (0 == strcmp(data, "-inf")) {
4343
*result = -HUGE_VAL;
44+
*maybe_int = 0;
4445
} else if (0 == strcmp(data, "inf")) {
4546
*result = HUGE_VAL;
47+
*maybe_int = 0;
4648
} else {
4749
PyErr_SetString(PyExc_ValueError, "Unable to parse string");
4850
Py_XDECREF(tmp);
@@ -117,7 +119,7 @@ PANDAS_INLINE void uppercase(char *p) {
117119

118120

119121
static double xstrtod(const char *str, char **endptr, char decimal,
120-
char sci, int skip_trailing)
122+
char sci, int skip_trailing, int *maybe_int)
121123
{
122124
double number;
123125
int exponent;
@@ -129,6 +131,7 @@ static double xstrtod(const char *str, char **endptr, char decimal,
129131
int num_decimals;
130132

131133
errno = 0;
134+
*maybe_int = 1;
132135

133136
// Skip leading whitespace
134137
while (isspace(*p)) p++;
@@ -157,6 +160,7 @@ static double xstrtod(const char *str, char **endptr, char decimal,
157160
// Process decimal part
158161
if (*p == decimal)
159162
{
163+
*maybe_int = 0;
160164
p++;
161165

162166
while (isdigit(*p))
@@ -182,6 +186,8 @@ static double xstrtod(const char *str, char **endptr, char decimal,
182186
// Process an exponent string
183187
if (toupper(*p) == toupper(sci))
184188
{
189+
*maybe_int = 0;
190+
185191
// Handle optional sign
186192
negative = 0;
187193
switch (*++p)

0 commit comments

Comments
 (0)