Skip to content

BUG: Strings like '2E' are incorrectly parsed as valid floats #12215

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.18.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -784,6 +784,7 @@ Bug Fixes
- Bug in ``read_excel`` failing to read data with one column when ``squeeze=True`` (:issue:`12157`)
- Bug in ``.groupby`` where a ``KeyError`` was not raised for a wrong column if there was only one row in the dataframe (:issue:`11741`)
- Bug in ``.read_csv`` with dtype specified on empty data producing an error (:issue:`12048`)
- Bug in ``.read_csv`` where strings like ``'2E'`` are treated as valid floats (:issue:`12237`)
- Bug in building *pandas* with debugging symbols (:issue:`12123`)


Expand Down
19 changes: 19 additions & 0 deletions pandas/io/tests/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import pandas.util.testing as tm
import pandas as pd

from pandas.core.common import AbstractMethodError
from pandas.compat import parse_date
import pandas.lib as lib
from pandas import compat
Expand Down Expand Up @@ -2495,6 +2496,18 @@ def test_float_parser(self):
expected = pd.DataFrame([[float(s) for s in data.split(',')]])
tm.assert_frame_equal(result, expected)

def float_precision_choices(self):
raise AbstractMethodError(self)

def test_scientific_no_exponent(self):
# See PR 12215
df = DataFrame.from_items([('w', ['2e']), ('x', ['3E']),
('y', ['42e']), ('z', ['632E'])])
data = df.to_csv(index=False)
for prec in self.float_precision_choices():
df_roundtrip = self.read_csv(StringIO(data), float_precision=prec)
tm.assert_frame_equal(df_roundtrip, df)

def test_int64_overflow(self):
data = """ID
00013007854817840016671868
Expand Down Expand Up @@ -2651,6 +2664,9 @@ def read_table(self, *args, **kwds):
kwds['engine'] = 'python'
return read_table(*args, **kwds)

def float_precision_choices(self):
return [None]

def test_sniff_delimiter(self):
text = """index|A|B|C
foo|1|2|3
Expand Down Expand Up @@ -3409,6 +3425,9 @@ def test_variable_width_unicode(self):
class CParserTests(ParserTests):
""" base class for CParser Testsing """

def float_precision_choices(self):
return [None, 'high', 'round_trip']

def test_buffer_overflow(self):
# GH9205
# test certain malformed input files that cause buffer overflows in
Expand Down
6 changes: 6 additions & 0 deletions pandas/src/parse_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -197,17 +197,23 @@ static double xstrtod(const char *str, char **endptr, char decimal,
}

// Process string of digits
num_digits = 0;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does this need s declaration as to type?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No --- it's re-using the num_digits variable declared earlier (line 130).

n = 0;
while (isdigit(*p))
{
n = n * 10 + (*p - '0');
num_digits++;
p++;
}

if (negative)
exponent -= n;
else
exponent += n;

// If no digits, after the 'e'/'E', un-consume it
if (num_digits == 0)
p--;
}


Expand Down
12 changes: 12 additions & 0 deletions pandas/src/parser/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -2225,17 +2225,23 @@ double xstrtod(const char *str, char **endptr, char decimal,
}

// Process string of digits
num_digits = 0;
n = 0;
while (isdigit(*p))
{
n = n * 10 + (*p - '0');
num_digits++;
p++;
}

if (negative)
exponent -= n;
else
exponent += n;

// If no digits, after the 'e'/'E', un-consume it
if (num_digits == 0)
p--;
}


Expand Down Expand Up @@ -2396,17 +2402,23 @@ double precise_xstrtod(const char *str, char **endptr, char decimal,
}

// Process string of digits
num_digits = 0;
n = 0;
while (isdigit(*p))
{
n = n * 10 + (*p - '0');
num_digits++;
p++;
}

if (negative)
exponent -= n;
else
exponent += n;

// If no digits, after the 'e'/'E', un-consume it
if (num_digits == 0)
p--;
}

if (exponent > 308)
Expand Down
7 changes: 7 additions & 0 deletions pandas/tests/test_tseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,6 +337,13 @@ def test_convert_infs():
assert (result.dtype == np.float64)


def test_scientific_no_exponent():
# See PR 12215
arr = np.array(['42E', '2E', '99e', '6e'], dtype='O')
result = lib.maybe_convert_numeric(arr, set(), False, True)
assert np.all(np.isnan(result))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ahh, you fixed floatify, I get it now

use self.assertTrue(.....)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK. (There are plenty of other bare asserts in that file though; shall I create an issue to update them?)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just pls create another issue, we shouldn't have any bare asserts.



def test_convert_objects_ints():
# test that we can detect many kinds of integers
dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8']
Expand Down