Skip to content

Commit 517c559

Browse files
bennorthjreback
authored andcommitted
BUG: Strings like '2E' are incorrectly parsed as valid floats
DataFrame({'x': [2.5], 'y': [42], 'z': ['2E']}) does not round-trip correctly. The string '2E' is interpreted as a valid float, but it should not be This PR changes the three variants of `xstrtod()` to reject a string where no digits follow the 'e' or 'E', and includes tests for this case. Author: Ben North <[email protected]> Closes #12215 from bennorth/BUG-float-parsing and squashes the following commits: 8d2b583 [Ben North] BUG: Reject empty-exponent strings as non-floats
1 parent fe201a2 commit 517c559

File tree

5 files changed

+45
-0
lines changed

5 files changed

+45
-0
lines changed

doc/source/whatsnew/v0.18.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -784,6 +784,7 @@ Bug Fixes
784784
- Bug in ``read_excel`` failing to read data with one column when ``squeeze=True`` (:issue:`12157`)
785785
- Bug in ``.groupby`` where a ``KeyError`` was not raised for a wrong column if there was only one row in the dataframe (:issue:`11741`)
786786
- Bug in ``.read_csv`` with dtype specified on empty data producing an error (:issue:`12048`)
787+
- Bug in ``.read_csv`` where strings like ``'2E'`` are treated as valid floats (:issue:`12237`)
787788
- Bug in building *pandas* with debugging symbols (:issue:`12123`)
788789

789790

pandas/io/tests/test_parsers.py

+19
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
import pandas.util.testing as tm
3030
import pandas as pd
3131

32+
from pandas.core.common import AbstractMethodError
3233
from pandas.compat import parse_date
3334
import pandas.lib as lib
3435
from pandas import compat
@@ -2495,6 +2496,18 @@ def test_float_parser(self):
24952496
expected = pd.DataFrame([[float(s) for s in data.split(',')]])
24962497
tm.assert_frame_equal(result, expected)
24972498

2499+
def float_precision_choices(self):
2500+
raise AbstractMethodError(self)
2501+
2502+
def test_scientific_no_exponent(self):
2503+
# See PR 12215
2504+
df = DataFrame.from_items([('w', ['2e']), ('x', ['3E']),
2505+
('y', ['42e']), ('z', ['632E'])])
2506+
data = df.to_csv(index=False)
2507+
for prec in self.float_precision_choices():
2508+
df_roundtrip = self.read_csv(StringIO(data), float_precision=prec)
2509+
tm.assert_frame_equal(df_roundtrip, df)
2510+
24982511
def test_int64_overflow(self):
24992512
data = """ID
25002513
00013007854817840016671868
@@ -2651,6 +2664,9 @@ def read_table(self, *args, **kwds):
26512664
kwds['engine'] = 'python'
26522665
return read_table(*args, **kwds)
26532666

2667+
def float_precision_choices(self):
2668+
return [None]
2669+
26542670
def test_sniff_delimiter(self):
26552671
text = """index|A|B|C
26562672
foo|1|2|3
@@ -3409,6 +3425,9 @@ def test_variable_width_unicode(self):
34093425
class CParserTests(ParserTests):
34103426
""" base class for CParser Testsing """
34113427

3428+
def float_precision_choices(self):
3429+
return [None, 'high', 'round_trip']
3430+
34123431
def test_buffer_overflow(self):
34133432
# GH9205
34143433
# test certain malformed input files that cause buffer overflows in

pandas/src/parse_helper.h

+6
Original file line numberDiff line numberDiff line change
@@ -197,17 +197,23 @@ static double xstrtod(const char *str, char **endptr, char decimal,
197197
}
198198

199199
// Process string of digits
200+
num_digits = 0;
200201
n = 0;
201202
while (isdigit(*p))
202203
{
203204
n = n * 10 + (*p - '0');
205+
num_digits++;
204206
p++;
205207
}
206208

207209
if (negative)
208210
exponent -= n;
209211
else
210212
exponent += n;
213+
214+
// If no digits, after the 'e'/'E', un-consume it
215+
if (num_digits == 0)
216+
p--;
211217
}
212218

213219

pandas/src/parser/tokenizer.c

+12
Original file line numberDiff line numberDiff line change
@@ -2225,17 +2225,23 @@ double xstrtod(const char *str, char **endptr, char decimal,
22252225
}
22262226

22272227
// Process string of digits
2228+
num_digits = 0;
22282229
n = 0;
22292230
while (isdigit(*p))
22302231
{
22312232
n = n * 10 + (*p - '0');
2233+
num_digits++;
22322234
p++;
22332235
}
22342236

22352237
if (negative)
22362238
exponent -= n;
22372239
else
22382240
exponent += n;
2241+
2242+
// If no digits, after the 'e'/'E', un-consume it
2243+
if (num_digits == 0)
2244+
p--;
22392245
}
22402246

22412247

@@ -2396,17 +2402,23 @@ double precise_xstrtod(const char *str, char **endptr, char decimal,
23962402
}
23972403

23982404
// Process string of digits
2405+
num_digits = 0;
23992406
n = 0;
24002407
while (isdigit(*p))
24012408
{
24022409
n = n * 10 + (*p - '0');
2410+
num_digits++;
24032411
p++;
24042412
}
24052413

24062414
if (negative)
24072415
exponent -= n;
24082416
else
24092417
exponent += n;
2418+
2419+
// If no digits, after the 'e'/'E', un-consume it
2420+
if (num_digits == 0)
2421+
p--;
24102422
}
24112423

24122424
if (exponent > 308)

pandas/tests/test_tseries.py

+7
Original file line numberDiff line numberDiff line change
@@ -337,6 +337,13 @@ def test_convert_infs():
337337
assert (result.dtype == np.float64)
338338

339339

340+
def test_scientific_no_exponent():
341+
# See PR 12215
342+
arr = np.array(['42E', '2E', '99e', '6e'], dtype='O')
343+
result = lib.maybe_convert_numeric(arr, set(), False, True)
344+
assert np.all(np.isnan(result))
345+
346+
340347
def test_convert_objects_ints():
341348
# test that we can detect many kinds of integers
342349
dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8']

0 commit comments

Comments
 (0)