BUG: Strings like '2E' are incorrectly parsed as valid floats

bennorth · jreback · commit 517c559bc08e · 2016-02-09T17:01:53.000-05:00
DataFrame({'x': [2.5], 'y': [42], 'z': ['2E']}) does not round-trip correctly. The string '2E' is interpreted as a valid float, but it should not be This PR changes the three variants of `xstrtod()` to reject a string where no digits follow the 'e' or 'E', and includes tests for this case. Author: Ben North <ben@redfrontdoor.org> Closes #12215 from bennorth/BUG-float-parsing and squashes the following commits: 8d2b583 [Ben North] BUG: Reject empty-exponent strings as non-floats
diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt
@@ -784,6 +784,7 @@ Bug Fixes
 - Bug in ``read_excel`` failing to read data with one column when ``squeeze=True`` (:issue:`12157`)
 - Bug in ``.groupby`` where a ``KeyError`` was not raised for a wrong column if there was only one row in the dataframe (:issue:`11741`)
 - Bug in ``.read_csv`` with dtype specified on empty data producing an error (:issue:`12048`)
+- Bug in ``.read_csv`` where strings like ``'2E'`` are treated as valid floats (:issue:`12237`)
 - Bug in building *pandas* with debugging symbols (:issue:`12123`)
 
 
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -29,6 +29,7 @@
 import pandas.util.testing as tm
 import pandas as pd
 
+from pandas.core.common import AbstractMethodError
 from pandas.compat import parse_date
 import pandas.lib as lib
 from pandas import compat
@@ -2495,6 +2496,18 @@ def test_float_parser(self):
         expected = pd.DataFrame([[float(s) for s in data.split(',')]])
         tm.assert_frame_equal(result, expected)
 
+    def float_precision_choices(self):
+        raise AbstractMethodError(self)
+
+    def test_scientific_no_exponent(self):
+        # See PR 12215
+        df = DataFrame.from_items([('w', ['2e']), ('x', ['3E']),
+                                   ('y', ['42e']), ('z', ['632E'])])
+        data = df.to_csv(index=False)
+        for prec in self.float_precision_choices():
+            df_roundtrip = self.read_csv(StringIO(data), float_precision=prec)
+            tm.assert_frame_equal(df_roundtrip, df)
+
     def test_int64_overflow(self):
         data = """ID
 00013007854817840016671868
@@ -2651,6 +2664,9 @@ def read_table(self, *args, **kwds):
         kwds['engine'] = 'python'
         return read_table(*args, **kwds)
 
+    def float_precision_choices(self):
+        return [None]
+
     def test_sniff_delimiter(self):
         text = """index|A|B|C
 foo|1|2|3
@@ -3409,6 +3425,9 @@ def test_variable_width_unicode(self):
 class CParserTests(ParserTests):
     """ base class for CParser Testsing """
 
+    def float_precision_choices(self):
+        return [None, 'high', 'round_trip']
+
     def test_buffer_overflow(self):
         # GH9205
         # test certain malformed input files that cause buffer overflows in
diff --git a/pandas/src/parse_helper.h b/pandas/src/parse_helper.h
@@ -197,17 +197,23 @@ static double xstrtod(const char *str, char **endptr, char decimal,
     }
 
     // Process string of digits
+    num_digits = 0;
     n = 0;
     while (isdigit(*p))
     {
       n = n * 10 + (*p - '0');
+      num_digits++;
       p++;
     }
 
     if (negative)
       exponent -= n;
     else
       exponent += n;
+
+    // If no digits, after the 'e'/'E', un-consume it
+    if (num_digits == 0)
+        p--;
   }
 
 
diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c
@@ -2225,17 +2225,23 @@ double xstrtod(const char *str, char **endptr, char decimal,
     }
 
     // Process string of digits
+    num_digits = 0;
     n = 0;
     while (isdigit(*p))
     {
       n = n * 10 + (*p - '0');
+      num_digits++;
       p++;
     }
 
     if (negative)
       exponent -= n;
     else
       exponent += n;
+
+    // If no digits, after the 'e'/'E', un-consume it
+    if (num_digits == 0)
+        p--;
   }
 
 
@@ -2396,17 +2402,23 @@ double precise_xstrtod(const char *str, char **endptr, char decimal,
         }
 
         // Process string of digits
+        num_digits = 0;
         n = 0;
         while (isdigit(*p))
         {
             n = n * 10 + (*p - '0');
+            num_digits++;
             p++;
         }
 
         if (negative)
             exponent -= n;
         else
             exponent += n;
+
+        // If no digits, after the 'e'/'E', un-consume it
+        if (num_digits == 0)
+            p--;
     }
 
     if (exponent > 308)
diff --git a/pandas/tests/test_tseries.py b/pandas/tests/test_tseries.py
@@ -337,6 +337,13 @@ def test_convert_infs():
     assert (result.dtype == np.float64)
 
 
+def test_scientific_no_exponent():
+    # See PR 12215
+    arr = np.array(['42E', '2E', '99e', '6e'], dtype='O')
+    result = lib.maybe_convert_numeric(arr, set(), False, True)
+    assert np.all(np.isnan(result))
+
+
 def test_convert_objects_ints():
     # test that we can detect many kinds of integers
     dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8']

Original file line number	Diff line number	Diff line change
`@@ -197,17 +197,23 @@ static double xstrtod(const char str, char *endptr, char decimal,`
`197`	`197`	`}`
`198`	`198`
`199`	`199`	`// Process string of digits`
	`200`	`+ num_digits = 0;`
`200`	`201`	`n = 0;`
`201`	`202`	`while (isdigit(*p))`
`202`	`203`	`{`
`203`	`204`	`n = n * 10 + (*p - '0');`
	`205`	`+ num_digits++;`
`204`	`206`	`p++;`
`205`	`207`	`}`
`206`	`208`
`207`	`209`	`if (negative)`
`208`	`210`	`exponent -= n;`
`209`	`211`	`else`
`210`	`212`	`exponent += n;`
	`213`	`+`
	`214`	`+ // If no digits, after the 'e'/'E', un-consume it`
	`215`	`+ if (num_digits == 0)`
	`216`	`+ p--;`
`211`	`217`	`}`
`212`	`218`
`213`	`219`
Original file line number	Diff line number	Diff line change
`@@ -2225,17 +2225,23 @@ double xstrtod(const char str, char *endptr, char decimal,`
`2225`	`2225`	`}`
`2226`	`2226`
`2227`	`2227`	`// Process string of digits`
	`2228`	`+ num_digits = 0;`
`2228`	`2229`	`n = 0;`
`2229`	`2230`	`while (isdigit(*p))`
`2230`	`2231`	`{`
`2231`	`2232`	`n = n * 10 + (*p - '0');`
	`2233`	`+ num_digits++;`
`2232`	`2234`	`p++;`
`2233`	`2235`	`}`
`2234`	`2236`
`2235`	`2237`	`if (negative)`
`2236`	`2238`	`exponent -= n;`
`2237`	`2239`	`else`
`2238`	`2240`	`exponent += n;`
	`2241`	`+`
	`2242`	`+ // If no digits, after the 'e'/'E', un-consume it`
	`2243`	`+ if (num_digits == 0)`
	`2244`	`+ p--;`
`2239`	`2245`	`}`
`2240`	`2246`
`2241`	`2247`
`@@ -2396,17 +2402,23 @@ double precise_xstrtod(const char str, char *endptr, char decimal,`
`2396`	`2402`	`}`
`2397`	`2403`
`2398`	`2404`	`// Process string of digits`
	`2405`	`+ num_digits = 0;`
`2399`	`2406`	`n = 0;`
`2400`	`2407`	`while (isdigit(*p))`
`2401`	`2408`	`{`
`2402`	`2409`	`n = n * 10 + (*p - '0');`
	`2410`	`+ num_digits++;`
`2403`	`2411`	`p++;`
`2404`	`2412`	`}`
`2405`	`2413`
`2406`	`2414`	`if (negative)`
`2407`	`2415`	`exponent -= n;`
`2408`	`2416`	`else`
`2409`	`2417`	`exponent += n;`
	`2418`	`+`
	`2419`	`+ // If no digits, after the 'e'/'E', un-consume it`
	`2420`	`+ if (num_digits == 0)`
	`2421`	`+ p--;`
`2410`	`2422`	`}`
`2411`	`2423`
`2412`	`2424`	`if (exponent > 308)`