pandas-dev · bennorth · Feb 2, 2016 · jreback · Feb 3, 2016 · bennorth
diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt
@@ -784,6 +784,7 @@ Bug Fixes
 - Bug in ``read_excel`` failing to read data with one column when ``squeeze=True`` (:issue:`12157`)
 - Bug in ``.groupby`` where a ``KeyError`` was not raised for a wrong column if there was only one row in the dataframe (:issue:`11741`)
 - Bug in ``.read_csv`` with dtype specified on empty data producing an error (:issue:`12048`)
+- Bug in ``.read_csv`` where strings like ``'2E'`` are treated as valid floats (:issue:`12237`)
 - Bug in building *pandas* with debugging symbols (:issue:`12123`)
 
 

diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -29,6 +29,7 @@
 import pandas.util.testing as tm
 import pandas as pd
 
+from pandas.core.common import AbstractMethodError
 from pandas.compat import parse_date
 import pandas.lib as lib
 from pandas import compat
@@ -2495,6 +2496,18 @@ def test_float_parser(self):
         expected = pd.DataFrame([[float(s) for s in data.split(',')]])
         tm.assert_frame_equal(result, expected)
 
+    def float_precision_choices(self):
+        raise AbstractMethodError(self)
+
+    def test_scientific_no_exponent(self):
+        # See PR 12215
+        df = DataFrame.from_items([('w', ['2e']), ('x', ['3E']),
+                                   ('y', ['42e']), ('z', ['632E'])])
+        data = df.to_csv(index=False)
+        for prec in self.float_precision_choices():
+            df_roundtrip = self.read_csv(StringIO(data), float_precision=prec)
+            tm.assert_frame_equal(df_roundtrip, df)
+
     def test_int64_overflow(self):
         data = """ID
 00013007854817840016671868
@@ -2651,6 +2664,9 @@ def read_table(self, *args, **kwds):
         kwds['engine'] = 'python'
         return read_table(*args, **kwds)
 
+    def float_precision_choices(self):
+        return [None]
+
     def test_sniff_delimiter(self):
         text = """index|A|B|C
 foo|1|2|3
@@ -3409,6 +3425,9 @@ def test_variable_width_unicode(self):
 class CParserTests(ParserTests):
     """ base class for CParser Testsing """
 
+    def float_precision_choices(self):
+        return [None, 'high', 'round_trip']
+
     def test_buffer_overflow(self):
         # GH9205
         # test certain malformed input files that cause buffer overflows in

diff --git a/pandas/src/parse_helper.h b/pandas/src/parse_helper.h
@@ -197,17 +197,23 @@ static double xstrtod(const char *str, char **endptr, char decimal,
     }
 
     // Process string of digits
+    num_digits = 0;
     n = 0;
     while (isdigit(*p))
     {
       n = n * 10 + (*p - '0');
+      num_digits++;
       p++;
     }
 
     if (negative)
       exponent -= n;
     else
       exponent += n;
+
+    // If no digits, after the 'e'/'E', un-consume it
+    if (num_digits == 0)
+        p--;
   }
 
 

diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c
@@ -2225,17 +2225,23 @@ double xstrtod(const char *str, char **endptr, char decimal,
     }
 
     // Process string of digits
+    num_digits = 0;
     n = 0;
     while (isdigit(*p))
     {
       n = n * 10 + (*p - '0');
+      num_digits++;
       p++;
     }
 
     if (negative)
       exponent -= n;
     else
       exponent += n;
+
+    // If no digits, after the 'e'/'E', un-consume it
+    if (num_digits == 0)
+        p--;
   }
 
 
@@ -2396,17 +2402,23 @@ double precise_xstrtod(const char *str, char **endptr, char decimal,
         }
 
         // Process string of digits
+        num_digits = 0;
         n = 0;
         while (isdigit(*p))
         {
             n = n * 10 + (*p - '0');
+            num_digits++;
             p++;
         }
 
         if (negative)
             exponent -= n;
         else
             exponent += n;
+
+        // If no digits, after the 'e'/'E', un-consume it
+        if (num_digits == 0)
+            p--;
     }
 
     if (exponent > 308)

diff --git a/pandas/tests/test_tseries.py b/pandas/tests/test_tseries.py
@@ -337,6 +337,13 @@ def test_convert_infs():
     assert (result.dtype == np.float64)
 
 
+def test_scientific_no_exponent():
+    # See PR 12215
+    arr = np.array(['42E', '2E', '99e', '6e'], dtype='O')
+    result = lib.maybe_convert_numeric(arr, set(), False, True)
+    assert np.all(np.isnan(result))
+
+
 def test_convert_objects_ints():
     # test that we can detect many kinds of integers
     dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8']