From 8d2b58351ec126322d2127170fa444f0ccd8aa72 Mon Sep 17 00:00:00 2001
From: Ben North <ben@redfrontdoor.org>
Date: Tue, 2 Feb 2016 21:54:23 +0000
Subject: [PATCH] BUG: Reject empty-exponent strings as non-floats

The man page for strode(3) says: "A decimal exponent consists of an 'E'
or 'e', followed by an optional plus or minus sign, followed by a
NONEMPTY sequence of decimal digits".  (Emphasis on 'nonempty' added.)
Currently, Pandas parses the string '2E' as a valid float, interpreting
it as '2E0', i.e., 2.0.  It should reject '2E'.

Update the functions

    precise_xstrtod()
    xstrtod() (two copies)

such that they require at least one digit after the 'e' or 'E'.  If
there are no digits, then there is not a valid exponent, and in that
case, we rewind the 'next character' pointer back to point to the 'e' or
'E'.

Add tests:

    test_scientific_no_exponent() in tests/test_tseries.py

    ParserTests.test_scientific_no_exponent in io/tests/test_parsers.py
    (tests behaviour under C and Python engines; and for the three
    float_precision variants under the C engine)
---
 doc/source/whatsnew/v0.18.0.txt |  1 +
 pandas/io/tests/test_parsers.py | 19 +++++++++++++++++++
 pandas/src/parse_helper.h       |  6 ++++++
 pandas/src/parser/tokenizer.c   | 12 ++++++++++++
 pandas/tests/test_tseries.py    |  7 +++++++
 5 files changed, 45 insertions(+)

diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt
index 421822380c2da..ac6267a15b513 100644
--- a/doc/source/whatsnew/v0.18.0.txt
+++ b/doc/source/whatsnew/v0.18.0.txt
@@ -784,6 +784,7 @@ Bug Fixes
 - Bug in ``read_excel`` failing to read data with one column when ``squeeze=True`` (:issue:`12157`)
 - Bug in ``.groupby`` where a ``KeyError`` was not raised for a wrong column if there was only one row in the dataframe (:issue:`11741`)
 - Bug in ``.read_csv`` with dtype specified on empty data producing an error (:issue:`12048`)
+- Bug in ``.read_csv`` where strings like ``'2E'`` are treated as valid floats (:issue:`12237`)
 - Bug in building *pandas* with debugging symbols (:issue:`12123`)
 
 
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
index 7c68a44874631..d3020e337322b 100755
--- a/pandas/io/tests/test_parsers.py
+++ b/pandas/io/tests/test_parsers.py
@@ -29,6 +29,7 @@
 import pandas.util.testing as tm
 import pandas as pd
 
+from pandas.core.common import AbstractMethodError
 from pandas.compat import parse_date
 import pandas.lib as lib
 from pandas import compat
@@ -2495,6 +2496,18 @@ def test_float_parser(self):
         expected = pd.DataFrame([[float(s) for s in data.split(',')]])
         tm.assert_frame_equal(result, expected)
 
+    def float_precision_choices(self):
+        raise AbstractMethodError(self)
+
+    def test_scientific_no_exponent(self):
+        # See PR 12215
+        df = DataFrame.from_items([('w', ['2e']), ('x', ['3E']),
+                                   ('y', ['42e']), ('z', ['632E'])])
+        data = df.to_csv(index=False)
+        for prec in self.float_precision_choices():
+            df_roundtrip = self.read_csv(StringIO(data), float_precision=prec)
+            tm.assert_frame_equal(df_roundtrip, df)
+
     def test_int64_overflow(self):
         data = """ID
 00013007854817840016671868
@@ -2651,6 +2664,9 @@ def read_table(self, *args, **kwds):
         kwds['engine'] = 'python'
         return read_table(*args, **kwds)
 
+    def float_precision_choices(self):
+        return [None]
+
     def test_sniff_delimiter(self):
         text = """index|A|B|C
 foo|1|2|3
@@ -3409,6 +3425,9 @@ def test_variable_width_unicode(self):
 class CParserTests(ParserTests):
     """ base class for CParser Testsing """
 
+    def float_precision_choices(self):
+        return [None, 'high', 'round_trip']
+
     def test_buffer_overflow(self):
         # GH9205
         # test certain malformed input files that cause buffer overflows in
diff --git a/pandas/src/parse_helper.h b/pandas/src/parse_helper.h
index 2cb1a7f017c62..d47e448700029 100644
--- a/pandas/src/parse_helper.h
+++ b/pandas/src/parse_helper.h
@@ -197,10 +197,12 @@ static double xstrtod(const char *str, char **endptr, char decimal,
     }
 
     // Process string of digits
+    num_digits = 0;
     n = 0;
     while (isdigit(*p))
     {
       n = n * 10 + (*p - '0');
+      num_digits++;
       p++;
     }
 
@@ -208,6 +210,10 @@ static double xstrtod(const char *str, char **endptr, char decimal,
       exponent -= n;
     else
       exponent += n;
+
+    // If no digits, after the 'e'/'E', un-consume it
+    if (num_digits == 0)
+        p--;
   }
 
 
diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c
index 2e4a804a577b5..8fd3674047301 100644
--- a/pandas/src/parser/tokenizer.c
+++ b/pandas/src/parser/tokenizer.c
@@ -2225,10 +2225,12 @@ double xstrtod(const char *str, char **endptr, char decimal,
     }
 
     // Process string of digits
+    num_digits = 0;
     n = 0;
     while (isdigit(*p))
     {
       n = n * 10 + (*p - '0');
+      num_digits++;
       p++;
     }
 
@@ -2236,6 +2238,10 @@ double xstrtod(const char *str, char **endptr, char decimal,
       exponent -= n;
     else
       exponent += n;
+
+    // If no digits, after the 'e'/'E', un-consume it
+    if (num_digits == 0)
+        p--;
   }
 
 
@@ -2396,10 +2402,12 @@ double precise_xstrtod(const char *str, char **endptr, char decimal,
         }
 
         // Process string of digits
+        num_digits = 0;
         n = 0;
         while (isdigit(*p))
         {
             n = n * 10 + (*p - '0');
+            num_digits++;
             p++;
         }
 
@@ -2407,6 +2415,10 @@ double precise_xstrtod(const char *str, char **endptr, char decimal,
             exponent -= n;
         else
             exponent += n;
+
+        // If no digits, after the 'e'/'E', un-consume it
+        if (num_digits == 0)
+            p--;
     }
 
     if (exponent > 308)
diff --git a/pandas/tests/test_tseries.py b/pandas/tests/test_tseries.py
index 8422759192cc3..f3784a246eb4b 100644
--- a/pandas/tests/test_tseries.py
+++ b/pandas/tests/test_tseries.py
@@ -337,6 +337,13 @@ def test_convert_infs():
     assert (result.dtype == np.float64)
 
 
+def test_scientific_no_exponent():
+    # See PR 12215
+    arr = np.array(['42E', '2E', '99e', '6e'], dtype='O')
+    result = lib.maybe_convert_numeric(arr, set(), False, True)
+    assert np.all(np.isnan(result))
+
+
 def test_convert_objects_ints():
     # test that we can detect many kinds of integers
     dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8']