From 8d2b58351ec126322d2127170fa444f0ccd8aa72 Mon Sep 17 00:00:00 2001 From: Ben North Date: Tue, 2 Feb 2016 21:54:23 +0000 Subject: [PATCH] BUG: Reject empty-exponent strings as non-floats The man page for strode(3) says: "A decimal exponent consists of an 'E' or 'e', followed by an optional plus or minus sign, followed by a NONEMPTY sequence of decimal digits". (Emphasis on 'nonempty' added.) Currently, Pandas parses the string '2E' as a valid float, interpreting it as '2E0', i.e., 2.0. It should reject '2E'. Update the functions precise_xstrtod() xstrtod() (two copies) such that they require at least one digit after the 'e' or 'E'. If there are no digits, then there is not a valid exponent, and in that case, we rewind the 'next character' pointer back to point to the 'e' or 'E'. Add tests: test_scientific_no_exponent() in tests/test_tseries.py ParserTests.test_scientific_no_exponent in io/tests/test_parsers.py (tests behaviour under C and Python engines; and for the three float_precision variants under the C engine) --- doc/source/whatsnew/v0.18.0.txt | 1 + pandas/io/tests/test_parsers.py | 19 +++++++++++++++++++ pandas/src/parse_helper.h | 6 ++++++ pandas/src/parser/tokenizer.c | 12 ++++++++++++ pandas/tests/test_tseries.py | 7 +++++++ 5 files changed, 45 insertions(+) diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt index 421822380c2da..ac6267a15b513 100644 --- a/doc/source/whatsnew/v0.18.0.txt +++ b/doc/source/whatsnew/v0.18.0.txt @@ -784,6 +784,7 @@ Bug Fixes - Bug in ``read_excel`` failing to read data with one column when ``squeeze=True`` (:issue:`12157`) - Bug in ``.groupby`` where a ``KeyError`` was not raised for a wrong column if there was only one row in the dataframe (:issue:`11741`) - Bug in ``.read_csv`` with dtype specified on empty data producing an error (:issue:`12048`) +- Bug in ``.read_csv`` where strings like ``'2E'`` are treated as valid floats (:issue:`12237`) - Bug in building *pandas* with debugging symbols (:issue:`12123`) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 7c68a44874631..d3020e337322b 100755 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -29,6 +29,7 @@ import pandas.util.testing as tm import pandas as pd +from pandas.core.common import AbstractMethodError from pandas.compat import parse_date import pandas.lib as lib from pandas import compat @@ -2495,6 +2496,18 @@ def test_float_parser(self): expected = pd.DataFrame([[float(s) for s in data.split(',')]]) tm.assert_frame_equal(result, expected) + def float_precision_choices(self): + raise AbstractMethodError(self) + + def test_scientific_no_exponent(self): + # See PR 12215 + df = DataFrame.from_items([('w', ['2e']), ('x', ['3E']), + ('y', ['42e']), ('z', ['632E'])]) + data = df.to_csv(index=False) + for prec in self.float_precision_choices(): + df_roundtrip = self.read_csv(StringIO(data), float_precision=prec) + tm.assert_frame_equal(df_roundtrip, df) + def test_int64_overflow(self): data = """ID 00013007854817840016671868 @@ -2651,6 +2664,9 @@ def read_table(self, *args, **kwds): kwds['engine'] = 'python' return read_table(*args, **kwds) + def float_precision_choices(self): + return [None] + def test_sniff_delimiter(self): text = """index|A|B|C foo|1|2|3 @@ -3409,6 +3425,9 @@ def test_variable_width_unicode(self): class CParserTests(ParserTests): """ base class for CParser Testsing """ + def float_precision_choices(self): + return [None, 'high', 'round_trip'] + def test_buffer_overflow(self): # GH9205 # test certain malformed input files that cause buffer overflows in diff --git a/pandas/src/parse_helper.h b/pandas/src/parse_helper.h index 2cb1a7f017c62..d47e448700029 100644 --- a/pandas/src/parse_helper.h +++ b/pandas/src/parse_helper.h @@ -197,10 +197,12 @@ static double xstrtod(const char *str, char **endptr, char decimal, } // Process string of digits + num_digits = 0; n = 0; while (isdigit(*p)) { n = n * 10 + (*p - '0'); + num_digits++; p++; } @@ -208,6 +210,10 @@ static double xstrtod(const char *str, char **endptr, char decimal, exponent -= n; else exponent += n; + + // If no digits, after the 'e'/'E', un-consume it + if (num_digits == 0) + p--; } diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index 2e4a804a577b5..8fd3674047301 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -2225,10 +2225,12 @@ double xstrtod(const char *str, char **endptr, char decimal, } // Process string of digits + num_digits = 0; n = 0; while (isdigit(*p)) { n = n * 10 + (*p - '0'); + num_digits++; p++; } @@ -2236,6 +2238,10 @@ double xstrtod(const char *str, char **endptr, char decimal, exponent -= n; else exponent += n; + + // If no digits, after the 'e'/'E', un-consume it + if (num_digits == 0) + p--; } @@ -2396,10 +2402,12 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, } // Process string of digits + num_digits = 0; n = 0; while (isdigit(*p)) { n = n * 10 + (*p - '0'); + num_digits++; p++; } @@ -2407,6 +2415,10 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, exponent -= n; else exponent += n; + + // If no digits, after the 'e'/'E', un-consume it + if (num_digits == 0) + p--; } if (exponent > 308) diff --git a/pandas/tests/test_tseries.py b/pandas/tests/test_tseries.py index 8422759192cc3..f3784a246eb4b 100644 --- a/pandas/tests/test_tseries.py +++ b/pandas/tests/test_tseries.py @@ -337,6 +337,13 @@ def test_convert_infs(): assert (result.dtype == np.float64) +def test_scientific_no_exponent(): + # See PR 12215 + arr = np.array(['42E', '2E', '99e', '6e'], dtype='O') + result = lib.maybe_convert_numeric(arr, set(), False, True) + assert np.all(np.isnan(result)) + + def test_convert_objects_ints(): # test that we can detect many kinds of integers dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8']