diff --git a/doc/source/io.rst b/doc/source/io.rst index 273cbd5daae7d..b467e6243399a 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -176,7 +176,12 @@ They can take a number of arguments: - ``mangle_dupe_cols``: boolean, default True, then duplicate columns will be specified as 'X.0'...'X.N', rather than 'X'...'X' - ``tupleize_cols``: boolean, default False, if False, convert a list of tuples - to a multi-index of columns, otherwise, leave the column index as a list of tuples + to a multi-index of columns, otherwise, leave the column index as a list of + tuples + - ``float_precision`` : string, default None. Specifies which converter the C + engine should use for floating-point values. The options are None for the + ordinary converter, 'high' for the high-precision converter, and + 'round_trip' for the round-trip converter. .. ipython:: python :suppress: @@ -512,6 +517,23 @@ data columns: specify `index_col` as a column label rather then as an index on the resulting frame. +Specifying method for floating-point conversion +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The parameter ``float_precision`` can be specified in order to use +a specific floating-point converter during parsing with the C engine. +The options are the ordinary converter, the high-precision converter, and +the round-trip converter (which is guaranteed to round-trip values after +writing to a file). For example: + +.. ipython:: python + + val = '0.3066101993807095471566981359501369297504425048828125' + data = 'a,b,c\n1,2,{0}'.format(val) + abs(pd.read_csv(StringIO(data), engine='c', float_precision=None)['c'][0] - float(val)) + abs(pd.read_csv(StringIO(data), engine='c', float_precision='high')['c'][0] - float(val)) + abs(pd.read_csv(StringIO(data), engine='c', float_precision='round_trip')['c'][0] - float(val)) + + Date Parsing Functions ~~~~~~~~~~~~~~~~~~~~~~ Finally, the parser allows you can specify a custom ``date_parser`` function to diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index 74cffa7859a1d..6eaeccf89059d 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -631,6 +631,8 @@ Enhancements - Added support for ``c``, ``colormap`` and ``colorbar`` arguments for ``DataFrame.plot`` with ``kind='scatter'`` (:issue:`7780`) +- ``read_csv`` now has a keyword parameter ``float_precision`` which specifies which floating-point + converter the C engine should use during parsing, see :ref:`_io` (:issue:`8002`, :issue:`8044`) - ``PeriodIndex`` supports ``resolution`` as the same as ``DatetimeIndex`` (:issue:`7708`) - ``pandas.tseries.holiday`` has added support for additional holidays and ways to observe holidays (:issue:`7070`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 22f076d3aabca..e0243964c78ae 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -303,7 +303,8 @@ def _read(filepath_or_buffer, kwds): 'error_bad_lines': True, 'warn_bad_lines': True, 'dtype': None, - 'decimal': b'.' + 'decimal': b'.', + 'float_precision': None } _fwf_defaults = { @@ -369,6 +370,7 @@ def parser_f(filepath_or_buffer, date_parser=None, memory_map=False, + float_precision=None, nrows=None, iterator=False, chunksize=None, @@ -437,6 +439,7 @@ def parser_f(filepath_or_buffer, encoding=encoding, squeeze=squeeze, memory_map=memory_map, + float_precision=float_precision, na_filter=na_filter, compact_ints=compact_ints, @@ -1264,6 +1267,11 @@ def TextParser(*args, **kwds): If True and `parse_dates` is True for a column, try to infer the datetime format based on the first datetime string. If the format can be inferred, there often will be a large parsing speed-up. + float_precision : string, default None + Specifies which converter the C engine should use for floating-point + values. The options are None for the ordinary converter, + 'high' for the high-precision converter, and 'round_trip' for the + round-trip converter. """ kwds['engine'] = 'python' return TextFileReader(*args, **kwds) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index f2b9a9447e8fb..a381e1802d29c 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -2523,6 +2523,12 @@ def test_verbose_import(self): finally: sys.stdout = sys.__stdout__ + def test_float_precision_specified(self): + # Should raise an error if float_precision (C parser option) is specified + with tm.assertRaisesRegexp(ValueError, "The 'float_precision' option " + "is not supported with the 'python' engine"): + self.read_csv(StringIO('a,b,c\n1,2,3'), float_precision='high') + def test_iteration_open_handle(self): if PY3: raise nose.SkipTest("won't work in Python 3 {0}".format(sys.version_info)) @@ -3088,6 +3094,25 @@ def test_compact_ints(self): ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)]) self.assertEqual(result.dtype, ex_dtype) + def test_precise_conversion(self): + # GH #8002 + from decimal import Decimal + normal_errors = [] + precise_errors = [] + for num in np.linspace(1., 2., num=500): # test numbers between 1 and 2 + text = 'a\n{0:.25}'.format(num) # 25 decimal digits of precision + normal_val = float(self.read_csv(StringIO(text))['a'][0]) + precise_val = float(self.read_csv(StringIO(text), float_precision='high')['a'][0]) + roundtrip_val = float(self.read_csv(StringIO(text), float_precision='round_trip')['a'][0]) + actual_val = Decimal(text[2:]) + def error(val): + return abs(Decimal('{0:.100}'.format(val)) - actual_val) + normal_errors.append(error(normal_val)) + precise_errors.append(error(precise_val)) + self.assertEqual(roundtrip_val, float(text[2:])) # round-trip should match float() + self.assertTrue(sum(precise_errors) < sum(normal_errors)) + self.assertTrue(max(precise_errors) < max(normal_errors)) + def test_pass_dtype(self): data = """\ one,two diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 199d4ab44abfa..5905fada0cbfb 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -62,6 +62,9 @@ cdef extern from "headers/stdint.h": cdef extern from "headers/portable.h": pass +cdef extern from "errno.h": + int errno + try: basestring except NameError: @@ -155,6 +158,7 @@ cdef extern from "parser/tokenizer.h": void *skipset int skip_footer + double (*converter)(const char *, char **, char, char, char, int) # error handling char *warn_msg @@ -189,8 +193,13 @@ cdef extern from "parser/tokenizer.h": int64_t int_max, int *error, char tsep) uint64_t str_to_uint64(char *p_item, uint64_t uint_max, int *error) - inline int to_double(char *item, double *p_value, - char sci, char decimal, char thousands) + double xstrtod(const char *p, char **q, char decimal, char sci, + char tsep, int skip_trailing) + double precise_xstrtod(const char *p, char **q, char decimal, char sci, + char tsep, int skip_trailing) + double round_trip(const char *p, char **q, char decimal, char sci, + char tsep, int skip_trailing) + inline int to_complex(char *item, double *p_real, double *p_imag, char sci, char decimal) inline int to_longlong(char *item, long long *p_value) @@ -315,7 +324,8 @@ cdef class TextReader: skip_footer=0, verbose=False, mangle_dupe_cols=True, - tupleize_cols=False): + tupleize_cols=False, + float_precision=None): self.parser = parser_new() self.parser.chunksize = tokenize_chunksize @@ -415,6 +425,11 @@ cdef class TextReader: self.verbose = verbose self.low_memory = low_memory + self.parser.converter = xstrtod + if float_precision == 'high': + self.parser.converter = precise_xstrtod + elif float_precision == 'round_trip': + self.parser.converter = round_trip # encoding if encoding is not None: @@ -1018,7 +1033,7 @@ cdef class TextReader: elif dtype[1] == 'f': result, na_count = _try_double(self.parser, i, start, end, - na_filter, na_hashset, na_flist) + na_filter, na_hashset, na_flist) if dtype[1:] != 'f8': result = result.astype(dtype) @@ -1415,12 +1430,14 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end, size_t i, lines coliter_t it char *word + char *p_end double *data double NA = na_values[np.float64] ndarray result khiter_t k bint use_na_flist = len(na_flist) > 0 + global errno lines = line_end - line_start result = np.empty(lines, dtype=np.float64) data = result.data @@ -1436,8 +1453,9 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end, na_count += 1 data[0] = NA else: - error = to_double(word, data, parser.sci, parser.decimal, parser.thousands) - if error != 1: + data[0] = parser.converter(word, &p_end, parser.decimal, parser.sci, + parser.thousands, 1) + if errno != 0 or p_end[0] or p_end == word: if strcasecmp(word, cinf) == 0: data[0] = INF elif strcasecmp(word, cneginf) == 0: @@ -1452,8 +1470,9 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end, else: for i in range(lines): word = COLITER_NEXT(it) - error = to_double(word, data, parser.sci, parser.decimal, parser.thousands) - if error != 1: + data[0] = parser.converter(word, &p_end, parser.decimal, parser.sci, + parser.thousands, 1) + if errno != 0 or p_end[0] or p_end == word: if strcasecmp(word, cinf) == 0: data[0] = INF elif strcasecmp(word, cneginf) == 0: diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index b30706f85894b..79d854dd07674 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -1689,10 +1689,6 @@ void test_count_lines(char *fname) { -// forward declaration -static double xstrtod(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing); - - P_INLINE void lowercase(char *p) { for ( ; *p; ++p) *p = tolower(*p); } @@ -1702,32 +1698,6 @@ P_INLINE void uppercase(char *p) { } -/* - * `item` must be the nul-terminated string that is to be - * converted to a double. - * - * To be successful, to_double() must use *all* the characters - * in `item`. E.g. "1.q25" will fail. Leading and trailing - * spaces are allowed. - * - * `sci` is the scientific notation exponent character, usually - * either 'E' or 'D'. Case is ignored. - * - * `decimal` is the decimal point character, usually either - * '.' or ','. - * - */ - -int to_double(char *item, double *p_value, char sci, char decimal, char tsep) -{ - char *p_end; - - *p_value = xstrtod(item, &p_end, decimal, sci, tsep, TRUE); - - return (errno == 0) && (!*p_end); -} - - int P_INLINE to_complex(char *item, double *p_real, double *p_imag, char sci, char decimal) { char *p_end; @@ -1917,7 +1887,7 @@ int main(int argc, char *argv[]) // * Add tsep argument for thousands separator // -static double xstrtod(const char *str, char **endptr, char decimal, +double xstrtod(const char *str, char **endptr, char decimal, char sci, char tsep, int skip_trailing) { double number; @@ -2048,6 +2018,171 @@ static double xstrtod(const char *str, char **endptr, char decimal, return number; } +double precise_xstrtod(const char *str, char **endptr, char decimal, + char sci, char tsep, int skip_trailing) +{ + double number; + int exponent; + int negative; + char *p = (char *) str; + int num_digits; + int num_decimals; + int max_digits = 17; + int n; + // Cache powers of 10 in memory + static double e[] = {1., 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, 1e10, + 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19, 1e20, + 1e21, 1e22, 1e23, 1e24, 1e25, 1e26, 1e27, 1e28, 1e29, 1e30, + 1e31, 1e32, 1e33, 1e34, 1e35, 1e36, 1e37, 1e38, 1e39, 1e40, + 1e41, 1e42, 1e43, 1e44, 1e45, 1e46, 1e47, 1e48, 1e49, 1e50, + 1e51, 1e52, 1e53, 1e54, 1e55, 1e56, 1e57, 1e58, 1e59, 1e60, + 1e61, 1e62, 1e63, 1e64, 1e65, 1e66, 1e67, 1e68, 1e69, 1e70, + 1e71, 1e72, 1e73, 1e74, 1e75, 1e76, 1e77, 1e78, 1e79, 1e80, + 1e81, 1e82, 1e83, 1e84, 1e85, 1e86, 1e87, 1e88, 1e89, 1e90, + 1e91, 1e92, 1e93, 1e94, 1e95, 1e96, 1e97, 1e98, 1e99, 1e100, + 1e101, 1e102, 1e103, 1e104, 1e105, 1e106, 1e107, 1e108, 1e109, 1e110, + 1e111, 1e112, 1e113, 1e114, 1e115, 1e116, 1e117, 1e118, 1e119, 1e120, + 1e121, 1e122, 1e123, 1e124, 1e125, 1e126, 1e127, 1e128, 1e129, 1e130, + 1e131, 1e132, 1e133, 1e134, 1e135, 1e136, 1e137, 1e138, 1e139, 1e140, + 1e141, 1e142, 1e143, 1e144, 1e145, 1e146, 1e147, 1e148, 1e149, 1e150, + 1e151, 1e152, 1e153, 1e154, 1e155, 1e156, 1e157, 1e158, 1e159, 1e160, + 1e161, 1e162, 1e163, 1e164, 1e165, 1e166, 1e167, 1e168, 1e169, 1e170, + 1e171, 1e172, 1e173, 1e174, 1e175, 1e176, 1e177, 1e178, 1e179, 1e180, + 1e181, 1e182, 1e183, 1e184, 1e185, 1e186, 1e187, 1e188, 1e189, 1e190, + 1e191, 1e192, 1e193, 1e194, 1e195, 1e196, 1e197, 1e198, 1e199, 1e200, + 1e201, 1e202, 1e203, 1e204, 1e205, 1e206, 1e207, 1e208, 1e209, 1e210, + 1e211, 1e212, 1e213, 1e214, 1e215, 1e216, 1e217, 1e218, 1e219, 1e220, + 1e221, 1e222, 1e223, 1e224, 1e225, 1e226, 1e227, 1e228, 1e229, 1e230, + 1e231, 1e232, 1e233, 1e234, 1e235, 1e236, 1e237, 1e238, 1e239, 1e240, + 1e241, 1e242, 1e243, 1e244, 1e245, 1e246, 1e247, 1e248, 1e249, 1e250, + 1e251, 1e252, 1e253, 1e254, 1e255, 1e256, 1e257, 1e258, 1e259, 1e260, + 1e261, 1e262, 1e263, 1e264, 1e265, 1e266, 1e267, 1e268, 1e269, 1e270, + 1e271, 1e272, 1e273, 1e274, 1e275, 1e276, 1e277, 1e278, 1e279, 1e280, + 1e281, 1e282, 1e283, 1e284, 1e285, 1e286, 1e287, 1e288, 1e289, 1e290, + 1e291, 1e292, 1e293, 1e294, 1e295, 1e296, 1e297, 1e298, 1e299, 1e300, + 1e301, 1e302, 1e303, 1e304, 1e305, 1e306, 1e307, 1e308}; + errno = 0; + + // Skip leading whitespace + while (isspace(*p)) p++; + + // Handle optional sign + negative = 0; + switch (*p) + { + case '-': negative = 1; // Fall through to increment position + case '+': p++; + } + + number = 0.; + exponent = 0; + num_digits = 0; + num_decimals = 0; + + // Process string of digits + while (isdigit(*p)) + { + if (num_digits < max_digits) + { + number = number * 10. + (*p - '0'); + num_digits++; + } + else + ++exponent; + + p++; + p += (tsep != '\0' & *p == tsep); + } + + // Process decimal part + if (*p == decimal) + { + p++; + + while (num_digits < max_digits && isdigit(*p)) + { + number = number * 10. + (*p - '0'); + p++; + num_digits++; + num_decimals++; + } + + if (num_digits >= max_digits) // consume extra decimal digits + while (isdigit(*p)) + ++p; + + exponent -= num_decimals; + } + + if (num_digits == 0) + { + errno = ERANGE; + return 0.0; + } + + // Correct for sign + if (negative) number = -number; + + // Process an exponent string + if (toupper(*p) == toupper(sci)) + { + // Handle optional sign + negative = 0; + switch (*++p) + { + case '-': negative = 1; // Fall through to increment pos + case '+': p++; + } + + // Process string of digits + n = 0; + while (isdigit(*p)) + { + n = n * 10 + (*p - '0'); + p++; + } + + if (negative) + exponent -= n; + else + exponent += n; + } + + if (exponent > 308) + { + errno = ERANGE; + return HUGE_VAL; + } + else if (exponent > 0) + number *= e[exponent]; + else if (exponent < -308) // subnormal + { + if (exponent < -616) // prevent invalid array access + number = 0.; + number /= e[-308 - exponent]; + number /= e[308]; + } + else + number /= e[-exponent]; + + if (number == HUGE_VAL || number == -HUGE_VAL) + errno = ERANGE; + + if (skip_trailing) { + // Skip trailing whitespace + while (isspace(*p)) p++; + } + + if (endptr) *endptr = p; + return number; +} + +double round_trip(const char *p, char **q, char decimal, char sci, + char tsep, int skip_trailing) +{ + return strtod(p, q); +} + /* float strtof(const char *str, char **endptr) { diff --git a/pandas/src/parser/tokenizer.h b/pandas/src/parser/tokenizer.h index 6af63c07f1104..62e890f60e43d 100644 --- a/pandas/src/parser/tokenizer.h +++ b/pandas/src/parser/tokenizer.h @@ -202,6 +202,7 @@ typedef struct parser_t { void *skipset; int skip_footer; + double (*converter)(const char *, char **, char, char, char, int); // error handling char *warn_msg; @@ -257,7 +258,9 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, int *error, char tsep); uint64_t str_to_uint64(const char *p_item, uint64_t uint_max, int *error); -int P_INLINE to_double(char *item, double *p_value, char sci, char decimal, char tsep); +double xstrtod(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing); +double precise_xstrtod(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing); +double round_trip(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing); int P_INLINE to_complex(char *item, double *p_real, double *p_imag, char sci, char decimal); int P_INLINE to_longlong(char *item, long long *p_value); int P_INLINE to_longlong_thousands(char *item, long long *p_value, char tsep); diff --git a/vb_suite/parser_vb.py b/vb_suite/parser_vb.py index 50d37f37708e7..96da3fac2de5e 100644 --- a/vb_suite/parser_vb.py +++ b/vb_suite/parser_vb.py @@ -79,3 +79,22 @@ cmd = "read_table(StringIO(data), sep=',', header=None, parse_dates=[1])" sdate = datetime(2012, 5, 7) read_table_multiple_date_baseline = Benchmark(cmd, setup, start_date=sdate) + +setup = common_setup + """ +from cStringIO import StringIO +data = '''\ +0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336 +0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285 +0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126 +0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394 +0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020 +''' +data = data * 200 +""" +cmd = "read_csv(StringIO(data), sep=',', header=None, float_precision=None)" +sdate = datetime(2014, 8, 20) +read_csv_default_converter = Benchmark(cmd, setup, start_date=sdate) +cmd = "read_csv(StringIO(data), sep=',', header=None, float_precision='high')" +read_csv_precise_converter = Benchmark(cmd, setup, start_date=sdate) +cmd = "read_csv(StringIO(data), sep=',', header=None, float_precision='round_trip')" +read_csv_roundtrip_converter = Benchmark(cmd, setup, start_date=sdate)