Skip to content

Commit 25810c6

Browse files
committed
Merge pull request #8044 from amras1/new-float-conversion
Added parameter float_precision to CSV parser
2 parents 61e8f5a + d6e2c75 commit 25810c6

File tree

8 files changed

+275
-42
lines changed

8 files changed

+275
-42
lines changed

doc/source/io.rst

+23-1
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,12 @@ They can take a number of arguments:
176176
- ``mangle_dupe_cols``: boolean, default True, then duplicate columns will be specified
177177
as 'X.0'...'X.N', rather than 'X'...'X'
178178
- ``tupleize_cols``: boolean, default False, if False, convert a list of tuples
179-
to a multi-index of columns, otherwise, leave the column index as a list of tuples
179+
to a multi-index of columns, otherwise, leave the column index as a list of
180+
tuples
181+
- ``float_precision`` : string, default None. Specifies which converter the C
182+
engine should use for floating-point values. The options are None for the
183+
ordinary converter, 'high' for the high-precision converter, and
184+
'round_trip' for the round-trip converter.
180185

181186
.. ipython:: python
182187
:suppress:
@@ -512,6 +517,23 @@ data columns:
512517
specify `index_col` as a column label rather then as an index on the resulting frame.
513518

514519

520+
Specifying method for floating-point conversion
521+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
522+
The parameter ``float_precision`` can be specified in order to use
523+
a specific floating-point converter during parsing with the C engine.
524+
The options are the ordinary converter, the high-precision converter, and
525+
the round-trip converter (which is guaranteed to round-trip values after
526+
writing to a file). For example:
527+
528+
.. ipython:: python
529+
530+
val = '0.3066101993807095471566981359501369297504425048828125'
531+
data = 'a,b,c\n1,2,{0}'.format(val)
532+
abs(pd.read_csv(StringIO(data), engine='c', float_precision=None)['c'][0] - float(val))
533+
abs(pd.read_csv(StringIO(data), engine='c', float_precision='high')['c'][0] - float(val))
534+
abs(pd.read_csv(StringIO(data), engine='c', float_precision='round_trip')['c'][0] - float(val))
535+
536+
515537
Date Parsing Functions
516538
~~~~~~~~~~~~~~~~~~~~~~
517539
Finally, the parser allows you can specify a custom ``date_parser`` function to

doc/source/v0.15.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -631,6 +631,8 @@ Enhancements
631631
- Added support for ``c``, ``colormap`` and ``colorbar`` arguments for
632632
``DataFrame.plot`` with ``kind='scatter'`` (:issue:`7780`)
633633

634+
- ``read_csv`` now has a keyword parameter ``float_precision`` which specifies which floating-point
635+
converter the C engine should use during parsing, see :ref:`_io` (:issue:`8002`, :issue:`8044`)
634636

635637
- ``PeriodIndex`` supports ``resolution`` as the same as ``DatetimeIndex`` (:issue:`7708`)
636638
- ``pandas.tseries.holiday`` has added support for additional holidays and ways to observe holidays (:issue:`7070`)

pandas/io/parsers.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -303,7 +303,8 @@ def _read(filepath_or_buffer, kwds):
303303
'error_bad_lines': True,
304304
'warn_bad_lines': True,
305305
'dtype': None,
306-
'decimal': b'.'
306+
'decimal': b'.',
307+
'float_precision': None
307308
}
308309

309310
_fwf_defaults = {
@@ -369,6 +370,7 @@ def parser_f(filepath_or_buffer,
369370
date_parser=None,
370371

371372
memory_map=False,
373+
float_precision=None,
372374
nrows=None,
373375
iterator=False,
374376
chunksize=None,
@@ -437,6 +439,7 @@ def parser_f(filepath_or_buffer,
437439
encoding=encoding,
438440
squeeze=squeeze,
439441
memory_map=memory_map,
442+
float_precision=float_precision,
440443

441444
na_filter=na_filter,
442445
compact_ints=compact_ints,
@@ -1264,6 +1267,11 @@ def TextParser(*args, **kwds):
12641267
If True and `parse_dates` is True for a column, try to infer the
12651268
datetime format based on the first datetime string. If the format
12661269
can be inferred, there often will be a large parsing speed-up.
1270+
float_precision : string, default None
1271+
Specifies which converter the C engine should use for floating-point
1272+
values. The options are None for the ordinary converter,
1273+
'high' for the high-precision converter, and 'round_trip' for the
1274+
round-trip converter.
12671275
"""
12681276
kwds['engine'] = 'python'
12691277
return TextFileReader(*args, **kwds)

pandas/io/tests/test_parsers.py

+25
Original file line numberDiff line numberDiff line change
@@ -2523,6 +2523,12 @@ def test_verbose_import(self):
25232523
finally:
25242524
sys.stdout = sys.__stdout__
25252525

2526+
def test_float_precision_specified(self):
2527+
# Should raise an error if float_precision (C parser option) is specified
2528+
with tm.assertRaisesRegexp(ValueError, "The 'float_precision' option "
2529+
"is not supported with the 'python' engine"):
2530+
self.read_csv(StringIO('a,b,c\n1,2,3'), float_precision='high')
2531+
25262532
def test_iteration_open_handle(self):
25272533
if PY3:
25282534
raise nose.SkipTest("won't work in Python 3 {0}".format(sys.version_info))
@@ -3088,6 +3094,25 @@ def test_compact_ints(self):
30883094
ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)])
30893095
self.assertEqual(result.dtype, ex_dtype)
30903096

3097+
def test_precise_conversion(self):
3098+
# GH #8002
3099+
from decimal import Decimal
3100+
normal_errors = []
3101+
precise_errors = []
3102+
for num in np.linspace(1., 2., num=500): # test numbers between 1 and 2
3103+
text = 'a\n{0:.25}'.format(num) # 25 decimal digits of precision
3104+
normal_val = float(self.read_csv(StringIO(text))['a'][0])
3105+
precise_val = float(self.read_csv(StringIO(text), float_precision='high')['a'][0])
3106+
roundtrip_val = float(self.read_csv(StringIO(text), float_precision='round_trip')['a'][0])
3107+
actual_val = Decimal(text[2:])
3108+
def error(val):
3109+
return abs(Decimal('{0:.100}'.format(val)) - actual_val)
3110+
normal_errors.append(error(normal_val))
3111+
precise_errors.append(error(precise_val))
3112+
self.assertEqual(roundtrip_val, float(text[2:])) # round-trip should match float()
3113+
self.assertTrue(sum(precise_errors) < sum(normal_errors))
3114+
self.assertTrue(max(precise_errors) < max(normal_errors))
3115+
30913116
def test_pass_dtype(self):
30923117
data = """\
30933118
one,two

pandas/parser.pyx

+27-8
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,9 @@ cdef extern from "headers/stdint.h":
6262
cdef extern from "headers/portable.h":
6363
pass
6464

65+
cdef extern from "errno.h":
66+
int errno
67+
6568
try:
6669
basestring
6770
except NameError:
@@ -155,6 +158,7 @@ cdef extern from "parser/tokenizer.h":
155158

156159
void *skipset
157160
int skip_footer
161+
double (*converter)(const char *, char **, char, char, char, int)
158162

159163
# error handling
160164
char *warn_msg
@@ -189,8 +193,13 @@ cdef extern from "parser/tokenizer.h":
189193
int64_t int_max, int *error, char tsep)
190194
uint64_t str_to_uint64(char *p_item, uint64_t uint_max, int *error)
191195

192-
inline int to_double(char *item, double *p_value,
193-
char sci, char decimal, char thousands)
196+
double xstrtod(const char *p, char **q, char decimal, char sci,
197+
char tsep, int skip_trailing)
198+
double precise_xstrtod(const char *p, char **q, char decimal, char sci,
199+
char tsep, int skip_trailing)
200+
double round_trip(const char *p, char **q, char decimal, char sci,
201+
char tsep, int skip_trailing)
202+
194203
inline int to_complex(char *item, double *p_real,
195204
double *p_imag, char sci, char decimal)
196205
inline int to_longlong(char *item, long long *p_value)
@@ -315,7 +324,8 @@ cdef class TextReader:
315324
skip_footer=0,
316325
verbose=False,
317326
mangle_dupe_cols=True,
318-
tupleize_cols=False):
327+
tupleize_cols=False,
328+
float_precision=None):
319329

320330
self.parser = parser_new()
321331
self.parser.chunksize = tokenize_chunksize
@@ -415,6 +425,11 @@ cdef class TextReader:
415425

416426
self.verbose = verbose
417427
self.low_memory = low_memory
428+
self.parser.converter = xstrtod
429+
if float_precision == 'high':
430+
self.parser.converter = precise_xstrtod
431+
elif float_precision == 'round_trip':
432+
self.parser.converter = round_trip
418433

419434
# encoding
420435
if encoding is not None:
@@ -1018,7 +1033,7 @@ cdef class TextReader:
10181033

10191034
elif dtype[1] == 'f':
10201035
result, na_count = _try_double(self.parser, i, start, end,
1021-
na_filter, na_hashset, na_flist)
1036+
na_filter, na_hashset, na_flist)
10221037

10231038
if dtype[1:] != 'f8':
10241039
result = result.astype(dtype)
@@ -1415,12 +1430,14 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end,
14151430
size_t i, lines
14161431
coliter_t it
14171432
char *word
1433+
char *p_end
14181434
double *data
14191435
double NA = na_values[np.float64]
14201436
ndarray result
14211437
khiter_t k
14221438
bint use_na_flist = len(na_flist) > 0
14231439

1440+
global errno
14241441
lines = line_end - line_start
14251442
result = np.empty(lines, dtype=np.float64)
14261443
data = <double *> result.data
@@ -1436,8 +1453,9 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end,
14361453
na_count += 1
14371454
data[0] = NA
14381455
else:
1439-
error = to_double(word, data, parser.sci, parser.decimal, parser.thousands)
1440-
if error != 1:
1456+
data[0] = parser.converter(word, &p_end, parser.decimal, parser.sci,
1457+
parser.thousands, 1)
1458+
if errno != 0 or p_end[0] or p_end == word:
14411459
if strcasecmp(word, cinf) == 0:
14421460
data[0] = INF
14431461
elif strcasecmp(word, cneginf) == 0:
@@ -1452,8 +1470,9 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end,
14521470
else:
14531471
for i in range(lines):
14541472
word = COLITER_NEXT(it)
1455-
error = to_double(word, data, parser.sci, parser.decimal, parser.thousands)
1456-
if error != 1:
1473+
data[0] = parser.converter(word, &p_end, parser.decimal, parser.sci,
1474+
parser.thousands, 1)
1475+
if errno != 0 or p_end[0] or p_end == word:
14571476
if strcasecmp(word, cinf) == 0:
14581477
data[0] = INF
14591478
elif strcasecmp(word, cneginf) == 0:

0 commit comments

Comments
 (0)