Skip to content

Commit d4cd7ef

Browse files
Dr-IrvKevin D Smith
authored and
Kevin D Smith
committed
Change default of float_precision for read_csv and read_table to "high" (pandas-dev#36228)
1 parent 5a8aa44 commit d4cd7ef

File tree

4 files changed

+46
-8
lines changed

4 files changed

+46
-8
lines changed

doc/source/whatsnew/v1.2.0.rst

+13
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,19 @@ For example:
9696
buffer = io.BytesIO()
9797
data.to_csv(buffer, mode="w+b", encoding="utf-8", compression="gzip")
9898
99+
:.. _whatsnew_read_csv_table_precision_default:
100+
101+
Change in default floating precision for ``read_csv`` and ``read_table``
102+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
103+
104+
For the C parsing engine, the methods :meth:`read_csv` and :meth:`read_table` previously defaulted to a parser that
105+
could read floating point numbers slightly incorrectly with respect to the last bit in precision.
106+
The option ``floating_precision="high"`` has always been available to avoid this issue.
107+
Beginning with this version, the default is now to use the more accurate parser by making
108+
``floating_precision=None`` correspond to the high precision parser, and the new option
109+
``floating_precision="legacy"`` to use the legacy parser. The change to using the higher precision
110+
parser by default should have no impact on performance. (:issue:`17154`)
111+
99112
.. _whatsnew_120.enhancements.other:
100113

101114
Other enhancements

pandas/_libs/parsers.pyx

+5-2
Original file line numberDiff line numberDiff line change
@@ -476,10 +476,13 @@ cdef class TextReader:
476476
if float_precision == "round_trip":
477477
# see gh-15140
478478
self.parser.double_converter = round_trip
479-
elif float_precision == "high":
479+
elif float_precision == "legacy":
480+
self.parser.double_converter = xstrtod
481+
elif float_precision == "high" or float_precision is None:
480482
self.parser.double_converter = precise_xstrtod
481483
else:
482-
self.parser.double_converter = xstrtod
484+
raise ValueError(f'Unrecognized float_precision option: '
485+
f'{float_precision}')
483486

484487
if isinstance(dtype, dict):
485488
dtype = {k: pandas_dtype(dtype[k])

pandas/io/parsers.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -338,9 +338,9 @@
338338
option can improve performance because there is no longer any I/O overhead.
339339
float_precision : str, optional
340340
Specifies which converter the C engine should use for floating-point
341-
values. The options are `None` for the ordinary converter,
342-
`high` for the high-precision converter, and `round_trip` for the
343-
round-trip converter.
341+
values. The options are `None` or `high` for the ordinary converter,
342+
`legacy` for the original lower precision pandas converter, and
343+
`round_trip` for the round-trip converter.
344344
345345
Returns
346346
-------
@@ -2284,6 +2284,7 @@ def TextParser(*args, **kwds):
22842284
values. The options are None for the ordinary converter,
22852285
'high' for the high-precision converter, and 'round_trip' for the
22862286
round-trip converter.
2287+
.. versionchanged:: 1.2
22872288
"""
22882289
kwds["engine"] = "python"
22892290
return TextFileReader(*args, **kwds)

pandas/tests/io/parser/test_c_parser_only.py

+24-3
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,9 @@ def test_precise_conversion(c_parser_only):
160160
# 25 decimal digits of precision
161161
text = f"a\n{num:.25}"
162162

163-
normal_val = float(parser.read_csv(StringIO(text))["a"][0])
163+
normal_val = float(
164+
parser.read_csv(StringIO(text), float_precision="legacy")["a"][0]
165+
)
164166
precise_val = float(
165167
parser.read_csv(StringIO(text), float_precision="high")["a"][0]
166168
)
@@ -608,7 +610,7 @@ def test_unix_style_breaks(c_parser_only):
608610
tm.assert_frame_equal(result, expected)
609611

610612

611-
@pytest.mark.parametrize("float_precision", [None, "high", "round_trip"])
613+
@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"])
612614
@pytest.mark.parametrize(
613615
"data,thousands,decimal",
614616
[
@@ -646,7 +648,7 @@ def test_1000_sep_with_decimal(
646648
tm.assert_frame_equal(result, expected)
647649

648650

649-
@pytest.mark.parametrize("float_precision", [None, "high", "round_trip"])
651+
@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"])
650652
@pytest.mark.parametrize(
651653
"value,expected",
652654
[
@@ -702,3 +704,22 @@ def test_1000_sep_decimal_float_precision(
702704
)
703705
val = df.iloc[0, 0]
704706
assert val == expected
707+
708+
709+
def test_float_precision_options(c_parser_only):
710+
# GH 17154, 36228
711+
parser = c_parser_only
712+
s = "foo\n243.164\n"
713+
df = parser.read_csv(StringIO(s))
714+
df2 = parser.read_csv(StringIO(s), float_precision="high")
715+
716+
tm.assert_frame_equal(df, df2)
717+
718+
df3 = parser.read_csv(StringIO(s), float_precision="legacy")
719+
720+
assert not df.iloc[0, 0] == df3.iloc[0, 0]
721+
722+
msg = "Unrecognized float_precision option: junk"
723+
724+
with pytest.raises(ValueError, match=msg):
725+
parser.read_csv(StringIO(s), float_precision="junk")

0 commit comments

Comments
 (0)