Skip to content

Commit 993fe55

Browse files
anthonydelagemax-sixty
authored andcommitted
Use general float format when writing to CSV buffer to prevent numerical overload (#193)
* Write to CSV stream with general float format. * Specify number of significant digits for float format. * Change format to '%.15g' and add tests. * Fixing style errors. * Define string as unicode in to_gbq's float test. * Update Changelog.
1 parent 7652ac6 commit 993fe55

File tree

3 files changed

+25
-1
lines changed

3 files changed

+25
-1
lines changed

docs/source/changelog.rst

+9
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,15 @@
11
Changelog
22
=========
33

4+
.. _changelog-0.5.1:
5+
6+
0.5.1 / (Unreleased)
7+
--------------------
8+
9+
- Use general float with 15 decimal digit precision when writing to local
10+
CSV buffer in ``to_gbq``. This prevents numerical overflow in certain
11+
edge cases. (:issue:`192`)
12+
413
.. _changelog-0.5.0:
514

615
0.5.0 / 2018-06-15

pandas_gbq/load.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ def encode_chunk(dataframe):
1515
csv_buffer = six.StringIO()
1616
dataframe.to_csv(
1717
csv_buffer, index=False, header=False, encoding='utf-8',
18-
date_format='%Y-%m-%d %H:%M:%S.%f')
18+
float_format='%.15g', date_format='%Y-%m-%d %H:%M:%S.%f')
1919

2020
# Convert to a BytesIO buffer so that unicode text is properly handled.
2121
# See: https://github.com/pydata/pandas-gbq/issues/106

tests/unit/test_load.py

+15
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import pandas
55

66
from pandas_gbq import load
7+
from io import StringIO
78

89

910
def test_encode_chunk_with_unicode():
@@ -20,6 +21,20 @@ def test_encode_chunk_with_unicode():
2021
assert u'信用卡' in csv_string
2122

2223

24+
def test_encode_chunk_with_floats():
25+
"""Test that floats in a dataframe are encoded with at most 15 significant
26+
figures.
27+
28+
See: https://github.com/pydata/pandas-gbq/issues/192
29+
"""
30+
input_csv = StringIO(u'01/01/17 23:00,1.05148,1.05153,1.05148,1.05153,4')
31+
df = pandas.read_csv(input_csv, header=None)
32+
csv_buffer = load.encode_chunk(df)
33+
csv_bytes = csv_buffer.read()
34+
csv_string = csv_bytes.decode('utf-8')
35+
assert '1.05153' in csv_string
36+
37+
2338
def test_encode_chunks_splits_dataframe():
2439
df = pandas.DataFrame(numpy.random.randn(6, 4), index=range(6))
2540
chunks = list(load.encode_chunks(df, chunksize=2))

0 commit comments

Comments
 (0)