Use general float format when writing to CSV buffer to prevent numerical overload (#193)

anthonydelage · max-sixty · commit 993fe55f7e1f · 2018-07-26T10:47:06.000-04:00
* Write to CSV stream with general float format.

* Specify number of significant digits for float format.

* Change format to '%.15g' and add tests.

* Fixing style errors.

* Define string as unicode in to_gbq's float test.

* Update Changelog.
diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
@@ -1,6 +1,15 @@
 Changelog
 =========
 
+.. _changelog-0.5.1:
+
+0.5.1 / (Unreleased)
+--------------------
+
+- Use general float with 15 decimal digit precision when writing to local
+  CSV buffer in ``to_gbq``. This prevents numerical overflow in certain
+  edge cases. (:issue:`192`)
+
 .. _changelog-0.5.0:
 
 0.5.0 / 2018-06-15
diff --git a/pandas_gbq/load.py b/pandas_gbq/load.py
@@ -15,7 +15,7 @@ def encode_chunk(dataframe):
     csv_buffer = six.StringIO()
     dataframe.to_csv(
         csv_buffer, index=False, header=False, encoding='utf-8',
-        date_format='%Y-%m-%d %H:%M:%S.%f')
+        float_format='%.15g', date_format='%Y-%m-%d %H:%M:%S.%f')
 
     # Convert to a BytesIO buffer so that unicode text is properly handled.
     # See: https://github.com/pydata/pandas-gbq/issues/106
diff --git a/tests/unit/test_load.py b/tests/unit/test_load.py
@@ -4,6 +4,7 @@
 import pandas
 
 from pandas_gbq import load
+from io import StringIO
 
 
 def test_encode_chunk_with_unicode():
@@ -20,6 +21,20 @@ def test_encode_chunk_with_unicode():
     assert u'信用卡' in csv_string
 
 
+def test_encode_chunk_with_floats():
+    """Test that floats in a dataframe are encoded with at most 15 significant
+        figures.
+
+    See: https://github.com/pydata/pandas-gbq/issues/192
+    """
+    input_csv = StringIO(u'01/01/17 23:00,1.05148,1.05153,1.05148,1.05153,4')
+    df = pandas.read_csv(input_csv, header=None)
+    csv_buffer = load.encode_chunk(df)
+    csv_bytes = csv_buffer.read()
+    csv_string = csv_bytes.decode('utf-8')
+    assert '1.05153' in csv_string
+
+
 def test_encode_chunks_splits_dataframe():
     df = pandas.DataFrame(numpy.random.randn(6, 4), index=range(6))
     chunks = list(load.encode_chunks(df, chunksize=2))