Allow newlines in data passed to to_gbq()

cbandy · cbandy · commit 045cecfc3923 · 2018-10-24T12:15:32.000-05:00
diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
@@ -1,6 +1,8 @@
 Changelog
 =========
 
+- Allow newlines in data passed to ``to_gbq``. (:issue:`180`)
+
 .. _changelog-0.7.0:
 
 0.7.0 / 2018-10-19
diff --git a/pandas_gbq/load.py b/pandas_gbq/load.py
@@ -61,6 +61,7 @@ def load_chunks(
     job_config = bigquery.LoadJobConfig()
     job_config.write_disposition = "WRITE_APPEND"
     job_config.source_format = "CSV"
+    job_config.allow_quoted_newlines = True
 
     if schema is None:
         schema = pandas_gbq.schema.generate_bq_schema(dataframe)
diff --git a/tests/system/test_gbq.py b/tests/system/test_gbq.py
@@ -1167,6 +1167,39 @@ def test_upload_mixed_float_and_int(self, project_id):
 
         assert len(result_df) == test_size
 
+    def test_upload_data_with_newlines(self, project_id):
+        test_id = "data_with_newlines"
+        test_size = 2
+        df = DataFrame(
+            {
+                "s": ["abcd", "ef\ngh"],
+            }
+        )
+
+        gbq.to_gbq(
+            df,
+            self.destination_table + test_id,
+            project_id=project_id,
+            private_key=self.credentials,
+        )
+
+        result_df = gbq.read_gbq(
+            "SELECT * FROM {0}".format(self.destination_table + test_id),
+            project_id=project_id,
+            private_key=self.credentials,
+            dialect="legacy",
+        )
+
+        assert len(result_df) == test_size
+
+        if sys.version_info.major < 3:
+            pytest.skip(msg="Unicode comparison in Py2 not working")
+
+        result = result_df["s"].sort_values()
+        expected = df["s"].sort_values()
+
+        tm.assert_numpy_array_equal(expected.values, result.values)
+
     def test_upload_data_flexible_column_order(self, project_id):
         test_id = "13"
         test_size = 10
diff --git a/tests/unit/test_load.py b/tests/unit/test_load.py
@@ -37,6 +37,22 @@ def test_encode_chunk_with_floats():
     assert "1.05153" in csv_string
 
 
+def test_encode_chunk_with_newlines():
+    """See: https://github.com/pydata/pandas-gbq/issues/180
+    """
+    df = pandas.DataFrame(
+        {
+            "s": ["abcd", "ef\ngh", "ij\r\nkl"],
+        }
+    )
+    csv_buffer = load.encode_chunk(df)
+    csv_bytes = csv_buffer.read()
+    csv_string = csv_bytes.decode("utf-8")
+    assert 'abcd' in csv_string
+    assert '"ef\ngh"' in csv_string
+    assert '"ij\r\nkl"' in csv_string
+
+
 def test_encode_chunks_splits_dataframe():
     df = pandas.DataFrame(numpy.random.randn(6, 4), index=range(6))
     chunks = list(load.encode_chunks(df, chunksize=2))