ENH: Allow users to specify whether gbq should use standard SQL

parthea · jreback · commit 97de42abbea5 · 2016-08-03T06:30:00.000-04:00
closes pandas-dev#13615 Author: Anthonios Partheniou <apartheniou@electricalengineer.ca> Closes pandas-dev#13850 from parthea/gbq-enable-standard-sql-dialect and squashes the following commits: e1fbb07 [Anthonios Partheniou] Allow users to specify whether gbq should use standard SQL pandas-dev#13615
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -4482,6 +4482,13 @@ destination DataFrame as well as a preferred column order as follows:
 
    You can toggle the verbose output via the ``verbose`` flag which defaults to ``True``.
 
+.. note::
+
+    The ``dialect`` argument can be used to indicate whether to use BigQuery's ``'legacy'`` SQL
+    or BigQuery's ``'standard'`` SQL (beta). The default value is ``'legacy'``. For more information
+    on BigQuery's standard SQL, see `BigQuery SQL Reference
+    <https://cloud.google.com/bigquery/sql-reference/>`__
+
 .. _io.bigquery_writer:
 
 
diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt
@@ -301,6 +301,12 @@ For ``MultiIndex``, values are dropped if any level is missing by default. Speci
 
 ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`)
 
+.. _whatsnew_0170.gbq:
+
+Google BigQuery Enhancements
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+- The :func:`pandas.io.gbq.read_gbq` method has gained the ``dialect`` argument to allow users to specify whether to use BigQuery's legacy SQL or BigQuery's standard SQL. See the :ref:`docs <io.bigquery_reader>` for more details (:issue:`13615`).
+
 .. _whatsnew_0190.enhancements.other:
 
 Other enhancements
diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py
@@ -145,13 +145,14 @@ class GbqConnector(object):
     scope = 'https://www.googleapis.com/auth/bigquery'
 
     def __init__(self, project_id, reauth=False, verbose=False,
-                 private_key=None):
+                 private_key=None, dialect='legacy'):
         _check_google_client_version()
         _test_google_api_imports()
         self.project_id = project_id
         self.reauth = reauth
         self.verbose = verbose
         self.private_key = private_key
+        self.dialect = dialect
         self.credentials = self.get_credentials()
         self.service = self.get_service()
 
@@ -334,7 +335,8 @@ def run_query(self, query):
         job_data = {
             'configuration': {
                 'query': {
-                    'query': query
+                    'query': query,
+                    'useLegacySql': self.dialect == 'legacy'
                     # 'allowLargeResults', 'createDisposition',
                     # 'preserveNulls', destinationTable, useQueryCache
                 }
@@ -563,7 +565,7 @@ def _parse_entry(field_value, field_type):
 
 
 def read_gbq(query, project_id=None, index_col=None, col_order=None,
-             reauth=False, verbose=True, private_key=None):
+             reauth=False, verbose=True, private_key=None, dialect='legacy'):
     """Load data from Google BigQuery.
 
     THIS IS AN EXPERIMENTAL LIBRARY
@@ -602,6 +604,17 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None,
         or string contents. This is useful for remote server
         authentication (eg. jupyter iPython notebook on remote host)
 
+        .. versionadded:: 0.18.1
+
+    dialect : {'legacy', 'standard'}, default 'legacy'
+        'legacy' : Use BigQuery's legacy SQL dialect.
+        'standard' : Use BigQuery's standard SQL (beta), which is
+        compliant with the SQL 2011 standard. For more information
+        see `BigQuery SQL Reference
+        <https://cloud.google.com/bigquery/sql-reference/>`__
+
+        .. versionadded:: 0.19.0
+
     Returns
     -------
     df: DataFrame
@@ -612,8 +625,12 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None,
     if not project_id:
         raise TypeError("Missing required parameter: project_id")
 
+    if dialect not in ('legacy', 'standard'):
+        raise ValueError("'{0}' is not valid for dialect".format(dialect))
+
     connector = GbqConnector(project_id, reauth=reauth, verbose=verbose,
-                             private_key=private_key)
+                             private_key=private_key,
+                             dialect=dialect)
     schema, pages = connector.run_query(query)
     dataframe_list = []
     while len(pages) > 0:
diff --git a/pandas/io/tests/test_gbq.py b/pandas/io/tests/test_gbq.py
@@ -557,6 +557,50 @@ def test_zero_rows(self):
         expected_result = DataFrame(page_array, columns=['title', 'id'])
         self.assert_frame_equal(df, expected_result)
 
+    def test_legacy_sql(self):
+        legacy_sql = "SELECT id FROM [publicdata.samples.wikipedia] LIMIT 10"
+
+        # Test that a legacy sql statement fails when
+        # setting dialect='standard'
+        with tm.assertRaises(gbq.GenericGBQException):
+            gbq.read_gbq(legacy_sql, project_id=PROJECT_ID,
+                         dialect='standard')
+
+        # Test that a legacy sql statement succeeds when
+        # setting dialect='legacy'
+        df = gbq.read_gbq(legacy_sql, project_id=PROJECT_ID,
+                          dialect='legacy')
+        self.assertEqual(len(df.drop_duplicates()), 10)
+
+    def test_standard_sql(self):
+        standard_sql = "SELECT DISTINCT id FROM " \
+                       "`publicdata.samples.wikipedia` LIMIT 10"
+
+        # Test that a standard sql statement fails when using
+        # the legacy SQL dialect (default value)
+        with tm.assertRaises(gbq.GenericGBQException):
+            gbq.read_gbq(standard_sql, project_id=PROJECT_ID)
+
+        # Test that a standard sql statement succeeds when
+        # setting dialect='standard'
+        df = gbq.read_gbq(standard_sql, project_id=PROJECT_ID,
+                          dialect='standard')
+        self.assertEqual(len(df.drop_duplicates()), 10)
+
+    def test_invalid_option_for_sql_dialect(self):
+        sql_statement = "SELECT DISTINCT id FROM " \
+                        "`publicdata.samples.wikipedia` LIMIT 10"
+
+        # Test that an invalid option for `dialect` raises ValueError
+        with tm.assertRaises(ValueError):
+            gbq.read_gbq(sql_statement, project_id=PROJECT_ID,
+                         dialect='invalid')
+
+        # Test that a correct option for dialect succeeds
+        # to make sure ValueError was due to invalid dialect
+        gbq.read_gbq(sql_statement, project_id=PROJECT_ID,
+                     dialect='standard')
+
 
 class TestToGBQIntegration(tm.TestCase):
     # Changes to BigQuery table schema may take up to 2 minutes as of May 2015