Skip to content

Commit 97de42a

Browse files
partheajreback
authored andcommitted
ENH: Allow users to specify whether gbq should use standard SQL
closes pandas-dev#13615 Author: Anthonios Partheniou <[email protected]> Closes pandas-dev#13850 from parthea/gbq-enable-standard-sql-dialect and squashes the following commits: e1fbb07 [Anthonios Partheniou] Allow users to specify whether gbq should use standard SQL pandas-dev#13615
1 parent 3db3365 commit 97de42a

File tree

4 files changed

+78
-4
lines changed

4 files changed

+78
-4
lines changed

doc/source/io.rst

+7
Original file line numberDiff line numberDiff line change
@@ -4482,6 +4482,13 @@ destination DataFrame as well as a preferred column order as follows:
44824482

44834483
You can toggle the verbose output via the ``verbose`` flag which defaults to ``True``.
44844484

4485+
.. note::
4486+
4487+
The ``dialect`` argument can be used to indicate whether to use BigQuery's ``'legacy'`` SQL
4488+
or BigQuery's ``'standard'`` SQL (beta). The default value is ``'legacy'``. For more information
4489+
on BigQuery's standard SQL, see `BigQuery SQL Reference
4490+
<https://cloud.google.com/bigquery/sql-reference/>`__
4491+
44854492
.. _io.bigquery_writer:
44864493

44874494

doc/source/whatsnew/v0.19.0.txt

+6
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,12 @@ For ``MultiIndex``, values are dropped if any level is missing by default. Speci
301301

302302
``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`)
303303

304+
.. _whatsnew_0170.gbq:
305+
306+
Google BigQuery Enhancements
307+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
308+
- The :func:`pandas.io.gbq.read_gbq` method has gained the ``dialect`` argument to allow users to specify whether to use BigQuery's legacy SQL or BigQuery's standard SQL. See the :ref:`docs <io.bigquery_reader>` for more details (:issue:`13615`).
309+
304310
.. _whatsnew_0190.enhancements.other:
305311

306312
Other enhancements

pandas/io/gbq.py

+21-4
Original file line numberDiff line numberDiff line change
@@ -145,13 +145,14 @@ class GbqConnector(object):
145145
scope = 'https://www.googleapis.com/auth/bigquery'
146146

147147
def __init__(self, project_id, reauth=False, verbose=False,
148-
private_key=None):
148+
private_key=None, dialect='legacy'):
149149
_check_google_client_version()
150150
_test_google_api_imports()
151151
self.project_id = project_id
152152
self.reauth = reauth
153153
self.verbose = verbose
154154
self.private_key = private_key
155+
self.dialect = dialect
155156
self.credentials = self.get_credentials()
156157
self.service = self.get_service()
157158

@@ -334,7 +335,8 @@ def run_query(self, query):
334335
job_data = {
335336
'configuration': {
336337
'query': {
337-
'query': query
338+
'query': query,
339+
'useLegacySql': self.dialect == 'legacy'
338340
# 'allowLargeResults', 'createDisposition',
339341
# 'preserveNulls', destinationTable, useQueryCache
340342
}
@@ -563,7 +565,7 @@ def _parse_entry(field_value, field_type):
563565

564566

565567
def read_gbq(query, project_id=None, index_col=None, col_order=None,
566-
reauth=False, verbose=True, private_key=None):
568+
reauth=False, verbose=True, private_key=None, dialect='legacy'):
567569
"""Load data from Google BigQuery.
568570
569571
THIS IS AN EXPERIMENTAL LIBRARY
@@ -602,6 +604,17 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None,
602604
or string contents. This is useful for remote server
603605
authentication (eg. jupyter iPython notebook on remote host)
604606
607+
.. versionadded:: 0.18.1
608+
609+
dialect : {'legacy', 'standard'}, default 'legacy'
610+
'legacy' : Use BigQuery's legacy SQL dialect.
611+
'standard' : Use BigQuery's standard SQL (beta), which is
612+
compliant with the SQL 2011 standard. For more information
613+
see `BigQuery SQL Reference
614+
<https://cloud.google.com/bigquery/sql-reference/>`__
615+
616+
.. versionadded:: 0.19.0
617+
605618
Returns
606619
-------
607620
df: DataFrame
@@ -612,8 +625,12 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None,
612625
if not project_id:
613626
raise TypeError("Missing required parameter: project_id")
614627

628+
if dialect not in ('legacy', 'standard'):
629+
raise ValueError("'{0}' is not valid for dialect".format(dialect))
630+
615631
connector = GbqConnector(project_id, reauth=reauth, verbose=verbose,
616-
private_key=private_key)
632+
private_key=private_key,
633+
dialect=dialect)
617634
schema, pages = connector.run_query(query)
618635
dataframe_list = []
619636
while len(pages) > 0:

pandas/io/tests/test_gbq.py

+44
Original file line numberDiff line numberDiff line change
@@ -557,6 +557,50 @@ def test_zero_rows(self):
557557
expected_result = DataFrame(page_array, columns=['title', 'id'])
558558
self.assert_frame_equal(df, expected_result)
559559

560+
def test_legacy_sql(self):
561+
legacy_sql = "SELECT id FROM [publicdata.samples.wikipedia] LIMIT 10"
562+
563+
# Test that a legacy sql statement fails when
564+
# setting dialect='standard'
565+
with tm.assertRaises(gbq.GenericGBQException):
566+
gbq.read_gbq(legacy_sql, project_id=PROJECT_ID,
567+
dialect='standard')
568+
569+
# Test that a legacy sql statement succeeds when
570+
# setting dialect='legacy'
571+
df = gbq.read_gbq(legacy_sql, project_id=PROJECT_ID,
572+
dialect='legacy')
573+
self.assertEqual(len(df.drop_duplicates()), 10)
574+
575+
def test_standard_sql(self):
576+
standard_sql = "SELECT DISTINCT id FROM " \
577+
"`publicdata.samples.wikipedia` LIMIT 10"
578+
579+
# Test that a standard sql statement fails when using
580+
# the legacy SQL dialect (default value)
581+
with tm.assertRaises(gbq.GenericGBQException):
582+
gbq.read_gbq(standard_sql, project_id=PROJECT_ID)
583+
584+
# Test that a standard sql statement succeeds when
585+
# setting dialect='standard'
586+
df = gbq.read_gbq(standard_sql, project_id=PROJECT_ID,
587+
dialect='standard')
588+
self.assertEqual(len(df.drop_duplicates()), 10)
589+
590+
def test_invalid_option_for_sql_dialect(self):
591+
sql_statement = "SELECT DISTINCT id FROM " \
592+
"`publicdata.samples.wikipedia` LIMIT 10"
593+
594+
# Test that an invalid option for `dialect` raises ValueError
595+
with tm.assertRaises(ValueError):
596+
gbq.read_gbq(sql_statement, project_id=PROJECT_ID,
597+
dialect='invalid')
598+
599+
# Test that a correct option for dialect succeeds
600+
# to make sure ValueError was due to invalid dialect
601+
gbq.read_gbq(sql_statement, project_id=PROJECT_ID,
602+
dialect='standard')
603+
560604

561605
class TestToGBQIntegration(tm.TestCase):
562606
# Changes to BigQuery table schema may take up to 2 minutes as of May 2015

0 commit comments

Comments
 (0)