Skip to content

Allow users to specify whether gbq should use standard SQL #13850

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions doc/source/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4482,6 +4482,13 @@ destination DataFrame as well as a preferred column order as follows:

You can toggle the verbose output via the ``verbose`` flag which defaults to ``True``.

.. note::

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh j meant make this take a string

maybe standard (or legacy)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

The ``dialect`` argument can be used to indicate whether to use BigQuery's ``'legacy'`` SQL
or BigQuery's ``'standard'`` SQL (beta). The default value is ``'legacy'``. For more information
on BigQuery's standard SQL, see `BigQuery SQL Reference
<https://cloud.google.com/bigquery/sql-reference/>`__

.. _io.bigquery_writer:


Expand Down
6 changes: 6 additions & 0 deletions doc/source/whatsnew/v0.19.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,12 @@ For ``MultiIndex``, values are dropped if any level is missing by default. Speci

``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`)

.. _whatsnew_0170.gbq:

Google BigQuery Enhancements
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
- The :func:`pandas.io.gbq.read_gbq` method has gained the ``dialect`` argument to allow users to specify whether to use BigQuery's legacy SQL or BigQuery's standard SQL. See the :ref:`docs <io.bigquery_reader>` for more details (:issue:`13615`).

.. _whatsnew_0190.enhancements.other:

Other enhancements
Expand Down
20 changes: 16 additions & 4 deletions pandas/io/gbq.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,13 +145,14 @@ class GbqConnector(object):
scope = 'https://www.googleapis.com/auth/bigquery'

def __init__(self, project_id, reauth=False, verbose=False,
private_key=None):
private_key=None, dialect='legacy'):
_check_google_client_version()
_test_google_api_imports()
self.project_id = project_id
self.reauth = reauth
self.verbose = verbose
self.private_key = private_key
self.dialect = dialect
self.credentials = self.get_credentials()
self.service = self.get_service()

Expand Down Expand Up @@ -334,7 +335,8 @@ def run_query(self, query):
job_data = {
'configuration': {
'query': {
'query': query
'query': query,
'useLegacySql': self.dialect == 'legacy'
# 'allowLargeResults', 'createDisposition',
# 'preserveNulls', destinationTable, useQueryCache
}
Expand Down Expand Up @@ -563,7 +565,7 @@ def _parse_entry(field_value, field_type):


def read_gbq(query, project_id=None, index_col=None, col_order=None,
reauth=False, verbose=True, private_key=None):
reauth=False, verbose=True, private_key=None, dialect='legacy'):
"""Load data from Google BigQuery.

THIS IS AN EXPERIMENTAL LIBRARY
Expand Down Expand Up @@ -601,6 +603,12 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None,
Service account private key in JSON format. Can be file path
or string contents. This is useful for remote server
authentication (eg. jupyter iPython notebook on remote host)
dialect : {'legacy', 'standard'}, default 'legacy'
'legacy' : Use BigQuery's legacy SQL dialect.
'standard' : Use BigQuery's standard SQL (beta), which is
compliant with the SQL 2011 standard. For more information
see `BigQuery SQL Reference
<https://cloud.google.com/bigquery/sql-reference/>`__

Returns
-------
Expand All @@ -612,8 +620,12 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None,
if not project_id:
raise TypeError("Missing required parameter: project_id")

if dialect not in ('legacy', 'standard'):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

did you add a test that exercises this exception?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

raise ValueError("'{0}' is not valid for dialect".format(dialect))

connector = GbqConnector(project_id, reauth=reauth, verbose=verbose,
private_key=private_key)
private_key=private_key,
dialect=dialect)
schema, pages = connector.run_query(query)
dataframe_list = []
while len(pages) > 0:
Expand Down
44 changes: 44 additions & 0 deletions pandas/io/tests/test_gbq.py
Original file line number Diff line number Diff line change
Expand Up @@ -557,6 +557,50 @@ def test_zero_rows(self):
expected_result = DataFrame(page_array, columns=['title', 'id'])
self.assert_frame_equal(df, expected_result)

def test_legacy_sql(self):
legacy_sql = "SELECT id FROM [publicdata.samples.wikipedia] LIMIT 10"

# Test that a legacy sql statement fails when
# setting dialect='standard'
with tm.assertRaises(gbq.GenericGBQException):
gbq.read_gbq(legacy_sql, project_id=PROJECT_ID,
dialect='standard')

# Test that a legacy sql statement succeeds when
# setting dialect='legacy'
df = gbq.read_gbq(legacy_sql, project_id=PROJECT_ID,
dialect='legacy')
self.assertEqual(len(df.drop_duplicates()), 10)

def test_standard_sql(self):
standard_sql = "SELECT DISTINCT id FROM " \
"`publicdata.samples.wikipedia` LIMIT 10"

# Test that a standard sql statement fails when using
# the legacy SQL dialect (default value)
with tm.assertRaises(gbq.GenericGBQException):
gbq.read_gbq(standard_sql, project_id=PROJECT_ID)

# Test that a standard sql statement succeeds when
# setting dialect='standard'
df = gbq.read_gbq(standard_sql, project_id=PROJECT_ID,
dialect='standard')
self.assertEqual(len(df.drop_duplicates()), 10)

def test_invalid_option_for_sql_dialect(self):
sql_statement = "SELECT DISTINCT id FROM " \
"`publicdata.samples.wikipedia` LIMIT 10"

# Test that an invalid option for `dialect` raises ValueError
with tm.assertRaises(ValueError):
gbq.read_gbq(sql_statement, project_id=PROJECT_ID,
dialect='invalid')

# Test that a correct option for dialect succeeds
# to make sure ValueError was due to invalid dialect
gbq.read_gbq(sql_statement, project_id=PROJECT_ID,
dialect='standard')


class TestToGBQIntegration(tm.TestCase):
# Changes to BigQuery table schema may take up to 2 minutes as of May 2015
Expand Down