Skip to content

ENH: Add parameter to download BigQuery results with the BigQuery Storage API #26104

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 20, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,7 @@ I/O
- Improved the explanation for the failure when value labels are repeated in Stata dta files and suggested work-arounds (:issue:`25772`)
- Improved :meth:`pandas.read_stata` and :class:`pandas.io.stata.StataReader` to read incorrectly formatted 118 format files saved by Stata (:issue:`25960`)
- Fixed bug in loading objects from S3 that contain ``#`` characters in the URL (:issue:`25945`)
- Adds ``use_bqstorage_api`` parameter to :func:`read_gbq` to speed up downloads of large data frames. This feature requires version 0.10.0 of the ``pandas-gbq`` library as well as the ``google-cloud-bigquery-storage`` and ``fastavro`` libraries. (:issue:`26104`)

Plotting
^^^^^^^^
Expand Down
44 changes: 31 additions & 13 deletions pandas/io/gbq.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
""" Google BigQuery support """

import warnings


def _try_import():
# since pandas is a dependency of pandas-gbq
Expand All @@ -26,7 +24,7 @@ def _try_import():
def read_gbq(query, project_id=None, index_col=None, col_order=None,
reauth=False, auth_local_webserver=False, dialect=None,
location=None, configuration=None, credentials=None,
private_key=None, verbose=None):
use_bqstorage_api=None, private_key=None, verbose=None):
"""
Load data from Google BigQuery.

Expand Down Expand Up @@ -103,6 +101,21 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None,
*New in version 0.8.0 of pandas-gbq*.

.. versionadded:: 0.24.0
use_bqstorage_api : bool, default False
Use the `BigQuery Storage API
<https://cloud.google.com/bigquery/docs/reference/storage/>`__ to
download query results quickly, but at an increased cost. To use this
API, first `enable it in the Cloud Console
<https://console.cloud.google.com/apis/library/bigquerystorage.googleapis.com>`__.
You must also have the `bigquery.readsessions.create
<https://cloud.google.com/bigquery/docs/access-control#roles>`__
permission on the project you are billing queries to.

This feature requires version 0.10.0 or later of the ``pandas-gbq``
package. It also requires the ``google-cloud-bigquery-storage`` and
``fastavro`` packages.

.. versionadded:: 0.25.0
private_key : str, deprecated
Deprecated in pandas-gbq version 0.8.0. Use the ``credentials``
parameter and
Expand Down Expand Up @@ -131,22 +144,27 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None,
"""
pandas_gbq = _try_import()

if dialect is None:
dialect = "legacy"
warnings.warn(
'The default value for dialect is changing to "standard" in a '
'future version of pandas-gbq. Pass in dialect="legacy" to '
"disable this warning.",
FutureWarning,
stacklevel=2,
)
kwargs = {}

# START: new kwargs. Don't populate unless explicitly set.
if use_bqstorage_api is not None:
kwargs["use_bqstorage_api"] = use_bqstorage_api
# END: new kwargs

# START: deprecated kwargs. Don't populate unless explicitly set.
if verbose is not None:
kwargs["verbose"] = verbose

if private_key is not None:
kwargs["private_key"] = private_key
# END: deprecated kwargs

return pandas_gbq.read_gbq(
query, project_id=project_id, index_col=index_col,
col_order=col_order, reauth=reauth,
auth_local_webserver=auth_local_webserver, dialect=dialect,
location=location, configuration=configuration,
credentials=credentials, verbose=verbose, private_key=private_key)
credentials=credentials, **kwargs)


def to_gbq(dataframe, destination_table, project_id=None, chunksize=None,
Expand Down
58 changes: 50 additions & 8 deletions pandas/tests/io/test_gbq.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@

import pandas as pd
from pandas import DataFrame
import pandas.util.testing as tm

api_exceptions = pytest.importorskip("google.api_core.exceptions")
bigquery = pytest.importorskip("google.cloud.bigquery")
Expand Down Expand Up @@ -90,16 +89,59 @@ def make_mixed_dataframe_v2(test_size):
index=range(test_size))


def test_read_gbq_without_dialect_warns_future_change(monkeypatch):
# Default dialect is changing to standard SQL. See:
# https://github.com/pydata/pandas-gbq/issues/195
def test_read_gbq_with_deprecated_kwargs(monkeypatch):
captured_kwargs = {}

def mock_read_gbq(*args, **kwargs):
def mock_read_gbq(sql, **kwargs):
captured_kwargs.update(kwargs)
return DataFrame([[1.0]])

monkeypatch.setattr(pandas_gbq, 'read_gbq', mock_read_gbq)
with tm.assert_produces_warning(FutureWarning):
pd.read_gbq("SELECT 1")
monkeypatch.setattr("pandas_gbq.read_gbq", mock_read_gbq)
private_key = object()
pd.read_gbq("SELECT 1", verbose=True, private_key=private_key)

assert captured_kwargs["verbose"]
assert captured_kwargs["private_key"] is private_key


def test_read_gbq_without_deprecated_kwargs(monkeypatch):
captured_kwargs = {}

def mock_read_gbq(sql, **kwargs):
captured_kwargs.update(kwargs)
return DataFrame([[1.0]])

monkeypatch.setattr("pandas_gbq.read_gbq", mock_read_gbq)
pd.read_gbq("SELECT 1")

assert "verbose" not in captured_kwargs
assert "private_key" not in captured_kwargs


def test_read_gbq_with_new_kwargs(monkeypatch):
captured_kwargs = {}

def mock_read_gbq(sql, **kwargs):
captured_kwargs.update(kwargs)
return DataFrame([[1.0]])

monkeypatch.setattr("pandas_gbq.read_gbq", mock_read_gbq)
pd.read_gbq("SELECT 1", use_bqstorage_api=True)

assert captured_kwargs["use_bqstorage_api"]


def test_read_gbq_without_new_kwargs(monkeypatch):
captured_kwargs = {}

def mock_read_gbq(sql, **kwargs):
captured_kwargs.update(kwargs)
return DataFrame([[1.0]])

monkeypatch.setattr("pandas_gbq.read_gbq", mock_read_gbq)
pd.read_gbq("SELECT 1")

assert "use_bqstorage_api" not in captured_kwargs


@pytest.mark.single
Expand Down