Skip to content

Commit 7706741

Browse files
tswastjreback
authored andcommitted
ENH: Add parameter to download BigQuery results with the BigQuery Storage API (#26104)
Adds new `use_bqstorage_api` parameter to `read_gbq`. This can speed up downloads of large data frames.
1 parent c4bf97a commit 7706741

File tree

3 files changed

+82
-21
lines changed

3 files changed

+82
-21
lines changed

doc/source/whatsnew/v0.25.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -367,6 +367,7 @@ I/O
367367
- Improved the explanation for the failure when value labels are repeated in Stata dta files and suggested work-arounds (:issue:`25772`)
368368
- Improved :meth:`pandas.read_stata` and :class:`pandas.io.stata.StataReader` to read incorrectly formatted 118 format files saved by Stata (:issue:`25960`)
369369
- Fixed bug in loading objects from S3 that contain ``#`` characters in the URL (:issue:`25945`)
370+
- Adds ``use_bqstorage_api`` parameter to :func:`read_gbq` to speed up downloads of large data frames. This feature requires version 0.10.0 of the ``pandas-gbq`` library as well as the ``google-cloud-bigquery-storage`` and ``fastavro`` libraries. (:issue:`26104`)
370371

371372
Plotting
372373
^^^^^^^^

pandas/io/gbq.py

+31-13
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
""" Google BigQuery support """
22

3-
import warnings
4-
53

64
def _try_import():
75
# since pandas is a dependency of pandas-gbq
@@ -26,7 +24,7 @@ def _try_import():
2624
def read_gbq(query, project_id=None, index_col=None, col_order=None,
2725
reauth=False, auth_local_webserver=False, dialect=None,
2826
location=None, configuration=None, credentials=None,
29-
private_key=None, verbose=None):
27+
use_bqstorage_api=None, private_key=None, verbose=None):
3028
"""
3129
Load data from Google BigQuery.
3230
@@ -103,6 +101,21 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None,
103101
*New in version 0.8.0 of pandas-gbq*.
104102
105103
.. versionadded:: 0.24.0
104+
use_bqstorage_api : bool, default False
105+
Use the `BigQuery Storage API
106+
<https://cloud.google.com/bigquery/docs/reference/storage/>`__ to
107+
download query results quickly, but at an increased cost. To use this
108+
API, first `enable it in the Cloud Console
109+
<https://console.cloud.google.com/apis/library/bigquerystorage.googleapis.com>`__.
110+
You must also have the `bigquery.readsessions.create
111+
<https://cloud.google.com/bigquery/docs/access-control#roles>`__
112+
permission on the project you are billing queries to.
113+
114+
This feature requires version 0.10.0 or later of the ``pandas-gbq``
115+
package. It also requires the ``google-cloud-bigquery-storage`` and
116+
``fastavro`` packages.
117+
118+
.. versionadded:: 0.25.0
106119
private_key : str, deprecated
107120
Deprecated in pandas-gbq version 0.8.0. Use the ``credentials``
108121
parameter and
@@ -131,22 +144,27 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None,
131144
"""
132145
pandas_gbq = _try_import()
133146

134-
if dialect is None:
135-
dialect = "legacy"
136-
warnings.warn(
137-
'The default value for dialect is changing to "standard" in a '
138-
'future version of pandas-gbq. Pass in dialect="legacy" to '
139-
"disable this warning.",
140-
FutureWarning,
141-
stacklevel=2,
142-
)
147+
kwargs = {}
148+
149+
# START: new kwargs. Don't populate unless explicitly set.
150+
if use_bqstorage_api is not None:
151+
kwargs["use_bqstorage_api"] = use_bqstorage_api
152+
# END: new kwargs
153+
154+
# START: deprecated kwargs. Don't populate unless explicitly set.
155+
if verbose is not None:
156+
kwargs["verbose"] = verbose
157+
158+
if private_key is not None:
159+
kwargs["private_key"] = private_key
160+
# END: deprecated kwargs
143161

144162
return pandas_gbq.read_gbq(
145163
query, project_id=project_id, index_col=index_col,
146164
col_order=col_order, reauth=reauth,
147165
auth_local_webserver=auth_local_webserver, dialect=dialect,
148166
location=location, configuration=configuration,
149-
credentials=credentials, verbose=verbose, private_key=private_key)
167+
credentials=credentials, **kwargs)
150168

151169

152170
def to_gbq(dataframe, destination_table, project_id=None, chunksize=None,

pandas/tests/io/test_gbq.py

+50-8
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88

99
import pandas as pd
1010
from pandas import DataFrame
11-
import pandas.util.testing as tm
1211

1312
api_exceptions = pytest.importorskip("google.api_core.exceptions")
1413
bigquery = pytest.importorskip("google.cloud.bigquery")
@@ -90,16 +89,59 @@ def make_mixed_dataframe_v2(test_size):
9089
index=range(test_size))
9190

9291

93-
def test_read_gbq_without_dialect_warns_future_change(monkeypatch):
94-
# Default dialect is changing to standard SQL. See:
95-
# https://github.com/pydata/pandas-gbq/issues/195
92+
def test_read_gbq_with_deprecated_kwargs(monkeypatch):
93+
captured_kwargs = {}
9694

97-
def mock_read_gbq(*args, **kwargs):
95+
def mock_read_gbq(sql, **kwargs):
96+
captured_kwargs.update(kwargs)
9897
return DataFrame([[1.0]])
9998

100-
monkeypatch.setattr(pandas_gbq, 'read_gbq', mock_read_gbq)
101-
with tm.assert_produces_warning(FutureWarning):
102-
pd.read_gbq("SELECT 1")
99+
monkeypatch.setattr("pandas_gbq.read_gbq", mock_read_gbq)
100+
private_key = object()
101+
pd.read_gbq("SELECT 1", verbose=True, private_key=private_key)
102+
103+
assert captured_kwargs["verbose"]
104+
assert captured_kwargs["private_key"] is private_key
105+
106+
107+
def test_read_gbq_without_deprecated_kwargs(monkeypatch):
108+
captured_kwargs = {}
109+
110+
def mock_read_gbq(sql, **kwargs):
111+
captured_kwargs.update(kwargs)
112+
return DataFrame([[1.0]])
113+
114+
monkeypatch.setattr("pandas_gbq.read_gbq", mock_read_gbq)
115+
pd.read_gbq("SELECT 1")
116+
117+
assert "verbose" not in captured_kwargs
118+
assert "private_key" not in captured_kwargs
119+
120+
121+
def test_read_gbq_with_new_kwargs(monkeypatch):
122+
captured_kwargs = {}
123+
124+
def mock_read_gbq(sql, **kwargs):
125+
captured_kwargs.update(kwargs)
126+
return DataFrame([[1.0]])
127+
128+
monkeypatch.setattr("pandas_gbq.read_gbq", mock_read_gbq)
129+
pd.read_gbq("SELECT 1", use_bqstorage_api=True)
130+
131+
assert captured_kwargs["use_bqstorage_api"]
132+
133+
134+
def test_read_gbq_without_new_kwargs(monkeypatch):
135+
captured_kwargs = {}
136+
137+
def mock_read_gbq(sql, **kwargs):
138+
captured_kwargs.update(kwargs)
139+
return DataFrame([[1.0]])
140+
141+
monkeypatch.setattr("pandas_gbq.read_gbq", mock_read_gbq)
142+
pd.read_gbq("SELECT 1")
143+
144+
assert "use_bqstorage_api" not in captured_kwargs
103145

104146

105147
@pytest.mark.single

0 commit comments

Comments
 (0)