Skip to content

ENH: improve bigquery connector to optionally allow use of gcloud credentials #8590

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 14 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion doc/source/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3643,9 +3643,18 @@ You will then be authenticated to the specified BigQuery account
via Google's Oauth2 mechanism. In general, this is as simple as following the
prompts in a browser window which will be opened for you. Should the browser not
be available, or fail to launch, a code will be provided to complete the process
manually. Additional information on the authentication mechanism can be found
manually. Additional information on this authentication mechanism can be found
`here <https://developers.google.com/accounts/docs/OAuth2#clientside/>`__

Alternatively, you can use a headless authentication mechanism via the Google Cloud SDK. More
information on installing the SDK and authenticating is available `here <https://cloud.google.com/sdk/gcloud/>`__

Once you have your authentication credentials setup, you can use this approach by including the gcloud_credentials parameter. It will accept either a boolean True (in which case it uses the SDK's default credentials path), or string filepath to the credentials file:

.. code-block:: python

data_frame = pd.read_gbq('SELECT * FROM test_dataset.test_table', project_id = projectid, gcloud_credentials = True)

You can define which column from BigQuery to use as an index in the
destination DataFrame as well as a preferred column order as follows:

Expand Down
12 changes: 6 additions & 6 deletions pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -2573,13 +2573,13 @@ def is_hashable(arg):
>>> is_hashable(a)
False
"""
# don't consider anything not collections.Hashable, so as not to broaden
# the definition of hashable beyond that. For example, old-style classes
# are not collections.Hashable but they won't fail hash().
if not isinstance(arg, collections.Hashable):
return False
# unfortunately, we can't use isinstance(arg, collections.Hashable), which
# can be faster than calling hash, because numpy scalars on Python 3 fail
# this test

# reconsider this decision once this numpy bug is fixed:
# https://github.com/numpy/numpy/issues/5562

# narrow the definition of hashable if hash(arg) fails in practice
try:
hash(arg)
except TypeError:
Expand Down
62 changes: 49 additions & 13 deletions pandas/io/gbq.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
from datetime import datetime
import os
import json
import logging
import sys
from time import sleep
import uuid
import traceback

import numpy as np

Expand Down Expand Up @@ -33,20 +35,22 @@
try:
from apiclient.discovery import build
from apiclient.http import MediaFileUpload
from apiclient.errors import HttpError
from apiclient.errors import HttpError

from oauth2client.client import OAuth2WebServerFlow
from oauth2client.client import AccessTokenRefreshError
from oauth2client.client import flow_from_clientsecrets
from oauth2client.client import Credentials
from oauth2client.file import Storage
from oauth2client.tools import run
_GOOGLE_API_CLIENT_INSTALLED=True
_GOOGLE_API_CLIENT_VERSION = pkg_resources.get_distribution('google-api-python-client').version

if LooseVersion(_GOOGLE_API_CLIENT_VERSION) >= '1.2.0':
if LooseVersion(_GOOGLE_API_CLIENT_VERSION) >= '1.2':
_GOOGLE_API_CLIENT_VALID_VERSION = True

except ImportError:
traceback.format_exc()
_GOOGLE_API_CLIENT_INSTALLED = False


Expand All @@ -72,6 +76,13 @@
logger = logging.getLogger('pandas.io.gbq')
logger.setLevel(logging.ERROR)

class MissingOauthCredentials(PandasError, IOError):
"""
Raised when Google BigQuery authentication credentials
file is missing, but was needed.
"""
pass

class InvalidPageToken(PandasError, IOError):
"""
Raised when Google BigQuery fails to return,
Expand Down Expand Up @@ -119,20 +130,32 @@ class InvalidColumnOrder(PandasError, IOError):
pass

class GbqConnector:
def __init__(self, project_id, reauth=False):
self.project_id = project_id
self.reauth = reauth
self.credentials = self.get_credentials()
self.service = self.get_service(self.credentials)
def __init__(self, project_id, reauth=False, gcloud_credentials=None):
self.project_id = project_id
self.reauth = reauth
self.gcloud_credentials = gcloud_credentials
self.credentials = self.get_credentials()
self.service = self.get_service(self.credentials)

def get_credentials(self):
flow = OAuth2WebServerFlow(client_id='495642085510-k0tmvj2m941jhre2nbqka17vqpjfddtd.apps.googleusercontent.com',
client_secret='kOc9wMptUtxkcIFbtZCcrEAc',
scope='https://www.googleapis.com/auth/bigquery',
redirect_uri='urn:ietf:wg:oauth:2.0:oob')

storage = Storage('bigquery_credentials.dat')
credentials = storage.get()
if self.gcloud_credentials is not None:
gcfp = self.gcloud_credentials # a bit of mangling since this is dual-typed, str or bool
if self.gcloud_credentials == True:
gcfp = '~/.config/gcloud/credentials'
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is not safe on windows. I don't recall exactly what you need to do, but I think you have to use the HOME env variable, then use os.join to construct paths.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doesn't the expanduser on line 150 take care of that? From https://docs.python.org/2/library/os.path.html#os.path.expanduser:

"On Unix and Windows, return the argument with an initial component of ~ or ~user replaced by that user‘s home directory."

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm, yeah prob does. ok

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ichuang Can we have those imports moved to the top to conform with PEP8?

credfn = os.path.expanduser(gcfp)
if not os.path.exists(credfn):
raise MissingOauthCredentials("Required google cloud authentication credentials file {0} missing.".format(credfn))
gcloud_cred = json.loads(open(credfn).read())['data'][0]['credential']
credentials = Credentials.new_from_json(json.dumps(gcloud_cred))
return credentials
else:
storage = Storage('bigquery_credentials.dat')
credentials = storage.get()

if credentials is None or credentials.invalid or self.reauth:
credentials = run(flow, storage)
Expand Down Expand Up @@ -328,7 +351,8 @@ def _test_imports():
if not _HTTPLIB2_INSTALLED:
raise ImportError("pandas requires httplib2 for Google BigQuery support")

def read_gbq(query, project_id = None, index_col=None, col_order=None, reauth=False):
def read_gbq(query, project_id = None, index_col=None, col_order=None, reauth=False,
gcloud_credentials=None):
"""Load data from Google BigQuery.

THIS IS AN EXPERIMENTAL LIBRARY
Expand All @@ -353,6 +377,12 @@ def read_gbq(query, project_id = None, index_col=None, col_order=None, reauth=Fa
reauth : boolean (default False)
Force Google BigQuery to reauthenticate the user. This is useful
if multiple accounts are used.
gcloud_credentials : boolean or str (default None)
Use oauth2 credentials from gcloud auth login. This is useful
if pandas is being run in an ipython notebook, and the user
has pre-existing authentication tokens.
Set to True to use the default path, ~/.config/gcloud/credentials.
Else provide an explicit path to file to use for credentials.

Returns
-------
Expand All @@ -366,7 +396,7 @@ def read_gbq(query, project_id = None, index_col=None, col_order=None, reauth=Fa
if not project_id:
raise TypeError("Missing required parameter: project_id")

connector = GbqConnector(project_id, reauth = reauth)
connector = GbqConnector(project_id, reauth = reauth, gcloud_credentials = gcloud_credentials)
schema, pages = connector.run_query(query)
dataframe_list = []
while len(pages) > 0:
Expand Down Expand Up @@ -401,7 +431,7 @@ def read_gbq(query, project_id = None, index_col=None, col_order=None, reauth=Fa
return final_df

def to_gbq(dataframe, destination_table, project_id=None, chunksize=10000,
verbose=True, reauth=False):
verbose=True, reauth=False, gcloud_credentials=None):
"""Write a DataFrame to a Google BigQuery table.

THIS IS AN EXPERIMENTAL LIBRARY
Expand Down Expand Up @@ -430,6 +460,12 @@ def to_gbq(dataframe, destination_table, project_id=None, chunksize=10000,
reauth : boolean (default False)
Force Google BigQuery to reauthenticate the user. This is useful
if multiple accounts are used.
gcloud_credentials : boolean or str (default None)
Use oauth2 credentials from gcloud auth login. This is useful
if pandas is being run in an ipython notebook, and the user
has pre-existing authentication tokens.
Set to True to use the default path, ~/.config/gcloud/credentials.
Else provide an explicit path to file to use for credentials.

"""
_test_imports()
Expand All @@ -440,7 +476,7 @@ def to_gbq(dataframe, destination_table, project_id=None, chunksize=10000,
if not '.' in destination_table:
raise NotFoundException("Invalid Table Name. Should be of the form 'datasetId.tableId' ")

connector = GbqConnector(project_id, reauth = reauth)
connector = GbqConnector(project_id, reauth = reauth, gcloud_credentials = gcloud_credentials)
dataset_id, table_id = destination_table.rsplit('.',1)

connector.load_data(dataframe, dataset_id, table_id, chunksize, verbose)
Expand Down
62 changes: 62 additions & 0 deletions pandas/io/tests/data/gcloud_credentials
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
{
"data": [
{
"credential": {
"_class": "OAuth2Credentials",
"_module": "oauth2client.client",
"access_token": "ya29.xXx",
"client_id": "1112223456.apps.googleusercontent.com",
"client_secret": "aBc467",
"id_token": {
"at_hash": "OlAp__aV",
"aud": "1112223456.apps.googleusercontent.com",
"azp": "1112223456.apps.googleusercontent.com",
"cid": "1112223456.apps.googleusercontent.com",
"email": "[email protected]",
"email_verified": true,
"exp": 1414558238,
"iat": 1414554338,
"id": "113403125016275849302",
"iss": "accounts.google.com",
"sub": "113403125016229475663",
"token_hash": "OlAp__aV",
"verified_email": true
},
"invalid": @INVALID@,
"refresh_token": "1/asf87bbEGsb78",
"revoke_uri": "https://accounts.google.com/o/oauth2/revoke",
"token_expiry": "2014-10-29T04:50:38Z",
"token_response": {
"access_token": "ya29.bYsadfiU8542B5",
"expires_in": 3600,
"id_token": {
"at_hash": "OlAp__aV",
"aud": "11112233456.apps.googleusercontent.com",
"azp": "11112223456.apps.googleusercontent.com",
"cid": "11112223456.apps.googleusercontent.com",
"email": "[email protected]",
"email_verified": true,
"exp": 1414558238,
"iat": 1414554338,
"id": "11340312501621345098732",
"iss": "accounts.google.com",
"sub": "1134031250162435660892",
"token_hash": "OlAp__aV",
"verified_email": true
},
"refresh_token": "1/6v6asdf6NrR92",
"token_type": "Bearer"
},
"token_uri": "https://accounts.google.com/o/oauth2/token",
"user_agent": "Cloud SDK Command Line Tool"
},
"key": {
"account": "[email protected]",
"clientId": "11112223456.apps.googleusercontent.com",
"scope": "https://www.googleapis.com/auth/appengine.admin https://www.googleapis.com/auth/bigquery https://www.googleapis.com/auth/compute https://www.googleapis.com/auth/devstorage.full_control https://www.googleapis.com/auth/userinfo.email https://www.googleapis.com/auth/ndev.cloudman https://www.googleapis.com/auth/cloud-platform https://www.googleapis.com/auth/sqlservice.admin https://www.googleapis.com/auth/prediction https://www.googleapis.com/auth/projecthosting",
"type": "google-cloud-sdk"
}
}
],
"file_version": 1
}
44 changes: 44 additions & 0 deletions pandas/io/tests/test_gbq.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import sys
import platform
from time import sleep
from tempfile import NamedTemporaryFile

import numpy as np

Expand Down Expand Up @@ -36,6 +37,27 @@ def test_requirements():
raise nose.SkipTest(import_exception)

class TestGBQConnectorIntegration(tm.TestCase):

@classmethod
def setUpClass(cls):
with open(os.path.join(tm.get_data_path(), 'gcloud_credentials'), 'r') as fin:
creds_json = fin.read()

creds_json_invalid = creds_json.replace('@INVALID@', '"true"')
creds_json_valid = creds_json.replace('@INVALID@', '"false"')

cls.creds_file_valid = NamedTemporaryFile()
cls.creds_file_valid.write(creds_json_valid.encode('UTF-8'))
cls.creds_file_valid.flush()

cls.creds_file_invalid = NamedTemporaryFile()
cls.creds_file_invalid.write(creds_json_invalid.encode('UTF-8'))
cls.creds_file_invalid.flush()

cls.non_creds_file = NamedTemporaryFile()
cls.non_creds_file.write('{"token": "50414e444153204556455259574845524521"}'.encode('UTF-8'))
cls.non_creds_file.flush()

def setUp(self):
test_requirements()

Expand Down Expand Up @@ -64,6 +86,28 @@ def test_should_be_able_to_get_results_from_query(self):
schema, pages = self.sut.run_query('SELECT 1')
self.assertTrue(pages is not None)

def test_should_raise_exception_with_invalid_gcloud_creds_path(self):
with tm.assertRaises(gbq.MissingOauthCredentials):
gbq.GbqConnector(PROJECT_ID, gcloud_credentials='missing_file')

def test_should_fail_with_invalid_gcloud_credentials(self):
credentials = gbq.GbqConnector(PROJECT_ID, gcloud_credentials=self.creds_file_invalid.name).credentials
self.assertEqual(credentials.invalid, "true")

def test_should_be_able_to_get_valid_gcloud_credentials(self):
credentials = gbq.GbqConnector(PROJECT_ID, gcloud_credentials=self.creds_file_valid.name).credentials
self.assertEqual(credentials.invalid, "false")

def test_should_fail_if_gcloud_credentials_incorrectly_formatted(self):
with tm.assertRaises(KeyError):
gbq.GbqConnector(PROJECT_ID, gcloud_credentials=self.non_creds_file.name)

@classmethod
def tearDownClass(cls):
cls.creds_file_valid.close()
cls.creds_file_invalid.close()
cls.non_creds_file.close()

class TestReadGBQUnitTests(tm.TestCase):
def setUp(self):
test_requirements()
Expand Down
7 changes: 2 additions & 5 deletions pandas/tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,7 @@ def __hash__(self):
raise TypeError("Not hashable")

hashable = (
1, 'a', tuple(), (1,), HashableClass(),
1, 3.14, np.float64(3.14), 'a', tuple(), (1,), HashableClass(),
)
not_hashable = (
[], UnhashableClass1(),
Expand All @@ -434,13 +434,10 @@ def __hash__(self):
)

for i in hashable:
assert isinstance(i, collections.Hashable)
assert com.is_hashable(i)
for i in not_hashable:
assert not isinstance(i, collections.Hashable)
assert not com.is_hashable(i)
for i in abc_hashable_not_really_hashable:
assert isinstance(i, collections.Hashable)
assert not com.is_hashable(i)

# numpy.array is no longer collections.Hashable as of
Expand All @@ -455,7 +452,7 @@ class OldStyleClass():
pass
c = OldStyleClass()
assert not isinstance(c, collections.Hashable)
assert not com.is_hashable(c)
assert com.is_hashable(c)
hash(c) # this will not raise


Expand Down