diff --git a/doc/source/io.rst b/doc/source/io.rst index f8fe6fc8a4c3a..ed43b8d731260 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -3643,9 +3643,18 @@ You will then be authenticated to the specified BigQuery account via Google's Oauth2 mechanism. In general, this is as simple as following the prompts in a browser window which will be opened for you. Should the browser not be available, or fail to launch, a code will be provided to complete the process -manually. Additional information on the authentication mechanism can be found +manually. Additional information on this authentication mechanism can be found `here `__ +Alternatively, you can use a headless authentication mechanism via the Google Cloud SDK. More +information on installing the SDK and authenticating is available `here `__ + +Once you have your authentication credentials setup, you can use this approach by including the gcloud_credentials parameter. It will accept either a boolean True (in which case it uses the SDK's default credentials path), or string filepath to the credentials file: + +.. code-block:: python + + data_frame = pd.read_gbq('SELECT * FROM test_dataset.test_table', project_id = projectid, gcloud_credentials = True) + You can define which column from BigQuery to use as an index in the destination DataFrame as well as a preferred column order as follows: diff --git a/pandas/core/common.py b/pandas/core/common.py index 937dc421e3926..7ab88edd77d4b 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -2573,13 +2573,13 @@ def is_hashable(arg): >>> is_hashable(a) False """ - # don't consider anything not collections.Hashable, so as not to broaden - # the definition of hashable beyond that. For example, old-style classes - # are not collections.Hashable but they won't fail hash(). - if not isinstance(arg, collections.Hashable): - return False + # unfortunately, we can't use isinstance(arg, collections.Hashable), which + # can be faster than calling hash, because numpy scalars on Python 3 fail + # this test + + # reconsider this decision once this numpy bug is fixed: + # https://github.com/numpy/numpy/issues/5562 - # narrow the definition of hashable if hash(arg) fails in practice try: hash(arg) except TypeError: diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py index 572a8be5c65e8..f921c027513e2 100644 --- a/pandas/io/gbq.py +++ b/pandas/io/gbq.py @@ -1,9 +1,11 @@ from datetime import datetime +import os import json import logging import sys from time import sleep import uuid +import traceback import numpy as np @@ -33,20 +35,22 @@ try: from apiclient.discovery import build from apiclient.http import MediaFileUpload - from apiclient.errors import HttpError + from apiclient.errors import HttpError from oauth2client.client import OAuth2WebServerFlow from oauth2client.client import AccessTokenRefreshError from oauth2client.client import flow_from_clientsecrets + from oauth2client.client import Credentials from oauth2client.file import Storage from oauth2client.tools import run _GOOGLE_API_CLIENT_INSTALLED=True _GOOGLE_API_CLIENT_VERSION = pkg_resources.get_distribution('google-api-python-client').version - if LooseVersion(_GOOGLE_API_CLIENT_VERSION) >= '1.2.0': + if LooseVersion(_GOOGLE_API_CLIENT_VERSION) >= '1.2': _GOOGLE_API_CLIENT_VALID_VERSION = True except ImportError: + traceback.format_exc() _GOOGLE_API_CLIENT_INSTALLED = False @@ -72,6 +76,13 @@ logger = logging.getLogger('pandas.io.gbq') logger.setLevel(logging.ERROR) +class MissingOauthCredentials(PandasError, IOError): + """ + Raised when Google BigQuery authentication credentials + file is missing, but was needed. + """ + pass + class InvalidPageToken(PandasError, IOError): """ Raised when Google BigQuery fails to return, @@ -119,11 +130,12 @@ class InvalidColumnOrder(PandasError, IOError): pass class GbqConnector: - def __init__(self, project_id, reauth=False): - self.project_id = project_id - self.reauth = reauth - self.credentials = self.get_credentials() - self.service = self.get_service(self.credentials) + def __init__(self, project_id, reauth=False, gcloud_credentials=None): + self.project_id = project_id + self.reauth = reauth + self.gcloud_credentials = gcloud_credentials + self.credentials = self.get_credentials() + self.service = self.get_service(self.credentials) def get_credentials(self): flow = OAuth2WebServerFlow(client_id='495642085510-k0tmvj2m941jhre2nbqka17vqpjfddtd.apps.googleusercontent.com', @@ -131,8 +143,19 @@ def get_credentials(self): scope='https://www.googleapis.com/auth/bigquery', redirect_uri='urn:ietf:wg:oauth:2.0:oob') - storage = Storage('bigquery_credentials.dat') - credentials = storage.get() + if self.gcloud_credentials is not None: + gcfp = self.gcloud_credentials # a bit of mangling since this is dual-typed, str or bool + if self.gcloud_credentials == True: + gcfp = '~/.config/gcloud/credentials' + credfn = os.path.expanduser(gcfp) + if not os.path.exists(credfn): + raise MissingOauthCredentials("Required google cloud authentication credentials file {0} missing.".format(credfn)) + gcloud_cred = json.loads(open(credfn).read())['data'][0]['credential'] + credentials = Credentials.new_from_json(json.dumps(gcloud_cred)) + return credentials + else: + storage = Storage('bigquery_credentials.dat') + credentials = storage.get() if credentials is None or credentials.invalid or self.reauth: credentials = run(flow, storage) @@ -328,7 +351,8 @@ def _test_imports(): if not _HTTPLIB2_INSTALLED: raise ImportError("pandas requires httplib2 for Google BigQuery support") -def read_gbq(query, project_id = None, index_col=None, col_order=None, reauth=False): +def read_gbq(query, project_id = None, index_col=None, col_order=None, reauth=False, + gcloud_credentials=None): """Load data from Google BigQuery. THIS IS AN EXPERIMENTAL LIBRARY @@ -353,6 +377,12 @@ def read_gbq(query, project_id = None, index_col=None, col_order=None, reauth=Fa reauth : boolean (default False) Force Google BigQuery to reauthenticate the user. This is useful if multiple accounts are used. + gcloud_credentials : boolean or str (default None) + Use oauth2 credentials from gcloud auth login. This is useful + if pandas is being run in an ipython notebook, and the user + has pre-existing authentication tokens. + Set to True to use the default path, ~/.config/gcloud/credentials. + Else provide an explicit path to file to use for credentials. Returns ------- @@ -366,7 +396,7 @@ def read_gbq(query, project_id = None, index_col=None, col_order=None, reauth=Fa if not project_id: raise TypeError("Missing required parameter: project_id") - connector = GbqConnector(project_id, reauth = reauth) + connector = GbqConnector(project_id, reauth = reauth, gcloud_credentials = gcloud_credentials) schema, pages = connector.run_query(query) dataframe_list = [] while len(pages) > 0: @@ -401,7 +431,7 @@ def read_gbq(query, project_id = None, index_col=None, col_order=None, reauth=Fa return final_df def to_gbq(dataframe, destination_table, project_id=None, chunksize=10000, - verbose=True, reauth=False): + verbose=True, reauth=False, gcloud_credentials=None): """Write a DataFrame to a Google BigQuery table. THIS IS AN EXPERIMENTAL LIBRARY @@ -430,6 +460,12 @@ def to_gbq(dataframe, destination_table, project_id=None, chunksize=10000, reauth : boolean (default False) Force Google BigQuery to reauthenticate the user. This is useful if multiple accounts are used. + gcloud_credentials : boolean or str (default None) + Use oauth2 credentials from gcloud auth login. This is useful + if pandas is being run in an ipython notebook, and the user + has pre-existing authentication tokens. + Set to True to use the default path, ~/.config/gcloud/credentials. + Else provide an explicit path to file to use for credentials. """ _test_imports() @@ -440,7 +476,7 @@ def to_gbq(dataframe, destination_table, project_id=None, chunksize=10000, if not '.' in destination_table: raise NotFoundException("Invalid Table Name. Should be of the form 'datasetId.tableId' ") - connector = GbqConnector(project_id, reauth = reauth) + connector = GbqConnector(project_id, reauth = reauth, gcloud_credentials = gcloud_credentials) dataset_id, table_id = destination_table.rsplit('.',1) connector.load_data(dataframe, dataset_id, table_id, chunksize, verbose) diff --git a/pandas/io/tests/data/gcloud_credentials b/pandas/io/tests/data/gcloud_credentials new file mode 100644 index 0000000000000..d0bd0381048a8 --- /dev/null +++ b/pandas/io/tests/data/gcloud_credentials @@ -0,0 +1,62 @@ +{ + "data": [ + { + "credential": { + "_class": "OAuth2Credentials", + "_module": "oauth2client.client", + "access_token": "ya29.xXx", + "client_id": "1112223456.apps.googleusercontent.com", + "client_secret": "aBc467", + "id_token": { + "at_hash": "OlAp__aV", + "aud": "1112223456.apps.googleusercontent.com", + "azp": "1112223456.apps.googleusercontent.com", + "cid": "1112223456.apps.googleusercontent.com", + "email": "luv-python-pandas@somemail.com", + "email_verified": true, + "exp": 1414558238, + "iat": 1414554338, + "id": "113403125016275849302", + "iss": "accounts.google.com", + "sub": "113403125016229475663", + "token_hash": "OlAp__aV", + "verified_email": true + }, + "invalid": @INVALID@, + "refresh_token": "1/asf87bbEGsb78", + "revoke_uri": "https://accounts.google.com/o/oauth2/revoke", + "token_expiry": "2014-10-29T04:50:38Z", + "token_response": { + "access_token": "ya29.bYsadfiU8542B5", + "expires_in": 3600, + "id_token": { + "at_hash": "OlAp__aV", + "aud": "11112233456.apps.googleusercontent.com", + "azp": "11112223456.apps.googleusercontent.com", + "cid": "11112223456.apps.googleusercontent.com", + "email": "luv-python-pandas@somemail.com", + "email_verified": true, + "exp": 1414558238, + "iat": 1414554338, + "id": "11340312501621345098732", + "iss": "accounts.google.com", + "sub": "1134031250162435660892", + "token_hash": "OlAp__aV", + "verified_email": true + }, + "refresh_token": "1/6v6asdf6NrR92", + "token_type": "Bearer" + }, + "token_uri": "https://accounts.google.com/o/oauth2/token", + "user_agent": "Cloud SDK Command Line Tool" + }, + "key": { + "account": "luv-python-pandas@somemail.com", + "clientId": "11112223456.apps.googleusercontent.com", + "scope": "https://www.googleapis.com/auth/appengine.admin https://www.googleapis.com/auth/bigquery https://www.googleapis.com/auth/compute https://www.googleapis.com/auth/devstorage.full_control https://www.googleapis.com/auth/userinfo.email https://www.googleapis.com/auth/ndev.cloudman https://www.googleapis.com/auth/cloud-platform https://www.googleapis.com/auth/sqlservice.admin https://www.googleapis.com/auth/prediction https://www.googleapis.com/auth/projecthosting", + "type": "google-cloud-sdk" + } + } + ], + "file_version": 1 +} diff --git a/pandas/io/tests/test_gbq.py b/pandas/io/tests/test_gbq.py index 2f79cc8ba1826..97e4d76a2eb20 100644 --- a/pandas/io/tests/test_gbq.py +++ b/pandas/io/tests/test_gbq.py @@ -9,6 +9,7 @@ import sys import platform from time import sleep +from tempfile import NamedTemporaryFile import numpy as np @@ -36,6 +37,27 @@ def test_requirements(): raise nose.SkipTest(import_exception) class TestGBQConnectorIntegration(tm.TestCase): + + @classmethod + def setUpClass(cls): + with open(os.path.join(tm.get_data_path(), 'gcloud_credentials'), 'r') as fin: + creds_json = fin.read() + + creds_json_invalid = creds_json.replace('@INVALID@', '"true"') + creds_json_valid = creds_json.replace('@INVALID@', '"false"') + + cls.creds_file_valid = NamedTemporaryFile() + cls.creds_file_valid.write(creds_json_valid.encode('UTF-8')) + cls.creds_file_valid.flush() + + cls.creds_file_invalid = NamedTemporaryFile() + cls.creds_file_invalid.write(creds_json_invalid.encode('UTF-8')) + cls.creds_file_invalid.flush() + + cls.non_creds_file = NamedTemporaryFile() + cls.non_creds_file.write('{"token": "50414e444153204556455259574845524521"}'.encode('UTF-8')) + cls.non_creds_file.flush() + def setUp(self): test_requirements() @@ -64,6 +86,28 @@ def test_should_be_able_to_get_results_from_query(self): schema, pages = self.sut.run_query('SELECT 1') self.assertTrue(pages is not None) + def test_should_raise_exception_with_invalid_gcloud_creds_path(self): + with tm.assertRaises(gbq.MissingOauthCredentials): + gbq.GbqConnector(PROJECT_ID, gcloud_credentials='missing_file') + + def test_should_fail_with_invalid_gcloud_credentials(self): + credentials = gbq.GbqConnector(PROJECT_ID, gcloud_credentials=self.creds_file_invalid.name).credentials + self.assertEqual(credentials.invalid, "true") + + def test_should_be_able_to_get_valid_gcloud_credentials(self): + credentials = gbq.GbqConnector(PROJECT_ID, gcloud_credentials=self.creds_file_valid.name).credentials + self.assertEqual(credentials.invalid, "false") + + def test_should_fail_if_gcloud_credentials_incorrectly_formatted(self): + with tm.assertRaises(KeyError): + gbq.GbqConnector(PROJECT_ID, gcloud_credentials=self.non_creds_file.name) + + @classmethod + def tearDownClass(cls): + cls.creds_file_valid.close() + cls.creds_file_invalid.close() + cls.non_creds_file.close() + class TestReadGBQUnitTests(tm.TestCase): def setUp(self): test_requirements() diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index d8ce98350627d..d0ae7c9988c8d 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -424,7 +424,7 @@ def __hash__(self): raise TypeError("Not hashable") hashable = ( - 1, 'a', tuple(), (1,), HashableClass(), + 1, 3.14, np.float64(3.14), 'a', tuple(), (1,), HashableClass(), ) not_hashable = ( [], UnhashableClass1(), @@ -434,13 +434,10 @@ def __hash__(self): ) for i in hashable: - assert isinstance(i, collections.Hashable) assert com.is_hashable(i) for i in not_hashable: - assert not isinstance(i, collections.Hashable) assert not com.is_hashable(i) for i in abc_hashable_not_really_hashable: - assert isinstance(i, collections.Hashable) assert not com.is_hashable(i) # numpy.array is no longer collections.Hashable as of @@ -455,7 +452,7 @@ class OldStyleClass(): pass c = OldStyleClass() assert not isinstance(c, collections.Hashable) - assert not com.is_hashable(c) + assert com.is_hashable(c) hash(c) # this will not raise