From 83b66c2708af326655d676f451515d2f71c567e5 Mon Sep 17 00:00:00 2001 From: Jason Ng Date: Fri, 7 Apr 2017 22:07:02 -0400 Subject: [PATCH 01/42] Add new from_gbq() method and tweak requirements --- pandas_gbq/gbq.py | 128 ++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 2 + 2 files changed, 130 insertions(+) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 9473b082..b2159d02 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -6,12 +6,14 @@ import time import sys import os +import uuid import numpy as np from distutils.version import StrictVersion from pandas import compat, DataFrame, concat from pandas.compat import lzip, bytes_to_str +from google.cloud import bigquery def _check_google_client_version(): @@ -1014,6 +1016,132 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, return final_df +def from_gbq(query, project_id=None, index_col=None, col_order=None, + private_key=None, dialect='legacy', configuration = None, **kwargs): + r"""Load data from Google BigQuery using google-cloud-python + + The main method a user calls to execute a Query in Google BigQuery + and read results into a pandas DataFrame. + + The Google Cloud library is used. + Documentation is available `here + `__ + + Authentication via Google Cloud can be performed a number of ways, see: + + The easiest is to download a service account JSON keyfile and point to it using + an environment variable: + `$ export GOOGLE_APPLICATION_CREDENTIALS="/path/to/keyfile.json"` + + Parameters + ---------- + query : str + SQL-Like Query to return data values + project_id : str (optional) + Google BigQuery Account project ID. + index_col : str (optional) + Name of result column to use for index in results DataFrame + col_order : list(str) (optional) + List of BigQuery column names in the desired order for results + DataFrame + private_key : str (optional) + Path to service account private key in JSON format. If none is provided, + will default to the GOOGLE_APPLICATION_CREDENTIALS environment variable + or another form of authentication (see above) + dialect : {'legacy', 'standard'}, default 'legacy' + 'legacy' : Use BigQuery's legacy SQL dialect. + 'standard' : Use BigQuery's standard SQL (beta), which is + compliant with the SQL 2011 standard. For more information + see `BigQuery SQL Reference + `__ + configuration : dict (optional) + Because of current limitations (https://github.com/GoogleCloudPlatform/google-cloud-python/issues/2765) + only a certain number of configuration settings are currently implemented. You can set them with + like: `from_gbq(q,configuration={'allow_large_results':True,'use_legacy_sql':False})` + Allowable settings: + -allow_large_results + -create_disposition + -default_dataset + -destination + -flatten_results + -priority + -use_query_cache + -use_legacy_sql + -dry_run + -write_disposition + -maximum_billing_tier + -maximum_bytes_billed + + Returns + ------- + df: DataFrame + DataFrame representing results of query + + """ + + if private_key: + os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = private_key + + def _wait_for_job(job): + while True: + job.reload() # Refreshes the state via a GET request. + if job.state == 'DONE': + if job.error_result: + raise RuntimeError(job.errors) + return + time.sleep(1) + + client = bigquery.Client(project=project_id) + query_job = client.run_async_query(str(uuid.uuid4()), query) + + if dialect != 'legacy': + query_job.use_legacy_sql = False + + if configuration: + for setting, value in configuration.items(): + setattr(query_job, setting, value) + + query_job.begin() + _wait_for_job(query_job) + + query_results = query_job.results() + + rows, total_rows, page_token = query_results.fetch_data() + columns=[field.name for field in query_results.schema] + data = rows + + final_df = DataFrame(data=data,columns=columns) + + # Reindex the DataFrame on the provided column + if index_col is not None: + if index_col in final_df.columns: + final_df.set_index(index_col, inplace=True) + else: + raise InvalidIndexColumn( + 'Index column "{0}" does not exist in DataFrame.' + .format(index_col) + ) + + # Change the order of columns in the DataFrame based on provided list + if col_order is not None: + if sorted(col_order) == sorted(final_df.columns): + final_df = final_df[col_order] + else: + raise InvalidColumnOrder( + 'Column order does not match this DataFrame.' + ) + + # cast BOOLEAN and INTEGER columns from object to bool/int + # if they dont have any nulls + type_map = {'BOOLEAN': bool, 'INTEGER': int} + for field in query_results.schema: + if field.field_type in type_map and \ + final_df[field.name].notnull().all(): + final_df[field.name] = \ + final_df[field.name].astype(type_map[field.field_type]) + + return final_df + def to_gbq(dataframe, destination_table, project_id, chunksize=10000, verbose=True, reauth=False, if_exists='fail', private_key=None, diff --git a/requirements.txt b/requirements.txt index c72b5a5a..d9022078 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,5 @@ google-api-python-client google-auth google-auth-httplib2 google-auth-oauthlib +google-cloud +uuid From 438ff590704446d07348c8c6735383bebdcf5813 Mon Sep 17 00:00:00 2001 From: Jason Ng Date: Sat, 8 Apr 2017 00:53:49 -0400 Subject: [PATCH 02/42] Remove unecessary type conversion Tweaking docstring --- pandas_gbq/gbq.py | 56 +++++++++++++++++------------------------------ 1 file changed, 20 insertions(+), 36 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index b2159d02..c90303fc 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -1017,7 +1017,7 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, return final_df def from_gbq(query, project_id=None, index_col=None, col_order=None, - private_key=None, dialect='legacy', configuration = None, **kwargs): + private_key=None, dialect='legacy', configuration=None, **kwargs): r"""Load data from Google BigQuery using google-cloud-python The main method a user calls to execute a Query in Google BigQuery @@ -1056,21 +1056,14 @@ def from_gbq(query, project_id=None, index_col=None, col_order=None, `__ configuration : dict (optional) Because of current limitations (https://github.com/GoogleCloudPlatform/google-cloud-python/issues/2765) - only a certain number of configuration settings are currently implemented. You can set them with - like: `from_gbq(q,configuration={'allow_large_results':True,'use_legacy_sql':False})` - Allowable settings: - -allow_large_results - -create_disposition - -default_dataset - -destination - -flatten_results - -priority - -use_query_cache - -use_legacy_sql - -dry_run - -write_disposition - -maximum_billing_tier - -maximum_bytes_billed + only some configuration settings are currently implemented. You can pass them + along like in the following: + `from_gbq(q,configuration={'allow_large_results':True,'maximum_billing_tier':2})` + + Example allowable settings: + allow_large_results, create_disposition, default_dataset, destination + flatten_results, priority, use_query_cache, use_legacy_sql, dry_run, + write_disposition, udf_resources, maximum_billing_tier, maximum_bytes_billed Returns ------- @@ -1107,23 +1100,13 @@ def _wait_for_job(job): query_results = query_job.results() rows, total_rows, page_token = query_results.fetch_data() - columns=[field.name for field in query_results.schema] + columns = [field.name for field in query_results.schema] data = rows final_df = DataFrame(data=data,columns=columns) - # Reindex the DataFrame on the provided column - if index_col is not None: - if index_col in final_df.columns: - final_df.set_index(index_col, inplace=True) - else: - raise InvalidIndexColumn( - 'Index column "{0}" does not exist in DataFrame.' - .format(index_col) - ) - # Change the order of columns in the DataFrame based on provided list - if col_order is not None: + if col_order: if sorted(col_order) == sorted(final_df.columns): final_df = final_df[col_order] else: @@ -1131,14 +1114,15 @@ def _wait_for_job(job): 'Column order does not match this DataFrame.' ) - # cast BOOLEAN and INTEGER columns from object to bool/int - # if they dont have any nulls - type_map = {'BOOLEAN': bool, 'INTEGER': int} - for field in query_results.schema: - if field.field_type in type_map and \ - final_df[field.name].notnull().all(): - final_df[field.name] = \ - final_df[field.name].astype(type_map[field.field_type]) + # Reindex the DataFrame on the provided column + if index_col: + if index_col in final_df.columns: + final_df.set_index(index_col, inplace=True) + else: + raise InvalidIndexColumn( + 'Index column "{0}" does not exist in DataFrame.' + .format(index_col) + ) return final_df From 03a9548e53765c0cb00c687cc04b9ab2ce4b8ec0 Mon Sep 17 00:00:00 2001 From: Jason Ng Date: Sun, 9 Apr 2017 09:59:08 -0400 Subject: [PATCH 03/42] Remove old read_gbq, rename from_gbq to read_gbq, and add verbose query job info Remove locale import Remove leftover >>>>>HEAD --- pandas_gbq/gbq.py | 58 +++++++++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 25 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index c90303fc..9b3b55e9 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -455,28 +455,6 @@ def _print(self, msg, end='\n'): sys.stdout.write(msg + end) sys.stdout.flush() - def _start_timer(self): - self.start = time.time() - - def get_elapsed_seconds(self): - return round(time.time() - self.start, 2) - - def print_elapsed_seconds(self, prefix='Elapsed', postfix='s.', - overlong=7): - sec = self.get_elapsed_seconds() - if sec > overlong: - self._print('{} {} {}'.format(prefix, sec, postfix)) - - # http://stackoverflow.com/questions/1094841/reusable-library-to-get-human-readable-version-of-file-size - @staticmethod - def sizeof_fmt(num, suffix='B'): - fmt = "%3.1f %s%s" - for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']: - if abs(num) < 1024.0: - return fmt % (num, unit, suffix) - num /= 1024.0 - return fmt % (num, 'Y', suffix) - def get_service(self): import httplib2 from google_auth_httplib2 import AuthorizedHttp @@ -833,7 +811,6 @@ def delete_and_recreate_table(self, dataset_id, table_id, table_schema): table.create(table_id, table_schema) sleep(delay) - def _get_credentials_file(): return os.environ.get( 'PANDAS_GBQ_CREDENTIALS_FILE') @@ -1016,7 +993,7 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, return final_df -def from_gbq(query, project_id=None, index_col=None, col_order=None, +def read_gbq(query, project_id=None, index_col=None, col_order=None, verbose=True, private_key=None, dialect='legacy', configuration=None, **kwargs): r"""Load data from Google BigQuery using google-cloud-python @@ -1044,6 +1021,8 @@ def from_gbq(query, project_id=None, index_col=None, col_order=None, col_order : list(str) (optional) List of BigQuery column names in the desired order for results DataFrame + verbose : boolean (default True) + Verbose output private_key : str (optional) Path to service account private key in JSON format. If none is provided, will default to the GOOGLE_APPLICATION_CREDENTIALS environment variable @@ -1072,6 +1051,15 @@ def from_gbq(query, project_id=None, index_col=None, col_order=None, """ + # http://stackoverflow.com/questions/1094841/reusable-library-to-get-human-readable-version-of-file-size + def sizeof_fmt(num, suffix='B'): + fmt = "%3.1f %s%s" + for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']: + if abs(num) < 1024.0: + return fmt % (num, unit, suffix) + num /= 1024.0 + return fmt % (num, 'Y', suffix) + if private_key: os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = private_key @@ -1095,11 +1083,31 @@ def _wait_for_job(job): setattr(query_job, setting, value) query_job.begin() - _wait_for_job(query_job) + if verbose: + print("Query running...") + _wait_for_job(query_job) + if verbose: + print("Query done.") + if query_job._properties["statistics"]["query"].get("cacheHit", False): + print("Cache hit.") + elif "statistics" in query_job._properties and "query" in query_job._properties["statistics"]: + bytes_billed = int(query_job._properties["statistics"]["query"].get("totalBytesProcessed", 0)) + bytes_processed = int(query_job._properties["statistics"]["query"].get("totalBytesBilled", 0)) + print("Total bytes billed (processed): %s (%s)" % (sizeof_fmt(bytes_billed),sizeof_fmt(bytes_processed))) query_results = query_job.results() + if verbose: + print("\nRetrieving results...") + rows, total_rows, page_token = query_results.fetch_data() + + if verbose: + print("Got %s rows.") % total_rows + print("\nTotal time taken %s s" % (datetime.utcnow()-query_job.created.replace(tzinfo=None)).seconds) + print("Finished at %s." % datetime.now().strftime('%Y-%m-%d %H:%M:%S')) + + columns = [field.name for field in query_results.schema] data = rows From ae7c0e98ad16ef28e03919879fc870c1a2ca2ad6 Mon Sep 17 00:00:00 2001 From: Jason Ng Date: Tue, 3 Oct 2017 18:50:35 -0400 Subject: [PATCH 04/42] Handle new iterator return type from fetch_data --- pandas_gbq/gbq.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 9b3b55e9..35fe5ce1 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -1099,8 +1099,8 @@ def _wait_for_job(job): if verbose: print("\nRetrieving results...") - - rows, total_rows, page_token = query_results.fetch_data() + rows = list(query_results.fetch_data()) + total_rows = len(rows) if verbose: print("Got %s rows.") % total_rows From d2d01bce737ebb7cec42078a9339af13b81f13da Mon Sep 17 00:00:00 2001 From: Jason Ng Date: Wed, 4 Oct 2017 13:34:39 -0400 Subject: [PATCH 05/42] Remove old read_gbq --- pandas_gbq/gbq.py | 142 +--------------------------------------------- 1 file changed, 1 insertion(+), 141 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 35fe5ce1..96d7125b 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -851,147 +851,7 @@ def _parse_entry(field_value, field_type): elif field_type == 'BOOLEAN': return field_value == 'true' return field_value - - -def read_gbq(query, project_id=None, index_col=None, col_order=None, - reauth=False, verbose=True, private_key=None, - auth_local_webserver=False, dialect='legacy', **kwargs): - r"""Load data from Google BigQuery. - - The main method a user calls to execute a Query in Google BigQuery - and read results into a pandas DataFrame. - - Google BigQuery API Client Library v2 for Python is used. - Documentation is available `here - `__ - - Authentication to the Google BigQuery service is via OAuth 2.0. - - - If "private_key" is not provided: - - By default "application default credentials" are used. - - If default application credentials are not found or are restrictive, - user account credentials are used. In this case, you will be asked to - grant permissions for product name 'pandas GBQ'. - - - If "private_key" is provided: - - Service account credentials will be used to authenticate. - - Parameters - ---------- - query : str - SQL-Like Query to return data values - project_id : str - Google BigQuery Account project ID. - index_col : str (optional) - Name of result column to use for index in results DataFrame - col_order : list(str) (optional) - List of BigQuery column names in the desired order for results - DataFrame - reauth : boolean (default False) - Force Google BigQuery to reauthenticate the user. This is useful - if multiple accounts are used. - verbose : boolean (default True) - Verbose output - private_key : str (optional) - Service account private key in JSON format. Can be file path - or string contents. This is useful for remote server - authentication (eg. jupyter iPython notebook on remote host) - auth_local_webserver : boolean, default False - Use the [local webserver flow] instead of the [console flow] when - getting user credentials. A file named bigquery_credentials.dat will - be created in current dir. You can also set PANDAS_GBQ_CREDENTIALS_FILE - environment variable so as to define a specific path to store this - credential (eg. /etc/keys/bigquery.dat). - - .. [local webserver flow] - http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server - .. [console flow] - http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console - .. versionadded:: 0.2.0 - - dialect : {'legacy', 'standard'}, default 'legacy' - 'legacy' : Use BigQuery's legacy SQL dialect. - 'standard' : Use BigQuery's standard SQL (beta), which is - compliant with the SQL 2011 standard. For more information - see `BigQuery SQL Reference - `__ - - **kwargs : Arbitrary keyword arguments - configuration (dict): query config parameters for job processing. - For example: - - configuration = {'query': {'useQueryCache': False}} - - For more information see `BigQuery SQL Reference - `__ - - Returns - ------- - df: DataFrame - DataFrame representing results of query - - """ - - _test_google_api_imports() - - if not project_id: - raise TypeError("Missing required parameter: project_id") - - if dialect not in ('legacy', 'standard'): - raise ValueError("'{0}' is not valid for dialect".format(dialect)) - - connector = GbqConnector( - project_id, reauth=reauth, verbose=verbose, private_key=private_key, - dialect=dialect, auth_local_webserver=auth_local_webserver) - schema, pages = connector.run_query(query, **kwargs) - dataframe_list = [] - while len(pages) > 0: - page = pages.pop() - dataframe_list.append(_parse_data(schema, page)) - - if len(dataframe_list) > 0: - final_df = concat(dataframe_list, ignore_index=True) - else: - final_df = _parse_data(schema, []) - - # Reindex the DataFrame on the provided column - if index_col is not None: - if index_col in final_df.columns: - final_df.set_index(index_col, inplace=True) - else: - raise InvalidIndexColumn( - 'Index column "{0}" does not exist in DataFrame.' - .format(index_col) - ) - - # Change the order of columns in the DataFrame based on provided list - if col_order is not None: - if sorted(col_order) == sorted(final_df.columns): - final_df = final_df[col_order] - else: - raise InvalidColumnOrder( - 'Column order does not match this DataFrame.' - ) - - # cast BOOLEAN and INTEGER columns from object to bool/int - # if they dont have any nulls - type_map = {'BOOLEAN': bool, 'INTEGER': int} - for field in schema['fields']: - if field['type'] in type_map and \ - final_df[field['name']].notnull().all(): - final_df[field['name']] = \ - final_df[field['name']].astype(type_map[field['type']]) - - connector.print_elapsed_seconds( - 'Total time taken', - datetime.now().strftime('s.\nFinished at %Y-%m-%d %H:%M:%S.'), - 0 - ) - - return final_df + def read_gbq(query, project_id=None, index_col=None, col_order=None, verbose=True, private_key=None, dialect='legacy', configuration=None, **kwargs): From 97410dd96da30b9432a690ac949798349b83ff48 Mon Sep 17 00:00:00 2001 From: Jason Ng Date: Wed, 4 Oct 2017 13:53:03 -0400 Subject: [PATCH 06/42] Change requirement to google-cloud-bigquery and pin to 0.25.0 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index d9022078..9d2969bc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,5 +4,5 @@ google-api-python-client google-auth google-auth-httplib2 google-auth-oauthlib -google-cloud +google-cloud-bigquery==0.25.0 uuid From a0847f8e17d7af74396520196558d2dc6247da3a Mon Sep 17 00:00:00 2001 From: Jason Ng Date: Wed, 4 Oct 2017 15:52:10 -0400 Subject: [PATCH 07/42] Pass any private key to client for auth --- pandas_gbq/gbq.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 96d7125b..e7fd788d 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -920,9 +920,6 @@ def sizeof_fmt(num, suffix='B'): num /= 1024.0 return fmt % (num, 'Y', suffix) - if private_key: - os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = private_key - def _wait_for_job(job): while True: job.reload() # Refreshes the state via a GET request. @@ -931,8 +928,10 @@ def _wait_for_job(job): raise RuntimeError(job.errors) return time.sleep(1) - - client = bigquery.Client(project=project_id) + if private_key: + client = bigquery.Client(project=project_id).from_service_account_json(private_key) + else: + client = bigquery.Client(project=project_id) query_job = client.run_async_query(str(uuid.uuid4()), query) if dialect != 'legacy': From 12bacd8aba0fdc914d4f01dd16a903a5d22cbd4e Mon Sep 17 00:00:00 2001 From: Jason Ng Date: Thu, 5 Oct 2017 13:53:50 -0400 Subject: [PATCH 08/42] Pin google-cloud-bigquery to 0.25.0 in setup.py --- pandas_gbq/gbq.py | 6 ++++-- setup.py | 1 + 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index e7fd788d..32c4b244 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -866,8 +866,10 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, verbose=Tru Authentication via Google Cloud can be performed a number of ways, see: - The easiest is to download a service account JSON keyfile and point to it using - an environment variable: + The easiest is to download a service account JSON keyfile or generate user credentials via + `gcloud auth application-default login` + + and point to it using an environment variable: `$ export GOOGLE_APPLICATION_CREDENTIALS="/path/to/keyfile.json"` Parameters diff --git a/setup.py b/setup.py index df3cd85d..48c924bc 100644 --- a/setup.py +++ b/setup.py @@ -24,6 +24,7 @@ def readme(): 'google-auth>=1.0.0', 'google-auth-httplib2>=0.0.1', 'google-auth-oauthlib>=0.0.1', + 'google-cloud-bigquery==0.25.0', ] From f50bcca0fa7c042925de2e9b169bfc7da8b004d4 Mon Sep 17 00:00:00 2001 From: Jason Ng Date: Thu, 5 Oct 2017 15:02:37 -0400 Subject: [PATCH 09/42] Catch renaming of results to result in 0.26.0 --- pandas_gbq/gbq.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 32c4b244..cac565d9 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -956,7 +956,10 @@ def _wait_for_job(job): bytes_billed = int(query_job._properties["statistics"]["query"].get("totalBytesProcessed", 0)) bytes_processed = int(query_job._properties["statistics"]["query"].get("totalBytesBilled", 0)) print("Total bytes billed (processed): %s (%s)" % (sizeof_fmt(bytes_billed),sizeof_fmt(bytes_processed))) - query_results = query_job.results() + try: + query_results = query_job.results() + except: + query_results = query_job.result() if verbose: print("\nRetrieving results...") From cd17b8d6c84a43df93074dc0183d88bbd7fb650e Mon Sep 17 00:00:00 2001 From: Jason Ng Date: Thu, 5 Oct 2017 15:12:25 -0400 Subject: [PATCH 10/42] Pin requirements to include 0.26.0 --- requirements.txt | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 9d2969bc..731224d2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,5 +4,5 @@ google-api-python-client google-auth google-auth-httplib2 google-auth-oauthlib -google-cloud-bigquery==0.25.0 +google-cloud-bigquery>=0.25.0,<=0.26.0 uuid diff --git a/setup.py b/setup.py index 48c924bc..b6cdf68d 100644 --- a/setup.py +++ b/setup.py @@ -24,7 +24,7 @@ def readme(): 'google-auth>=1.0.0', 'google-auth-httplib2>=0.0.1', 'google-auth-oauthlib>=0.0.1', - 'google-cloud-bigquery==0.25.0', + 'google-cloud-bigquery>=0.25.0,<=0.26.0', ] From 48ac79563a319c0d598c3f1fc485245be3791924 Mon Sep 17 00:00:00 2001 From: Jason Ng Date: Thu, 5 Oct 2017 16:51:24 -0400 Subject: [PATCH 11/42] Remove legacy read_gbq code and add initial user auth flow --- pandas_gbq/gbq.py | 196 +++++----------------------------------------- 1 file changed, 21 insertions(+), 175 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index cac565d9..7dbea89d 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -507,138 +507,6 @@ def process_insert_errors(self, insert_errors): raise StreamingInsertError - def run_query(self, query, **kwargs): - try: - from googleapiclient.errors import HttpError - except: - from apiclient.errors import HttpError - from google.auth.exceptions import RefreshError - - job_collection = self.service.jobs() - - job_config = { - 'query': { - 'query': query, - 'useLegacySql': self.dialect == 'legacy' - # 'allowLargeResults', 'createDisposition', - # 'preserveNulls', destinationTable, useQueryCache - } - } - config = kwargs.get('configuration') - if config is not None: - if len(config) != 1: - raise ValueError("Only one job type must be specified, but " - "given {}".format(','.join(config.keys()))) - if 'query' in config: - if 'query' in config['query'] and query is not None: - raise ValueError("Query statement can't be specified " - "inside config while it is specified " - "as parameter") - - job_config['query'].update(config['query']) - else: - raise ValueError("Only 'query' job type is supported") - - job_data = { - 'configuration': job_config - } - - self._start_timer() - try: - self._print('Requesting query... ', end="") - query_reply = job_collection.insert( - projectId=self.project_id, body=job_data).execute() - self._print('ok.') - except (RefreshError, ValueError): - if self.private_key: - raise AccessDenied( - "The service account credentials are not valid") - else: - raise AccessDenied( - "The credentials have been revoked or expired, " - "please re-run the application to re-authorize") - except HttpError as ex: - self.process_http_error(ex) - - job_reference = query_reply['jobReference'] - job_id = job_reference['jobId'] - self._print('Job ID: %s\nQuery running...' % job_id) - - while not query_reply.get('jobComplete', False): - self.print_elapsed_seconds(' Elapsed', 's. Waiting...') - - timeout_ms = job_config['query'].get('timeoutMs') - if timeout_ms and timeout_ms < self.get_elapsed_seconds() * 1000: - raise QueryTimeout('Query timeout: {} ms'.format(timeout_ms)) - - try: - query_reply = job_collection.getQueryResults( - projectId=job_reference['projectId'], - jobId=job_id).execute() - except HttpError as ex: - self.process_http_error(ex) - - if self.verbose: - if query_reply['cacheHit']: - self._print('Query done.\nCache hit.\n') - else: - bytes_processed = int(query_reply.get( - 'totalBytesProcessed', '0')) - self._print('Query done.\nProcessed: {}'.format( - self.sizeof_fmt(bytes_processed))) - self._print('Standard price: ${:,.2f} USD\n'.format( - bytes_processed * self.query_price_for_TB)) - - self._print('Retrieving results...') - - total_rows = int(query_reply['totalRows']) - result_pages = list() - seen_page_tokens = list() - current_row = 0 - # Only read schema on first page - schema = query_reply['schema'] - - # Loop through each page of data - while 'rows' in query_reply and current_row < total_rows: - page = query_reply['rows'] - result_pages.append(page) - current_row += len(page) - - self.print_elapsed_seconds( - ' Got page: {}; {}% done. Elapsed'.format( - len(result_pages), - round(100.0 * current_row / total_rows))) - - if current_row == total_rows: - break - - page_token = query_reply.get('pageToken', None) - - if not page_token and current_row < total_rows: - raise InvalidPageToken("Required pageToken was missing. " - "Received {0} of {1} rows" - .format(current_row, total_rows)) - - elif page_token in seen_page_tokens: - raise InvalidPageToken("A duplicate pageToken was returned") - - seen_page_tokens.append(page_token) - - try: - query_reply = job_collection.getQueryResults( - projectId=job_reference['projectId'], - jobId=job_id, - pageToken=page_token).execute() - except HttpError as ex: - self.process_http_error(ex) - - if current_row < total_rows: - raise InvalidPageToken() - - # print basic query stats - self._print('Got {} rows.\n'.format(total_rows)) - - return schema, result_pages def load_data(self, dataframe, dataset_id, table_id, chunksize): try: @@ -815,46 +683,9 @@ def _get_credentials_file(): return os.environ.get( 'PANDAS_GBQ_CREDENTIALS_FILE') - -def _parse_data(schema, rows): - # see: - # http://pandas.pydata.org/pandas-docs/dev/missing_data.html - # #missing-data-casting-rules-and-indexing - dtype_map = {'FLOAT': np.dtype(float), - 'TIMESTAMP': 'M8[ns]'} - - fields = schema['fields'] - col_types = [field['type'] for field in fields] - col_names = [str(field['name']) for field in fields] - col_dtypes = [dtype_map.get(field['type'], object) for field in fields] - page_array = np.zeros((len(rows),), dtype=lzip(col_names, col_dtypes)) - for row_num, raw_row in enumerate(rows): - entries = raw_row.get('f', []) - for col_num, field_type in enumerate(col_types): - field_value = _parse_entry(entries[col_num].get('v', ''), - field_type) - page_array[row_num][col_num] = field_value - - return DataFrame(page_array, columns=col_names) - - -def _parse_entry(field_value, field_type): - if field_value is None or field_value == 'null': - return None - if field_type == 'INTEGER': - return int(field_value) - elif field_type == 'FLOAT': - return float(field_value) - elif field_type == 'TIMESTAMP': - timestamp = datetime.utcfromtimestamp(float(field_value)) - return np.datetime64(timestamp) - elif field_type == 'BOOLEAN': - return field_value == 'true' - return field_value - - def read_gbq(query, project_id=None, index_col=None, col_order=None, verbose=True, - private_key=None, dialect='legacy', configuration=None, **kwargs): + private_key=None, auth_local_webserver=False, dialect='legacy', + configuration=None, **kwargs): r"""Load data from Google BigQuery using google-cloud-python The main method a user calls to execute a Query in Google BigQuery @@ -862,16 +693,23 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, verbose=Tru The Google Cloud library is used. Documentation is available `here - `__ + ` Authentication via Google Cloud can be performed a number of ways, see: - The easiest is to download a service account JSON keyfile or generate user credentials via - `gcloud auth application-default login` + + The easiest is to generate user credentials via `gcloud auth application-default login` and point to it using an environment variable: `$ export GOOGLE_APPLICATION_CREDENTIALS="/path/to/keyfile.json"` + You can also download a service account private key JSON file and pass the path to the file + to the private_key paramater. + + As a final alternative, you can also set auth_local_webserver to True, which will trigger + a pop-up through which a user can auth with their Google account. This will generate a user + credentials file, which is saved locally and can be re-used in the future. + Parameters ---------- query : str @@ -889,6 +727,12 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, verbose=Tru Path to service account private key in JSON format. If none is provided, will default to the GOOGLE_APPLICATION_CREDENTIALS environment variable or another form of authentication (see above) + auth_local_webserver : boolean, default False + Use the [local webserver flow] instead of the [console flow] when + getting user credentials. A file named bigquery_credentials.dat will + be created in ~/.config/pandas_gbq/. You can also set PANDAS_GBQ_CREDENTIALS_FILE + environment variable so as to define a specific path to store this + credential (eg. /etc/keys/bigquery.dat). dialect : {'legacy', 'standard'}, default 'legacy' 'legacy' : Use BigQuery's legacy SQL dialect. 'standard' : Use BigQuery's standard SQL (beta), which is @@ -900,7 +744,6 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, verbose=Tru only some configuration settings are currently implemented. You can pass them along like in the following: `from_gbq(q,configuration={'allow_large_results':True,'maximum_billing_tier':2})` - Example allowable settings: allow_large_results, create_disposition, default_dataset, destination flatten_results, priority, use_query_cache, use_legacy_sql, dry_run, @@ -914,6 +757,7 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, verbose=Tru """ # http://stackoverflow.com/questions/1094841/reusable-library-to-get-human-readable-version-of-file-size + def sizeof_fmt(num, suffix='B'): fmt = "%3.1f %s%s" for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']: @@ -932,6 +776,8 @@ def _wait_for_job(job): time.sleep(1) if private_key: client = bigquery.Client(project=project_id).from_service_account_json(private_key) + elif auth_local_webserver: + GbqConnector(project_id=project_id,auth_local_webserver=True).get_user_account_credentials() else: client = bigquery.Client(project=project_id) query_job = client.run_async_query(str(uuid.uuid4()), query) From c4a2c36f41bf91ed00558a26e979d05e4faf7685 Mon Sep 17 00:00:00 2001 From: Jason Ng Date: Thu, 5 Oct 2017 17:07:11 -0400 Subject: [PATCH 12/42] Use GbqConnector for credentials checking --- pandas_gbq/gbq.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 7dbea89d..6acb77f3 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -683,7 +683,7 @@ def _get_credentials_file(): return os.environ.get( 'PANDAS_GBQ_CREDENTIALS_FILE') -def read_gbq(query, project_id=None, index_col=None, col_order=None, verbose=True, +def read_gbq(query, project_id=None, index_col=None, col_order=None, reauth=False, verbose=True, private_key=None, auth_local_webserver=False, dialect='legacy', configuration=None, **kwargs): r"""Load data from Google BigQuery using google-cloud-python @@ -721,6 +721,9 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, verbose=Tru col_order : list(str) (optional) List of BigQuery column names in the desired order for results DataFrame + reauth : boolean (default False) + Force Google BigQuery to reauthenticate the user. This is useful + if multiple accounts are used. verbose : boolean (default True) Verbose output private_key : str (optional) @@ -774,12 +777,12 @@ def _wait_for_job(job): raise RuntimeError(job.errors) return time.sleep(1) - if private_key: - client = bigquery.Client(project=project_id).from_service_account_json(private_key) - elif auth_local_webserver: - GbqConnector(project_id=project_id,auth_local_webserver=True).get_user_account_credentials() - else: - client = bigquery.Client(project=project_id) + + credentials = GbqConnector(project_id=project_id, + reauth=reauth, + auth_local_webserver=auth_local_webserver, + private_key=private_key).credentials + client = bigquery.Client(project=project_id, credentials=credentials) query_job = client.run_async_query(str(uuid.uuid4()), query) if dialect != 'legacy': From 04373ebd0aa8dc802b9148b0048e200d0f4867dd Mon Sep 17 00:00:00 2001 From: Jason Ng Date: Thu, 5 Oct 2017 19:39:39 -0400 Subject: [PATCH 13/42] Dealing with tests part 1 --- pandas_gbq/gbq.py | 39 +++++++++++++++++----------- pandas_gbq/tests/test_gbq.py | 50 ++++++++++++++++++++---------------- 2 files changed, 52 insertions(+), 37 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 6acb77f3..2ecc5e93 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -683,9 +683,19 @@ def _get_credentials_file(): return os.environ.get( 'PANDAS_GBQ_CREDENTIALS_FILE') + +def sizeof_fmt(num, suffix='B'): + fmt = "%3.1f %s%s" + for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']: + if abs(num) < 1024.0: + return fmt % (num, unit, suffix) + num /= 1024.0 + return fmt % (num, 'Y', suffix) + + def read_gbq(query, project_id=None, index_col=None, col_order=None, reauth=False, verbose=True, - private_key=None, auth_local_webserver=False, dialect='legacy', - configuration=None, **kwargs): + private_key=None, auth_local_webserver=False, dialect='legacy', credentials=None, + get_schema=False, configuration=None, **kwargs): r"""Load data from Google BigQuery using google-cloud-python The main method a user calls to execute a Query in Google BigQuery @@ -741,7 +751,11 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, reauth=Fals 'standard' : Use BigQuery's standard SQL (beta), which is compliant with the SQL 2011 standard. For more information see `BigQuery SQL Reference - `__ + ` + credentials: credentials object (default None) + If generating credentials on your own, pass in. Otherwise, will attempt to generate automatically + get_schema: boolean, default False + Set to True if you only want to return the schema, otherwise by default will return dataframe configuration : dict (optional) Because of current limitations (https://github.com/GoogleCloudPlatform/google-cloud-python/issues/2765) only some configuration settings are currently implemented. You can pass them @@ -761,14 +775,6 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, reauth=Fals # http://stackoverflow.com/questions/1094841/reusable-library-to-get-human-readable-version-of-file-size - def sizeof_fmt(num, suffix='B'): - fmt = "%3.1f %s%s" - for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']: - if abs(num) < 1024.0: - return fmt % (num, unit, suffix) - num /= 1024.0 - return fmt % (num, 'Y', suffix) - def _wait_for_job(job): while True: job.reload() # Refreshes the state via a GET request. @@ -778,10 +784,11 @@ def _wait_for_job(job): return time.sleep(1) - credentials = GbqConnector(project_id=project_id, - reauth=reauth, - auth_local_webserver=auth_local_webserver, - private_key=private_key).credentials + if credentials is None: + credentials = GbqConnector(project_id=project_id, + reauth=reauth, + auth_local_webserver=auth_local_webserver, + private_key=private_key).credentials client = bigquery.Client(project=project_id, credentials=credentials) query_job = client.run_async_query(str(uuid.uuid4()), query) @@ -820,6 +827,8 @@ def _wait_for_job(job): print("\nTotal time taken %s s" % (datetime.utcnow()-query_job.created.replace(tzinfo=None)).seconds) print("Finished at %s." % datetime.now().strftime('%Y-%m-%d %H:%M:%S')) + if get_schema: + return query_results.schema columns = [field.name for field in query_results.schema] data = rows diff --git a/pandas_gbq/tests/test_gbq.py b/pandas_gbq/tests/test_gbq.py index 62b72dbc..e718bfda 100644 --- a/pandas_gbq/tests/test_gbq.py +++ b/pandas_gbq/tests/test_gbq.py @@ -198,12 +198,14 @@ def test_should_be_able_to_get_a_bigquery_service(self): assert bigquery_service is not None def test_should_be_able_to_get_schema_from_query(self): - schema, pages = self.sut.run_query('SELECT 1') + credentials = self.sut.credentials + schema = gbq.read_gbq('SELECT 1',project_id=_get_project_id(),credentials=credentials,get_schema=True) assert schema is not None def test_should_be_able_to_get_results_from_query(self): - schema, pages = self.sut.run_query('SELECT 1') - assert pages is not None + credentials = self.sut.credentials + results = gbq.read_gbq('SELECT 1',project_id=_get_project_id(),credentials=credentials) + assert results is not None def test_get_application_default_credentials_does_not_throw_error(self): if _check_if_can_get_correct_default_credentials(): @@ -261,12 +263,14 @@ def test_should_be_able_to_get_a_bigquery_service(self): assert bigquery_service is not None def test_should_be_able_to_get_schema_from_query(self): - schema, pages = self.sut.run_query('SELECT 1') + credentials = self.sut.credentials + schema = gbq.read_gbq('SELECT 1',project_id=_get_project_id(),credentials=credentials,get_schema=True) assert schema is not None def test_should_be_able_to_get_results_from_query(self): - schema, pages = self.sut.run_query('SELECT 1') - assert pages is not None + credentials = self.sut.credentials + results = gbq.read_gbq('SELECT 1',project_id=_get_project_id(),credentials=credentials) + assert results is not None class TestGBQConnectorIntegrationWithServiceAccountKeyContents(object): @@ -292,12 +296,14 @@ def test_should_be_able_to_get_a_bigquery_service(self): assert bigquery_service is not None def test_should_be_able_to_get_schema_from_query(self): - schema, pages = self.sut.run_query('SELECT 1') + credentials = self.sut.credentials + schema = gbq.read_gbq('SELECT 1',project_id=_get_project_id(),credentials=credentials,get_schema=True) assert schema is not None def test_should_be_able_to_get_results_from_query(self): - schema, pages = self.sut.run_query('SELECT 1') - assert pages is not None + credentials = self.sut.credentials + results = gbq.read_gbq('SELECT 1',project_id=_get_project_id(),credentials=credentials) + assert results is not None class GBQUnitTests(object): @@ -906,19 +912,19 @@ def test_timeout_configuration(self): configuration=config) def test_query_response_bytes(self): - assert self.gbq_connector.sizeof_fmt(999) == "999.0 B" - assert self.gbq_connector.sizeof_fmt(1024) == "1.0 KB" - assert self.gbq_connector.sizeof_fmt(1099) == "1.1 KB" - assert self.gbq_connector.sizeof_fmt(1044480) == "1020.0 KB" - assert self.gbq_connector.sizeof_fmt(1048576) == "1.0 MB" - assert self.gbq_connector.sizeof_fmt(1048576000) == "1000.0 MB" - assert self.gbq_connector.sizeof_fmt(1073741824) == "1.0 GB" - assert self.gbq_connector.sizeof_fmt(1.099512E12) == "1.0 TB" - assert self.gbq_connector.sizeof_fmt(1.125900E15) == "1.0 PB" - assert self.gbq_connector.sizeof_fmt(1.152922E18) == "1.0 EB" - assert self.gbq_connector.sizeof_fmt(1.180592E21) == "1.0 ZB" - assert self.gbq_connector.sizeof_fmt(1.208926E24) == "1.0 YB" - assert self.gbq_connector.sizeof_fmt(1.208926E28) == "10000.0 YB" + assert gbq.sizeof_fmt(999) == "999.0 B" + assert gbq.sizeof_fmt(1024) == "1.0 KB" + assert gbq.sizeof_fmt(1099) == "1.1 KB" + assert gbq.sizeof_fmt(1044480) == "1020.0 KB" + assert gbq.sizeof_fmt(1048576) == "1.0 MB" + assert gbq.sizeof_fmt(1048576000) == "1000.0 MB" + assert gbq.sizeof_fmt(1073741824) == "1.0 GB" + assert gbq.sizeof_fmt(1.099512E12) == "1.0 TB" + assert gbq.sizeof_fmt(1.125900E15) == "1.0 PB" + assert gbq.sizeof_fmt(1.152922E18) == "1.0 EB" + assert gbq.sizeof_fmt(1.180592E21) == "1.0 ZB" + assert gbq.sizeof_fmt(1.208926E24) == "1.0 YB" + assert gbq.sizeof_fmt(1.208926E28) == "10000.0 YB" class TestToGBQIntegrationWithServiceAccountKeyPath(object): From dcf014ab1c85804fcdb9c7b94d3f3c2837f428aa Mon Sep 17 00:00:00 2001 From: Jason Ng Date: Thu, 5 Oct 2017 21:41:03 -0400 Subject: [PATCH 14/42] Fix formatting to make linter happy plus remove obj conversion in test Remove obj conversion in test Tweak formatting Update ci requirements Fix more formatting to make linter happy --- ci/requirements-2.7-0.19.2.pip | 1 + ci/requirements-3.5-0.18.1.pip | 1 + ci/requirements-3.6-0.20.1.conda | 1 + ci/requirements-3.6-MASTER.pip | 1 + pandas_gbq/gbq.py | 107 ++++++++++++++++++------------- pandas_gbq/tests/test_gbq.py | 58 +++++++---------- 6 files changed, 88 insertions(+), 81 deletions(-) diff --git a/ci/requirements-2.7-0.19.2.pip b/ci/requirements-2.7-0.19.2.pip index 852dc153..22b763cf 100644 --- a/ci/requirements-2.7-0.19.2.pip +++ b/ci/requirements-2.7-0.19.2.pip @@ -5,3 +5,4 @@ google-auth-oauthlib PyCrypto python-gflags mock +google-cloud-bigquery>=0.25.0,<=0.26.0 diff --git a/ci/requirements-3.5-0.18.1.pip b/ci/requirements-3.5-0.18.1.pip index 6fb8a03d..19ca6227 100644 --- a/ci/requirements-3.5-0.18.1.pip +++ b/ci/requirements-3.5-0.18.1.pip @@ -3,3 +3,4 @@ google-auth==1.0.0 google-auth-httplib2==0.0.1 google-auth-oauthlib==0.0.1 mock +google-cloud-bigquery>=0.25.0,<=0.26.0 diff --git a/ci/requirements-3.6-0.20.1.conda b/ci/requirements-3.6-0.20.1.conda index a1608720..7ca942e4 100644 --- a/ci/requirements-3.6-0.20.1.conda +++ b/ci/requirements-3.6-0.20.1.conda @@ -3,3 +3,4 @@ google-auth google-auth-httplib2 google-auth-oauthlib mock +google-cloud-bigquery>=0.25.0,<=0.26.0 diff --git a/ci/requirements-3.6-MASTER.pip b/ci/requirements-3.6-MASTER.pip index a1608720..7ca942e4 100644 --- a/ci/requirements-3.6-MASTER.pip +++ b/ci/requirements-3.6-MASTER.pip @@ -3,3 +3,4 @@ google-auth google-auth-httplib2 google-auth-oauthlib mock +google-cloud-bigquery>=0.25.0,<=0.26.0 diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 2ecc5e93..71a9c2f0 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -6,13 +6,10 @@ import time import sys import os -import uuid - -import numpy as np from distutils.version import StrictVersion -from pandas import compat, DataFrame, concat -from pandas.compat import lzip, bytes_to_str +from pandas import compat, DataFrame +from pandas.compat import bytes_to_str from google.cloud import bigquery @@ -507,7 +504,6 @@ def process_insert_errors(self, insert_errors): raise StreamingInsertError - def load_data(self, dataframe, dataset_id, table_id, chunksize): try: from googleapiclient.errors import HttpError @@ -679,6 +675,7 @@ def delete_and_recreate_table(self, dataset_id, table_id, table_schema): table.create(table_id, table_schema) sleep(delay) + def _get_credentials_file(): return os.environ.get( 'PANDAS_GBQ_CREDENTIALS_FILE') @@ -691,11 +688,12 @@ def sizeof_fmt(num, suffix='B'): return fmt % (num, unit, suffix) num /= 1024.0 return fmt % (num, 'Y', suffix) - -def read_gbq(query, project_id=None, index_col=None, col_order=None, reauth=False, verbose=True, - private_key=None, auth_local_webserver=False, dialect='legacy', credentials=None, - get_schema=False, configuration=None, **kwargs): + +def read_gbq(query, project_id=None, index_col=None, col_order=None, + reauth=False, verbose=True, private_key=None, auth_local_webserver=False, + dialect='legacy', credentials=None, get_schema=False, query_parameters=(), + configuration=None, **kwargs): r"""Load data from Google BigQuery using google-cloud-python The main method a user calls to execute a Query in Google BigQuery @@ -706,19 +704,22 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, reauth=Fals ` Authentication via Google Cloud can be performed a number of ways, see: - + - The easiest is to generate user credentials via `gcloud auth application-default login` - - and point to it using an environment variable: + The easiest is to generate user credentials via + `gcloud auth application-default login` and point to it using an + environment variable: `$ export GOOGLE_APPLICATION_CREDENTIALS="/path/to/keyfile.json"` - You can also download a service account private key JSON file and pass the path to the file - to the private_key paramater. + You can also download a service account private key JSON file and pass the + path to the file to the private_key paramater. - As a final alternative, you can also set auth_local_webserver to True, which will trigger - a pop-up through which a user can auth with their Google account. This will generate a user - credentials file, which is saved locally and can be re-used in the future. + As a final alternative, you can also set auth_local_webserver to True, + which will trigger a pop-up through which a user can auth with their Google + account. This will generate a user credentials file, which is saved locally + and can be re-used in the future. Parameters ---------- @@ -737,15 +738,15 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, reauth=Fals verbose : boolean (default True) Verbose output private_key : str (optional) - Path to service account private key in JSON format. If none is provided, - will default to the GOOGLE_APPLICATION_CREDENTIALS environment variable - or another form of authentication (see above) + Path to service account private key in JSON format. If none is + provided, will default to the GOOGLE_APPLICATION_CREDENTIALS + environment variable or another form of authentication (see above) auth_local_webserver : boolean, default False Use the [local webserver flow] instead of the [console flow] when getting user credentials. A file named bigquery_credentials.dat will - be created in ~/.config/pandas_gbq/. You can also set PANDAS_GBQ_CREDENTIALS_FILE - environment variable so as to define a specific path to store this - credential (eg. /etc/keys/bigquery.dat). + be created in ~/.config/pandas_gbq/. You can also set + PANDAS_GBQ_CREDENTIALS_FILE environment variable so as to define a + specific path to store this credential (eg. /etc/keys/bigquery.dat). dialect : {'legacy', 'standard'}, default 'legacy' 'legacy' : Use BigQuery's legacy SQL dialect. 'standard' : Use BigQuery's standard SQL (beta), which is @@ -753,18 +754,26 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, reauth=Fals see `BigQuery SQL Reference ` credentials: credentials object (default None) - If generating credentials on your own, pass in. Otherwise, will attempt to generate automatically + If generating credentials on your own, pass in. Otherwise, will attempt + to generate automatically get_schema: boolean, default False - Set to True if you only want to return the schema, otherwise by default will return dataframe + Set to True if you only want to return the schema, otherwise by default + will return dataframe + query_parameters: dict (optional) configuration : dict (optional) - Because of current limitations (https://github.com/GoogleCloudPlatform/google-cloud-python/issues/2765) - only some configuration settings are currently implemented. You can pass them - along like in the following: - `from_gbq(q,configuration={'allow_large_results':True,'maximum_billing_tier':2})` - Example allowable settings: - allow_large_results, create_disposition, default_dataset, destination - flatten_results, priority, use_query_cache, use_legacy_sql, dry_run, - write_disposition, udf_resources, maximum_billing_tier, maximum_bytes_billed + Because of current limitations only some configuration settings are + currently implemented. You can pass them along like in the following: + `read_gbq(q,configuration={'allow_large_results':True, + 'maximum_billing_tier':2})` + Example allowable settings: + allow_large_results, create_disposition, default_dataset, + destination, flatten_results, priority, use_query_cache, + use_legacy_sql, dry_run, write_disposition, udf_resources, + maximum_billing_tier, maximum_bytes_billed + Returns ------- @@ -773,8 +782,6 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, reauth=Fals """ - # http://stackoverflow.com/questions/1094841/reusable-library-to-get-human-readable-version-of-file-size - def _wait_for_job(job): while True: job.reload() # Refreshes the state via a GET request. @@ -788,9 +795,12 @@ def _wait_for_job(job): credentials = GbqConnector(project_id=project_id, reauth=reauth, auth_local_webserver=auth_local_webserver, - private_key=private_key).credentials + private_key=private_key).credentials + client = bigquery.Client(project=project_id, credentials=credentials) - query_job = client.run_async_query(str(uuid.uuid4()), query) + query_job = client.run_async_query(str(uuid.uuid4()), + query, + query_parameters=query_parameters) if dialect != 'legacy': query_job.use_legacy_sql = False @@ -808,10 +818,14 @@ def _wait_for_job(job): print("Query done.") if query_job._properties["statistics"]["query"].get("cacheHit", False): print("Cache hit.") - elif "statistics" in query_job._properties and "query" in query_job._properties["statistics"]: - bytes_billed = int(query_job._properties["statistics"]["query"].get("totalBytesProcessed", 0)) - bytes_processed = int(query_job._properties["statistics"]["query"].get("totalBytesBilled", 0)) - print("Total bytes billed (processed): %s (%s)" % (sizeof_fmt(bytes_billed),sizeof_fmt(bytes_processed))) + elif ("statistics" in query_job._properties and + "query" in query_job._properties["statistics"]): + bytes_billed = int(query_job._properties["statistics"]["query"] + .get("totalBytesProcessed", 0)) + bytes_processed = int(query_job._properties["statistics"]["query"] + .get("totalBytesBilled", 0)) + print("Total bytes billed (processed): %s (%s)" % + (sizeof_fmt(bytes_billed), sizeof_fmt(bytes_processed))) try: query_results = query_job.results() except: @@ -824,16 +838,17 @@ def _wait_for_job(job): if verbose: print("Got %s rows.") % total_rows - print("\nTotal time taken %s s" % (datetime.utcnow()-query_job.created.replace(tzinfo=None)).seconds) + print("\nTotal time taken %ss" % (datetime.utcnow() - + query_job.created.replace(tzinfo=None)).seconds) print("Finished at %s." % datetime.now().strftime('%Y-%m-%d %H:%M:%S')) - + if get_schema: return query_results.schema columns = [field.name for field in query_results.schema] data = rows - final_df = DataFrame(data=data,columns=columns) + final_df = DataFrame(data=data, columns=columns) # Change the order of columns in the DataFrame based on provided list if col_order: diff --git a/pandas_gbq/tests/test_gbq.py b/pandas_gbq/tests/test_gbq.py index e718bfda..859b72b6 100644 --- a/pandas_gbq/tests/test_gbq.py +++ b/pandas_gbq/tests/test_gbq.py @@ -199,12 +199,17 @@ def test_should_be_able_to_get_a_bigquery_service(self): def test_should_be_able_to_get_schema_from_query(self): credentials = self.sut.credentials - schema = gbq.read_gbq('SELECT 1',project_id=_get_project_id(),credentials=credentials,get_schema=True) + schema = gbq.read_gbq('SELECT 1', + project_id=_get_project_id(), + credentials=credentials, + get_schema=True) assert schema is not None def test_should_be_able_to_get_results_from_query(self): credentials = self.sut.credentials - results = gbq.read_gbq('SELECT 1',project_id=_get_project_id(),credentials=credentials) + results = gbq.read_gbq('SELECT 1', + project_id=_get_project_id(), + credentials=credentials) assert results is not None def test_get_application_default_credentials_does_not_throw_error(self): @@ -264,12 +269,15 @@ def test_should_be_able_to_get_a_bigquery_service(self): def test_should_be_able_to_get_schema_from_query(self): credentials = self.sut.credentials - schema = gbq.read_gbq('SELECT 1',project_id=_get_project_id(),credentials=credentials,get_schema=True) + schema = gbq.read_gbq('SELECT 1', project_id=_get_project_id(), + credentials=credentials, + get_schema=True) assert schema is not None def test_should_be_able_to_get_results_from_query(self): credentials = self.sut.credentials - results = gbq.read_gbq('SELECT 1',project_id=_get_project_id(),credentials=credentials) + results = gbq.read_gbq('SELECT 1', project_id=_get_project_id(), + credentials=credentials) assert results is not None @@ -297,12 +305,17 @@ def test_should_be_able_to_get_a_bigquery_service(self): def test_should_be_able_to_get_schema_from_query(self): credentials = self.sut.credentials - schema = gbq.read_gbq('SELECT 1',project_id=_get_project_id(),credentials=credentials,get_schema=True) + schema = gbq.read_gbq('SELECT 1', + project_id=_get_project_id(), + credentials=credentials, + get_schema=True) assert schema is not None def test_should_be_able_to_get_results_from_query(self): credentials = self.sut.credentials - results = gbq.read_gbq('SELECT 1',project_id=_get_project_id(),credentials=credentials) + results = gbq.read_gbq('SELECT 1', + project_id=_get_project_id(), + credentials=credentials) assert results is not None @@ -522,7 +535,7 @@ def test_should_properly_handle_nullable_integers(self): df = gbq.read_gbq(query, project_id=_get_project_id(), private_key=_get_private_key_path()) tm.assert_frame_equal( - df, DataFrame({'nullable_integer': [1, None]}).astype(object)) + df, DataFrame({'nullable_integer': [1, None]})) def test_should_properly_handle_valid_longs(self): query = 'SELECT 1 << 62 AS valid_long' @@ -538,7 +551,7 @@ def test_should_properly_handle_nullable_longs(self): df = gbq.read_gbq(query, project_id=_get_project_id(), private_key=_get_private_key_path()) tm.assert_frame_equal( - df, DataFrame({'nullable_long': [1 << 62, None]}).astype(object)) + df, DataFrame({'nullable_long': [1 << 62, None]})) def test_should_properly_handle_null_integers(self): query = 'SELECT INTEGER(NULL) AS null_integer' @@ -634,7 +647,7 @@ def test_should_properly_handle_nullable_booleans(self): df = gbq.read_gbq(query, project_id=_get_project_id(), private_key=_get_private_key_path()) tm.assert_frame_equal( - df, DataFrame({'nullable_boolean': [True, None]}).astype(object)) + df, DataFrame({'nullable_boolean': [True, None]})) def test_unicode_string_conversion_and_normalization(self): correct_test_datatype = DataFrame( @@ -797,32 +810,7 @@ def test_invalid_option_for_sql_dialect(self): def test_query_with_parameters(self): sql_statement = "SELECT @param1 + @param2 AS valid_result" - config = { - 'query': { - "useLegacySql": False, - "parameterMode": "named", - "queryParameters": [ - { - "name": "param1", - "parameterType": { - "type": "INTEGER" - }, - "parameterValue": { - "value": 1 - } - }, - { - "name": "param2", - "parameterType": { - "type": "INTEGER" - }, - "parameterValue": { - "value": 2 - } - } - ] - } - } + config = {"use_legacy_sql": False} # Test that a query that relies on parameters fails # when parameters are not supplied via configuration with pytest.raises(ValueError): From 72253aabf55dcc64f307f1a32b979278e90fb421 Mon Sep 17 00:00:00 2001 From: Jason Ng Date: Fri, 6 Oct 2017 00:55:37 -0400 Subject: [PATCH 15/42] Rewrite query_parameters test with documentation --- pandas_gbq/gbq.py | 26 ++++++++++++++++++-------- pandas_gbq/tests/test_gbq.py | 14 ++++++++++---- 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 71a9c2f0..a9f7d034 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -691,9 +691,10 @@ def sizeof_fmt(num, suffix='B'): def read_gbq(query, project_id=None, index_col=None, col_order=None, - reauth=False, verbose=True, private_key=None, auth_local_webserver=False, - dialect='legacy', credentials=None, get_schema=False, query_parameters=(), - configuration=None, **kwargs): + reauth=False, verbose=True, private_key=None, + auth_local_webserver=False, dialect='legacy', credentials=None, + get_schema=False, query_parameters=(), configuration=None, + **kwargs): r"""Load data from Google BigQuery using google-cloud-python The main method a user calls to execute a Query in Google BigQuery @@ -741,7 +742,7 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, Path to service account private key in JSON format. If none is provided, will default to the GOOGLE_APPLICATION_CREDENTIALS environment variable or another form of authentication (see above) - auth_local_webserver : boolean, default False + auth_local_webserver : boolean, default False (optional) Use the [local webserver flow] instead of the [console flow] when getting user credentials. A file named bigquery_credentials.dat will be created in ~/.config/pandas_gbq/. You can also set @@ -756,11 +757,16 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, credentials: credentials object (default None) If generating credentials on your own, pass in. Otherwise, will attempt to generate automatically - get_schema: boolean, default False + get_schema: boolean, default False (optional) Set to True if you only want to return the schema, otherwise by default will return dataframe - query_parameters: dict (optional) + query_parameters: tuple (optional) Can only be used in Standard SQL + example: gbq.read_gbq("SELECT @param1 + @param2", + query_parameters = (bigquery.ScalarQueryParameter( + 'param1', 'INT64', 1), + bigquery.ScalarQueryParameter( + 'param2', 'INT64', 2))) + configuration : dict (optional) Because of current limitations only some configuration settings are @@ -802,8 +808,12 @@ def _wait_for_job(job): query, query_parameters=query_parameters) - if dialect != 'legacy': + if dialect == 'legacy': + query_job.use_legacy_sql = True + elif dialect == 'standard': query_job.use_legacy_sql = False + else: + raise ValueError("'{0}' is not valid for dialect".format(dialect)) if configuration: for setting, value in configuration.items(): diff --git a/pandas_gbq/tests/test_gbq.py b/pandas_gbq/tests/test_gbq.py index 859b72b6..00ff4769 100644 --- a/pandas_gbq/tests/test_gbq.py +++ b/pandas_gbq/tests/test_gbq.py @@ -17,6 +17,7 @@ from pandas_gbq import gbq import pandas.util.testing as tm from pandas.compat.numpy import np_datetime64_compat +from google.cloud import bigquery TABLE_ID = 'new_test' @@ -813,15 +814,20 @@ def test_query_with_parameters(self): config = {"use_legacy_sql": False} # Test that a query that relies on parameters fails # when parameters are not supplied via configuration - with pytest.raises(ValueError): + with pytest.raises(RuntimeError): gbq.read_gbq(sql_statement, project_id=_get_project_id(), private_key=_get_private_key_path()) # Test that the query is successful because we have supplied - # the correct query parameters via the 'config' option + # the correct query parameters via the 'config' and query_parameters + # option df = gbq.read_gbq(sql_statement, project_id=_get_project_id(), - private_key=_get_private_key_path(), - configuration=config) + configuration = config, + query_parameters = (bigquery.ScalarQueryParameter( + 'param1', 'INT64', 1), + bigquery.ScalarQueryParameter( + 'param2', 'INT64', 2)), + private_key=_get_private_key_path()) tm.assert_frame_equal(df, DataFrame({'valid_result': [3]})) def test_query_inside_configuration(self): From 751f39b142e3a0b366dab00f2a65baf09e8a751d Mon Sep 17 00:00:00 2001 From: Jason Ng Date: Fri, 6 Oct 2017 07:03:25 -0400 Subject: [PATCH 16/42] Rewrite sync and async job flow, handle timeouts, add BadRequest error to tests --- pandas_gbq/gbq.py | 131 +++++++++++++++++++++++------------ pandas_gbq/tests/test_gbq.py | 12 ++-- 2 files changed, 92 insertions(+), 51 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index a9f7d034..6f8c2597 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -694,7 +694,7 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, reauth=False, verbose=True, private_key=None, auth_local_webserver=False, dialect='legacy', credentials=None, get_schema=False, query_parameters=(), configuration=None, - **kwargs): + timeout_ms=None, **kwargs): r"""Load data from Google BigQuery using google-cloud-python The main method a user calls to execute a Query in Google BigQuery @@ -780,6 +780,11 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, maximum_billing_tier, maximum_bytes_billed + timeout_ms: int (optional) If set or found in config, triggers a sync query + that times out with no results if it can't be completed in the time + desired + Returns ------- @@ -788,6 +793,9 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, """ + if dialect not in ('legacy', 'standard'): + raise ValueError("'{0}' is not valid for dialect".format(dialect)) + def _wait_for_job(job): while True: job.reload() # Refreshes the state via a GET request. @@ -802,55 +810,88 @@ def _wait_for_job(job): reauth=reauth, auth_local_webserver=auth_local_webserver, private_key=private_key).credentials - client = bigquery.Client(project=project_id, credentials=credentials) - query_job = client.run_async_query(str(uuid.uuid4()), - query, - query_parameters=query_parameters) - - if dialect == 'legacy': - query_job.use_legacy_sql = True - elif dialect == 'standard': - query_job.use_legacy_sql = False - else: - raise ValueError("'{0}' is not valid for dialect".format(dialect)) - - if configuration: - for setting, value in configuration.items(): - setattr(query_job, setting, value) - - query_job.begin() - - if verbose: - print("Query running...") - _wait_for_job(query_job) - if verbose: - print("Query done.") - if query_job._properties["statistics"]["query"].get("cacheHit", False): - print("Cache hit.") - elif ("statistics" in query_job._properties and - "query" in query_job._properties["statistics"]): - bytes_billed = int(query_job._properties["statistics"]["query"] - .get("totalBytesProcessed", 0)) - bytes_processed = int(query_job._properties["statistics"]["query"] - .get("totalBytesBilled", 0)) - print("Total bytes billed (processed): %s (%s)" % - (sizeof_fmt(bytes_billed), sizeof_fmt(bytes_processed))) - try: - query_results = query_job.results() - except: - query_results = query_job.result() - if verbose: - print("\nRetrieving results...") - rows = list(query_results.fetch_data()) - total_rows = len(rows) + def _set_common_query_settings(query_job): + if dialect == 'legacy': + query_job.use_legacy_sql = True + elif dialect == 'standard': + query_job.use_legacy_sql = False + + if configuration: + for setting, value in configuration.items(): + setattr(query_job, setting, value) + return query_job + + def sync_query(): + query_job = client.run_sync_query(query, + query_parameters=query_parameters) + query_job = _set_common_query_settings(query_job) + if verbose: + print("Query running...") + if timeout_ms: + query_job.timeout_ms = timeout_ms + query_job.run() + if not query_job._properties.get("jobComplete", False): + raise QueryTimeout("Sync query timed out") + if verbose: + print("Query done.") + if query_job._properties.get("cacheHit", False): + print("Cache hit.") + else: + bytes_billed = int(query_job._properties + .get("totalBytesProcessed", 0)) + bytes_processed = int(query_job._properties + .get("totalBytesBilled", 0)) + print("Total bytes billed (processed): %s (%s)" % + (sizeof_fmt(bytes_billed), sizeof_fmt(bytes_processed))) + print("\nRetrieving results...") + return query_job, None + + def async_query(): + query_job = client.run_async_query(str(uuid.uuid4()), + query, + query_parameters=query_parameters) + query_job = _set_common_query_settings(query_job) + query_job.begin() + try: + query_results = query_job.results().fetch_data() + except: + query_results = query_job.result().fetch_data() + if verbose: + print("Query running...") + _wait_for_job(query_job) + if verbose: + print("Query done.") + if query_job._properties["statistics"]["query"].get("cacheHit", False): + print("Cache hit.") + elif ("statistics" in query_job._properties and + "query" in query_job._properties["statistics"]): + bytes_billed = int(query_job._properties["statistics"]["query"] + .get("totalBytesProcessed", 0)) + bytes_processed = int(query_job._properties["statistics"]["query"] + .get("totalBytesBilled", 0)) + print("Total bytes billed (processed): %s (%s)" % + (sizeof_fmt(bytes_billed), sizeof_fmt(bytes_processed))) + print("\nRetrieving results...") + return query_results, query_job + + if (configuration and "timeout_ms" in configuration) or timeout_ms: + query_results, query_job = sync_query() + rows = list(query_results.rows) + total_rows = len(rows) + else: + query_results, query_job = async_query() + rows = list(query_results) + total_rows = len(rows) if verbose: print("Got %s rows.") % total_rows - print("\nTotal time taken %ss" % (datetime.utcnow() - - query_job.created.replace(tzinfo=None)).seconds) - print("Finished at %s." % datetime.now().strftime('%Y-%m-%d %H:%M:%S')) + if query_job: + print("\nTotal time taken %ss" % (datetime.utcnow() - + query_job.created.replace(tzinfo=None)).seconds) + print("Finished at %s." % datetime.now() + .strftime('%Y-%m-%d %H:%M:%S')) if get_schema: return query_results.schema diff --git a/pandas_gbq/tests/test_gbq.py b/pandas_gbq/tests/test_gbq.py index 00ff4769..778974ea 100644 --- a/pandas_gbq/tests/test_gbq.py +++ b/pandas_gbq/tests/test_gbq.py @@ -19,6 +19,8 @@ from pandas.compat.numpy import np_datetime64_compat from google.cloud import bigquery +from google.cloud.exceptions import BadRequest + TABLE_ID = 'new_test' @@ -765,7 +767,7 @@ def test_legacy_sql(self): # Test that a legacy sql statement fails when # setting dialect='standard' - with pytest.raises(gbq.GenericGBQException): + with pytest.raises((RuntimeError,BadRequest)): gbq.read_gbq(legacy_sql, project_id=_get_project_id(), dialect='standard', private_key=_get_private_key_path()) @@ -783,7 +785,7 @@ def test_standard_sql(self): # Test that a standard sql statement fails when using # the legacy SQL dialect (default value) - with pytest.raises(gbq.GenericGBQException): + with pytest.raises((RuntimeError,BadRequest)): gbq.read_gbq(standard_sql, project_id=_get_project_id(), private_key=_get_private_key_path()) @@ -814,7 +816,7 @@ def test_query_with_parameters(self): config = {"use_legacy_sql": False} # Test that a query that relies on parameters fails # when parameters are not supplied via configuration - with pytest.raises(RuntimeError): + with pytest.raises((RuntimeError,BadRequest)): gbq.read_gbq(sql_statement, project_id=_get_project_id(), private_key=_get_private_key_path()) @@ -895,9 +897,7 @@ def test_configuration_raises_value_error_with_multiple_config(self): def test_timeout_configuration(self): sql_statement = 'SELECT 1' config = { - 'query': { - "timeoutMs": 1 - } + "timeout_ms": 1 } # Test that QueryTimeout error raises with pytest.raises(gbq.QueryTimeout): From d83fd2293dca1a9c011474ef2c7859da3a9a2eb0 Mon Sep 17 00:00:00 2001 From: Jason Ng Date: Fri, 6 Oct 2017 07:24:46 -0400 Subject: [PATCH 17/42] Remove two configuration tests which are no longer relevant --- pandas_gbq/gbq.py | 5 ++++- pandas_gbq/tests/test_gbq.py | 39 ------------------------------------ 2 files changed, 4 insertions(+), 40 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 6f8c2597..dd5b7cf9 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -794,7 +794,10 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, """ if dialect not in ('legacy', 'standard'): - raise ValueError("'{0}' is not valid for dialect".format(dialect)) + raise ValueError("'{0}' is not valid for dialect".format(dialect)) + if configuration and any(key in configuration for key in + ["query", "copy", "load", "extract"]): + raise ValueError("New API handles configuration settings differently") def _wait_for_job(job): while True: diff --git a/pandas_gbq/tests/test_gbq.py b/pandas_gbq/tests/test_gbq.py index 778974ea..7f6d3291 100644 --- a/pandas_gbq/tests/test_gbq.py +++ b/pandas_gbq/tests/test_gbq.py @@ -832,27 +832,6 @@ def test_query_with_parameters(self): private_key=_get_private_key_path()) tm.assert_frame_equal(df, DataFrame({'valid_result': [3]})) - def test_query_inside_configuration(self): - query_no_use = 'SELECT "PI_WRONG" AS valid_string' - query = 'SELECT "PI" AS valid_string' - config = { - 'query': { - "query": query, - "useQueryCache": False, - } - } - # Test that it can't pass query both - # inside config and as parameter - with pytest.raises(ValueError): - gbq.read_gbq(query_no_use, project_id=_get_project_id(), - private_key=_get_private_key_path(), - configuration=config) - - df = gbq.read_gbq(None, project_id=_get_project_id(), - private_key=_get_private_key_path(), - configuration=config) - tm.assert_frame_equal(df, DataFrame({'valid_string': ['PI']})) - def test_configuration_without_query(self): sql_statement = 'SELECT 1' config = { @@ -876,24 +855,6 @@ def test_configuration_without_query(self): private_key=_get_private_key_path(), configuration=config) - def test_configuration_raises_value_error_with_multiple_config(self): - sql_statement = 'SELECT 1' - config = { - 'query': { - "query": sql_statement, - "useQueryCache": False, - }, - 'load': { - "query": sql_statement, - "useQueryCache": False, - } - } - # Test that only ValueError is raised with multiple configurations - with pytest.raises(ValueError): - gbq.read_gbq(sql_statement, project_id=_get_project_id(), - private_key=_get_private_key_path(), - configuration=config) - def test_timeout_configuration(self): sql_statement = 'SELECT 1' config = { From e597a762be88f7cca89aa0d9c334125b2e94871f Mon Sep 17 00:00:00 2001 From: Jason Ng Date: Fri, 6 Oct 2017 11:24:37 -0400 Subject: [PATCH 18/42] Rewrite tests part 2 and can specify return type --- pandas_gbq/gbq.py | 55 ++++++++++++++++++++++++------------ pandas_gbq/tests/test_gbq.py | 44 ++++++++++++++++------------- 2 files changed, 61 insertions(+), 38 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index dd5b7cf9..2ad48b45 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -8,7 +8,7 @@ import os from distutils.version import StrictVersion -from pandas import compat, DataFrame +from pandas import compat, DataFrame, to_datetime, to_numeric from pandas.compat import bytes_to_str from google.cloud import bigquery @@ -693,7 +693,7 @@ def sizeof_fmt(num, suffix='B'): def read_gbq(query, project_id=None, index_col=None, col_order=None, reauth=False, verbose=True, private_key=None, auth_local_webserver=False, dialect='legacy', credentials=None, - get_schema=False, query_parameters=(), configuration=None, + return_type='df', query_parameters=(), configuration=None, timeout_ms=None, **kwargs): r"""Load data from Google BigQuery using google-cloud-python @@ -757,9 +757,13 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, credentials: credentials object (default None) If generating credentials on your own, pass in. Otherwise, will attempt to generate automatically - get_schema: boolean, default False (optional) - Set to True if you only want to return the schema, otherwise by default - will return dataframe + return_type: {'schema','list','df'}, default 'df' + schema returns an array of SchemaField objects, which you can access + `from pprint import pprint + [pprint(vars(field)) for field in schema]` + list returns a list of lists of the rows of the results; column names + are not included + df returns a dataframe by default query_parameters: tuple (optional) Can only be used in Standard SQL example: gbq.read_gbq("SELECT @param1 + @param2", query_parameters = (bigquery.ScalarQueryParameter( @@ -866,13 +870,16 @@ def async_query(): _wait_for_job(query_job) if verbose: print("Query done.") - if query_job._properties["statistics"]["query"].get("cacheHit", False): + if query_job._properties["statistics"]["query"].get("cacheHit", + False): print("Cache hit.") elif ("statistics" in query_job._properties and "query" in query_job._properties["statistics"]): - bytes_billed = int(query_job._properties["statistics"]["query"] + bytes_billed = int(query_job + ._properties["statistics"]["query"] .get("totalBytesProcessed", 0)) - bytes_processed = int(query_job._properties["statistics"]["query"] + bytes_processed = int(query_job + ._properties["statistics"]["query"] .get("totalBytesBilled", 0)) print("Total bytes billed (processed): %s (%s)" % (sizeof_fmt(bytes_billed), sizeof_fmt(bytes_processed))) @@ -896,22 +903,25 @@ def async_query(): print("Finished at %s." % datetime.now() .strftime('%Y-%m-%d %H:%M:%S')) - if get_schema: + if return_type=='schema': return query_results.schema + elif return_type=='list': + return rows columns = [field.name for field in query_results.schema] data = rows final_df = DataFrame(data=data, columns=columns) - # Change the order of columns in the DataFrame based on provided list - if col_order: - if sorted(col_order) == sorted(final_df.columns): - final_df = final_df[col_order] - else: - raise InvalidColumnOrder( - 'Column order does not match this DataFrame.' - ) + # Manual field type conversion. Inserted to handle tests + # with only null rows, otherwise type conversion works automatically + for field in query_results.schema: + if field.field_type=='TIMESTAMP': + if final_df[field.name].isnull().values.all(): + final_df[field.name] = to_datetime(final_df[field.name]) + if field.field_type=='FLOAT': + if final_df[field.name].isnull().values.all(): + final_df[field.name] = to_numeric(final_df[field.name]) # Reindex the DataFrame on the provided column if index_col: @@ -921,7 +931,16 @@ def async_query(): raise InvalidIndexColumn( 'Index column "{0}" does not exist in DataFrame.' .format(index_col) - ) + ) + + # Change the order of columns in the DataFrame based on provided list + if col_order: + if sorted(col_order) == sorted(final_df.columns): + final_df = final_df[col_order] + else: + raise InvalidColumnOrder( + 'Column order does not match this DataFrame.' + ) return final_df diff --git a/pandas_gbq/tests/test_gbq.py b/pandas_gbq/tests/test_gbq.py index 7f6d3291..1b11e3be 100644 --- a/pandas_gbq/tests/test_gbq.py +++ b/pandas_gbq/tests/test_gbq.py @@ -13,13 +13,13 @@ from pandas import compat from pandas.compat import u, range -from pandas import NaT, DataFrame +from pandas import NaT, DataFrame, to_datetime from pandas_gbq import gbq import pandas.util.testing as tm from pandas.compat.numpy import np_datetime64_compat from google.cloud import bigquery -from google.cloud.exceptions import BadRequest +from google.cloud.exceptions import BadRequest, NotFound TABLE_ID = 'new_test' @@ -205,7 +205,7 @@ def test_should_be_able_to_get_schema_from_query(self): schema = gbq.read_gbq('SELECT 1', project_id=_get_project_id(), credentials=credentials, - get_schema=True) + return_type='schema') assert schema is not None def test_should_be_able_to_get_results_from_query(self): @@ -274,7 +274,7 @@ def test_should_be_able_to_get_schema_from_query(self): credentials = self.sut.credentials schema = gbq.read_gbq('SELECT 1', project_id=_get_project_id(), credentials=credentials, - get_schema=True) + return_type='schema') assert schema is not None def test_should_be_able_to_get_results_from_query(self): @@ -608,16 +608,19 @@ def test_should_properly_handle_timestamp_unix_epoch(self): query = 'SELECT TIMESTAMP("1970-01-01 00:00:00") AS unix_epoch' df = gbq.read_gbq(query, project_id=_get_project_id(), private_key=_get_private_key_path()) - tm.assert_frame_equal(df, DataFrame( - {'unix_epoch': [np.datetime64('1970-01-01T00:00:00.000000Z')]})) + expected = DataFrame({'unix_epoch': + [np.datetime64('1970-01-01T00:00:00.000000Z')]}) + tm.assert_frame_equal(df, to_datetime(expected.unix_epoch).dt + .tz_localize('UTC').to_frame()) def test_should_properly_handle_arbitrary_timestamp(self): query = 'SELECT TIMESTAMP("2004-09-15 05:00:00") AS valid_timestamp' df = gbq.read_gbq(query, project_id=_get_project_id(), private_key=_get_private_key_path()) - tm.assert_frame_equal(df, DataFrame({ - 'valid_timestamp': [np.datetime64('2004-09-15T05:00:00.000000Z')] - })) + expected = DataFrame({'valid_timestamp': + [np.datetime64('2004-09-15T05:00:00.000000Z')]}) + tm.assert_frame_equal(df, to_datetime(expected.valid_timestamp).dt + .tz_localize('UTC').to_frame()) def test_should_properly_handle_null_timestamp(self): query = 'SELECT TIMESTAMP(NULL) AS null_timestamp' @@ -711,7 +714,7 @@ def test_column_order_plus_index(self): def test_read_gbq_raises_invalid_index_column(self): query = "SELECT 'a' AS string_1, 'b' AS string_2, 'c' AS string_3" - col_order = ['string_3', 'string_2'] + col_order = ['string_3', 'string_2', 'string_1'] # Column string_bbb does not exist. Should raise InvalidIndexColumn with pytest.raises(gbq.InvalidIndexColumn): @@ -720,18 +723,18 @@ def test_read_gbq_raises_invalid_index_column(self): private_key=_get_private_key_path()) def test_malformed_query(self): - with pytest.raises(gbq.GenericGBQException): + with pytest.raises(BadRequest): gbq.read_gbq("SELCET * FORM [publicdata:samples.shakespeare]", project_id=_get_project_id(), private_key=_get_private_key_path()) def test_bad_project_id(self): - with pytest.raises(gbq.GenericGBQException): + with pytest.raises(NotFound): gbq.read_gbq("SELECT 1", project_id='001', private_key=_get_private_key_path()) def test_bad_table_name(self): - with pytest.raises(gbq.GenericGBQException): + with pytest.raises(NotFound): gbq.read_gbq("SELECT * FROM [publicdata:samples.nope]", project_id=_get_project_id(), private_key=_get_private_key_path()) @@ -760,14 +763,15 @@ def test_zero_rows(self): ('is_bot', np.dtype(bool)), ('ts', 'M8[ns]')]) expected_result = DataFrame( page_array, columns=['title', 'id', 'is_bot', 'ts']) - tm.assert_frame_equal(df, expected_result) + tm.assert_frame_equal(expected_result.astype(object), + df.reset_index(drop=True).astype(object)) def test_legacy_sql(self): legacy_sql = "SELECT id FROM [publicdata.samples.wikipedia] LIMIT 10" # Test that a legacy sql statement fails when # setting dialect='standard' - with pytest.raises((RuntimeError,BadRequest)): + with pytest.raises((RuntimeError, BadRequest)): gbq.read_gbq(legacy_sql, project_id=_get_project_id(), dialect='standard', private_key=_get_private_key_path()) @@ -785,7 +789,7 @@ def test_standard_sql(self): # Test that a standard sql statement fails when using # the legacy SQL dialect (default value) - with pytest.raises((RuntimeError,BadRequest)): + with pytest.raises((RuntimeError, BadRequest)): gbq.read_gbq(standard_sql, project_id=_get_project_id(), private_key=_get_private_key_path()) @@ -816,7 +820,7 @@ def test_query_with_parameters(self): config = {"use_legacy_sql": False} # Test that a query that relies on parameters fails # when parameters are not supplied via configuration - with pytest.raises((RuntimeError,BadRequest)): + with pytest.raises((RuntimeError, BadRequest)): gbq.read_gbq(sql_statement, project_id=_get_project_id(), private_key=_get_private_key_path()) @@ -824,10 +828,10 @@ def test_query_with_parameters(self): # the correct query parameters via the 'config' and query_parameters # option df = gbq.read_gbq(sql_statement, project_id=_get_project_id(), - configuration = config, - query_parameters = (bigquery.ScalarQueryParameter( + configuration=config, + query_parameters=(bigquery.ScalarQueryParameter( 'param1', 'INT64', 1), - bigquery.ScalarQueryParameter( + bigquery.ScalarQueryParameter( 'param2', 'INT64', 2)), private_key=_get_private_key_path()) tm.assert_frame_equal(df, DataFrame({'valid_result': [3]})) From 347bca21bf1544a92adff648b102180922870a4c Mon Sep 17 00:00:00 2001 From: Jason Ng Date: Fri, 6 Oct 2017 11:40:58 -0400 Subject: [PATCH 19/42] Pin bigquery to 0.26.0 and more linting Fix pinning in conda ci requirements Give up trying to pin in conda Clean up the read_gbq doc a little --- ci/requirements-2.7-0.19.2.pip | 2 +- ci/requirements-3.5-0.18.1.pip | 2 +- ci/requirements-3.6-0.20.1.conda | 2 +- ci/requirements-3.6-MASTER.pip | 2 +- pandas_gbq/gbq.py | 29 ++++++++++++++--------------- pandas_gbq/tests/test_gbq.py | 20 +++++++++----------- requirements.txt | 2 +- setup.py | 2 +- 8 files changed, 29 insertions(+), 32 deletions(-) diff --git a/ci/requirements-2.7-0.19.2.pip b/ci/requirements-2.7-0.19.2.pip index 22b763cf..4a90785a 100644 --- a/ci/requirements-2.7-0.19.2.pip +++ b/ci/requirements-2.7-0.19.2.pip @@ -5,4 +5,4 @@ google-auth-oauthlib PyCrypto python-gflags mock -google-cloud-bigquery>=0.25.0,<=0.26.0 +google-cloud-bigquery==0.26.0 diff --git a/ci/requirements-3.5-0.18.1.pip b/ci/requirements-3.5-0.18.1.pip index 19ca6227..9c94dbbb 100644 --- a/ci/requirements-3.5-0.18.1.pip +++ b/ci/requirements-3.5-0.18.1.pip @@ -3,4 +3,4 @@ google-auth==1.0.0 google-auth-httplib2==0.0.1 google-auth-oauthlib==0.0.1 mock -google-cloud-bigquery>=0.25.0,<=0.26.0 +google-cloud-bigquery==0.26.0 diff --git a/ci/requirements-3.6-0.20.1.conda b/ci/requirements-3.6-0.20.1.conda index 7ca942e4..3ee89b92 100644 --- a/ci/requirements-3.6-0.20.1.conda +++ b/ci/requirements-3.6-0.20.1.conda @@ -3,4 +3,4 @@ google-auth google-auth-httplib2 google-auth-oauthlib mock -google-cloud-bigquery>=0.25.0,<=0.26.0 +google-cloud-bigquery diff --git a/ci/requirements-3.6-MASTER.pip b/ci/requirements-3.6-MASTER.pip index 7ca942e4..96ab0aff 100644 --- a/ci/requirements-3.6-MASTER.pip +++ b/ci/requirements-3.6-MASTER.pip @@ -3,4 +3,4 @@ google-auth google-auth-httplib2 google-auth-oauthlib mock -google-cloud-bigquery>=0.25.0,<=0.26.0 +google-cloud-bigquery==0.26.0 diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 2ad48b45..bcf2b610 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -708,7 +708,7 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, - The easiest is to generate user credentials via + One method is to generate user credentials via `gcloud auth application-default login` and point to it using an environment variable: @@ -717,10 +717,11 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, You can also download a service account private key JSON file and pass the path to the file to the private_key paramater. - As a final alternative, you can also set auth_local_webserver to True, - which will trigger a pop-up through which a user can auth with their Google - account. This will generate a user credentials file, which is saved locally - and can be re-used in the future. + If default credentials are not located and a private key is not passed, + an auth flow will begin where a user can auth via a link or via a pop-up + through which a user can auth with their Google account. This will + generate a user credentials file, which is saved locally and can be re-used + in the future. Parameters ---------- @@ -799,7 +800,7 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, if dialect not in ('legacy', 'standard'): raise ValueError("'{0}' is not valid for dialect".format(dialect)) - if configuration and any(key in configuration for key in + if configuration and any(key in configuration for key in ["query", "copy", "load", "extract"]): raise ValueError("New API handles configuration settings differently") @@ -870,7 +871,7 @@ def async_query(): _wait_for_job(query_job) if verbose: print("Query done.") - if query_job._properties["statistics"]["query"].get("cacheHit", + if query_job._properties["statistics"]["query"].get("cacheHit", False): print("Cache hit.") elif ("statistics" in query_job._properties and @@ -903,9 +904,9 @@ def async_query(): print("Finished at %s." % datetime.now() .strftime('%Y-%m-%d %H:%M:%S')) - if return_type=='schema': + if return_type == 'schema': return query_results.schema - elif return_type=='list': + elif return_type == 'list': return rows columns = [field.name for field in query_results.schema] @@ -916,10 +917,10 @@ def async_query(): # Manual field type conversion. Inserted to handle tests # with only null rows, otherwise type conversion works automatically for field in query_results.schema: - if field.field_type=='TIMESTAMP': + if field.field_type == 'TIMESTAMP': if final_df[field.name].isnull().values.all(): final_df[field.name] = to_datetime(final_df[field.name]) - if field.field_type=='FLOAT': + if field.field_type == 'FLOAT': if final_df[field.name].isnull().values.all(): final_df[field.name] = to_numeric(final_df[field.name]) @@ -930,8 +931,7 @@ def async_query(): else: raise InvalidIndexColumn( 'Index column "{0}" does not exist in DataFrame.' - .format(index_col) - ) + .format(index_col)) # Change the order of columns in the DataFrame based on provided list if col_order: @@ -939,8 +939,7 @@ def async_query(): final_df = final_df[col_order] else: raise InvalidColumnOrder( - 'Column order does not match this DataFrame.' - ) + 'Column order does not match this DataFrame.') return final_df diff --git a/pandas_gbq/tests/test_gbq.py b/pandas_gbq/tests/test_gbq.py index 1b11e3be..f75ef49c 100644 --- a/pandas_gbq/tests/test_gbq.py +++ b/pandas_gbq/tests/test_gbq.py @@ -608,19 +608,19 @@ def test_should_properly_handle_timestamp_unix_epoch(self): query = 'SELECT TIMESTAMP("1970-01-01 00:00:00") AS unix_epoch' df = gbq.read_gbq(query, project_id=_get_project_id(), private_key=_get_private_key_path()) - expected = DataFrame({'unix_epoch': - [np.datetime64('1970-01-01T00:00:00.000000Z')]}) + expected = DataFrame({'unix_epoch': + [np.datetime64('1970-01-01T00:00:00.000000Z')]}) tm.assert_frame_equal(df, to_datetime(expected.unix_epoch).dt - .tz_localize('UTC').to_frame()) + .tz_localize('UTC').to_frame()) def test_should_properly_handle_arbitrary_timestamp(self): query = 'SELECT TIMESTAMP("2004-09-15 05:00:00") AS valid_timestamp' df = gbq.read_gbq(query, project_id=_get_project_id(), private_key=_get_private_key_path()) - expected = DataFrame({'valid_timestamp': - [np.datetime64('2004-09-15T05:00:00.000000Z')]}) + expected = DataFrame({'valid_timestamp': + [np.datetime64('2004-09-15T05:00:00.000000Z')]}) tm.assert_frame_equal(df, to_datetime(expected.valid_timestamp).dt - .tz_localize('UTC').to_frame()) + .tz_localize('UTC').to_frame()) def test_should_properly_handle_null_timestamp(self): query = 'SELECT TIMESTAMP(NULL) AS null_timestamp' @@ -830,9 +830,9 @@ def test_query_with_parameters(self): df = gbq.read_gbq(sql_statement, project_id=_get_project_id(), configuration=config, query_parameters=(bigquery.ScalarQueryParameter( - 'param1', 'INT64', 1), + 'param1', 'INT64', 1), bigquery.ScalarQueryParameter( - 'param2', 'INT64', 2)), + 'param2', 'INT64', 2)), private_key=_get_private_key_path()) tm.assert_frame_equal(df, DataFrame({'valid_result': [3]})) @@ -861,9 +861,7 @@ def test_configuration_without_query(self): def test_timeout_configuration(self): sql_statement = 'SELECT 1' - config = { - "timeout_ms": 1 - } + config = {"timeout_ms": 1} # Test that QueryTimeout error raises with pytest.raises(gbq.QueryTimeout): gbq.read_gbq(sql_statement, project_id=_get_project_id(), diff --git a/requirements.txt b/requirements.txt index 731224d2..f7f2cd1a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,5 +4,5 @@ google-api-python-client google-auth google-auth-httplib2 google-auth-oauthlib -google-cloud-bigquery>=0.25.0,<=0.26.0 +google-cloud-bigquery==0.26.0 uuid diff --git a/setup.py b/setup.py index b6cdf68d..ba3e5e3e 100644 --- a/setup.py +++ b/setup.py @@ -24,7 +24,7 @@ def readme(): 'google-auth>=1.0.0', 'google-auth-httplib2>=0.0.1', 'google-auth-oauthlib>=0.0.1', - 'google-cloud-bigquery>=0.25.0,<=0.26.0', + 'google-cloud-bigquery==0.26.0', ] From fc6134ab7b4a5fcc3b5df2bf4f1fbe3ecffad28d Mon Sep 17 00:00:00 2001 From: Jason Ng Date: Sat, 7 Oct 2017 16:29:25 -0400 Subject: [PATCH 20/42] Remove uuid from requirements --- requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index f7f2cd1a..f49120c2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,3 @@ google-auth google-auth-httplib2 google-auth-oauthlib google-cloud-bigquery==0.26.0 -uuid From 61db19190cdfe690298288cf90e0d62a5b73e95d Mon Sep 17 00:00:00 2001 From: Jason Ng Date: Mon, 9 Oct 2017 09:58:14 -0400 Subject: [PATCH 21/42] Clean up/revert some of the documentation --- pandas_gbq/gbq.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index bcf2b610..66fb21a0 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -727,7 +727,7 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, ---------- query : str SQL-Like Query to return data values - project_id : str (optional) + project_id : str Google BigQuery Account project ID. index_col : str (optional) Name of result column to use for index in results DataFrame @@ -740,22 +740,27 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, verbose : boolean (default True) Verbose output private_key : str (optional) - Path to service account private key in JSON format. If none is - provided, will default to the GOOGLE_APPLICATION_CREDENTIALS - environment variable or another form of authentication (see above) - auth_local_webserver : boolean, default False (optional) + Service account private key in JSON format. Can be file path + or string contents. This is useful for remote server + authentication (eg. jupyter iPython notebook on remote host) + auth_local_webserver : boolean, default False Use the [local webserver flow] instead of the [console flow] when getting user credentials. A file named bigquery_credentials.dat will - be created in ~/.config/pandas_gbq/. You can also set - PANDAS_GBQ_CREDENTIALS_FILE environment variable so as to define a - specific path to store this credential (eg. /etc/keys/bigquery.dat). + be created in current dir. You can also set PANDAS_GBQ_CREDENTIALS_FILE + environment variable so as to define a specific path to store this + credential (eg. /etc/keys/bigquery.dat). + .. [local webserver flow] + http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server + .. [console flow] + http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console + .. versionadded:: 0.2.0 dialect : {'legacy', 'standard'}, default 'legacy' 'legacy' : Use BigQuery's legacy SQL dialect. 'standard' : Use BigQuery's standard SQL (beta), which is compliant with the SQL 2011 standard. For more information see `BigQuery SQL Reference ` - credentials: credentials object (default None) + credentials: credentials object, default None (optional) If generating credentials on your own, pass in. Otherwise, will attempt to generate automatically return_type: {'schema','list','df'}, default 'df' From 28c0ae72a03aa526a78acfb7635a4d7680f2af98 Mon Sep 17 00:00:00 2001 From: Jason Ng Date: Fri, 27 Oct 2017 13:24:21 -0400 Subject: [PATCH 22/42] Unpin in testing requirements and expand config error message --- ci/requirements-2.7-0.19.2.pip | 2 +- ci/requirements-3.6-MASTER.pip | 2 +- pandas_gbq/gbq.py | 15 ++++++++++++--- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/ci/requirements-2.7-0.19.2.pip b/ci/requirements-2.7-0.19.2.pip index 4a90785a..a31accd7 100644 --- a/ci/requirements-2.7-0.19.2.pip +++ b/ci/requirements-2.7-0.19.2.pip @@ -5,4 +5,4 @@ google-auth-oauthlib PyCrypto python-gflags mock -google-cloud-bigquery==0.26.0 +google-cloud-bigquery diff --git a/ci/requirements-3.6-MASTER.pip b/ci/requirements-3.6-MASTER.pip index 96ab0aff..3ee89b92 100644 --- a/ci/requirements-3.6-MASTER.pip +++ b/ci/requirements-3.6-MASTER.pip @@ -3,4 +3,4 @@ google-auth google-auth-httplib2 google-auth-oauthlib mock -google-cloud-bigquery==0.26.0 +google-cloud-bigquery diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 66fb21a0..65070bb2 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -12,7 +12,6 @@ from pandas.compat import bytes_to_str from google.cloud import bigquery - def _check_google_client_version(): try: @@ -789,7 +788,7 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, use_legacy_sql, dry_run, write_disposition, udf_resources, maximum_billing_tier, maximum_bytes_billed + google/cloud/bigquery/job.html?highlight=QueryJobConfig> timeout_ms: int (optional) If set or found in config, triggers a sync query that times out with no results if it can't be completed in the time desired @@ -802,12 +801,22 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, DataFrame representing results of query """ + _test_google_api_imports() + + if not project_id: + raise TypeError("Missing required parameter: project_id") if dialect not in ('legacy', 'standard'): raise ValueError("'{0}' is not valid for dialect".format(dialect)) if configuration and any(key in configuration for key in ["query", "copy", "load", "extract"]): - raise ValueError("New API handles configuration settings differently") + raise ValueError("The Google Cloud BigQuery API handles configuration " + "settings differently. There are now a discrete set of query " + "settings one can set by passing in a dictionary, e.g.: " + "`configuration={'maximum_billing_tier':2}`. See " + "http://google-cloud-python.readthedocs.io/en/latest/_modules/" + "google/cloud/bigquery/job.html?highlight=QueryJobConfig " + "for allowable paramaters.") def _wait_for_job(job): while True: From cd75cda5d427f19652012e29bf6e30b66c87febb Mon Sep 17 00:00:00 2001 From: Jason Ng Date: Fri, 27 Oct 2017 13:26:56 -0400 Subject: [PATCH 23/42] Include 0.27 in setup.py install_requires --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index ba3e5e3e..327c983d 100644 --- a/setup.py +++ b/setup.py @@ -24,7 +24,7 @@ def readme(): 'google-auth>=1.0.0', 'google-auth-httplib2>=0.0.1', 'google-auth-oauthlib>=0.0.1', - 'google-cloud-bigquery==0.26.0', + 'google-cloud-bigquery>=0.26.0,<0.28.0', ] From 62641775cc43c7bc786b303fd8a057da342ea625 Mon Sep 17 00:00:00 2001 From: Jason Ng Date: Fri, 27 Oct 2017 15:16:28 -0400 Subject: [PATCH 24/42] Initial run_query function for modularizing read_gbq --- pandas_gbq/gbq.py | 197 +++++++++++++++++++++++++--------------------- 1 file changed, 109 insertions(+), 88 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 65070bb2..a5b0c892 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -12,6 +12,7 @@ from pandas.compat import bytes_to_str from google.cloud import bigquery + def _check_google_client_version(): try: @@ -689,6 +690,95 @@ def sizeof_fmt(num, suffix='B'): return fmt % (num, 'Y', suffix) +def run_query(query, client, dialect, query_parameters, configuration, verbose, + async=True): + def _wait_for_job(job): + while True: + job.reload() # Refreshes the state via a GET request. + if job.state == 'DONE': + if job.error_result: + raise RuntimeError(job.errors) + return + time.sleep(1) + + def _set_common_query_settings(query_job): + if dialect == 'legacy': + query_job.use_legacy_sql = True + elif dialect == 'standard': + query_job.use_legacy_sql = False + + if configuration: + for setting, value in configuration.items(): + setattr(query_job, setting, value) + return query_job + + def sync_query(): + query_job = client.run_sync_query(query, + query_parameters=query_parameters) + query_job = _set_common_query_settings(query_job) + if verbose: + print("Query running...") + query_job.run() + if not query_job._properties.get("jobComplete", False): + raise QueryTimeout("Sync query timed out") + if verbose: + print("Query done.") + if query_job._properties.get("cacheHit", False): + print("Cache hit.") + else: + bytes_billed = int(query_job._properties + .get("totalBytesProcessed", 0)) + bytes_processed = int(query_job._properties + .get("totalBytesBilled", 0)) + print("Total bytes billed (processed): %s (%s)" % + (sizeof_fmt(bytes_billed), sizeof_fmt(bytes_processed))) + print("\nRetrieving results...") + return query_job, None + + def async_query(): + query_job = client.run_async_query(str(uuid.uuid4()), + query, + query_parameters=query_parameters) + query_job = _set_common_query_settings(query_job) + query_job.begin() + try: + query_results = query_job.results().fetch_data() + except: + query_results = query_job.result().fetch_data() + if verbose: + print("Query running...") + _wait_for_job(query_job) + if verbose: + print("Query done.") + if query_job._properties["statistics"]["query"].get("cacheHit", + False): + print("Cache hit.") + elif ("statistics" in query_job._properties and + "query" in query_job._properties["statistics"]): + bytes_billed = int(query_job + ._properties["statistics"]["query"] + .get("totalBytesProcessed", 0)) + bytes_processed = int(query_job + ._properties["statistics"]["query"] + .get("totalBytesBilled", 0)) + print("Total bytes billed (processed): %s (%s)" % + (sizeof_fmt(bytes_billed), sizeof_fmt(bytes_processed))) + print("\nRetrieving results...") + return query_results, query_job + + if async: + query_results, query_job = async_query() + rows = list(query_results) + else: + query_results, query_job = sync_query() + rows = list(query_results.rows) + + columns = [field.name for field in query_results.schema] + schema = query_results.schema + + return query_results, query_job, rows, schema, columns + + def read_gbq(query, project_id=None, index_col=None, col_order=None, reauth=False, verbose=True, private_key=None, auth_local_webserver=False, dialect='legacy', credentials=None, @@ -818,15 +908,6 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, "google/cloud/bigquery/job.html?highlight=QueryJobConfig " "for allowable paramaters.") - def _wait_for_job(job): - while True: - job.reload() # Refreshes the state via a GET request. - if job.state == 'DONE': - if job.error_result: - raise RuntimeError(job.errors) - return - time.sleep(1) - if credentials is None: credentials = GbqConnector(project_id=project_id, reauth=reauth, @@ -834,84 +915,27 @@ def _wait_for_job(job): private_key=private_key).credentials client = bigquery.Client(project=project_id, credentials=credentials) - def _set_common_query_settings(query_job): - if dialect == 'legacy': - query_job.use_legacy_sql = True - elif dialect == 'standard': - query_job.use_legacy_sql = False - - if configuration: - for setting, value in configuration.items(): - setattr(query_job, setting, value) - return query_job - - def sync_query(): - query_job = client.run_sync_query(query, - query_parameters=query_parameters) - query_job = _set_common_query_settings(query_job) - if verbose: - print("Query running...") - if timeout_ms: - query_job.timeout_ms = timeout_ms - query_job.run() - if not query_job._properties.get("jobComplete", False): - raise QueryTimeout("Sync query timed out") - if verbose: - print("Query done.") - if query_job._properties.get("cacheHit", False): - print("Cache hit.") - else: - bytes_billed = int(query_job._properties - .get("totalBytesProcessed", 0)) - bytes_processed = int(query_job._properties - .get("totalBytesBilled", 0)) - print("Total bytes billed (processed): %s (%s)" % - (sizeof_fmt(bytes_billed), sizeof_fmt(bytes_processed))) - print("\nRetrieving results...") - return query_job, None - - def async_query(): - query_job = client.run_async_query(str(uuid.uuid4()), - query, - query_parameters=query_parameters) - query_job = _set_common_query_settings(query_job) - query_job.begin() - try: - query_results = query_job.results().fetch_data() - except: - query_results = query_job.result().fetch_data() - if verbose: - print("Query running...") - _wait_for_job(query_job) - if verbose: - print("Query done.") - if query_job._properties["statistics"]["query"].get("cacheHit", - False): - print("Cache hit.") - elif ("statistics" in query_job._properties and - "query" in query_job._properties["statistics"]): - bytes_billed = int(query_job - ._properties["statistics"]["query"] - .get("totalBytesProcessed", 0)) - bytes_processed = int(query_job - ._properties["statistics"]["query"] - .get("totalBytesBilled", 0)) - print("Total bytes billed (processed): %s (%s)" % - (sizeof_fmt(bytes_billed), sizeof_fmt(bytes_processed))) - print("\nRetrieving results...") - return query_results, query_job - - if (configuration and "timeout_ms" in configuration) or timeout_ms: - query_results, query_job = sync_query() - rows = list(query_results.rows) - total_rows = len(rows) + + if timeout_ms: + configuration['timeout_ms'] = timeout_ms + if (configuration and "timeout_ms" in configuration): + query_results, query_job, rows, schame, columns = run_query(query, + client, + dialect, + query_parameters, + configuration, + verbose, + async=False) else: - query_results, query_job = async_query() - rows = list(query_results) - total_rows = len(rows) + query_results, query_job, rows, schema, columns = run_query(query, + client, + dialect, + query_parameters, + configuration, + verbose) if verbose: - print("Got %s rows.") % total_rows + print("Got %s rows.") % len(rows) if query_job: print("\nTotal time taken %ss" % (datetime.utcnow() - query_job.created.replace(tzinfo=None)).seconds) @@ -923,10 +947,7 @@ def async_query(): elif return_type == 'list': return rows - columns = [field.name for field in query_results.schema] - data = rows - - final_df = DataFrame(data=data, columns=columns) + final_df = DataFrame(data=rows, columns=columns) # Manual field type conversion. Inserted to handle tests # with only null rows, otherwise type conversion works automatically From 208e39f4e4e8d28d3946da0b96057d778594ae2d Mon Sep 17 00:00:00 2001 From: Jason Ng Date: Fri, 27 Oct 2017 16:28:05 -0400 Subject: [PATCH 25/42] Move schema handling and creating the df to separate functions, update check schema test --- pandas_gbq/gbq.py | 133 +++++++++++++++++------------------ pandas_gbq/tests/test_gbq.py | 14 ++-- 2 files changed, 71 insertions(+), 76 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index a5b0c892..e6d854fc 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -690,8 +690,8 @@ def sizeof_fmt(num, suffix='B'): return fmt % (num, 'Y', suffix) -def run_query(query, client, dialect, query_parameters, configuration, verbose, - async=True): +def run_query(query, client, dialect='legacy', query_parameters=(), + configuration=None, verbose=True, async=True): def _wait_for_job(job): while True: job.reload() # Refreshes the state via a GET request. @@ -766,6 +766,15 @@ def async_query(): print("\nRetrieving results...") return query_results, query_job + def get_columns_schema(query_results): + schema = [{"name":f.name, + "field_type":f.field_type, + "mode":f.mode, + "fields":f.fields, + "description":f.description} for f in query_results.schema] + columns = [field["name"] for field in schema] + return columns, schema + if async: query_results, query_job = async_query() rows = list(query_results) @@ -773,16 +782,23 @@ def async_query(): query_results, query_job = sync_query() rows = list(query_results.rows) - columns = [field.name for field in query_results.schema] - schema = query_results.schema + columns, schema = get_columns_schema(query_results) - return query_results, query_job, rows, schema, columns + if verbose: + print("Got %s rows.") % len(rows) + if query_job: + print("\nTotal time taken %ss" % (datetime.utcnow() - + query_job.created.replace(tzinfo=None)).seconds) + print("Finished at %s." % datetime.now() + .strftime('%Y-%m-%d %H:%M:%S')) + + return rows, columns, schema def read_gbq(query, project_id=None, index_col=None, col_order=None, reauth=False, verbose=True, private_key=None, auth_local_webserver=False, dialect='legacy', credentials=None, - return_type='df', query_parameters=(), configuration=None, + query_parameters=(), configuration=None, timeout_ms=None, **kwargs): r"""Load data from Google BigQuery using google-cloud-python @@ -852,13 +868,6 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, credentials: credentials object, default None (optional) If generating credentials on your own, pass in. Otherwise, will attempt to generate automatically - return_type: {'schema','list','df'}, default 'df' - schema returns an array of SchemaField objects, which you can access - `from pprint import pprint - [pprint(vars(field)) for field in schema]` - list returns a list of lists of the rows of the results; column names - are not included - df returns a dataframe by default query_parameters: tuple (optional) Can only be used in Standard SQL example: gbq.read_gbq("SELECT @param1 + @param2", query_parameters = (bigquery.ScalarQueryParameter( @@ -891,6 +900,39 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, DataFrame representing results of query """ + + def _create_df(rows, columns, schema, index_col, col_order): + df = DataFrame(data=rows, columns=columns) + + # Manual field type conversion. Inserted to handle tests + # with only null rows, otherwise type conversion works automatically + for field in schema: + if field["field_type"] == 'TIMESTAMP': + if df[field["name"]].isnull().values.all(): + df[field["name"]] = to_datetime(df[field["name"]]) + if field["field_type"] == 'FLOAT': + if df[field["name"]].isnull().values.all(): + df[field["name"]] = to_numeric(df[field["name"]]) + + # Reindex the DataFrame on the provided column + if index_col: + if index_col in df.columns: + df.set_index(index_col, inplace=True) + else: + raise InvalidIndexColumn( + 'Index column "{0}" does not exist in DataFrame.' + .format(index_col)) + + # Change the order of columns in the DataFrame based on provided list + if col_order: + if sorted(col_order) == sorted(df.columns): + df = df[col_order] + else: + raise InvalidColumnOrder( + 'Column order does not match this DataFrame.') + + return df + _test_google_api_imports() if not project_id: @@ -914,69 +956,22 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, auth_local_webserver=auth_local_webserver, private_key=private_key).credentials client = bigquery.Client(project=project_id, credentials=credentials) - if timeout_ms: configuration['timeout_ms'] = timeout_ms if (configuration and "timeout_ms" in configuration): - query_results, query_job, rows, schame, columns = run_query(query, - client, - dialect, - query_parameters, - configuration, - verbose, - async=False) + rows, columns, schema = run_query(query, client, dialect, + query_parameters, configuration, + verbose, + async=False) else: - query_results, query_job, rows, schema, columns = run_query(query, - client, - dialect, - query_parameters, - configuration, - verbose) - - if verbose: - print("Got %s rows.") % len(rows) - if query_job: - print("\nTotal time taken %ss" % (datetime.utcnow() - - query_job.created.replace(tzinfo=None)).seconds) - print("Finished at %s." % datetime.now() - .strftime('%Y-%m-%d %H:%M:%S')) + rows, columns, schema = run_query(query, client, dialect, + query_parameters, configuration, + verbose) - if return_type == 'schema': - return query_results.schema - elif return_type == 'list': - return rows - - final_df = DataFrame(data=rows, columns=columns) - - # Manual field type conversion. Inserted to handle tests - # with only null rows, otherwise type conversion works automatically - for field in query_results.schema: - if field.field_type == 'TIMESTAMP': - if final_df[field.name].isnull().values.all(): - final_df[field.name] = to_datetime(final_df[field.name]) - if field.field_type == 'FLOAT': - if final_df[field.name].isnull().values.all(): - final_df[field.name] = to_numeric(final_df[field.name]) - - # Reindex the DataFrame on the provided column - if index_col: - if index_col in final_df.columns: - final_df.set_index(index_col, inplace=True) - else: - raise InvalidIndexColumn( - 'Index column "{0}" does not exist in DataFrame.' - .format(index_col)) - - # Change the order of columns in the DataFrame based on provided list - if col_order: - if sorted(col_order) == sorted(final_df.columns): - final_df = final_df[col_order] - else: - raise InvalidColumnOrder( - 'Column order does not match this DataFrame.') + df = _create_df(rows, columns, schema, index_col, col_order) - return final_df + return df def to_gbq(dataframe, destination_table, project_id, chunksize=10000, diff --git a/pandas_gbq/tests/test_gbq.py b/pandas_gbq/tests/test_gbq.py index f75ef49c..9402f614 100644 --- a/pandas_gbq/tests/test_gbq.py +++ b/pandas_gbq/tests/test_gbq.py @@ -188,6 +188,8 @@ def setup_method(self, method): self.sut = gbq.GbqConnector( _get_project_id(), auth_local_webserver=True) + self.client = bigquery.Client(project=_get_project_id(), + credentials=self.sut.credentials) def test_should_be_able_to_make_a_connector(self): assert self.sut is not None, 'Could not create a GbqConnector' @@ -202,10 +204,7 @@ def test_should_be_able_to_get_a_bigquery_service(self): def test_should_be_able_to_get_schema_from_query(self): credentials = self.sut.credentials - schema = gbq.read_gbq('SELECT 1', - project_id=_get_project_id(), - credentials=credentials, - return_type='schema') + schema = gbq.run_query('SELECT 1', client=self.client) assert schema is not None def test_should_be_able_to_get_results_from_query(self): @@ -258,6 +257,9 @@ def setup_method(self, method): self.sut = gbq.GbqConnector(_get_project_id(), private_key=_get_private_key_path()) + credentials = self.sut.get_credentials() + self.client = bigquery.Client(project=_get_project_id(), + credentials=self.sut.credentials) def test_should_be_able_to_make_a_connector(self): assert self.sut is not None @@ -272,9 +274,7 @@ def test_should_be_able_to_get_a_bigquery_service(self): def test_should_be_able_to_get_schema_from_query(self): credentials = self.sut.credentials - schema = gbq.read_gbq('SELECT 1', project_id=_get_project_id(), - credentials=credentials, - return_type='schema') + schema = gbq.run_query('SELECT 1', client=self.client) assert schema is not None def test_should_be_able_to_get_results_from_query(self): From 2cd32c7f4c8965ea329f190da978f4cc23201eb7 Mon Sep 17 00:00:00 2001 From: Jason Ng Date: Fri, 27 Oct 2017 16:48:31 -0400 Subject: [PATCH 26/42] Linting More linting --- pandas_gbq/gbq.py | 39 ++++++++++++++++++------------------ pandas_gbq/tests/test_gbq.py | 6 +----- 2 files changed, 21 insertions(+), 24 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index e6d854fc..6ffe1789 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -507,7 +507,7 @@ def process_insert_errors(self, insert_errors): def load_data(self, dataframe, dataset_id, table_id, chunksize): try: from googleapiclient.errors import HttpError - except: + except ImportError: from apiclient.errors import HttpError job_id = uuid.uuid4().hex @@ -581,7 +581,7 @@ def schema(self, dataset_id, table_id): try: from googleapiclient.errors import HttpError - except: + except ImportError: from apiclient.errors import HttpError try: @@ -690,7 +690,7 @@ def sizeof_fmt(num, suffix='B'): return fmt % (num, 'Y', suffix) -def run_query(query, client, dialect='legacy', query_parameters=(), +def run_query(query, client, dialect='legacy', query_parameters=(), configuration=None, verbose=True, async=True): def _wait_for_job(job): while True: @@ -743,7 +743,7 @@ def async_query(): query_job.begin() try: query_results = query_job.results().fetch_data() - except: + except AttributeError: query_results = query_job.result().fetch_data() if verbose: print("Query running...") @@ -767,11 +767,11 @@ def async_query(): return query_results, query_job def get_columns_schema(query_results): - schema = [{"name":f.name, - "field_type":f.field_type, - "mode":f.mode, - "fields":f.fields, - "description":f.description} for f in query_results.schema] + schema = [{"name": f.name, + "field_type": f.field_type, + "mode": f.mode, + "fields": f.fields, + "description": f.description} for f in query_results.schema] columns = [field["name"] for field in schema] return columns, schema @@ -943,12 +943,13 @@ def _create_df(rows, columns, schema, index_col, col_order): if configuration and any(key in configuration for key in ["query", "copy", "load", "extract"]): raise ValueError("The Google Cloud BigQuery API handles configuration " - "settings differently. There are now a discrete set of query " - "settings one can set by passing in a dictionary, e.g.: " - "`configuration={'maximum_billing_tier':2}`. See " - "http://google-cloud-python.readthedocs.io/en/latest/_modules/" - "google/cloud/bigquery/job.html?highlight=QueryJobConfig " - "for allowable paramaters.") + "settings differently. There are now a discrete set " + "of query settings one can set by passing in a " + "dictionary, e.g.: `configuration=" + "{'maximum_billing_tier':2}`. See http://google-cloud" + "-python.readthedocs.io/en/latest/_modules/google/" + "cloud/bigquery/job.html?highlight=QueryJobConfig " + "for allowable paramaters.") if credentials is None: credentials = GbqConnector(project_id=project_id, @@ -956,11 +957,11 @@ def _create_df(rows, columns, schema, index_col, col_order): auth_local_webserver=auth_local_webserver, private_key=private_key).credentials client = bigquery.Client(project=project_id, credentials=credentials) - + if timeout_ms: configuration['timeout_ms'] = timeout_ms if (configuration and "timeout_ms" in configuration): - rows, columns, schema = run_query(query, client, dialect, + rows, columns, schema = run_query(query, client, dialect, query_parameters, configuration, verbose, async=False) @@ -1121,7 +1122,7 @@ def __init__(self, project_id, dataset_id, reauth=False, verbose=False, private_key=None): try: from googleapiclient.errors import HttpError - except: + except ImportError: from apiclient.errors import HttpError self.http_error = HttpError self.dataset_id = dataset_id @@ -1220,7 +1221,7 @@ def __init__(self, project_id, reauth=False, verbose=False, private_key=None): try: from googleapiclient.errors import HttpError - except: + except ImportError: from apiclient.errors import HttpError self.http_error = HttpError super(_Dataset, self).__init__(project_id, reauth, verbose, diff --git a/pandas_gbq/tests/test_gbq.py b/pandas_gbq/tests/test_gbq.py index 9402f614..4af30608 100644 --- a/pandas_gbq/tests/test_gbq.py +++ b/pandas_gbq/tests/test_gbq.py @@ -203,7 +203,6 @@ def test_should_be_able_to_get_a_bigquery_service(self): assert bigquery_service is not None def test_should_be_able_to_get_schema_from_query(self): - credentials = self.sut.credentials schema = gbq.run_query('SELECT 1', client=self.client) assert schema is not None @@ -257,7 +256,6 @@ def setup_method(self, method): self.sut = gbq.GbqConnector(_get_project_id(), private_key=_get_private_key_path()) - credentials = self.sut.get_credentials() self.client = bigquery.Client(project=_get_project_id(), credentials=self.sut.credentials) @@ -273,14 +271,12 @@ def test_should_be_able_to_get_a_bigquery_service(self): assert bigquery_service is not None def test_should_be_able_to_get_schema_from_query(self): - credentials = self.sut.credentials schema = gbq.run_query('SELECT 1', client=self.client) assert schema is not None def test_should_be_able_to_get_results_from_query(self): - credentials = self.sut.credentials results = gbq.read_gbq('SELECT 1', project_id=_get_project_id(), - credentials=credentials) + credentials=self.sut.credentials) assert results is not None From 446819729e551c5b030e80bbe5e131d00134ebfd Mon Sep 17 00:00:00 2001 From: Jason Ng Date: Fri, 27 Oct 2017 17:47:35 -0400 Subject: [PATCH 27/42] Update schema test --- pandas_gbq/tests/test_gbq.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/pandas_gbq/tests/test_gbq.py b/pandas_gbq/tests/test_gbq.py index 4af30608..f94d7b3d 100644 --- a/pandas_gbq/tests/test_gbq.py +++ b/pandas_gbq/tests/test_gbq.py @@ -203,7 +203,8 @@ def test_should_be_able_to_get_a_bigquery_service(self): assert bigquery_service is not None def test_should_be_able_to_get_schema_from_query(self): - schema = gbq.run_query('SELECT 1', client=self.client) + result = gbq.run_query('SELECT 1', client=self.client) + rows, columns, schema = result assert schema is not None def test_should_be_able_to_get_results_from_query(self): @@ -271,7 +272,8 @@ def test_should_be_able_to_get_a_bigquery_service(self): assert bigquery_service is not None def test_should_be_able_to_get_schema_from_query(self): - schema = gbq.run_query('SELECT 1', client=self.client) + result = gbq.run_query('SELECT 1', client=self.client) + rows, columns, schema = result assert schema is not None def test_should_be_able_to_get_results_from_query(self): @@ -290,6 +292,8 @@ def setup_method(self, method): self.sut = gbq.GbqConnector(_get_project_id(), private_key=_get_private_key_contents()) + self.client = bigquery.Client(project=_get_project_id(), + credentials=self.sut.credentials) def test_should_be_able_to_make_a_connector(self): assert self.sut is not None @@ -303,18 +307,14 @@ def test_should_be_able_to_get_a_bigquery_service(self): assert bigquery_service is not None def test_should_be_able_to_get_schema_from_query(self): - credentials = self.sut.credentials - schema = gbq.read_gbq('SELECT 1', - project_id=_get_project_id(), - credentials=credentials, - get_schema=True) + result = gbq.run_query('SELECT 1', client=self.client) + rows, columns, schema = result assert schema is not None def test_should_be_able_to_get_results_from_query(self): - credentials = self.sut.credentials results = gbq.read_gbq('SELECT 1', project_id=_get_project_id(), - credentials=credentials) + credentials=self.sut.credentials) assert results is not None From 6a6fc2dd1b136468b53647bdbd7bc829c14b564d Mon Sep 17 00:00:00 2001 From: Jason Ng Date: Sat, 28 Oct 2017 01:12:03 -0400 Subject: [PATCH 28/42] Fix read_gbq docstring for Sphinx Whitespace --- pandas_gbq/gbq.py | 56 ++++++++++++++++++++++++++++------------------- 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 6ffe1789..073a58f1 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -807,17 +807,14 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, The Google Cloud library is used. Documentation is available `here - ` + `__ - Authentication via Google Cloud can be performed a number of ways, see: - + Authentication via Google Cloud can be performed a number of ways. One method is to generate user credentials via - `gcloud auth application-default login` and point to it using an + ``gcloud auth application-default login`` and point to it using an environment variable: - `$ export GOOGLE_APPLICATION_CREDENTIALS="/path/to/keyfile.json"` + ``$ export GOOGLE_APPLICATION_CREDENTIALS="/path/to/keyfile.json"`` You can also download a service account private key JSON file and pass the path to the file to the private_key paramater. @@ -854,45 +851,60 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, be created in current dir. You can also set PANDAS_GBQ_CREDENTIALS_FILE environment variable so as to define a specific path to store this credential (eg. /etc/keys/bigquery.dat). + .. [local webserver flow] http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server .. [console flow] http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console .. versionadded:: 0.2.0 + dialect : {'legacy', 'standard'}, default 'legacy' 'legacy' : Use BigQuery's legacy SQL dialect. 'standard' : Use BigQuery's standard SQL (beta), which is compliant with the SQL 2011 standard. For more information see `BigQuery SQL Reference - ` + `__ credentials: credentials object, default None (optional) If generating credentials on your own, pass in. Otherwise, will attempt to generate automatically + + .. versionadded:: 0.3.0 + query_parameters: tuple (optional) Can only be used in Standard SQL - example: gbq.read_gbq("SELECT @param1 + @param2", - query_parameters = (bigquery.ScalarQueryParameter( - 'param1', 'INT64', 1), - bigquery.ScalarQueryParameter( - 'param2', 'INT64', 2))) - + example. `More info + `__:: + + gbq.read_gbq("SELECT @param1 + @param2", + query_parameters = (bigquery.ScalarQueryParameter( + 'param1', 'INT64', 1), + bigquery.ScalarQueryParameter( + 'param2', 'INT64', 2))) + + .. versionadded:: 0.3.0 + configuration : dict (optional) - Because of current limitations only some configuration settings are - currently implemented. You can pass them along like in the following: + Due to the [current implementation in Google Cloud Python] only some + configuration settings are able to be set. You can pass them along like + in the following: `read_gbq(q,configuration={'allow_large_results':True, 'maximum_billing_tier':2})` - Example allowable settings: + [Example allowable settings]: allow_large_results, create_disposition, default_dataset, destination, flatten_results, priority, use_query_cache, use_legacy_sql, dry_run, write_disposition, udf_resources, maximum_billing_tier, maximum_bytes_billed - + + .. [current implementation in Google Cloud Python] + https://github.com/GoogleCloudPlatform/google-cloud-python/issues/2765 + .. [Example allowable settings] + http://google-cloud-python.readthedocs.io/en/latest/_modules/google/cloud/bigquery/job.html?highlight=QueryJobConfig + .. versionadded:: 0.3.0 + timeout_ms: int (optional) If set or found in config, triggers a sync query that times out with no results if it can't be completed in the time desired - + + .. versionadded:: 0.3.0 Returns ------- From 9ab0de39d5e3c11ff3f864469f6aedc70818e4cd Mon Sep 17 00:00:00 2001 From: Jason Ng Date: Mon, 30 Oct 2017 18:24:54 -0400 Subject: [PATCH 29/42] Remove timeout_ms paramater from read_gbq, it should be set in configuration --- pandas_gbq/gbq.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 073a58f1..68f4095e 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -798,8 +798,7 @@ def get_columns_schema(query_results): def read_gbq(query, project_id=None, index_col=None, col_order=None, reauth=False, verbose=True, private_key=None, auth_local_webserver=False, dialect='legacy', credentials=None, - query_parameters=(), configuration=None, - timeout_ms=None, **kwargs): + query_parameters=(), configuration=None, **kwargs): r"""Load data from Google BigQuery using google-cloud-python The main method a user calls to execute a Query in Google BigQuery @@ -900,12 +899,6 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, http://google-cloud-python.readthedocs.io/en/latest/_modules/google/cloud/bigquery/job.html?highlight=QueryJobConfig .. versionadded:: 0.3.0 - timeout_ms: int (optional) If set or found in config, triggers a sync query - that times out with no results if it can't be completed in the time - desired - - .. versionadded:: 0.3.0 - Returns ------- df: DataFrame @@ -970,8 +963,6 @@ def _create_df(rows, columns, schema, index_col, col_order): private_key=private_key).credentials client = bigquery.Client(project=project_id, credentials=credentials) - if timeout_ms: - configuration['timeout_ms'] = timeout_ms if (configuration and "timeout_ms" in configuration): rows, columns, schema = run_query(query, client, dialect, query_parameters, configuration, From e203ba4665c323c66982757d0d582cbc1c8b199d Mon Sep 17 00:00:00 2001 From: Jason Ng Date: Mon, 30 Oct 2017 18:57:34 -0400 Subject: [PATCH 30/42] Add more documentation --- pandas_gbq/gbq.py | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 68f4095e..8c3c6b1c 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -692,6 +692,29 @@ def sizeof_fmt(num, suffix='B'): def run_query(query, client, dialect='legacy', query_parameters=(), configuration=None, verbose=True, async=True): + """Execute a query job + + Parameters + ---------- + query, dialect, query_paramaters, configuration, verbose : see read_gbq() + client : bigQuery Client object + Client with the specified project_id and credentials used to run the + query + async: bool + Whether a synchronous or asynchronous query should be run. To be + deprecated in future versions; synchronous queries are used as a + workaround to implement timeouts, and will be removed in a + future update once Google Cloud Python resolves the issue. + + Returns + ------- + Tuple + rows : list of lists + columns: list of strings + schema: dictionary + Has the following keys: name, field_type, mode, fields, description + """ + def _wait_for_job(job): while True: job.reload() # Refreshes the state via a GET request. @@ -775,6 +798,7 @@ def get_columns_schema(query_results): columns = [field["name"] for field in schema] return columns, schema + # sync_query code to be removed in future if async: query_results, query_job = async_query() rows = list(query_results) @@ -870,8 +894,9 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, .. versionadded:: 0.3.0 query_parameters: tuple (optional) Can only be used in Standard SQL - example. `More info - `__:: + `More info + `__ + Example:: gbq.read_gbq("SELECT @param1 + @param2", query_parameters = (bigquery.ScalarQueryParameter( @@ -963,6 +988,9 @@ def _create_df(rows, columns, schema, index_col, col_order): private_key=private_key).credentials client = bigquery.Client(project=project_id, credentials=credentials) + # Temporary workaround in order to perform timeouts on queries. + # Once Google Cloud Python resolves, differentiation between sync and async + # code will be removed. if (configuration and "timeout_ms" in configuration): rows, columns, schema = run_query(query, client, dialect, query_parameters, configuration, From 92f39434daab0da8276a64d0293a29d67e4a2ce6 Mon Sep 17 00:00:00 2001 From: Jason Ng Date: Wed, 15 Nov 2017 14:07:57 -0500 Subject: [PATCH 31/42] Moved read_query to GbqConnector, update credentials and client generation to GbqConnector, and remove wait_for_job --- pandas_gbq/gbq.py | 262 ++++++++++++++++------------------- pandas_gbq/tests/test_gbq.py | 22 +-- 2 files changed, 129 insertions(+), 155 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 8c3c6b1c..ed339d75 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -10,7 +10,6 @@ from distutils.version import StrictVersion from pandas import compat, DataFrame, to_datetime, to_numeric from pandas.compat import bytes_to_str -from google.cloud import bigquery def _check_google_client_version(): @@ -201,6 +200,7 @@ class GbqConnector(object): def __init__(self, project_id, reauth=False, verbose=False, private_key=None, auth_local_webserver=False, dialect='legacy'): + from google.cloud import bigquery self.project_id = project_id self.reauth = reauth self.verbose = verbose @@ -210,6 +210,8 @@ def __init__(self, project_id, reauth=False, verbose=False, self.credentials_path = _get_credentials_file() self.credentials = self.get_credentials() self.service = self.get_service() + self.client = bigquery.Client(project=project_id, + credentials=self.credentials) # BQ Queries costs $5 per TB. First 1 TB per month is free # see here for more: https://cloud.google.com/bigquery/pricing @@ -504,6 +506,119 @@ def process_insert_errors(self, insert_errors): raise StreamingInsertError + def run_query(self, query, dialect='legacy', query_parameters=(), + configuration=None, verbose=True, async=True): + """Execute a query job + + Parameters + ---------- + query, dialect, query_paramaters, configuration, verbose : see read_gbq() + async: bool + Whether a synchronous or asynchronous query should be run. To be + deprecated in future versions; synchronous queries are used as a + workaround to implement timeouts, and will be removed in a + future update once Google Cloud Python resolves the issue. + + Returns + ------- + Tuple + rows : list of lists + columns: list of strings + schema: dictionary + Has the following keys: name, field_type, mode, fields, description + """ + + def _set_common_query_settings(query_job): + if dialect == 'legacy': + query_job.use_legacy_sql = True + elif dialect == 'standard': + query_job.use_legacy_sql = False + + if configuration: + for setting, value in configuration.items(): + setattr(query_job, setting, value) + return query_job + + def sync_query(): + query_job = self.client.run_sync_query(query, + query_parameters=query_parameters) + query_job = _set_common_query_settings(query_job) + if verbose: + print("Query running...") + query_job.run() + if not query_job._properties.get("jobComplete", False): + raise QueryTimeout("Sync query timed out") + if verbose: + print("Query done.") + if query_job._properties.get("cacheHit", False): + print("Cache hit.") + else: + bytes_billed = int(query_job._properties + .get("totalBytesProcessed", 0)) + bytes_processed = int(query_job._properties + .get("totalBytesBilled", 0)) + print("Total bytes billed (processed): %s (%s)" % + (sizeof_fmt(bytes_billed), sizeof_fmt(bytes_processed))) + print("\nRetrieving results...") + return query_job, None + + def async_query(): + query_job = self.client.run_async_query(str(uuid.uuid4()), + query, + query_parameters=query_parameters) + query_job = _set_common_query_settings(query_job) + query_job.begin() + try: + query_results = query_job.results().fetch_data() + except AttributeError: + query_results = query_job.result().fetch_data() + if verbose: + print("Query done.") + if query_job._properties["statistics"]["query"].get("cacheHit", + False): + print("Cache hit.") + elif ("statistics" in query_job._properties and + "query" in query_job._properties["statistics"]): + bytes_billed = int(query_job + ._properties["statistics"]["query"] + .get("totalBytesProcessed", 0)) + bytes_processed = int(query_job + ._properties["statistics"]["query"] + .get("totalBytesBilled", 0)) + print("Total bytes billed (processed): %s (%s)" % + (sizeof_fmt(bytes_billed), sizeof_fmt(bytes_processed))) + print("\nRetrieving results...") + return query_results, query_job + + def get_columns_schema(query_results): + schema = [{"name": f.name, + "field_type": f.field_type, + "mode": f.mode, + "fields": f.fields, + "description": f.description} for f in query_results.schema] + columns = [field["name"] for field in schema] + return columns, schema + + # sync_query code to be removed in future + if async: + query_results, query_job = async_query() + rows = list(query_results) + else: + query_results, query_job = sync_query() + rows = list(query_results.rows) + + columns, schema = get_columns_schema(query_results) + + if verbose: + print("Got %s rows.") % len(rows) + if query_job: + print("\nTotal time taken %ss" % (datetime.utcnow() - + query_job.created.replace(tzinfo=None)).seconds) + print("Finished at %s." % datetime.now() + .strftime('%Y-%m-%d %H:%M:%S')) + + return rows, columns, schema + def load_data(self, dataframe, dataset_id, table_id, chunksize): try: from googleapiclient.errors import HttpError @@ -690,138 +805,9 @@ def sizeof_fmt(num, suffix='B'): return fmt % (num, 'Y', suffix) -def run_query(query, client, dialect='legacy', query_parameters=(), - configuration=None, verbose=True, async=True): - """Execute a query job - - Parameters - ---------- - query, dialect, query_paramaters, configuration, verbose : see read_gbq() - client : bigQuery Client object - Client with the specified project_id and credentials used to run the - query - async: bool - Whether a synchronous or asynchronous query should be run. To be - deprecated in future versions; synchronous queries are used as a - workaround to implement timeouts, and will be removed in a - future update once Google Cloud Python resolves the issue. - - Returns - ------- - Tuple - rows : list of lists - columns: list of strings - schema: dictionary - Has the following keys: name, field_type, mode, fields, description - """ - - def _wait_for_job(job): - while True: - job.reload() # Refreshes the state via a GET request. - if job.state == 'DONE': - if job.error_result: - raise RuntimeError(job.errors) - return - time.sleep(1) - - def _set_common_query_settings(query_job): - if dialect == 'legacy': - query_job.use_legacy_sql = True - elif dialect == 'standard': - query_job.use_legacy_sql = False - - if configuration: - for setting, value in configuration.items(): - setattr(query_job, setting, value) - return query_job - - def sync_query(): - query_job = client.run_sync_query(query, - query_parameters=query_parameters) - query_job = _set_common_query_settings(query_job) - if verbose: - print("Query running...") - query_job.run() - if not query_job._properties.get("jobComplete", False): - raise QueryTimeout("Sync query timed out") - if verbose: - print("Query done.") - if query_job._properties.get("cacheHit", False): - print("Cache hit.") - else: - bytes_billed = int(query_job._properties - .get("totalBytesProcessed", 0)) - bytes_processed = int(query_job._properties - .get("totalBytesBilled", 0)) - print("Total bytes billed (processed): %s (%s)" % - (sizeof_fmt(bytes_billed), sizeof_fmt(bytes_processed))) - print("\nRetrieving results...") - return query_job, None - - def async_query(): - query_job = client.run_async_query(str(uuid.uuid4()), - query, - query_parameters=query_parameters) - query_job = _set_common_query_settings(query_job) - query_job.begin() - try: - query_results = query_job.results().fetch_data() - except AttributeError: - query_results = query_job.result().fetch_data() - if verbose: - print("Query running...") - _wait_for_job(query_job) - if verbose: - print("Query done.") - if query_job._properties["statistics"]["query"].get("cacheHit", - False): - print("Cache hit.") - elif ("statistics" in query_job._properties and - "query" in query_job._properties["statistics"]): - bytes_billed = int(query_job - ._properties["statistics"]["query"] - .get("totalBytesProcessed", 0)) - bytes_processed = int(query_job - ._properties["statistics"]["query"] - .get("totalBytesBilled", 0)) - print("Total bytes billed (processed): %s (%s)" % - (sizeof_fmt(bytes_billed), sizeof_fmt(bytes_processed))) - print("\nRetrieving results...") - return query_results, query_job - - def get_columns_schema(query_results): - schema = [{"name": f.name, - "field_type": f.field_type, - "mode": f.mode, - "fields": f.fields, - "description": f.description} for f in query_results.schema] - columns = [field["name"] for field in schema] - return columns, schema - - # sync_query code to be removed in future - if async: - query_results, query_job = async_query() - rows = list(query_results) - else: - query_results, query_job = sync_query() - rows = list(query_results.rows) - - columns, schema = get_columns_schema(query_results) - - if verbose: - print("Got %s rows.") % len(rows) - if query_job: - print("\nTotal time taken %ss" % (datetime.utcnow() - - query_job.created.replace(tzinfo=None)).seconds) - print("Finished at %s." % datetime.now() - .strftime('%Y-%m-%d %H:%M:%S')) - - return rows, columns, schema - - def read_gbq(query, project_id=None, index_col=None, col_order=None, reauth=False, verbose=True, private_key=None, - auth_local_webserver=False, dialect='legacy', credentials=None, + auth_local_webserver=False, dialect='legacy', query_parameters=(), configuration=None, **kwargs): r"""Load data from Google BigQuery using google-cloud-python @@ -981,23 +967,21 @@ def _create_df(rows, columns, schema, index_col, col_order): "cloud/bigquery/job.html?highlight=QueryJobConfig " "for allowable paramaters.") - if credentials is None: - credentials = GbqConnector(project_id=project_id, - reauth=reauth, - auth_local_webserver=auth_local_webserver, - private_key=private_key).credentials - client = bigquery.Client(project=project_id, credentials=credentials) + connector = GbqConnector(project_id=project_id, + reauth=reauth, + auth_local_webserver=auth_local_webserver, + private_key=private_key) # Temporary workaround in order to perform timeouts on queries. # Once Google Cloud Python resolves, differentiation between sync and async # code will be removed. if (configuration and "timeout_ms" in configuration): - rows, columns, schema = run_query(query, client, dialect, + rows, columns, schema = connector.run_query(query, dialect, query_parameters, configuration, verbose, async=False) else: - rows, columns, schema = run_query(query, client, dialect, + rows, columns, schema = connector.run_query(query, dialect, query_parameters, configuration, verbose) diff --git a/pandas_gbq/tests/test_gbq.py b/pandas_gbq/tests/test_gbq.py index f94d7b3d..e265313f 100644 --- a/pandas_gbq/tests/test_gbq.py +++ b/pandas_gbq/tests/test_gbq.py @@ -188,8 +188,6 @@ def setup_method(self, method): self.sut = gbq.GbqConnector( _get_project_id(), auth_local_webserver=True) - self.client = bigquery.Client(project=_get_project_id(), - credentials=self.sut.credentials) def test_should_be_able_to_make_a_connector(self): assert self.sut is not None, 'Could not create a GbqConnector' @@ -203,15 +201,13 @@ def test_should_be_able_to_get_a_bigquery_service(self): assert bigquery_service is not None def test_should_be_able_to_get_schema_from_query(self): - result = gbq.run_query('SELECT 1', client=self.client) + result = self.sut.run_query('SELECT 1') rows, columns, schema = result assert schema is not None def test_should_be_able_to_get_results_from_query(self): - credentials = self.sut.credentials results = gbq.read_gbq('SELECT 1', - project_id=_get_project_id(), - credentials=credentials) + project_id=_get_project_id()) assert results is not None def test_get_application_default_credentials_does_not_throw_error(self): @@ -257,8 +253,6 @@ def setup_method(self, method): self.sut = gbq.GbqConnector(_get_project_id(), private_key=_get_private_key_path()) - self.client = bigquery.Client(project=_get_project_id(), - credentials=self.sut.credentials) def test_should_be_able_to_make_a_connector(self): assert self.sut is not None @@ -272,13 +266,12 @@ def test_should_be_able_to_get_a_bigquery_service(self): assert bigquery_service is not None def test_should_be_able_to_get_schema_from_query(self): - result = gbq.run_query('SELECT 1', client=self.client) + result = self.sut.run_query('SELECT 1') rows, columns, schema = result assert schema is not None def test_should_be_able_to_get_results_from_query(self): - results = gbq.read_gbq('SELECT 1', project_id=_get_project_id(), - credentials=self.sut.credentials) + results = gbq.read_gbq('SELECT 1', project_id=_get_project_id()) assert results is not None @@ -292,8 +285,6 @@ def setup_method(self, method): self.sut = gbq.GbqConnector(_get_project_id(), private_key=_get_private_key_contents()) - self.client = bigquery.Client(project=_get_project_id(), - credentials=self.sut.credentials) def test_should_be_able_to_make_a_connector(self): assert self.sut is not None @@ -307,14 +298,13 @@ def test_should_be_able_to_get_a_bigquery_service(self): assert bigquery_service is not None def test_should_be_able_to_get_schema_from_query(self): - result = gbq.run_query('SELECT 1', client=self.client) + result = self.sut.run_query('SELECT 1') rows, columns, schema = result assert schema is not None def test_should_be_able_to_get_results_from_query(self): results = gbq.read_gbq('SELECT 1', - project_id=_get_project_id(), - credentials=self.sut.credentials) + project_id=_get_project_id()) assert results is not None From 3cc153a9786a0161cd758efef89ca8ce07289cb4 Mon Sep 17 00:00:00 2001 From: Jason Ng Date: Wed, 15 Nov 2017 14:16:41 -0500 Subject: [PATCH 32/42] Move sizeof_fmt back into GbqConnector Linting More linting --- pandas_gbq/gbq.py | 61 +++++++++++++++++++++++++++-------------------- 1 file changed, 35 insertions(+), 26 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index ed339d75..6d6b670f 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -3,7 +3,6 @@ import json from time import sleep import uuid -import time import sys import os @@ -210,7 +209,7 @@ def __init__(self, project_id, reauth=False, verbose=False, self.credentials_path = _get_credentials_file() self.credentials = self.get_credentials() self.service = self.get_service() - self.client = bigquery.Client(project=project_id, + self.client = bigquery.Client(project=project_id, credentials=self.credentials) # BQ Queries costs $5 per TB. First 1 TB per month is free @@ -454,6 +453,16 @@ def _print(self, msg, end='\n'): sys.stdout.write(msg + end) sys.stdout.flush() + # http://stackoverflow.com/questions/1094841/reusable-library-to-get-human-readable-version-of-file-size + @staticmethod + def sizeof_fmt(num, suffix='B'): + fmt = "%3.1f %s%s" + for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']: + if abs(num) < 1024.0: + return fmt % (num, unit, suffix) + num /= 1024.0 + return fmt % (num, 'Y', suffix) + def get_service(self): import httplib2 from google_auth_httplib2 import AuthorizedHttp @@ -512,7 +521,7 @@ def run_query(self, query, dialect='legacy', query_parameters=(), Parameters ---------- - query, dialect, query_paramaters, configuration, verbose : see read_gbq() + query, dialect, query_paramaters, configuration, verbose: see read_gbq async: bool Whether a synchronous or asynchronous query should be run. To be deprecated in future versions; synchronous queries are used as a @@ -525,7 +534,7 @@ def run_query(self, query, dialect='legacy', query_parameters=(), rows : list of lists columns: list of strings schema: dictionary - Has the following keys: name, field_type, mode, fields, description + Has keys: name, field_type, mode, fields, description """ def _set_common_query_settings(query_job): @@ -541,7 +550,7 @@ def _set_common_query_settings(query_job): def sync_query(): query_job = self.client.run_sync_query(query, - query_parameters=query_parameters) + query_parameters=query_parameters) query_job = _set_common_query_settings(query_job) if verbose: print("Query running...") @@ -558,15 +567,18 @@ def sync_query(): bytes_processed = int(query_job._properties .get("totalBytesBilled", 0)) print("Total bytes billed (processed): %s (%s)" % - (sizeof_fmt(bytes_billed), sizeof_fmt(bytes_processed))) + (self.sizeof_fmt(bytes_billed), + self.sizeof_fmt(bytes_processed))) print("\nRetrieving results...") return query_job, None def async_query(): query_job = self.client.run_async_query(str(uuid.uuid4()), - query, - query_parameters=query_parameters) + query, + query_parameters=query_parameters) query_job = _set_common_query_settings(query_job) + if verbose: + print("Query running...") query_job.begin() try: query_results = query_job.results().fetch_data() @@ -586,7 +598,8 @@ def async_query(): ._properties["statistics"]["query"] .get("totalBytesBilled", 0)) print("Total bytes billed (processed): %s (%s)" % - (sizeof_fmt(bytes_billed), sizeof_fmt(bytes_processed))) + (self.sizeof_fmt(bytes_billed), + self.sizeof_fmt(bytes_processed))) print("\nRetrieving results...") return query_results, query_job @@ -595,7 +608,8 @@ def get_columns_schema(query_results): "field_type": f.field_type, "mode": f.mode, "fields": f.fields, - "description": f.description} for f in query_results.schema] + "description": f.description} + for f in query_results.schema] columns = [field["name"] for field in schema] return columns, schema @@ -796,15 +810,6 @@ def _get_credentials_file(): 'PANDAS_GBQ_CREDENTIALS_FILE') -def sizeof_fmt(num, suffix='B'): - fmt = "%3.1f %s%s" - for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']: - if abs(num) < 1024.0: - return fmt % (num, unit, suffix) - num /= 1024.0 - return fmt % (num, 'Y', suffix) - - def read_gbq(query, project_id=None, index_col=None, col_order=None, reauth=False, verbose=True, private_key=None, auth_local_webserver=False, dialect='legacy', @@ -976,14 +981,18 @@ def _create_df(rows, columns, schema, index_col, col_order): # Once Google Cloud Python resolves, differentiation between sync and async # code will be removed. if (configuration and "timeout_ms" in configuration): - rows, columns, schema = connector.run_query(query, dialect, - query_parameters, configuration, - verbose, - async=False) + rows, columns, schema = connector.run_query(query, + dialect, + query_parameters, + configuration, + verbose, + async=False) else: - rows, columns, schema = connector.run_query(query, dialect, - query_parameters, configuration, - verbose) + rows, columns, schema = connector.run_query(query, + dialect, + query_parameters, + configuration, + verbose) df = _create_df(rows, columns, schema, index_col, col_order) From 91f900e44e297cf16f2ecde6aaafb688cd2270ea Mon Sep 17 00:00:00 2001 From: Jason Ng Date: Wed, 15 Nov 2017 17:47:36 -0500 Subject: [PATCH 33/42] Fix sizeof_fmt test and move create_df to top-level function Final linting --- pandas_gbq/gbq.py | 75 ++++++++++++++++++------------------ pandas_gbq/tests/test_gbq.py | 27 ++++++------- 2 files changed, 52 insertions(+), 50 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 6d6b670f..321f91ed 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -549,8 +549,8 @@ def _set_common_query_settings(query_job): return query_job def sync_query(): - query_job = self.client.run_sync_query(query, - query_parameters=query_parameters) + query_job = self.client.run_sync_query( + query, query_parameters=query_parameters) query_job = _set_common_query_settings(query_job) if verbose: print("Query running...") @@ -573,9 +573,10 @@ def sync_query(): return query_job, None def async_query(): - query_job = self.client.run_async_query(str(uuid.uuid4()), - query, - query_parameters=query_parameters) + query_job = self.client.run_async_query( + str(uuid.uuid4()), + query, + query_parameters=query_parameters) query_job = _set_common_query_settings(query_job) if verbose: print("Query running...") @@ -810,6 +811,38 @@ def _get_credentials_file(): 'PANDAS_GBQ_CREDENTIALS_FILE') +def _create_df(rows, columns, schema, index_col, col_order): + df = DataFrame(data=rows, columns=columns) + + # Manual field type conversion. Inserted to handle tests + # with only null rows, otherwise type conversion works automatically + for field in schema: + if field["field_type"] == 'TIMESTAMP': + if df[field["name"]].isnull().values.all(): + df[field["name"]] = to_datetime(df[field["name"]]) + if field["field_type"] == 'FLOAT': + if df[field["name"]].isnull().values.all(): + df[field["name"]] = to_numeric(df[field["name"]]) + + # Reindex the DataFrame on the provided column + if index_col: + if index_col in df.columns: + df.set_index(index_col, inplace=True) + else: + raise InvalidIndexColumn( + 'Index column "{0}" does not exist in DataFrame.' + .format(index_col)) + + # Change the order of columns in the DataFrame based on provided list + if col_order: + if sorted(col_order) == sorted(df.columns): + df = df[col_order] + else: + raise InvalidColumnOrder( + 'Column order does not match this DataFrame.') + return df + + def read_gbq(query, project_id=None, index_col=None, col_order=None, reauth=False, verbose=True, private_key=None, auth_local_webserver=False, dialect='legacy', @@ -922,38 +955,6 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, """ - def _create_df(rows, columns, schema, index_col, col_order): - df = DataFrame(data=rows, columns=columns) - - # Manual field type conversion. Inserted to handle tests - # with only null rows, otherwise type conversion works automatically - for field in schema: - if field["field_type"] == 'TIMESTAMP': - if df[field["name"]].isnull().values.all(): - df[field["name"]] = to_datetime(df[field["name"]]) - if field["field_type"] == 'FLOAT': - if df[field["name"]].isnull().values.all(): - df[field["name"]] = to_numeric(df[field["name"]]) - - # Reindex the DataFrame on the provided column - if index_col: - if index_col in df.columns: - df.set_index(index_col, inplace=True) - else: - raise InvalidIndexColumn( - 'Index column "{0}" does not exist in DataFrame.' - .format(index_col)) - - # Change the order of columns in the DataFrame based on provided list - if col_order: - if sorted(col_order) == sorted(df.columns): - df = df[col_order] - else: - raise InvalidColumnOrder( - 'Column order does not match this DataFrame.') - - return df - _test_google_api_imports() if not project_id: diff --git a/pandas_gbq/tests/test_gbq.py b/pandas_gbq/tests/test_gbq.py index e265313f..9f249ddd 100644 --- a/pandas_gbq/tests/test_gbq.py +++ b/pandas_gbq/tests/test_gbq.py @@ -855,19 +855,20 @@ def test_timeout_configuration(self): configuration=config) def test_query_response_bytes(self): - assert gbq.sizeof_fmt(999) == "999.0 B" - assert gbq.sizeof_fmt(1024) == "1.0 KB" - assert gbq.sizeof_fmt(1099) == "1.1 KB" - assert gbq.sizeof_fmt(1044480) == "1020.0 KB" - assert gbq.sizeof_fmt(1048576) == "1.0 MB" - assert gbq.sizeof_fmt(1048576000) == "1000.0 MB" - assert gbq.sizeof_fmt(1073741824) == "1.0 GB" - assert gbq.sizeof_fmt(1.099512E12) == "1.0 TB" - assert gbq.sizeof_fmt(1.125900E15) == "1.0 PB" - assert gbq.sizeof_fmt(1.152922E18) == "1.0 EB" - assert gbq.sizeof_fmt(1.180592E21) == "1.0 ZB" - assert gbq.sizeof_fmt(1.208926E24) == "1.0 YB" - assert gbq.sizeof_fmt(1.208926E28) == "10000.0 YB" + connector = gbq.GbqConnector(project_id=_get_project_id()) + assert connector.sizeof_fmt(999) == "999.0 B" + assert connector.sizeof_fmt(1024) == "1.0 KB" + assert connector.sizeof_fmt(1099) == "1.1 KB" + assert connector.sizeof_fmt(1044480) == "1020.0 KB" + assert connector.sizeof_fmt(1048576) == "1.0 MB" + assert connector.sizeof_fmt(1048576000) == "1000.0 MB" + assert connector.sizeof_fmt(1073741824) == "1.0 GB" + assert connector.sizeof_fmt(1.099512E12) == "1.0 TB" + assert connector.sizeof_fmt(1.125900E15) == "1.0 PB" + assert connector.sizeof_fmt(1.152922E18) == "1.0 EB" + assert connector.sizeof_fmt(1.180592E21) == "1.0 ZB" + assert connector.sizeof_fmt(1.208926E24) == "1.0 YB" + assert connector.sizeof_fmt(1.208926E28) == "10000.0 YB" class TestToGBQIntegrationWithServiceAccountKeyPath(object): From 4d967e69bb5bac12bcc8c3e15e0bdbf629365421 Mon Sep 17 00:00:00 2001 From: Jason Ng Date: Wed, 15 Nov 2017 18:09:19 -0500 Subject: [PATCH 34/42] Add import error message --- pandas_gbq/gbq.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 321f91ed..0aa00c89 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -74,6 +74,13 @@ def _test_google_api_imports(): "pandas requires google-auth for Google BigQuery support: " "{0}".format(ex)) + try: + from google.cloud import bigquery # noqa + except ImportError as ex: + raise ImportError( + "pandas requires google-cloud-python for Google BigQuery support: " + "{0}".format(ex)) + _check_google_client_version() From 476d72b3cdb9042f33db24d397b4116f08291c12 Mon Sep 17 00:00:00 2001 From: Jason Ng Date: Thu, 16 Nov 2017 12:41:07 -0500 Subject: [PATCH 35/42] Revert test_query_response_bytes test to original --- pandas_gbq/tests/test_gbq.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/pandas_gbq/tests/test_gbq.py b/pandas_gbq/tests/test_gbq.py index 9f249ddd..697eb2e1 100644 --- a/pandas_gbq/tests/test_gbq.py +++ b/pandas_gbq/tests/test_gbq.py @@ -855,20 +855,19 @@ def test_timeout_configuration(self): configuration=config) def test_query_response_bytes(self): - connector = gbq.GbqConnector(project_id=_get_project_id()) - assert connector.sizeof_fmt(999) == "999.0 B" - assert connector.sizeof_fmt(1024) == "1.0 KB" - assert connector.sizeof_fmt(1099) == "1.1 KB" - assert connector.sizeof_fmt(1044480) == "1020.0 KB" - assert connector.sizeof_fmt(1048576) == "1.0 MB" - assert connector.sizeof_fmt(1048576000) == "1000.0 MB" - assert connector.sizeof_fmt(1073741824) == "1.0 GB" - assert connector.sizeof_fmt(1.099512E12) == "1.0 TB" - assert connector.sizeof_fmt(1.125900E15) == "1.0 PB" - assert connector.sizeof_fmt(1.152922E18) == "1.0 EB" - assert connector.sizeof_fmt(1.180592E21) == "1.0 ZB" - assert connector.sizeof_fmt(1.208926E24) == "1.0 YB" - assert connector.sizeof_fmt(1.208926E28) == "10000.0 YB" + assert self.gbq_connector.sizeof_fmt(999) == "999.0 B" + assert self.gbq_connector.sizeof_fmt(1024) == "1.0 KB" + assert self.gbq_connector.sizeof_fmt(1099) == "1.1 KB" + assert self.gbq_connector.sizeof_fmt(1044480) == "1020.0 KB" + assert self.gbq_connector.sizeof_fmt(1048576) == "1.0 MB" + assert self.gbq_connector.sizeof_fmt(1048576000) == "1000.0 MB" + assert self.gbq_connector.sizeof_fmt(1073741824) == "1.0 GB" + assert self.gbq_connector.sizeof_fmt(1.099512E12) == "1.0 TB" + assert self.gbq_connector.sizeof_fmt(1.125900E15) == "1.0 PB" + assert self.gbq_connector.sizeof_fmt(1.152922E18) == "1.0 EB" + assert self.gbq_connector.sizeof_fmt(1.180592E21) == "1.0 ZB" + assert self.gbq_connector.sizeof_fmt(1.208926E24) == "1.0 YB" + assert self.gbq_connector.sizeof_fmt(1.208926E28) == "10000.0 YB" class TestToGBQIntegrationWithServiceAccountKeyPath(object): From 6da0ef27444352e86f0dbed646338d148e57787b Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Wed, 22 Nov 2017 17:13:40 -0800 Subject: [PATCH 36/42] Convert rest of methods to use google-cloud-bigquery - Removes references to google-api-client-library and httplib2. - Updates PR to not make any surface-level changes to the API, only swaps out the dependencies. - Updates PR to use latest version of google-cloud-bigquery. --- ci/requirements-2.7-0.19.2.pip | 3 - ci/requirements-3.5-0.18.1.pip | 4 +- ci/requirements-3.6-0.20.1.conda | 2 - ci/requirements-3.6-MASTER.pip | 2 - pandas_gbq/gbq.py | 777 ++++++++++++------------------- pandas_gbq/tests/test_gbq.py | 189 +++++--- requirements.txt | 5 +- setup.py | 5 +- 8 files changed, 414 insertions(+), 573 deletions(-) diff --git a/ci/requirements-2.7-0.19.2.pip b/ci/requirements-2.7-0.19.2.pip index a31accd7..cd94478a 100644 --- a/ci/requirements-2.7-0.19.2.pip +++ b/ci/requirements-2.7-0.19.2.pip @@ -1,8 +1,5 @@ -google-api-python-client google-auth -google-auth-httplib2 google-auth-oauthlib PyCrypto -python-gflags mock google-cloud-bigquery diff --git a/ci/requirements-3.5-0.18.1.pip b/ci/requirements-3.5-0.18.1.pip index 9c94dbbb..18369345 100644 --- a/ci/requirements-3.5-0.18.1.pip +++ b/ci/requirements-3.5-0.18.1.pip @@ -1,6 +1,4 @@ -google-api-python-client==1.6.0 google-auth==1.0.0 -google-auth-httplib2==0.0.1 google-auth-oauthlib==0.0.1 mock -google-cloud-bigquery==0.26.0 +google-cloud-bigquery==0.28.0 diff --git a/ci/requirements-3.6-0.20.1.conda b/ci/requirements-3.6-0.20.1.conda index 3ee89b92..b52f2aeb 100644 --- a/ci/requirements-3.6-0.20.1.conda +++ b/ci/requirements-3.6-0.20.1.conda @@ -1,6 +1,4 @@ -google-api-python-client google-auth -google-auth-httplib2 google-auth-oauthlib mock google-cloud-bigquery diff --git a/ci/requirements-3.6-MASTER.pip b/ci/requirements-3.6-MASTER.pip index 3ee89b92..b52f2aeb 100644 --- a/ci/requirements-3.6-MASTER.pip +++ b/ci/requirements-3.6-MASTER.pip @@ -1,6 +1,4 @@ -google-api-python-client google-auth -google-auth-httplib2 google-auth-oauthlib mock google-cloud-bigquery diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 0aa00c89..15130321 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -1,14 +1,16 @@ import warnings from datetime import datetime import json +import time from time import sleep -import uuid import sys import os +import numpy as np + from distutils.version import StrictVersion -from pandas import compat, DataFrame, to_datetime, to_numeric -from pandas.compat import bytes_to_str +from pandas import compat, DataFrame +from pandas.compat import lzip def _check_google_client_version(): @@ -19,31 +21,24 @@ def _check_google_client_version(): except ImportError: raise ImportError('Could not import pkg_resources (setuptools).') - # Version 1.6.0 is the first version to support google-auth. - # https://github.com/google/google-api-python-client/blob/master/CHANGELOG - google_api_minimum_version = '1.6.0' + # Version 0.28.0 includes many changes compared to previous versions + # https://github.com/GoogleCloudPlatform/google-cloud-python/blob/master/bigquery/CHANGELOG.md + bigquery_client_minimum_version = '0.28.0' - _GOOGLE_API_CLIENT_VERSION = pkg_resources.get_distribution( - 'google-api-python-client').version + _BIGQUERY_CLIENT_VERSION = pkg_resources.get_distribution( + 'google-cloud-bigquery').version - if (StrictVersion(_GOOGLE_API_CLIENT_VERSION) < - StrictVersion(google_api_minimum_version)): - raise ImportError('pandas requires google-api-python-client >= {0} ' + if (StrictVersion(_BIGQUERY_CLIENT_VERSION) < + StrictVersion(bigquery_client_minimum_version)): + raise ImportError('pandas requires google-cloud-bigquery >= {0} ' 'for Google BigQuery support, ' 'current version {1}' - .format(google_api_minimum_version, - _GOOGLE_API_CLIENT_VERSION)) + .format(bigquery_client_minimum_version, + _BIGQUERY_CLIENT_VERSION)) def _test_google_api_imports(): - try: - import httplib2 # noqa - except ImportError as ex: - raise ImportError( - 'pandas requires httplib2 for Google BigQuery support: ' - '{0}'.format(ex)) - try: from google_auth_oauthlib.flow import InstalledAppFlow # noqa except ImportError as ex: @@ -51,22 +46,6 @@ def _test_google_api_imports(): 'pandas requires google-auth-oauthlib for Google BigQuery ' 'support: {0}'.format(ex)) - try: - from google_auth_httplib2 import AuthorizedHttp # noqa - from google_auth_httplib2 import Request # noqa - except ImportError as ex: - raise ImportError( - 'pandas requires google-auth-httplib2 for Google BigQuery ' - 'support: {0}'.format(ex)) - - try: - from googleapiclient.discovery import build # noqa - from googleapiclient.errors import HttpError # noqa - except ImportError as ex: - raise ImportError( - "pandas requires google-api-python-client for Google BigQuery " - "support: {0}".format(ex)) - try: import google.auth # noqa except ImportError as ex: @@ -85,24 +64,18 @@ def _test_google_api_imports(): def _try_credentials(project_id, credentials): - import httplib2 - from googleapiclient.discovery import build - import googleapiclient.errors - from google_auth_httplib2 import AuthorizedHttp + from google.cloud import bigquery + import google.api_core.exceptions if credentials is None: return None - http = httplib2.Http() try: - authed_http = AuthorizedHttp(credentials, http=http) - bigquery_service = build('bigquery', 'v2', http=authed_http) + client = bigquery.Client(project=project_id, credentials=credentials) # Check if the application has rights to the BigQuery project - jobs = bigquery_service.jobs() - job_data = {'configuration': {'query': {'query': 'SELECT 1'}}} - jobs.insert(projectId=project_id, body=job_data).execute() + client.query('SELECT 1').result() return credentials - except googleapiclient.errors.Error: + except google.api_core.exceptions.GoogleAPIError: return None @@ -185,14 +158,6 @@ class QueryTimeout(ValueError): pass -class StreamingInsertError(ValueError): - """ - Raised when BigQuery reports a streaming insert error. - For more information see `Streaming Data Into BigQuery - `__ - """ - - class TableCreationError(ValueError): """ Raised when the create table method fails @@ -206,7 +171,9 @@ class GbqConnector(object): def __init__(self, project_id, reauth=False, verbose=False, private_key=None, auth_local_webserver=False, dialect='legacy'): - from google.cloud import bigquery + from google.api_core.exceptions import GoogleAPIError + from google.api_core.exceptions import ClientError + self.http_error = (ClientError, GoogleAPIError) self.project_id = project_id self.reauth = reauth self.verbose = verbose @@ -215,9 +182,7 @@ def __init__(self, project_id, reauth=False, verbose=False, self.dialect = dialect self.credentials_path = _get_credentials_file() self.credentials = self.get_credentials() - self.service = self.get_service() - self.client = bigquery.Client(project=project_id, - credentials=self.credentials) + self.client = self.get_client() # BQ Queries costs $5 per TB. First 1 TB per month is free # see here for more: https://cloud.google.com/bigquery/pricing @@ -283,8 +248,7 @@ def load_user_account_credentials(self): credentials do not have access to the project (self.project_id) on BigQuery. """ - import httplib2 - from google_auth_httplib2 import Request + import google.auth.transport.requests from google.oauth2.credentials import Credentials # Use the default credentials location under ~/.config and the @@ -316,8 +280,7 @@ def load_user_account_credentials(self): scopes=credentials_json.get('scopes')) # Refresh the token before trying to use it. - http = httplib2.Http() - request = Request(http) + request = google.auth.transport.requests.Request() credentials.refresh(request) return _try_credentials(self.project_id, credentials) @@ -418,8 +381,7 @@ def get_user_account_credentials(self): return credentials def get_service_account_credentials(self): - import httplib2 - from google_auth_httplib2 import Request + import google.auth.transport.requests from google.oauth2.service_account import Credentials from os.path import isfile @@ -442,8 +404,7 @@ def get_service_account_credentials(self): credentials = credentials.with_scopes([self.scope]) # Refresh the token before trying to use it. - http = httplib2.Http() - request = Request(http) + request = google.auth.transport.requests.Request() credentials.refresh(request) return credentials @@ -460,6 +421,18 @@ def _print(self, msg, end='\n'): sys.stdout.write(msg + end) sys.stdout.flush() + def _start_timer(self): + self.start = time.time() + + def get_elapsed_seconds(self): + return round(time.time() - self.start, 2) + + def print_elapsed_seconds(self, prefix='Elapsed', postfix='s.', + overlong=7): + sec = self.get_elapsed_seconds() + if sec > overlong: + self._print('{} {} {}'.format(prefix, sec, postfix)) + # http://stackoverflow.com/questions/1094841/reusable-library-to-get-human-readable-version-of-file-size @staticmethod def sizeof_fmt(num, suffix='B'): @@ -470,184 +443,128 @@ def sizeof_fmt(num, suffix='B'): num /= 1024.0 return fmt % (num, 'Y', suffix) - def get_service(self): - import httplib2 - from google_auth_httplib2 import AuthorizedHttp - from googleapiclient.discovery import build - - http = httplib2.Http() - authed_http = AuthorizedHttp( - self.credentials, http=http) - bigquery_service = build('bigquery', 'v2', http=authed_http) - - return bigquery_service + def get_client(self): + from google.cloud import bigquery + return bigquery.Client( + project=self.project_id, credentials=self.credentials) @staticmethod def process_http_error(ex): # See `BigQuery Troubleshooting Errors # `__ - status = json.loads(bytes_to_str(ex.content))['error'] - errors = status.get('errors', None) - - if errors: - for error in errors: - reason = error['reason'] - message = error['message'] - - raise GenericGBQException( - "Reason: {0}, Message: {1}".format(reason, message)) - - raise GenericGBQException(errors) - - def process_insert_errors(self, insert_errors): - for insert_error in insert_errors: - row = insert_error['index'] - errors = insert_error.get('errors', None) - for error in errors: - reason = error['reason'] - message = error['message'] - location = error['location'] - error_message = ('Error at Row: {0}, Reason: {1}, ' - 'Location: {2}, Message: {3}' - .format(row, reason, location, message)) - - # Report all error messages if verbose is set - if self.verbose: - self._print(error_message) - else: - raise StreamingInsertError(error_message + - '\nEnable verbose logging to ' - 'see all errors') + raise GenericGBQException("Reason: {0}".format(ex)) - raise StreamingInsertError + def run_query(self, query, **kwargs): + from google.auth.exceptions import RefreshError + from google.cloud.bigquery import QueryJobConfig + from concurrent.futures import TimeoutError - def run_query(self, query, dialect='legacy', query_parameters=(), - configuration=None, verbose=True, async=True): - """Execute a query job + job_config = { + 'query': { + 'useLegacySql': self.dialect == 'legacy' + # 'allowLargeResults', 'createDisposition', + # 'preserveNulls', destinationTable, useQueryCache + } + } + config = kwargs.get('configuration') + if config is not None: + if len(config) != 1: + raise ValueError("Only one job type must be specified, but " + "given {}".format(','.join(config.keys()))) + if 'query' in config: + if 'query' in config['query']: + if query is not None: + raise ValueError("Query statement can't be specified " + "inside config while it is specified " + "as parameter") + query = config['query']['query'] + del config['query']['query'] + + job_config['query'].update(config['query']) + else: + raise ValueError("Only 'query' job type is supported") - Parameters - ---------- - query, dialect, query_paramaters, configuration, verbose: see read_gbq - async: bool - Whether a synchronous or asynchronous query should be run. To be - deprecated in future versions; synchronous queries are used as a - workaround to implement timeouts, and will be removed in a - future update once Google Cloud Python resolves the issue. + self._start_timer() + try: + self._print('Requesting query... ', end="") + query_reply = self.client.query( + query, + job_config=QueryJobConfig.from_api_repr(job_config['query'])) + self._print('ok.') + except (RefreshError, ValueError): + if self.private_key: + raise AccessDenied( + "The service account credentials are not valid") + else: + raise AccessDenied( + "The credentials have been revoked or expired, " + "please re-run the application to re-authorize") + except self.http_error as ex: + self.process_http_error(ex) - Returns - ------- - Tuple - rows : list of lists - columns: list of strings - schema: dictionary - Has keys: name, field_type, mode, fields, description - """ + job_id = query_reply.job_id + self._print('Job ID: %s\nQuery running...' % job_id) + + while query_reply.state != 'DONE': + self.print_elapsed_seconds(' Elapsed', 's. Waiting...') + + timeout_ms = job_config['query'].get('timeoutMs') + if timeout_ms and timeout_ms < self.get_elapsed_seconds() * 1000: + raise QueryTimeout('Query timeout: {} ms'.format(timeout_ms)) + + timeout_sec = 1.0 + if timeout_ms: + # Wait at most 1 second so we can show progress bar + timeout_sec = min(1.0, timeout_ms / 1000.0) - def _set_common_query_settings(query_job): - if dialect == 'legacy': - query_job.use_legacy_sql = True - elif dialect == 'standard': - query_job.use_legacy_sql = False - - if configuration: - for setting, value in configuration.items(): - setattr(query_job, setting, value) - return query_job - - def sync_query(): - query_job = self.client.run_sync_query( - query, query_parameters=query_parameters) - query_job = _set_common_query_settings(query_job) - if verbose: - print("Query running...") - query_job.run() - if not query_job._properties.get("jobComplete", False): - raise QueryTimeout("Sync query timed out") - if verbose: - print("Query done.") - if query_job._properties.get("cacheHit", False): - print("Cache hit.") - else: - bytes_billed = int(query_job._properties - .get("totalBytesProcessed", 0)) - bytes_processed = int(query_job._properties - .get("totalBytesBilled", 0)) - print("Total bytes billed (processed): %s (%s)" % - (self.sizeof_fmt(bytes_billed), - self.sizeof_fmt(bytes_processed))) - print("\nRetrieving results...") - return query_job, None - - def async_query(): - query_job = self.client.run_async_query( - str(uuid.uuid4()), - query, - query_parameters=query_parameters) - query_job = _set_common_query_settings(query_job) - if verbose: - print("Query running...") - query_job.begin() try: - query_results = query_job.results().fetch_data() - except AttributeError: - query_results = query_job.result().fetch_data() - if verbose: - print("Query done.") - if query_job._properties["statistics"]["query"].get("cacheHit", - False): - print("Cache hit.") - elif ("statistics" in query_job._properties and - "query" in query_job._properties["statistics"]): - bytes_billed = int(query_job - ._properties["statistics"]["query"] - .get("totalBytesProcessed", 0)) - bytes_processed = int(query_job - ._properties["statistics"]["query"] - .get("totalBytesBilled", 0)) - print("Total bytes billed (processed): %s (%s)" % - (self.sizeof_fmt(bytes_billed), - self.sizeof_fmt(bytes_processed))) - print("\nRetrieving results...") - return query_results, query_job - - def get_columns_schema(query_results): - schema = [{"name": f.name, - "field_type": f.field_type, - "mode": f.mode, - "fields": f.fields, - "description": f.description} - for f in query_results.schema] - columns = [field["name"] for field in schema] - return columns, schema - - # sync_query code to be removed in future - if async: - query_results, query_job = async_query() - rows = list(query_results) - else: - query_results, query_job = sync_query() - rows = list(query_results.rows) + query_reply.result(timeout=timeout_sec) + except TimeoutError: + # Use our own timeout logic + pass + except self.http_error as ex: + self.process_http_error(ex) + + if self.verbose: + if query_reply.cache_hit: + self._print('Query done.\nCache hit.\n') + else: + bytes_processed = query_reply.total_bytes_processed or 0 + bytes_billed = query_reply.total_bytes_billed or 0 + self._print('Query done.\nProcessed: {} Billed: {}'.format( + self.sizeof_fmt(bytes_processed), + self.sizeof_fmt(bytes_billed))) + self._print('Standard price: ${:,.2f} USD\n'.format( + bytes_billed * self.query_price_for_TB)) - columns, schema = get_columns_schema(query_results) + self._print('Retrieving results...') - if verbose: - print("Got %s rows.") % len(rows) - if query_job: - print("\nTotal time taken %ss" % (datetime.utcnow() - - query_job.created.replace(tzinfo=None)).seconds) - print("Finished at %s." % datetime.now() - .strftime('%Y-%m-%d %H:%M:%S')) + try: + rows_iter = query_reply.result() + except self.http_error as ex: + self.process_http_error(ex) + result_rows = list(rows_iter) + total_rows = rows_iter.total_rows + schema = { + 'fields': [ + field.to_api_repr() + for field in rows_iter.schema], + } - return rows, columns, schema + # print basic query stats + self._print('Got {} rows.\n'.format(total_rows)) + + return schema, result_rows def load_data(self, dataframe, dataset_id, table_id, chunksize): - try: - from googleapiclient.errors import HttpError - except ImportError: - from apiclient.errors import HttpError + from google.cloud.bigquery import LoadJobConfig + from six import StringIO - job_id = uuid.uuid4().hex + destination_table = self.client.dataset(dataset_id).table(table_id) + job_config = LoadJobConfig() + job_config.write_disposition = 'WRITE_APPEND' + job_config.source_format = 'NEWLINE_DELIMITED_JSON' rows = [] remaining_rows = len(dataframe) @@ -655,44 +572,25 @@ def load_data(self, dataframe, dataset_id, table_id, chunksize): self._print("\n\n") for index, row in dataframe.reset_index(drop=True).iterrows(): - row_dict = dict() - row_dict['json'] = json.loads(row.to_json(force_ascii=False, - date_unit='s', - date_format='iso')) - row_dict['insertId'] = job_id + str(index) - rows.append(row_dict) + row_json = row.to_json( + force_ascii=False, date_unit='s', date_format='iso') + rows.append(row_json) remaining_rows -= 1 if (len(rows) % chunksize == 0) or (remaining_rows == 0): - self._print("\rStreaming Insert is {0}% Complete".format( + self._print("\rLoad is {0}% Complete".format( ((total_rows - remaining_rows) * 100) / total_rows)) - body = {'rows': rows} + body = StringIO('{}\n'.format('\n'.join(rows))) try: - response = self.service.tabledata().insertAll( - projectId=self.project_id, - datasetId=dataset_id, - tableId=table_id, - body=body).execute() - except HttpError as ex: + self.client.load_table_from_file( + body, + destination_table, + job_config=job_config).result() + except self.http_error as ex: self.process_http_error(ex) - # For streaming inserts, even if you receive a success HTTP - # response code, you'll need to check the insertErrors property - # of the response to determine if the row insertions were - # successful, because it's possible that BigQuery was only - # partially successful at inserting the rows. See the `Success - # HTTP Response Codes - # `__ - # section - - insert_errors = response.get('insertErrors', None) - if insert_errors: - self.process_insert_errors(insert_errors) - - sleep(1) # Maintains the inserts "per second" rate per API rows = [] self._print("\n") @@ -716,23 +614,18 @@ def schema(self, dataset_id, table_id): Fields representing the schema """ - try: - from googleapiclient.errors import HttpError - except ImportError: - from apiclient.errors import HttpError + table_ref = self.client.dataset(dataset_id).table(table_id) try: - remote_schema = self.service.tables().get( - projectId=self.project_id, - datasetId=dataset_id, - tableId=table_id).execute()['schema'] + table = self.client.get_table(table_ref) + remote_schema = table.schema - remote_fields = [{'name': field_remote['name'], - 'type': field_remote['type']} - for field_remote in remote_schema['fields']] + remote_fields = [{'name': field_remote.name, + 'type': field_remote.field_type} + for field_remote in remote_schema] return remote_fields - except HttpError as ex: + except self.http_error as ex: self.process_http_error(ex) def verify_schema(self, dataset_id, table_id, schema): @@ -818,42 +711,33 @@ def _get_credentials_file(): 'PANDAS_GBQ_CREDENTIALS_FILE') -def _create_df(rows, columns, schema, index_col, col_order): - df = DataFrame(data=rows, columns=columns) +def _parse_data(schema, rows): + # see: + # http://pandas.pydata.org/pandas-docs/dev/missing_data.html + # #missing-data-casting-rules-and-indexing + dtype_map = {'FLOAT': np.dtype(float), + 'TIMESTAMP': 'M8[ns]'} - # Manual field type conversion. Inserted to handle tests - # with only null rows, otherwise type conversion works automatically - for field in schema: - if field["field_type"] == 'TIMESTAMP': - if df[field["name"]].isnull().values.all(): - df[field["name"]] = to_datetime(df[field["name"]]) - if field["field_type"] == 'FLOAT': - if df[field["name"]].isnull().values.all(): - df[field["name"]] = to_numeric(df[field["name"]]) + fields = schema['fields'] + col_types = [field['type'] for field in fields] + col_names = [str(field['name']) for field in fields] + col_dtypes = [ + dtype_map.get(field['type'].upper(), object) + for field in fields + ] + print(fields) + page_array = np.zeros((len(rows),), dtype=lzip(col_names, col_dtypes)) + for row_num, entries in enumerate(rows): + for col_num in range(len(col_types)): + field_value = entries[col_num] + page_array[row_num][col_num] = field_value - # Reindex the DataFrame on the provided column - if index_col: - if index_col in df.columns: - df.set_index(index_col, inplace=True) - else: - raise InvalidIndexColumn( - 'Index column "{0}" does not exist in DataFrame.' - .format(index_col)) - - # Change the order of columns in the DataFrame based on provided list - if col_order: - if sorted(col_order) == sorted(df.columns): - df = df[col_order] - else: - raise InvalidColumnOrder( - 'Column order does not match this DataFrame.') - return df + return DataFrame(page_array, columns=col_names) def read_gbq(query, project_id=None, index_col=None, col_order=None, reauth=False, verbose=True, private_key=None, - auth_local_webserver=False, dialect='legacy', - query_parameters=(), configuration=None, **kwargs): + auth_local_webserver=False, dialect='legacy', **kwargs): r"""Load data from Google BigQuery using google-cloud-python The main method a user calls to execute a Query in Google BigQuery @@ -863,21 +747,19 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, Documentation is available `here `__ - Authentication via Google Cloud can be performed a number of ways. + Authentication to the Google BigQuery service is via OAuth 2.0. - One method is to generate user credentials via - ``gcloud auth application-default login`` and point to it using an - environment variable: - ``$ export GOOGLE_APPLICATION_CREDENTIALS="/path/to/keyfile.json"`` + - If "private_key" is not provided: - You can also download a service account private key JSON file and pass the - path to the file to the private_key paramater. + By default "application default credentials" are used. - If default credentials are not located and a private key is not passed, - an auth flow will begin where a user can auth via a link or via a pop-up - through which a user can auth with their Google account. This will - generate a user credentials file, which is saved locally and can be re-used - in the future. + If default application credentials are not found or are restrictive, + user account credentials are used. In this case, you will be asked to + grant permissions for product name 'pandas GBQ'. + + - If "private_key" is provided: + + Service account credentials will be used to authenticate. Parameters ---------- @@ -918,42 +800,15 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, compliant with the SQL 2011 standard. For more information see `BigQuery SQL Reference `__ - credentials: credentials object, default None (optional) - If generating credentials on your own, pass in. Otherwise, will attempt - to generate automatically - - .. versionadded:: 0.3.0 - - query_parameters: tuple (optional) Can only be used in Standard SQL - `More info - `__ - Example:: - - gbq.read_gbq("SELECT @param1 + @param2", - query_parameters = (bigquery.ScalarQueryParameter( - 'param1', 'INT64', 1), - bigquery.ScalarQueryParameter( - 'param2', 'INT64', 2))) - - .. versionadded:: 0.3.0 - - configuration : dict (optional) - Due to the [current implementation in Google Cloud Python] only some - configuration settings are able to be set. You can pass them along like - in the following: - `read_gbq(q,configuration={'allow_large_results':True, - 'maximum_billing_tier':2})` - [Example allowable settings]: - allow_large_results, create_disposition, default_dataset, - destination, flatten_results, priority, use_query_cache, - use_legacy_sql, dry_run, write_disposition, udf_resources, - maximum_billing_tier, maximum_bytes_billed - - .. [current implementation in Google Cloud Python] - https://github.com/GoogleCloudPlatform/google-cloud-python/issues/2765 - .. [Example allowable settings] - http://google-cloud-python.readthedocs.io/en/latest/_modules/google/cloud/bigquery/job.html?highlight=QueryJobConfig - .. versionadded:: 0.3.0 + + **kwargs : Arbitrary keyword arguments + configuration (dict): query config parameters for job processing. + For example: + + configuration = {'query': {'useQueryCache': False}} + + For more information see `BigQuery SQL Reference + `__ Returns ------- @@ -969,42 +824,48 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, if dialect not in ('legacy', 'standard'): raise ValueError("'{0}' is not valid for dialect".format(dialect)) - if configuration and any(key in configuration for key in - ["query", "copy", "load", "extract"]): - raise ValueError("The Google Cloud BigQuery API handles configuration " - "settings differently. There are now a discrete set " - "of query settings one can set by passing in a " - "dictionary, e.g.: `configuration=" - "{'maximum_billing_tier':2}`. See http://google-cloud" - "-python.readthedocs.io/en/latest/_modules/google/" - "cloud/bigquery/job.html?highlight=QueryJobConfig " - "for allowable paramaters.") - - connector = GbqConnector(project_id=project_id, - reauth=reauth, - auth_local_webserver=auth_local_webserver, - private_key=private_key) - - # Temporary workaround in order to perform timeouts on queries. - # Once Google Cloud Python resolves, differentiation between sync and async - # code will be removed. - if (configuration and "timeout_ms" in configuration): - rows, columns, schema = connector.run_query(query, - dialect, - query_parameters, - configuration, - verbose, - async=False) - else: - rows, columns, schema = connector.run_query(query, - dialect, - query_parameters, - configuration, - verbose) - df = _create_df(rows, columns, schema, index_col, col_order) + connector = GbqConnector( + project_id, reauth=reauth, verbose=verbose, private_key=private_key, + dialect=dialect, auth_local_webserver=auth_local_webserver) + schema, rows = connector.run_query(query, **kwargs) + final_df = _parse_data(schema, rows) + + # Reindex the DataFrame on the provided column + if index_col is not None: + if index_col in final_df.columns: + final_df.set_index(index_col, inplace=True) + else: + raise InvalidIndexColumn( + 'Index column "{0}" does not exist in DataFrame.' + .format(index_col) + ) + + # Change the order of columns in the DataFrame based on provided list + if col_order is not None: + if sorted(col_order) == sorted(final_df.columns): + final_df = final_df[col_order] + else: + raise InvalidColumnOrder( + 'Column order does not match this DataFrame.' + ) + + # cast BOOLEAN and INTEGER columns from object to bool/int + # if they dont have any nulls + type_map = {'BOOLEAN': bool, 'INTEGER': int} + for field in schema['fields']: + if field['type'].upper() in type_map and \ + final_df[field['name']].notnull().all(): + final_df[field['name']] = \ + final_df[field['name']].astype(type_map[field['type'].upper()]) - return df + connector.print_elapsed_seconds( + 'Total time taken', + datetime.now().strftime('s.\nFinished at %Y-%m-%d %H:%M:%S.'), + 0 + ) + + return final_df def to_gbq(dataframe, destination_table, project_id, chunksize=10000, @@ -1152,11 +1013,6 @@ class _Table(GbqConnector): def __init__(self, project_id, dataset_id, reauth=False, verbose=False, private_key=None): - try: - from googleapiclient.errors import HttpError - except ImportError: - from apiclient.errors import HttpError - self.http_error = HttpError self.dataset_id = dataset_id super(_Table, self).__init__(project_id, reauth, verbose, private_key) @@ -1173,18 +1029,16 @@ def exists(self, table_id): boolean true if table exists, otherwise false """ + from google.api_core.exceptions import NotFound + table_ref = self.client.dataset(self.dataset_id).table(table_id) try: - self.service.tables().get( - projectId=self.project_id, - datasetId=self.dataset_id, - tableId=table_id).execute() + self.client.get_table(table_ref) return True + except NotFound: + return False except self.http_error as ex: - if ex.resp.status == 404: - return False - else: - self.process_http_error(ex) + self.process_http_error(ex) def create(self, table_id, schema): """ Create a table in Google BigQuery given a table and schema @@ -1197,6 +1051,8 @@ def create(self, table_id, schema): Use the generate_bq_schema to generate your table schema from a dataframe. """ + from google.cloud.bigquery import SchemaField + from google.cloud.bigquery import Table if self.exists(table_id): raise TableCreationError("Table {0} already " @@ -1207,20 +1063,20 @@ def create(self, table_id, schema): _Dataset(self.project_id, private_key=self.private_key).create(self.dataset_id) - body = { - 'schema': schema, - 'tableReference': { - 'tableId': table_id, - 'projectId': self.project_id, - 'datasetId': self.dataset_id - } - } + table_ref = self.client.dataset(self.dataset_id).table(table_id) + table = Table(table_ref) + + for field in schema['fields']: + if 'mode' not in field: + field['mode'] = 'NULLABLE' + + table.schema = [ + SchemaField.from_api_repr(field) + for field in schema['fields'] + ] try: - self.service.tables().insert( - projectId=self.project_id, - datasetId=self.dataset_id, - body=body).execute() + self.client.create_table(table) except self.http_error as ex: self.process_http_error(ex) @@ -1232,30 +1088,25 @@ def delete(self, table_id): table : str Name of table to be deleted """ + from google.api_core.exceptions import NotFound if not self.exists(table_id): raise NotFoundException("Table does not exist") + table_ref = self.client.dataset(self.dataset_id).table(table_id) try: - self.service.tables().delete( - datasetId=self.dataset_id, - projectId=self.project_id, - tableId=table_id).execute() - except self.http_error as ex: + self.client.delete_table(table_ref) + except NotFound: # Ignore 404 error which may occur if table already deleted - if ex.resp.status != 404: - self.process_http_error(ex) + pass + except self.http_error as ex: + self.process_http_error(ex) class _Dataset(GbqConnector): def __init__(self, project_id, reauth=False, verbose=False, private_key=None): - try: - from googleapiclient.errors import HttpError - except ImportError: - from apiclient.errors import HttpError - self.http_error = HttpError super(_Dataset, self).__init__(project_id, reauth, verbose, private_key) @@ -1272,17 +1123,15 @@ def exists(self, dataset_id): boolean true if dataset exists, otherwise false """ + from google.api_core.exceptions import NotFound try: - self.service.datasets().get( - projectId=self.project_id, - datasetId=dataset_id).execute() + self.client.get_dataset(self.client.dataset(dataset_id)) return True + except NotFound: + return False except self.http_error as ex: - if ex.resp.status == 404: - return False - else: - self.process_http_error(ex) + self.process_http_error(ex) def datasets(self): """ Return a list of datasets in Google BigQuery @@ -1298,32 +1147,15 @@ def datasets(self): """ dataset_list = [] - next_page_token = None - first_query = True - while first_query or next_page_token: - first_query = False - - try: - list_dataset_response = self.service.datasets().list( - projectId=self.project_id, - pageToken=next_page_token).execute() - - dataset_response = list_dataset_response.get('datasets') - if dataset_response is None: - dataset_response = [] - - next_page_token = list_dataset_response.get('nextPageToken') - - if dataset_response is None: - dataset_response = [] + try: + dataset_response = self.client.list_datasets() - for row_num, raw_row in enumerate(dataset_response): - dataset_list.append( - raw_row['datasetReference']['datasetId']) + for row in dataset_response: + dataset_list.append(row.dataset_id) - except self.http_error as ex: - self.process_http_error(ex) + except self.http_error as ex: + self.process_http_error(ex) return dataset_list @@ -1335,22 +1167,16 @@ def create(self, dataset_id): dataset : str Name of dataset to be written """ + from google.cloud.bigquery import Dataset if self.exists(dataset_id): raise DatasetCreationError("Dataset {0} already " "exists".format(dataset_id)) - body = { - 'datasetReference': { - 'projectId': self.project_id, - 'datasetId': dataset_id - } - } + dataset = Dataset(self.client.dataset(dataset_id)) try: - self.service.datasets().insert( - projectId=self.project_id, - body=body).execute() + self.client.create_dataset(dataset) except self.http_error as ex: self.process_http_error(ex) @@ -1362,20 +1188,20 @@ def delete(self, dataset_id): dataset : str Name of dataset to be deleted """ + from google.api_core.exceptions import NotFound if not self.exists(dataset_id): raise NotFoundException( "Dataset {0} does not exist".format(dataset_id)) try: - self.service.datasets().delete( - datasetId=dataset_id, - projectId=self.project_id).execute() + self.client.delete_dataset(self.client.dataset(dataset_id)) - except self.http_error as ex: + except NotFound: # Ignore 404 error which may occur if dataset already deleted - if ex.resp.status != 404: - self.process_http_error(ex) + pass + except self.http_error as ex: + self.process_http_error(ex) def tables(self, dataset_id): """ List tables in the specific dataset in Google BigQuery @@ -1392,28 +1218,15 @@ def tables(self, dataset_id): """ table_list = [] - next_page_token = None - first_query = True - - while first_query or next_page_token: - first_query = False - try: - list_table_response = self.service.tables().list( - projectId=self.project_id, - datasetId=dataset_id, - pageToken=next_page_token).execute() - - table_response = list_table_response.get('tables') - next_page_token = list_table_response.get('nextPageToken') - - if not table_response: - return table_list + try: + table_response = self.client.list_dataset_tables( + self.client.dataset(dataset_id)) - for row_num, raw_row in enumerate(table_response): - table_list.append(raw_row['tableReference']['tableId']) + for row in table_response: + table_list.append(row.table_id) - except self.http_error as ex: - self.process_http_error(ex) + except self.http_error as ex: + self.process_http_error(ex) return table_list diff --git a/pandas_gbq/tests/test_gbq.py b/pandas_gbq/tests/test_gbq.py index 697eb2e1..6a2b8480 100644 --- a/pandas_gbq/tests/test_gbq.py +++ b/pandas_gbq/tests/test_gbq.py @@ -13,13 +13,10 @@ from pandas import compat from pandas.compat import u, range -from pandas import NaT, DataFrame, to_datetime +from pandas import NaT, DataFrame from pandas_gbq import gbq import pandas.util.testing as tm from pandas.compat.numpy import np_datetime64_compat -from google.cloud import bigquery - -from google.cloud.exceptions import BadRequest, NotFound TABLE_ID = 'new_test' @@ -196,19 +193,17 @@ def test_should_be_able_to_get_valid_credentials(self): credentials = self.sut.get_credentials() assert credentials.valid - def test_should_be_able_to_get_a_bigquery_service(self): - bigquery_service = self.sut.get_service() - assert bigquery_service is not None + def test_should_be_able_to_get_a_bigquery_client(self): + bigquery_client = self.sut.get_client() + assert bigquery_client is not None def test_should_be_able_to_get_schema_from_query(self): - result = self.sut.run_query('SELECT 1') - rows, columns, schema = result + schema, pages = self.sut.run_query('SELECT 1') assert schema is not None def test_should_be_able_to_get_results_from_query(self): - results = gbq.read_gbq('SELECT 1', - project_id=_get_project_id()) - assert results is not None + schema, pages = self.sut.run_query('SELECT 1') + assert pages is not None def test_get_application_default_credentials_does_not_throw_error(self): if _check_if_can_get_correct_default_credentials(): @@ -261,18 +256,17 @@ def test_should_be_able_to_get_valid_credentials(self): credentials = self.sut.get_credentials() assert credentials.valid - def test_should_be_able_to_get_a_bigquery_service(self): - bigquery_service = self.sut.get_service() - assert bigquery_service is not None + def test_should_be_able_to_get_a_bigquery_client(self): + bigquery_client = self.sut.get_client() + assert bigquery_client is not None def test_should_be_able_to_get_schema_from_query(self): - result = self.sut.run_query('SELECT 1') - rows, columns, schema = result + schema, pages = self.sut.run_query('SELECT 1') assert schema is not None def test_should_be_able_to_get_results_from_query(self): - results = gbq.read_gbq('SELECT 1', project_id=_get_project_id()) - assert results is not None + schema, pages = self.sut.run_query('SELECT 1') + assert pages is not None class TestGBQConnectorIntegrationWithServiceAccountKeyContents(object): @@ -293,19 +287,17 @@ def test_should_be_able_to_get_valid_credentials(self): credentials = self.sut.get_credentials() assert credentials.valid - def test_should_be_able_to_get_a_bigquery_service(self): - bigquery_service = self.sut.get_service() - assert bigquery_service is not None + def test_should_be_able_to_get_a_bigquery_client(self): + bigquery_client = self.sut.get_client() + assert bigquery_client is not None def test_should_be_able_to_get_schema_from_query(self): - result = self.sut.run_query('SELECT 1') - rows, columns, schema = result + schema, pages = self.sut.run_query('SELECT 1') assert schema is not None def test_should_be_able_to_get_results_from_query(self): - results = gbq.read_gbq('SELECT 1', - project_id=_get_project_id()) - assert results is not None + schema, pages = self.sut.run_query('SELECT 1') + assert pages is not None class GBQUnitTests(object): @@ -524,7 +516,7 @@ def test_should_properly_handle_nullable_integers(self): df = gbq.read_gbq(query, project_id=_get_project_id(), private_key=_get_private_key_path()) tm.assert_frame_equal( - df, DataFrame({'nullable_integer': [1, None]})) + df, DataFrame({'nullable_integer': [1, None]}).astype(object)) def test_should_properly_handle_valid_longs(self): query = 'SELECT 1 << 62 AS valid_long' @@ -540,7 +532,7 @@ def test_should_properly_handle_nullable_longs(self): df = gbq.read_gbq(query, project_id=_get_project_id(), private_key=_get_private_key_path()) tm.assert_frame_equal( - df, DataFrame({'nullable_long': [1 << 62, None]})) + df, DataFrame({'nullable_long': [1 << 62, None]}).astype(object)) def test_should_properly_handle_null_integers(self): query = 'SELECT INTEGER(NULL) AS null_integer' @@ -594,19 +586,16 @@ def test_should_properly_handle_timestamp_unix_epoch(self): query = 'SELECT TIMESTAMP("1970-01-01 00:00:00") AS unix_epoch' df = gbq.read_gbq(query, project_id=_get_project_id(), private_key=_get_private_key_path()) - expected = DataFrame({'unix_epoch': - [np.datetime64('1970-01-01T00:00:00.000000Z')]}) - tm.assert_frame_equal(df, to_datetime(expected.unix_epoch).dt - .tz_localize('UTC').to_frame()) + tm.assert_frame_equal(df, DataFrame( + {'unix_epoch': [np.datetime64('1970-01-01T00:00:00.000000Z')]})) def test_should_properly_handle_arbitrary_timestamp(self): query = 'SELECT TIMESTAMP("2004-09-15 05:00:00") AS valid_timestamp' df = gbq.read_gbq(query, project_id=_get_project_id(), private_key=_get_private_key_path()) - expected = DataFrame({'valid_timestamp': - [np.datetime64('2004-09-15T05:00:00.000000Z')]}) - tm.assert_frame_equal(df, to_datetime(expected.valid_timestamp).dt - .tz_localize('UTC').to_frame()) + tm.assert_frame_equal(df, DataFrame({ + 'valid_timestamp': [np.datetime64('2004-09-15T05:00:00.000000Z')] + })) def test_should_properly_handle_null_timestamp(self): query = 'SELECT TIMESTAMP(NULL) AS null_timestamp' @@ -639,7 +628,7 @@ def test_should_properly_handle_nullable_booleans(self): df = gbq.read_gbq(query, project_id=_get_project_id(), private_key=_get_private_key_path()) tm.assert_frame_equal( - df, DataFrame({'nullable_boolean': [True, None]})) + df, DataFrame({'nullable_boolean': [True, None]}).astype(object)) def test_unicode_string_conversion_and_normalization(self): correct_test_datatype = DataFrame( @@ -700,7 +689,7 @@ def test_column_order_plus_index(self): def test_read_gbq_raises_invalid_index_column(self): query = "SELECT 'a' AS string_1, 'b' AS string_2, 'c' AS string_3" - col_order = ['string_3', 'string_2', 'string_1'] + col_order = ['string_3', 'string_2'] # Column string_bbb does not exist. Should raise InvalidIndexColumn with pytest.raises(gbq.InvalidIndexColumn): @@ -709,18 +698,18 @@ def test_read_gbq_raises_invalid_index_column(self): private_key=_get_private_key_path()) def test_malformed_query(self): - with pytest.raises(BadRequest): + with pytest.raises(gbq.GenericGBQException): gbq.read_gbq("SELCET * FORM [publicdata:samples.shakespeare]", project_id=_get_project_id(), private_key=_get_private_key_path()) def test_bad_project_id(self): - with pytest.raises(NotFound): + with pytest.raises(gbq.GenericGBQException): gbq.read_gbq("SELECT 1", project_id='001', private_key=_get_private_key_path()) def test_bad_table_name(self): - with pytest.raises(NotFound): + with pytest.raises(gbq.GenericGBQException): gbq.read_gbq("SELECT * FROM [publicdata:samples.nope]", project_id=_get_project_id(), private_key=_get_private_key_path()) @@ -749,15 +738,14 @@ def test_zero_rows(self): ('is_bot', np.dtype(bool)), ('ts', 'M8[ns]')]) expected_result = DataFrame( page_array, columns=['title', 'id', 'is_bot', 'ts']) - tm.assert_frame_equal(expected_result.astype(object), - df.reset_index(drop=True).astype(object)) + tm.assert_frame_equal(df, expected_result) def test_legacy_sql(self): legacy_sql = "SELECT id FROM [publicdata.samples.wikipedia] LIMIT 10" # Test that a legacy sql statement fails when # setting dialect='standard' - with pytest.raises((RuntimeError, BadRequest)): + with pytest.raises(gbq.GenericGBQException): gbq.read_gbq(legacy_sql, project_id=_get_project_id(), dialect='standard', private_key=_get_private_key_path()) @@ -775,7 +763,7 @@ def test_standard_sql(self): # Test that a standard sql statement fails when using # the legacy SQL dialect (default value) - with pytest.raises((RuntimeError, BadRequest)): + with pytest.raises(gbq.GenericGBQException): gbq.read_gbq(standard_sql, project_id=_get_project_id(), private_key=_get_private_key_path()) @@ -803,25 +791,66 @@ def test_invalid_option_for_sql_dialect(self): def test_query_with_parameters(self): sql_statement = "SELECT @param1 + @param2 AS valid_result" - config = {"use_legacy_sql": False} + config = { + 'query': { + "useLegacySql": False, + "parameterMode": "named", + "queryParameters": [ + { + "name": "param1", + "parameterType": { + "type": "INTEGER" + }, + "parameterValue": { + "value": 1 + } + }, + { + "name": "param2", + "parameterType": { + "type": "INTEGER" + }, + "parameterValue": { + "value": 2 + } + } + ] + } + } # Test that a query that relies on parameters fails # when parameters are not supplied via configuration - with pytest.raises((RuntimeError, BadRequest)): + with pytest.raises(ValueError): gbq.read_gbq(sql_statement, project_id=_get_project_id(), private_key=_get_private_key_path()) # Test that the query is successful because we have supplied - # the correct query parameters via the 'config' and query_parameters - # option + # the correct query parameters via the 'config' option df = gbq.read_gbq(sql_statement, project_id=_get_project_id(), - configuration=config, - query_parameters=(bigquery.ScalarQueryParameter( - 'param1', 'INT64', 1), - bigquery.ScalarQueryParameter( - 'param2', 'INT64', 2)), - private_key=_get_private_key_path()) + private_key=_get_private_key_path(), + configuration=config) tm.assert_frame_equal(df, DataFrame({'valid_result': [3]})) + def test_query_inside_configuration(self): + query_no_use = 'SELECT "PI_WRONG" AS valid_string' + query = 'SELECT "PI" AS valid_string' + config = { + 'query': { + "query": query, + "useQueryCache": False, + } + } + # Test that it can't pass query both + # inside config and as parameter + with pytest.raises(ValueError): + gbq.read_gbq(query_no_use, project_id=_get_project_id(), + private_key=_get_private_key_path(), + configuration=config) + + df = gbq.read_gbq(None, project_id=_get_project_id(), + private_key=_get_private_key_path(), + configuration=config) + tm.assert_frame_equal(df, DataFrame({'valid_string': ['PI']})) + def test_configuration_without_query(self): sql_statement = 'SELECT 1' config = { @@ -845,9 +874,31 @@ def test_configuration_without_query(self): private_key=_get_private_key_path(), configuration=config) + def test_configuration_raises_value_error_with_multiple_config(self): + sql_statement = 'SELECT 1' + config = { + 'query': { + "query": sql_statement, + "useQueryCache": False, + }, + 'load': { + "query": sql_statement, + "useQueryCache": False, + } + } + # Test that only ValueError is raised with multiple configurations + with pytest.raises(ValueError): + gbq.read_gbq(sql_statement, project_id=_get_project_id(), + private_key=_get_private_key_path(), + configuration=config) + def test_timeout_configuration(self): sql_statement = 'SELECT 1' - config = {"timeout_ms": 1} + config = { + 'query': { + "timeoutMs": 1 + } + } # Test that QueryTimeout error raises with pytest.raises(gbq.QueryTimeout): gbq.read_gbq(sql_statement, project_id=_get_project_id(), @@ -926,8 +977,6 @@ def test_upload_data(self): gbq.to_gbq(df, self.destination_table + test_id, _get_project_id(), chunksize=10000, private_key=_get_private_key_path()) - sleep(30) # <- Curses Google!!! - result = gbq.read_gbq("SELECT COUNT(*) AS num_rows FROM {0}" .format(self.destination_table + test_id), project_id=_get_project_id(), @@ -964,8 +1013,6 @@ def test_upload_data_if_table_exists_append(self): gbq.to_gbq(df, self.destination_table + test_id, _get_project_id(), if_exists='append', private_key=_get_private_key_path()) - sleep(30) # <- Curses Google!!! - result = gbq.read_gbq("SELECT COUNT(*) AS num_rows FROM {0}" .format(self.destination_table + test_id), project_id=_get_project_id(), @@ -995,8 +1042,6 @@ def test_upload_subset_columns_if_table_exists_append(self): self.destination_table + test_id, _get_project_id(), if_exists='append', private_key=_get_private_key_path()) - sleep(30) # <- Curses Google!!! - result = gbq.read_gbq("SELECT COUNT(*) AS num_rows FROM {0}" .format(self.destination_table + test_id), project_id=_get_project_id(), @@ -1029,8 +1074,6 @@ def test_upload_data_if_table_exists_replace(self): _get_project_id(), if_exists='replace', private_key=_get_private_key_path()) - sleep(30) # <- Curses Google!!! - result = gbq.read_gbq("SELECT COUNT(*) AS num_rows FROM {0}" .format(self.destination_table + test_id), project_id=_get_project_id(), @@ -1204,10 +1247,14 @@ def test_verify_schema_ignores_field_mode(self): def test_retrieve_schema(self): # Issue #24 schema function returns the schema in biquery test_id = "15" - test_schema = {'fields': [{'name': 'A', 'type': 'FLOAT'}, - {'name': 'B', 'type': 'FLOAT'}, - {'name': 'C', 'type': 'STRING'}, - {'name': 'D', 'type': 'TIMESTAMP'}]} + test_schema = { + 'fields': [ + {'name': 'A', 'type': 'FLOAT', 'mode': 'NULLABLE'}, + {'name': 'B', 'type': 'FLOAT', 'mode': 'NULLABLE'}, + {'name': 'C', 'type': 'STRING', 'mode': 'NULLABLE'}, + {'name': 'D', 'type': 'TIMESTAMP', 'mode': 'NULLABLE'} + ] + } self.table.create(TABLE_ID + test_id, test_schema) actual = self.sut.schema(self.dataset_prefix + "1", TABLE_ID + test_id) @@ -1364,8 +1411,6 @@ def test_upload_data(self): gbq.to_gbq(df, self.destination_table + test_id, _get_project_id(), chunksize=10000) - sleep(30) # <- Curses Google!!! - result = gbq.read_gbq("SELECT COUNT(*) AS num_rows FROM {0}".format( self.destination_table + test_id), project_id=_get_project_id()) @@ -1422,8 +1467,6 @@ def test_upload_data(self): gbq.to_gbq(df, self.destination_table + test_id, _get_project_id(), chunksize=10000, private_key=_get_private_key_contents()) - sleep(30) # <- Curses Google!!! - result = gbq.read_gbq("SELECT COUNT(*) as num_rows FROM {0}".format( self.destination_table + test_id), project_id=_get_project_id(), diff --git a/requirements.txt b/requirements.txt index f49120c2..88cf967a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,4 @@ pandas -httplib2 -google-api-python-client google-auth -google-auth-httplib2 google-auth-oauthlib -google-cloud-bigquery==0.26.0 +google-cloud-bigquery diff --git a/setup.py b/setup.py index 327c983d..86a40c5e 100644 --- a/setup.py +++ b/setup.py @@ -19,12 +19,9 @@ def readme(): INSTALL_REQUIRES = [ 'pandas', - 'httplib2>=0.9.2', - 'google-api-python-client>=1.6.0', 'google-auth>=1.0.0', - 'google-auth-httplib2>=0.0.1', 'google-auth-oauthlib>=0.0.1', - 'google-cloud-bigquery>=0.26.0,<0.28.0', + 'google-cloud-bigquery>=0.28.0', ] From cd551bb98252c662dc4f98783982ed97e8447204 Mon Sep 17 00:00:00 2001 From: Jason Ng Date: Mon, 27 Nov 2017 13:59:39 -0500 Subject: [PATCH 37/42] Indentation --- pandas_gbq/gbq.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 15130321..58a3374c 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -476,8 +476,8 @@ def run_query(self, query, **kwargs): if 'query' in config['query']: if query is not None: raise ValueError("Query statement can't be specified " - "inside config while it is specified " - "as parameter") + "inside config while it is specified " + "as parameter") query = config['query']['query'] del config['query']['query'] From a6865e0b0eae3f43b15ec238e4b1a571588d2201 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 27 Nov 2017 12:26:09 -0800 Subject: [PATCH 38/42] Ignore mode property when comparing schemas. --- pandas_gbq/gbq.py | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 58a3374c..1893462e 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -620,9 +620,11 @@ def schema(self, dataset_id, table_id): table = self.client.get_table(table_ref) remote_schema = table.schema - remote_fields = [{'name': field_remote.name, - 'type': field_remote.field_type} - for field_remote in remote_schema] + remote_fields = [ + field_remote.to_api_repr() for field_remote in remote_schema] + for field in remote_fields: + field['type'] = field['type'].upper() + field['mode'] = field['mode'].upper() return remote_fields except self.http_error as ex: @@ -655,6 +657,14 @@ def verify_schema(self, dataset_id, table_id, schema): key=lambda x: x['name']) fields_local = sorted(schema['fields'], key=lambda x: x['name']) + # Ignore mode when comparing schemas. + for field in fields_local: + if 'mode' in field: + del field['mode'] + for field in fields_remote: + if 'mode' in field: + del field['mode'] + return fields_remote == fields_local def schema_is_subset(self, dataset_id, table_id, schema): @@ -683,6 +693,14 @@ def schema_is_subset(self, dataset_id, table_id, schema): fields_remote = self.schema(dataset_id, table_id) fields_local = schema['fields'] + # Ignore mode when comparing schemas. + for field in fields_local: + if 'mode' in field: + del field['mode'] + for field in fields_remote: + if 'mode' in field: + del field['mode'] + return all(field in fields_remote for field in fields_local) def delete_and_recreate_table(self, dataset_id, table_id, table_schema): From c636783f1d8d775640f54269b143e56f2dd3e4c6 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 27 Nov 2017 12:52:28 -0800 Subject: [PATCH 39/42] Document new dependency on google-cloud-bigquery. --- docs/source/changelog.rst | 5 +++++ docs/source/install.rst | 8 +++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index dc35067e..0a5661fe 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -1,6 +1,11 @@ Changelog ========= +0.3.0 / 2017-??-?? +------------------ + +- Use the `google-cloud-bigquery `__ library for API calls instead of ``google-api-client`` and ``httplib2``. (:issue:`93`) + 0.2.1 / 2017-??-?? ------------------ diff --git a/docs/source/install.rst b/docs/source/install.rst index 2b701fd2..98f2d79d 100644 --- a/docs/source/install.rst +++ b/docs/source/install.rst @@ -37,8 +37,10 @@ Dependencies This module requires following additional dependencies: -- `httplib2 `__: HTTP client -- `google-api-python-client `__: Google's API client - `google-auth `__: authentication and authorization for Google's API - `google-auth-oauthlib `__: integration with `oauthlib `__ for end-user authentication -- `google-auth-httplib2 `__: adapter to use ``httplib2`` HTTP client with ``google-auth`` +- `google-cloud-bigquery `__: Google Cloud client library for BigQuery + +.. note:: + + The dependency on `google-cloud-bigquery `__ is new in version 0.3.0 of ``pandas-gbq``. \ No newline at end of file From e959571a05ad48cfb42152d4e550a2b1a78b23f2 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Wed, 29 Nov 2017 16:21:17 -0800 Subject: [PATCH 40/42] Document dependencies for previous verions. Also says which libraries are no longer required, for easier upgrades. --- docs/source/install.rst | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/docs/source/install.rst b/docs/source/install.rst index 98f2d79d..c64c7939 100644 --- a/docs/source/install.rst +++ b/docs/source/install.rst @@ -43,4 +43,11 @@ This module requires following additional dependencies: .. note:: - The dependency on `google-cloud-bigquery `__ is new in version 0.3.0 of ``pandas-gbq``. \ No newline at end of file + The dependency on `google-cloud-bigquery `__ is new in version 0.3.0 of ``pandas-gbq``. + Versions less than 0.3.0 required the following dependencies: + + - `httplib2 `__: HTTP client (no longer required) + - `google-api-python-client `__: Google's API client (no longer required, replaced by `google-cloud-bigquery `__:) + - `google-auth `__: authentication and authorization for Google's API + - `google-auth-oauthlib `__: integration with `oauthlib `__ for end-user authentication + - `google-auth-httplib2 `__: adapter to use ``httplib2`` HTTP client with ``google-auth`` (no longer required) From 6448abb94eed3a2ba84f5720cc3e660b143d3bfe Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 8 Dec 2017 09:58:31 -0800 Subject: [PATCH 41/42] Remove print statement used for debugging. --- pandas_gbq/gbq.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 3f60f5f0..46a246e5 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -742,7 +742,6 @@ def _parse_data(schema, rows): dtype_map.get(field['type'].upper(), object) for field in fields ] - print(fields) page_array = np.zeros((len(rows),), dtype=lzip(col_names, col_dtypes)) for row_num, entries in enumerate(rows): for col_num in range(len(col_types)): From 26d64316711bc2104e009dfc3021a1e01d154c2d Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Wed, 20 Dec 2017 10:15:07 -0800 Subject: [PATCH 42/42] Add deps and StreamingInsertError to changelog. --- docs/source/changelog.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index a876840e..b6684582 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -4,7 +4,9 @@ Changelog 0.3.0 / 2017-??-?? ------------------ -- Use the `google-cloud-bigquery `__ library for API calls instead of ``google-api-client`` and ``httplib2``. (:issue:`93`) +- Use the `google-cloud-bigquery `__ library for API calls. The ``google-cloud-bigquery`` package is a new dependency, and dependencies on ``google-api-python-client`` and ``httplib2`` are removed. See the `installation guide `__ for more details. (:issue:`93`) +- :func:`to_gbq` now uses a load job instead of the streaming API. (:issue:`75`) +- Remove ``StreamingInsertError`` class, as it is no longer used by :func:`to_gbq`. (:issue:`75`) 0.2.1 / 2017-11-27 ------------------