diff --git a/ci/requirements-2.6.txt b/ci/requirements-2.6.txt
index 5038b9e2b6552..60a8b57e72907 100644
--- a/ci/requirements-2.6.txt
+++ b/ci/requirements-2.6.txt
@@ -4,3 +4,4 @@ python-dateutil==1.5
pytz==2013b
http://www.crummy.com/software/BeautifulSoup/bs4/download/4.2/beautifulsoup4-4.2.0.tar.gz
html5lib==1.0b2
+bigquery==2.0.15
diff --git a/ci/requirements-2.7.txt b/ci/requirements-2.7.txt
index 686dc87f7d009..fe27fe10f7c04 100644
--- a/ci/requirements-2.7.txt
+++ b/ci/requirements-2.7.txt
@@ -18,3 +18,4 @@ MySQL-python==1.2.4
scipy==0.10.0
beautifulsoup4==4.2.1
statsmodels==0.5.0
+bigquery==2.0.15
diff --git a/ci/requirements-2.7_LOCALE.txt b/ci/requirements-2.7_LOCALE.txt
index e4cdf0733a7d3..f037cbed15160 100644
--- a/ci/requirements-2.7_LOCALE.txt
+++ b/ci/requirements-2.7_LOCALE.txt
@@ -16,3 +16,4 @@ lxml==3.2.1
scipy==0.10.0
beautifulsoup4==4.2.1
statsmodels==0.5.0
+bigquery==2.0.15
diff --git a/doc/source/install.rst b/doc/source/install.rst
index 532c90b83ebb0..2774e0ccc36ea 100644
--- a/doc/source/install.rst
+++ b/doc/source/install.rst
@@ -114,6 +114,7 @@ Optional Dependencies
:func:`~pandas.io.clipboard.read_clipboard`. Most package managers on Linux
distributions will have xclip and/or xsel immediately available for
installation.
+ * `Google bq Command Line Tool `__: Needed for :mod:`pandas.io.gbq`
* One of the following combinations of libraries is needed to use the
top-level :func:`~pandas.io.html.read_html` function:
diff --git a/doc/source/io.rst b/doc/source/io.rst
index 8f35693c6532e..d389444256107 100644
--- a/doc/source/io.rst
+++ b/doc/source/io.rst
@@ -38,6 +38,7 @@ object.
* ``read_json``
* ``read_msgpack`` (experimental)
* ``read_html``
+ * ``read_gbq`` (experimental)
* ``read_stata``
* ``read_clipboard``
* ``read_pickle``
@@ -51,6 +52,7 @@ The corresponding ``writer`` functions are object methods that are accessed like
* ``to_json``
* ``to_msgpack`` (experimental)
* ``to_html``
+ * ``to_gbq`` (experimental)
* ``to_stata``
* ``to_clipboard``
* ``to_pickle``
@@ -2905,7 +2907,70 @@ There are a few other available functions:
For now, writing your DataFrame into a database works only with
**SQLite**. Moreover, the **index** will currently be **dropped**.
+Google BigQuery (Experimental)
+------------------------------
+The :mod:`pandas.io.gbq` module provides a wrapper for Google's BigQuery
+analytics web service to simplify retrieving results from BigQuery tables
+using SQL-like queries. Result sets are parsed into a pandas
+DataFrame with a shape derived from the source table. Additionally,
+DataFrames can be uploaded into BigQuery datasets as tables
+if the source datatypes are compatible with BigQuery ones. The general
+structure of this module and its provided functions are based loosely on those in
+ :mod:`pandas.io.sql`.
+
+For specifics on the service itself, see:
+
+As an example, suppose you want to load all data from an existing table
+: `test_dataset.test_table`
+into BigQuery and pull it into a DataFrame.
+
+.. code-block:: python
+
+ from pandas.io import gbq
+ data_frame = gbq.read_gbq('SELECT * FROM test_dataset.test_table')
+
+The user will then be authenticated by the `bq` command line client -
+this usually involves the default browser opening to a login page,
+though the process can be done entirely from command line if necessary.
+Datasets and additional parameters can be either configured with `bq`,
+passed in as options to `read_gbq`, or set using Google's gflags (this
+is not officially supported by this module, though care was taken
+to ensure that they should be followed regardless of how you call the
+method).
+
+Additionally, you can define which column to use as an index as well as a preferred column order as follows:
+
+.. code-block:: python
+
+ data_frame = gbq.read_gbq('SELECT * FROM test_dataset.test_table', index_col='index_column_name', col_order='[col1, col2, col3,...]')
+
+Finally, if you would like to create a BigQuery table, `my_dataset.my_table`, from the rows of DataFrame, `df`:
+
+.. code-block:: python
+
+ df = pandas.DataFrame({'string_col_name' : ['hello'],
+ 'integer_col_name' : [1],
+ 'boolean_col_name' : [True]})
+ schema = ['STRING', 'INTEGER', 'BOOLEAN']
+ data_frame = gbq.to_gbq(df, 'my_dataset.my_table', if_exists='fail', schema = schema)
+
+To add more rows to this, simply:
+
+.. code-block:: python
+
+ df2 = pandas.DataFrame({'string_col_name' : ['hello2'],
+ 'integer_col_name' : [2],
+ 'boolean_col_name' : [False]})
+ data_frame = gbq.to_gbq(df2, 'my_dataset.my_table', if_exists='append')
+
+
+
+.. note::
+
+ * There is a hard cap on BigQuery result sets, at 128MB compressed. Also, the BigQuery SQL query language has some oddities,
+ see:
+
STATA Format
------------
diff --git a/doc/source/release.rst b/doc/source/release.rst
index f008109f9de8e..7776ee1efba4f 100644
--- a/doc/source/release.rst
+++ b/doc/source/release.rst
@@ -78,6 +78,7 @@ Experimental Features
- Add msgpack support via ``pd.read_msgpack()`` and ``pd.to_msgpack()`` / ``df.to_msgpack()`` for serialization
of arbitrary pandas (and python objects) in a lightweight portable binary format (:issue:`686`)
- Added PySide support for the qtpandas DataFrameModel and DataFrameWidget.
+ - Added :mod:`pandas.io.gbq` for reading from (and writing to) Google BigQuery into a DataFrame. (:issue:`4140`)
Improvements to existing features
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py
new file mode 100644
index 0000000000000..f226af6629aa5
--- /dev/null
+++ b/pandas/io/gbq.py
@@ -0,0 +1,467 @@
+"""
+Pandas module to interface with Google BigQuery.
+"""
+import os
+import sys
+import tempfile
+import csv
+import logging
+from datetime import datetime
+
+import pandas as pd
+import numpy as np
+
+from pandas import DataFrame, concat
+from pandas.core.common import PandasError
+
+try:
+ import bq
+ import bigquery_client
+ import gflags as flags
+ _BQ_INSTALLED = True
+except ImportError:
+ _BQ_INSTALLED = False
+
+
+# Setup the logger
+logger = logging.getLogger('pandas.io.gbq')
+
+# These are some custom exceptions that the
+# to_gbq() method can throw
+
+class SchemaMissing(PandasError,IOError):
+ """
+ Raised when attempting to write a DataFrame to
+ a new table in Google BigQuery without specifying
+ a schema describing the DataFrame.
+ """
+ pass
+
+class InvalidSchema(PandasError,IOError):
+ """
+ Raised when attempting to write a DataFrame to
+ Google BigQuery with an invalid table schema.
+ """
+ pass
+
+class TableExistsFail(PandasError,IOError):
+ """
+ Raised when attempting to write a DataFrame to
+ an existing Google BigQuery table without specifying
+ that a replace/update action be taken.
+ """
+ pass
+
+class InvalidColumnOrder(PandasError,IOError):
+ """
+ Raised when the provided column order for output
+ results DataFrame does not match the schema
+ returned by BigQuery.
+ """
+ pass
+
+
+def _authenticate():
+ """
+ For testing, we abstract the authentication to BigQuery API.
+ Presently this is implemented using the bq.py Client.Get()
+ method. Any exceptions raised are considered fatal, so we
+ do not process them.
+
+ Returns
+ -------
+ BigqueryClient : Configured connection to Google BigQuery
+ """
+ return bq.Client.Get()
+
+def _parse_entry(field_value, field_type):
+ """
+ Given a value and the corresponding BigQuery data type,
+ perform any operations needed and return in a format
+ appropriate for a numpy record dictionary
+
+ Parameters
+ ----------
+ field_value : Source object to be transformed
+ field_type : String representation of Google BigQuery
+ data type (per schema)
+
+ Returns
+ -------
+ field_value : object or primitive of type corresponding
+ to field_type
+ """
+
+ # Avoid any casting problems
+ if field_value is None or field_value == 'null':
+ return None
+ if field_type == 'INTEGER' or field_type == 'FLOAT':
+ field_value = float(field_value)
+ elif field_type == 'TIMESTAMP':
+ timestamp = datetime.utcfromtimestamp(float(field_value))
+ field_value = np.datetime64(timestamp)
+ elif field_type == 'BOOLEAN':
+ field_value = field_value == 'true'
+ else:
+ field_value = str(field_value)
+ return field_value
+
+
+def _parse_page(raw_page, col_names, col_types, col_dtypes):
+ """
+ Given a list of rows produced by the client.apiclient.tabledata().list(),
+ build a numpy array with proper dtypes and column names as specified
+ by the arguments.
+
+ Parameters
+ ----------
+ raw_page : Resulting list of rows from a page retrieved via
+ bigquery API
+ client.apiclient.tabledata().list().execute()['rows']
+ col_names: An ordered list of names for the columns
+ col_types: String representation of the BigQuery DataType for that
+ column
+ col_dtypes: Target numpy.dtype for the column
+
+ Returns
+ -------
+ page_array : numpy record array corresponding
+ to the page data
+ """
+
+ # Should be at most 100,000 per the API, but this could
+ # be increased in the future. Should only be less than
+ # this for the last page to reduce API calls
+ page_row_count = len(raw_page)
+
+ # Place to hold the results for a page of data
+ page_array = np.zeros(
+ (page_row_count,),
+ dtype=zip(col_names,col_dtypes)
+ )
+ for row_num, raw_row in enumerate(raw_page):
+ entries = raw_row.get('f', [])
+ # Iterate over each entry - setting proper field types
+ for col_num, field_type in enumerate(col_types):
+ # Process the field's types using schema
+ field_value = _parse_entry(entries[col_num].get('v', ''),
+ field_type)
+ # Fill the value into the final array
+ page_array[row_num][col_num] = field_value
+
+ return page_array
+
+def _parse_data(client, job, index_col=None, col_order=None):
+ """
+ Iterate through the query results and piece together the
+ final DataFrame. Builds a DataFrame for each page of
+ results, then concatenates them together when finished.
+ To save memory, we use numpy record arrays to build these
+ DataFrames.
+
+ Parameters
+ ----------
+ client: An instance of bq.Client
+ job: An array containing the job info for a completed query
+ index_col: str (optional)
+ Name of result column to use for index in results DataFrame
+ col_order: list() (optional)
+ List of BigQuery column names in the desired order for results
+ DataFrame
+
+ Returns
+ -------
+ df: pandas DataFrame
+ DataFrame representing results of query
+
+ Raises:
+ ------
+ InvalidColumnOrder:
+ Raised if 'col_order' parameter doesn't match returned DataFrame
+ BigqueryError:
+ Raised by bigquery_client if a Google API error is encountered
+
+
+ Notes:
+ -----
+ This script relies on Google being consistent with their
+ pagination API. We are using the most flexible iteration method
+ that we could find in the bq.py/bigquery_client.py API's, but
+ these have undergone large amounts of change recently.
+
+ We have encountered bugs with this functionality, see:
+ http://stackoverflow.com/questions/19145587/bq-py-not-paging-results
+ """
+
+ # dtype Map -
+ # see: http://pandas.pydata.org/pandas-docs/dev/missing_data.html#missing-data-casting-rules-and-indexing
+ dtype_map = {'INTEGER': np.dtype(float),
+ 'FLOAT': np.dtype(float),
+ 'TIMESTAMP': 'M8[ns]'} # This seems to be buggy without nanosecond indicator
+
+ # We first need the schema to get information about the columns of
+ # our dataframe.
+
+ table_dict = job['configuration']['query']['destinationTable']
+ fields = client.GetTableSchema(table_dict)['fields']
+
+ # Get the schema into a format useable to create our
+ # dataframe
+ col_dtypes = []
+ col_types = []
+ col_names = []
+
+ # TODO: Do this in one clean step
+ for field in fields:
+ col_types.append(field['type'])
+ # Note the encoding... numpy doesn't like titles that are UTF8, which is the return
+ # type from the API
+ col_names.append(field['name'].encode('ascii', 'ignore'))
+ # Note, it would be nice to use 'str' types, but BigQuery doesn't have a fixed length
+ # in mind - just maxes out at 64k
+ col_dtypes.append(dtype_map.get(field['type'],object))
+
+
+ # How many columns are there
+ num_columns = len(col_names)
+
+ # Iterate over the result rows.
+ # Since Google's API now requires pagination of results,
+ # we do that here. The following is repurposed from
+ # bigquery_client.py :: Client.ReadTableRows()
+
+ # Initially, no page token is set
+ page_token = None
+
+ # Most of Google's client API's allow one to set total_rows in case
+ # the user only wants the first 'n' results from a query. Typically
+ # they set this to sys.maxint by default, but this caused problems
+ # during testing - specifically on OS X. It appears that at some
+ # point in bigquery_client.py, there is an attempt to cast this value
+ # to an unsigned integer. Depending on the python install,
+ # sys.maxint may exceed the limitations of unsigned integers.
+ #
+ # See:
+ # https://code.google.com/p/google-bigquery-tools/issues/detail?id=14
+
+ # This is hardcoded value for 32bit sys.maxint per
+ # the above note. Theoretically, we could simply use
+ # 100,000 (or whatever the current max page size is),
+ # but this is more flexible in the event of an API change
+ total_rows = 2147483647
+
+ # Keep track of rows read
+ row_count = 0
+
+ # Keep our page DataFrames until the end when we
+ # concatentate them
+ dataframe_list = list()
+
+ # Iterate over all rows
+ while row_count < total_rows:
+ data = client.apiclient.tabledata().list(maxResults=total_rows - row_count,
+ pageToken=page_token,
+ **table_dict).execute()
+
+ # If there are more results than will fit on a page,
+ # you will recieve a token for the next page.
+ page_token = data.get('pageToken', None)
+
+ # How many rows are there across all pages?
+ total_rows = min(total_rows, int(data['totalRows'])) # Changed to use get(data[rows],0)
+ raw_page = data.get('rows', [])
+ page_array = _parse_page(raw_page, col_names, col_types, col_dtypes)
+
+ row_count += len(page_array)
+ if total_rows > 0:
+ completed = (100 * row_count) / total_rows
+ logger.info('Remaining Rows: ' + str(total_rows - row_count) + '(' + str(completed) + '% Complete)')
+ else:
+ logger.info('No Rows')
+
+ dataframe_list.append(DataFrame(page_array))
+
+ # Handle any exceptions that might have occured
+ if not page_token and row_count != total_rows:
+ raise bigquery_client.BigqueryInterfaceError(
+ 'PageToken missing for %r' % (
+ bigquery_client.ApiClientHelper.TableReference.Create(**table_dict),))
+ if not raw_page and row_count != total_rows:
+ raise bigquery_client.BigqueryInterfaceError(
+ 'Not enough rows returned by server for %r' % (
+ bigquery_client.ApiClientHelper.TableReference.Create(**table_dict),))
+
+ # Build final dataframe
+ final_df = concat(dataframe_list, ignore_index=True)
+
+ # Reindex the DataFrame on the provided column
+ if index_col is not None:
+ if index_col in col_names:
+ final_df.set_index(index_col, inplace=True)
+ col_names.remove(index_col)
+ else:
+ raise InvalidColumnOrder('Index column "{0}" does not exist in DataFrame.'.format(index_col))
+
+ # Change the order of columns in the DataFrame based on provided list
+ if col_order is not None:
+ if sorted(col_order) == sorted(col_names):
+ final_df = final_df[col_order]
+ else:
+ raise InvalidColumnOrder('Column order does not match this DataFrame.')
+
+ # Downcast floats to integers and objects to booleans
+ # if there are no NaN's. This is presently due to a
+ # limitation of numpy in handling missing data.
+ final_df._data = final_df._data.downcast(dtypes='infer')
+ return final_df
+
+def to_gbq(dataframe, destination_table, schema=None, col_order=None, if_exists='fail', **kwargs):
+ """
+ Write a DataFrame to a Google BigQuery table. If the table exists,
+ the DataFrame will be appended. If not, a new table will be created,
+ in which case the schema will have to be specified. By default,
+ rows will be written in the order they appear in the DataFrame, though
+ the user may specify an alternative order.
+
+ Parameters
+ ---------------
+ dataframe: DataFrame
+ DataFrame to be written
+ destination_table: string
+ name of table to be written, in the form 'dataset.tablename'
+ schema : sequence (optional)
+ list of column types in order for data to be inserted, e.g. ['INTEGER', 'TIMESTAMP', 'BOOLEAN']
+ col_order: sequence (optional)
+ order which columns are to be inserted, e.g. ['primary_key', 'birthday', 'username']
+ if_exists: {'fail', 'replace', 'append'} (optional)
+ fail: If table exists, do nothing.
+ replace: If table exists, drop it, recreate it, and insert data.
+ append: If table exists, insert data. Create if does not exist.
+ kwargs are passed to the Client constructor
+
+ Raises:
+ ------
+ SchemaMissing:
+ Raised if the 'if_exists' parameter is set to 'replace', but no schema is specified
+ TableExists:
+ Raised if the specified 'destination_table' exists but the 'if_exists' parameter is set to 'fail' (the default)
+ InvalidSchema:
+ Raised if the 'schema' parameter does not match the provided DataFrame
+ """
+
+ if not _BQ_INSTALLED:
+ if sys.version_info >= (3, 0):
+ raise NotImplementedError('gbq module does not support Python 3 yet')
+ else:
+ raise ImportError('Could not import Google BigQuery Client.')
+
+ ALLOWED_TYPES = ['STRING', 'INTEGER', 'FLOAT', 'BOOLEAN', 'TIMESTAMP', 'RECORD']
+
+ if if_exists == 'replace' and schema is None:
+ raise SchemaMissing('Cannot replace a table without specifying the data schema')
+ else:
+ client = _authenticate()
+ table_reference = client.GetTableReference(destination_table)
+ if client.TableExists(table_reference):
+ if if_exists == 'fail':
+ raise TableExistsFail('Cannot overwrite existing tables if \'if_exists="fail"\'')
+ else:
+ # Build up a string representation of the
+ # table's schema. Since the table already
+ # exists, we ask ask the API for it, which
+ # is returned in a list of dictionaries
+ # describing column data. Iterate over these
+ # and build up a string of form:
+ # "col_name1 : col_type1, col_name2 : col_type2..."
+ schema_full = client.GetTableSchema(dict(table_reference))['fields']
+ schema = ''
+ for count, row in enumerate(schema_full):
+ if count > 0:
+ schema += ', '
+ schema += row['name'] + ':' + row['type']
+ else:
+ logger.info('Creating New Table')
+ if schema is None:
+ raise SchemaMissing('Cannot create a new table without specifying the data schema')
+ else:
+ columns = dataframe.columns
+ if len(schema) != len(columns):
+ raise InvalidSchema('Incorrect number of columns in schema')
+ else:
+ schema_string = ''
+ for count, name in enumerate(columns):
+ if count > 0:
+ schema_string += ', '
+ column_type = schema[count].upper()
+ if column_type in ALLOWED_TYPES:
+ schema_string += name + ':' + schema[count].lower()
+ else:
+ raise InvalidSchema('Invalid Type: ' + column_type + ". Must be one of: " + str(ALLOWED_TYPES))
+ schema = schema_string
+
+ opts = kwargs
+ opts['sync'] = True
+ opts['skip_leading_rows'] = 1
+ opts['encoding'] = 'UTF-8'
+ opts['max_bad_records'] = 0
+
+ # See: https://developers.google.com/bigquery/docs/reference/v2/jobs
+ if if_exists == 'replace':
+ opts['write_disposition'] = 'WRITE_TRUNCATE'
+ elif if_exists == 'append':
+ opts['write_disposition'] = 'WRITE_APPEND'
+
+ with tempfile.NamedTemporaryFile() as csv_file:
+ dataframe.to_csv(csv_file.name, index=False, encoding='utf-8')
+ job = client.Load(table_reference, csv_file.name, schema=schema, **opts)
+
+def read_gbq(query, project_id = None, destination_table = None, index_col=None, col_order=None, **kwargs):
+ """
+ The main method a user calls to load data from Google BigQuery into a pandas DataFrame.
+ This is a simple wrapper for Google's bq.py and bigquery_client.py, which we use
+ to get the source data. Because of this, this script respects the user's bq settings
+ file, '~/.bigqueryrc', if it exists. Such a file can be generated using 'bq init'. Further,
+ additional parameters for the query can be specified as either **kwds in the command,
+ or using FLAGS provided in the 'gflags' module. Particular options can be found in
+ bigquery_client.py.
+
+ Parameters
+ ----------
+ query: str
+ SQL-Like Query to return data values
+ project_id: str (optional)
+ Google BigQuery Account project ID. Optional, since it may be
+ located in ~/.bigqueryrc
+ index_col: str (optional)
+ Name of result column to use for index in results DataFrame
+ col_order: list(str) (optional)
+ List of BigQuery column names in the desired order for results
+ DataFrame
+ destination_table: string (optional)
+ If provided, send the results to the given table.
+ **kwargs: to be passed to bq.Client.Create(). Particularly: 'trace', 'sync',
+ 'api', 'api_version'
+
+ Returns
+ -------
+ df: pandas DataFrame
+ DataFrame representing results of query
+
+ """
+ if not _BQ_INSTALLED:
+ if sys.version_info >= (3, 0):
+ raise NotImplementedError('gbq module does not support Python 3 yet')
+ else:
+ raise ImportError('Could not import Google BigQuery Client.')
+
+ query_args = kwargs
+ query_args['project_id'] = project_id
+ query_args['query'] = query
+ query_args['destination_table'] = destination_table
+ query_args['sync'] = True
+
+ client = _authenticate()
+
+ job = client.Query(**query_args)
+
+ return _parse_data(client, job, index_col=index_col, col_order=col_order)
diff --git a/pandas/io/tests/data/gbq_fake_job.txt b/pandas/io/tests/data/gbq_fake_job.txt
new file mode 100644
index 0000000000000..2a0f09bc66ef3
--- /dev/null
+++ b/pandas/io/tests/data/gbq_fake_job.txt
@@ -0,0 +1 @@
+{u'status': {u'state': u'DONE'}, u'kind': u'bigquery#job', u'statistics': {u'query': {u'cacheHit': True, u'totalBytesProcessed': u'0'}, u'endTime': u'1377668744674', u'totalBytesProcessed': u'0', u'startTime': u'1377668744466'}, u'jobReference': {u'projectId': u'57288129629', u'jobId': u'bqjob_r5f956972f0190bdf_00000140c374bf42_2'}, u'etag': u'"4PTsVxg68bQkQs1RJ1Ndewqkgg4/oO4VmgFrAku4N6FWci9s7iFIftc"', u'configuration': {u'query': {u'createDisposition': u'CREATE_IF_NEEDED', u'query': u'SELECT * FROM [publicdata:samples.shakespeare]', u'writeDisposition': u'WRITE_TRUNCATE', u'destinationTable': {u'projectId': u'57288129629', u'tableId': u'anonb5ec450da88eeeb78a27784ea482ee75a146d442', u'datasetId': u'_d0b4f5f0d50dc68a3eb0fa6cba66a9a8687d9253'}}}, u'id': u'57288129629:bqjob_r5f956972f0190bdf_00000140c374bf42_2', u'selfLink': u'https://www.googleapis.com/bigquery/v2/projects/57288129629/jobs/bqjob_r5f956972f0190bdf_00000140c374bf42_2'}
\ No newline at end of file
diff --git a/pandas/io/tests/test_gbq.py b/pandas/io/tests/test_gbq.py
new file mode 100644
index 0000000000000..89b048d472d5f
--- /dev/null
+++ b/pandas/io/tests/test_gbq.py
@@ -0,0 +1,450 @@
+import ast
+import nose
+import os
+import shutil
+import subprocess
+import unittest
+
+import numpy as np
+
+import pandas.io.gbq as gbq
+import pandas.util.testing as tm
+
+from pandas.core.frame import DataFrame
+from pandas.util.testing import with_connectivity_check
+from pandas import NaT
+
+
+try:
+ import bq
+ import bigquery_client
+ import gflags as flags
+except ImportError:
+ raise nose.SkipTest
+
+####################################################################################
+# Fake Google BigQuery Client
+
+class FakeClient:
+ def __init__(self):
+ self.apiclient = FakeApiClient()
+ def GetTableSchema(self,table_dict):
+ retval = {'fields': [
+ {'type': 'STRING', 'name': 'corpus', 'mode': 'NULLABLE'},
+ {'type': 'INTEGER', 'name': 'corpus_date', 'mode': 'NULLABLE'},
+ {'type': 'STRING', 'name': 'word', 'mode': 'NULLABLE'},
+ {'type': 'INTEGER', 'name': 'word_count', 'mode': 'NULLABLE'}
+ ]}
+ return retval
+
+# Fake Google BigQuery API Client
+class FakeApiClient:
+ def __init__(self):
+ self._tabledata = FakeTableData()
+
+
+ def tabledata(self):
+ return self._tabledata
+
+class FakeTableData:
+ def __init__(self):
+ self._list = FakeList()
+
+ def list(self,maxResults = None, pageToken = None, **table_dict):
+ return self._list
+
+class FakeList:
+ def execute(self):
+ return {'rows': [ {'f': [{'v': 'othello'}, {'v': '1603'}, {'v': 'brave'}, {'v': '3'}]},
+ {'f': [{'v': 'othello'}, {'v': '1603'}, {'v': 'attended'}, {'v': '1'}]},
+ {'f': [{'v': 'othello'}, {'v': '1603'}, {'v': 'treason'}, {'v': '1'}]},
+ {'f': [{'v': 'othello'}, {'v': '1603'}, {'v': 'islanders'}, {'v': '1'}]},
+ {'f': [{'v': 'othello'}, {'v': '1603'}, {'v': 'heed'}, {'v': '3'}]},
+ {'f': [{'v': 'othello'}, {'v': '1603'}, {'v': 'alehouse'}, {'v': '1'}]},
+ {'f': [{'v': 'othello'}, {'v': '1603'}, {'v': 'corrigible'}, {'v': '1'}]},
+ {'f': [{'v': 'othello'}, {'v': '1603'}, {'v': 'brawl'}, {'v': '2'}]},
+ {'f': [{'v': 'othello'}, {'v': '1603'}, {'v': "'"}, {'v': '17'}]},
+ {'f': [{'v': 'othello'}, {'v': '1603'}, {'v': 'troubled'}, {'v': '1'}]}
+ ],
+ 'kind': 'bigquery#tableDataList',
+ 'etag': '"4PTsVxg68bQkQs1RJ1Ndewqkgg4/hoRHzb4qfhJAIa2mEewC-jhs9Bg"',
+ 'totalRows': '10'}
+
+####################################################################################
+
+class test_gbq(unittest.TestCase):
+ def setUp(self):
+ with open(self.fake_job_path, 'r') as fin:
+ self.fake_job = ast.literal_eval(fin.read())
+
+ self.test_data_small = [{'f': [{'v': 'othello'}, {'v': '1603'}, {'v': 'brave'}, {'v': '3'}]},
+ {'f': [{'v': 'othello'}, {'v': '1603'}, {'v': 'attended'}, {'v': '1'}]},
+ {'f': [{'v': 'othello'}, {'v': '1603'}, {'v': 'treason'}, {'v': '1'}]},
+ {'f': [{'v': 'othello'}, {'v': '1603'}, {'v': 'islanders'}, {'v': '1'}]},
+ {'f': [{'v': 'othello'}, {'v': '1603'}, {'v': 'heed'}, {'v': '3'}]},
+ {'f': [{'v': 'othello'}, {'v': '1603'}, {'v': 'alehouse'}, {'v': '1'}]},
+ {'f': [{'v': 'othello'}, {'v': '1603'}, {'v': 'corrigible'}, {'v': '1'}]},
+ {'f': [{'v': 'othello'}, {'v': '1603'}, {'v': 'brawl'}, {'v': '2'}]},
+ {'f': [{'v': 'othello'}, {'v': '1603'}, {'v': "'"}, {'v': '17'}]},
+ {'f': [{'v': 'othello'}, {'v': '1603'}, {'v': 'troubled'},
+ {'v': '1'}]}]
+
+ self.correct_data_small = np.array(
+ [('othello', 1603, 'brave', 3),
+ ('othello', 1603, 'attended', 1),
+ ('othello', 1603, 'treason', 1),
+ ('othello', 1603, 'islanders', 1),
+ ('othello', 1603, 'heed', 3),
+ ('othello', 1603, 'alehouse', 1),
+ ('othello', 1603, 'corrigible', 1),
+ ('othello', 1603, 'brawl', 2),
+ ('othello', 1603, "'", 17),
+ ('othello', 1603, 'troubled', 1)
+ ],
+ dtype=[('corpus', 'S16'),
+ ('corpus_date', '