diff --git a/doc/source/api.rst b/doc/source/api.rst
index 46d77d0dcceb7..5706fa7864ed5 100644
--- a/doc/source/api.rst
+++ b/doc/source/api.rst
@@ -89,8 +89,19 @@ SQL
read_frame
write_frame
+Google BigQuery
+~~~~~~~~~~~~~~~
+.. currentmodule:: pandas.io.gbq
+
+.. autosummary::
+ :toctree: generated/
+
+ read_gbq
+ to_gbq
+
.. currentmodule:: pandas
+
STATA
~~~~~
diff --git a/doc/source/io.rst b/doc/source/io.rst
index 3e9359743b7a4..e75de91582b49 100644
--- a/doc/source/io.rst
+++ b/doc/source/io.rst
@@ -2932,56 +2932,76 @@ if the source datatypes are compatible with BigQuery ones.
For specifics on the service itself, see `here `__
As an example, suppose you want to load all data from an existing table
-``test_dataset.test_table`` into BigQuery and pull it into a ``DataFrame``.
+: `test_dataset.test_table`
+into BigQuery and pull it into a DataFrame.
-::
+.. code-block:: python
from pandas.io import gbq
- data_frame = gbq.read_gbq('SELECT * FROM test_dataset.test_table')
+
+ # Insert your BigQuery Project ID Here
+ # Can be found in the web console, or
+ # using the command line tool `bq ls`
+ projectid = "xxxxxxxx"
+
+ data_frame = gbq.read_gbq('SELECT * FROM test_dataset.test_table', project_id = projectid)
-The user will then be authenticated by the ``bq`` command line client -
+The user will then be authenticated by the `bq` command line client -
this usually involves the default browser opening to a login page,
though the process can be done entirely from command line if necessary.
-Datasets and additional parameters can be either configured with ``bq``,
-passed in as options to :func:`~pandas.read_gbq`, or set using Google's
-``gflags`` (this is not officially supported by this module, though care was
-taken to ensure that they should be followed regardless of how you call the
+Datasets and additional parameters can be either configured with `bq`,
+passed in as options to `read_gbq`, or set using Google's gflags (this
+is not officially supported by this module, though care was taken
+to ensure that they should be followed regardless of how you call the
method).
Additionally, you can define which column to use as an index as well as a preferred column order as follows:
-::
+.. code-block:: python
data_frame = gbq.read_gbq('SELECT * FROM test_dataset.test_table',
index_col='index_column_name',
- col_order='[col1, col2, col3,...]')
+ col_order='[col1, col2, col3,...]', project_id = projectid)
-Finally, if you would like to create a BigQuery table, `my_dataset.my_table`,
-from the rows of DataFrame, `df`:
+Finally, if you would like to create a BigQuery table, `my_dataset.my_table`, from the rows of DataFrame, `df`:
-::
+.. code-block:: python
- df = pandas.DataFrame({'string_col_name': ['hello'],
- 'integer_col_name': [1],
- 'boolean_col_name': [True]})
+ df = pandas.DataFrame({'string_col_name' : ['hello'],
+ 'integer_col_name' : [1],
+ 'boolean_col_name' : [True]})
schema = ['STRING', 'INTEGER', 'BOOLEAN']
- data_frame = gbq.to_gbq(df, 'my_dataset.my_table', if_exists='fail',
- schema=schema)
+ data_frame = gbq.to_gbq(df, 'my_dataset.my_table',
+ if_exists='fail', schema = schema, project_id = projectid)
To add more rows to this, simply:
-::
+.. code-block:: python
- df2 = pandas.DataFrame({'string_col_name': ['hello2'],
- 'integer_col_name': [2],
- 'boolean_col_name': [False]})
- data_frame = gbq.to_gbq(df2, 'my_dataset.my_table', if_exists='append')
+ df2 = pandas.DataFrame({'string_col_name' : ['hello2'],
+ 'integer_col_name' : [2],
+ 'boolean_col_name' : [False]})
+ data_frame = gbq.to_gbq(df2, 'my_dataset.my_table', if_exists='append', project_id = projectid)
.. note::
- There is a hard cap on BigQuery result sets, at 128MB compressed. Also, the
- BigQuery SQL query language has some oddities, see `here
- `__
+ A default project id can be set using the command line:
+ `bq init`.
+
+ There is a hard cap on BigQuery result sets, at 128MB compressed. Also, the BigQuery SQL query language has some oddities,
+ see `here `__
+
+ You can access the management console to determine project id's by:
+
+
+.. warning::
+
+ To use this module, you will need a BigQuery account. See
+ for details.
+
+ As of 10/10/13, there is a bug in Google's API preventing result sets
+ from being larger than 100,000 rows. A patch is scheduled for the week of
+ 10/14/13.
.. _io.stata:
diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt
index 6bf32b2343084..14e120fdff672 100644
--- a/doc/source/v0.13.0.txt
+++ b/doc/source/v0.13.0.txt
@@ -8,7 +8,7 @@ enhancements along with a large number of bug fixes.
Highlights include support for a new index type ``Float64Index``, support for new methods of interpolation, updated ``timedelta`` operations, and a new string manipulation method ``extract``.
Several experimental features are added, including new ``eval/query`` methods for expression evaluation, support for ``msgpack`` serialization,
-and an io interface to google's ``BigQuery``.
+and an io interface to Google's ``BigQuery``.
.. warning::
@@ -648,6 +648,69 @@ Experimental
os.remove('foo.msg')
+- ``pandas.io.gbq`` provides a simple way to extract from, and load data into,
+ Google's BigQuery Data Sets by way of pandas DataFrames. BigQuery is a high
+ performance SQL-like database service, useful for performing ad-hoc queries
+ against extremely large datasets. :ref:`See the docs`
+
+ .. code-block:: python
+
+ from pandas.io import gbq
+
+ # A query to select the average monthly temperatures in the
+ # in the year 2000 across the USA. The dataset,
+ # publicata:samples.gsod, is available on all BigQuery accounts,
+ # and is based on NOAA gsod data.
+
+ query = """SELECT station_number as STATION,
+ month as MONTH, AVG(mean_temp) as MEAN_TEMP
+ FROM publicdata:samples.gsod
+ WHERE YEAR = 2000
+ GROUP BY STATION, MONTH
+ ORDER BY STATION, MONTH ASC"""
+
+ # Fetch the result set for this query
+
+ # Your Google BigQuery Project ID
+ # To find this, see your dashboard:
+ # https://code.google.com/apis/console/b/0/?noredirect
+ projectid = xxxxxxxxx;
+
+ df = gbq.read_gbq(query, project_id = projectid)
+
+ # Use pandas to process and reshape the dataset
+
+ df2 = df.pivot(index='STATION', columns='MONTH', values='MEAN_TEMP')
+ df3 = pandas.concat([df2.min(), df2.mean(), df2.max()],
+ axis=1,keys=["Min Tem", "Mean Temp", "Max Temp"])
+
+ The resulting dataframe is:
+
+ ```
+ Min Tem Mean Temp Max Temp
+ MONTH
+ 1 -53.336667 39.827892 89.770968
+ 2 -49.837500 43.685219 93.437932
+ 3 -77.926087 48.708355 96.099998
+ 4 -82.892858 55.070087 97.317240
+ 5 -92.378261 61.428117 102.042856
+ 6 -77.703334 65.858888 102.900000
+ 7 -87.821428 68.169663 106.510714
+ 8 -89.431999 68.614215 105.500000
+ 9 -86.611112 63.436935 107.142856
+ 10 -78.209677 56.880838 92.103333
+ 11 -50.125000 48.861228 94.996428
+ 12 -50.332258 42.286879 94.396774
+ ```
+ .. warning::
+
+ To use this module, you will need a BigQuery account. See
+ for details.
+
+ As of 10/10/13, there is a bug in Google's API preventing result sets
+ from being larger than 100,000 rows. A patch is scheduled for the week of
+ 10/14/13.
+
.. _whatsnew_0130.refactoring:
Internal Refactoring
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 967da6102ae1a..7013ad4f9b02b 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -671,6 +671,41 @@ def to_dict(self, outtype='dict'):
else: # pragma: no cover
raise ValueError("outtype %s not understood" % outtype)
+ def to_gbq(self, destination_table, schema=None, col_order=None, if_exists='fail', **kwargs):
+ """
+ Write a DataFrame to a Google BigQuery table. If the table exists,
+ the DataFrame will be appended. If not, a new table will be created,
+ in which case the schema will have to be specified. By default,
+ rows will be written in the order they appear in the DataFrame, though
+ the user may specify an alternative order.
+
+ Parameters
+ ---------------
+ destination_table: string
+ name of table to be written, in the form 'dataset.tablename'
+ schema : sequence (optional)
+ list of column types in order for data to be inserted, e.g. ['INTEGER', 'TIMESTAMP', 'BOOLEAN']
+ col_order: sequence (optional)
+ order which columns are to be inserted, e.g. ['primary_key', 'birthday', 'username']
+ if_exists: {'fail', 'replace', 'append'} (optional)
+ fail: If table exists, do nothing.
+ replace: If table exists, drop it, recreate it, and insert data.
+ append: If table exists, insert data. Create if does not exist.
+ kwargs are passed to the Client constructor
+
+ Raises:
+ ------
+ SchemaMissing:
+ Raised if the 'if_exists' parameter is set to 'replace', but no schema is specified
+ TableExists:
+ Raised if the specified 'destination_table' exists but the 'if_exists' parameter is set to 'fail' (the default)
+ InvalidSchema:
+ Raised if the 'schema' parameter does not match the provided DataFrame
+ """
+
+ from pandas.io import gbq
+ return gbq.to_gbq(self, destination_table, schema=None, col_order=None, if_exists='fail', **kwargs)
+
@classmethod
def from_records(cls, data, index=None, exclude=None, columns=None,
coerce_float=False, nrows=None):