diff --git a/doc/source/api.rst b/doc/source/api.rst index 46d77d0dcceb7..5706fa7864ed5 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -89,8 +89,19 @@ SQL read_frame write_frame +Google BigQuery +~~~~~~~~~~~~~~~ +.. currentmodule:: pandas.io.gbq + +.. autosummary:: + :toctree: generated/ + + read_gbq + to_gbq + .. currentmodule:: pandas + STATA ~~~~~ diff --git a/doc/source/io.rst b/doc/source/io.rst index 3e9359743b7a4..e75de91582b49 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -2932,56 +2932,76 @@ if the source datatypes are compatible with BigQuery ones. For specifics on the service itself, see `here `__ As an example, suppose you want to load all data from an existing table -``test_dataset.test_table`` into BigQuery and pull it into a ``DataFrame``. +: `test_dataset.test_table` +into BigQuery and pull it into a DataFrame. -:: +.. code-block:: python from pandas.io import gbq - data_frame = gbq.read_gbq('SELECT * FROM test_dataset.test_table') + + # Insert your BigQuery Project ID Here + # Can be found in the web console, or + # using the command line tool `bq ls` + projectid = "xxxxxxxx" + + data_frame = gbq.read_gbq('SELECT * FROM test_dataset.test_table', project_id = projectid) -The user will then be authenticated by the ``bq`` command line client - +The user will then be authenticated by the `bq` command line client - this usually involves the default browser opening to a login page, though the process can be done entirely from command line if necessary. -Datasets and additional parameters can be either configured with ``bq``, -passed in as options to :func:`~pandas.read_gbq`, or set using Google's -``gflags`` (this is not officially supported by this module, though care was -taken to ensure that they should be followed regardless of how you call the +Datasets and additional parameters can be either configured with `bq`, +passed in as options to `read_gbq`, or set using Google's gflags (this +is not officially supported by this module, though care was taken +to ensure that they should be followed regardless of how you call the method). Additionally, you can define which column to use as an index as well as a preferred column order as follows: -:: +.. code-block:: python data_frame = gbq.read_gbq('SELECT * FROM test_dataset.test_table', index_col='index_column_name', - col_order='[col1, col2, col3,...]') + col_order='[col1, col2, col3,...]', project_id = projectid) -Finally, if you would like to create a BigQuery table, `my_dataset.my_table`, -from the rows of DataFrame, `df`: +Finally, if you would like to create a BigQuery table, `my_dataset.my_table`, from the rows of DataFrame, `df`: -:: +.. code-block:: python - df = pandas.DataFrame({'string_col_name': ['hello'], - 'integer_col_name': [1], - 'boolean_col_name': [True]}) + df = pandas.DataFrame({'string_col_name' : ['hello'], + 'integer_col_name' : [1], + 'boolean_col_name' : [True]}) schema = ['STRING', 'INTEGER', 'BOOLEAN'] - data_frame = gbq.to_gbq(df, 'my_dataset.my_table', if_exists='fail', - schema=schema) + data_frame = gbq.to_gbq(df, 'my_dataset.my_table', + if_exists='fail', schema = schema, project_id = projectid) To add more rows to this, simply: -:: +.. code-block:: python - df2 = pandas.DataFrame({'string_col_name': ['hello2'], - 'integer_col_name': [2], - 'boolean_col_name': [False]}) - data_frame = gbq.to_gbq(df2, 'my_dataset.my_table', if_exists='append') + df2 = pandas.DataFrame({'string_col_name' : ['hello2'], + 'integer_col_name' : [2], + 'boolean_col_name' : [False]}) + data_frame = gbq.to_gbq(df2, 'my_dataset.my_table', if_exists='append', project_id = projectid) .. note:: - There is a hard cap on BigQuery result sets, at 128MB compressed. Also, the - BigQuery SQL query language has some oddities, see `here - `__ + A default project id can be set using the command line: + `bq init`. + + There is a hard cap on BigQuery result sets, at 128MB compressed. Also, the BigQuery SQL query language has some oddities, + see `here `__ + + You can access the management console to determine project id's by: + + +.. warning:: + + To use this module, you will need a BigQuery account. See + for details. + + As of 10/10/13, there is a bug in Google's API preventing result sets + from being larger than 100,000 rows. A patch is scheduled for the week of + 10/14/13. .. _io.stata: diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt index 6bf32b2343084..14e120fdff672 100644 --- a/doc/source/v0.13.0.txt +++ b/doc/source/v0.13.0.txt @@ -8,7 +8,7 @@ enhancements along with a large number of bug fixes. Highlights include support for a new index type ``Float64Index``, support for new methods of interpolation, updated ``timedelta`` operations, and a new string manipulation method ``extract``. Several experimental features are added, including new ``eval/query`` methods for expression evaluation, support for ``msgpack`` serialization, -and an io interface to google's ``BigQuery``. +and an io interface to Google's ``BigQuery``. .. warning:: @@ -648,6 +648,69 @@ Experimental os.remove('foo.msg') +- ``pandas.io.gbq`` provides a simple way to extract from, and load data into, + Google's BigQuery Data Sets by way of pandas DataFrames. BigQuery is a high + performance SQL-like database service, useful for performing ad-hoc queries + against extremely large datasets. :ref:`See the docs` + + .. code-block:: python + + from pandas.io import gbq + + # A query to select the average monthly temperatures in the + # in the year 2000 across the USA. The dataset, + # publicata:samples.gsod, is available on all BigQuery accounts, + # and is based on NOAA gsod data. + + query = """SELECT station_number as STATION, + month as MONTH, AVG(mean_temp) as MEAN_TEMP + FROM publicdata:samples.gsod + WHERE YEAR = 2000 + GROUP BY STATION, MONTH + ORDER BY STATION, MONTH ASC""" + + # Fetch the result set for this query + + # Your Google BigQuery Project ID + # To find this, see your dashboard: + # https://code.google.com/apis/console/b/0/?noredirect + projectid = xxxxxxxxx; + + df = gbq.read_gbq(query, project_id = projectid) + + # Use pandas to process and reshape the dataset + + df2 = df.pivot(index='STATION', columns='MONTH', values='MEAN_TEMP') + df3 = pandas.concat([df2.min(), df2.mean(), df2.max()], + axis=1,keys=["Min Tem", "Mean Temp", "Max Temp"]) + + The resulting dataframe is: + + ``` + Min Tem Mean Temp Max Temp + MONTH + 1 -53.336667 39.827892 89.770968 + 2 -49.837500 43.685219 93.437932 + 3 -77.926087 48.708355 96.099998 + 4 -82.892858 55.070087 97.317240 + 5 -92.378261 61.428117 102.042856 + 6 -77.703334 65.858888 102.900000 + 7 -87.821428 68.169663 106.510714 + 8 -89.431999 68.614215 105.500000 + 9 -86.611112 63.436935 107.142856 + 10 -78.209677 56.880838 92.103333 + 11 -50.125000 48.861228 94.996428 + 12 -50.332258 42.286879 94.396774 + ``` + .. warning:: + + To use this module, you will need a BigQuery account. See + for details. + + As of 10/10/13, there is a bug in Google's API preventing result sets + from being larger than 100,000 rows. A patch is scheduled for the week of + 10/14/13. + .. _whatsnew_0130.refactoring: Internal Refactoring diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 967da6102ae1a..7013ad4f9b02b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -671,6 +671,41 @@ def to_dict(self, outtype='dict'): else: # pragma: no cover raise ValueError("outtype %s not understood" % outtype) + def to_gbq(self, destination_table, schema=None, col_order=None, if_exists='fail', **kwargs): + """ + Write a DataFrame to a Google BigQuery table. If the table exists, + the DataFrame will be appended. If not, a new table will be created, + in which case the schema will have to be specified. By default, + rows will be written in the order they appear in the DataFrame, though + the user may specify an alternative order. + + Parameters + --------------- + destination_table: string + name of table to be written, in the form 'dataset.tablename' + schema : sequence (optional) + list of column types in order for data to be inserted, e.g. ['INTEGER', 'TIMESTAMP', 'BOOLEAN'] + col_order: sequence (optional) + order which columns are to be inserted, e.g. ['primary_key', 'birthday', 'username'] + if_exists: {'fail', 'replace', 'append'} (optional) + fail: If table exists, do nothing. + replace: If table exists, drop it, recreate it, and insert data. + append: If table exists, insert data. Create if does not exist. + kwargs are passed to the Client constructor + + Raises: + ------ + SchemaMissing: + Raised if the 'if_exists' parameter is set to 'replace', but no schema is specified + TableExists: + Raised if the specified 'destination_table' exists but the 'if_exists' parameter is set to 'fail' (the default) + InvalidSchema: + Raised if the 'schema' parameter does not match the provided DataFrame + """ + + from pandas.io import gbq + return gbq.to_gbq(self, destination_table, schema=None, col_order=None, if_exists='fail', **kwargs) + @classmethod def from_records(cls, data, index=None, exclude=None, columns=None, coerce_float=False, nrows=None):