diff --git a/benchmark/README.md b/benchmark/README.md new file mode 100644 index 00000000..5ede71d7 --- /dev/null +++ b/benchmark/README.md @@ -0,0 +1,16 @@ +# pandas-gbq benchmarks + +This directory contains a few scripts which are useful for performance +testing the pandas-gbq library. Use cProfile to time the script and see +details about where time is spent. To avoid timing how long BigQuery takes to +execute a query, run the benchmark twice to ensure the results are cached. + +## `read_gbq` + +Read a small table (a few KB). + + python -m cProfile --sort=cumtime read_gbq_small_results.py + +Read a large-ish table (100+ MB). + + python -m cProfile --sort=cumtime read_gbq_large_results.py diff --git a/benchmark/read_gbq_large_results.py b/benchmark/read_gbq_large_results.py new file mode 100644 index 00000000..98d9ff53 --- /dev/null +++ b/benchmark/read_gbq_large_results.py @@ -0,0 +1,8 @@ +import pandas_gbq + +# Select 163 MB worth of data, to time how long it takes to download large +# result sets. +df = pandas_gbq.read_gbq( + "SELECT * FROM `bigquery-public-data.usa_names.usa_1910_2013`", + dialect="standard", +) diff --git a/benchmark/read_gbq_small_results.py b/benchmark/read_gbq_small_results.py new file mode 100644 index 00000000..8e91b0a0 --- /dev/null +++ b/benchmark/read_gbq_small_results.py @@ -0,0 +1,7 @@ +import pandas_gbq + +# Select a few KB worth of data, to time downloading small result sets. +df = pandas_gbq.read_gbq( + "SELECT * FROM `bigquery-public-data.utility_us.country_code_iso`", + dialect="standard", +) diff --git a/ci/requirements-2.7.pip b/ci/requirements-2.7.pip index 48ceb439..10300f12 100644 --- a/ci/requirements-2.7.pip +++ b/ci/requirements-2.7.pip @@ -2,5 +2,5 @@ mock pandas==0.17.1 google-auth==1.4.1 google-auth-oauthlib==0.0.1 -google-cloud-bigquery==0.32.0 +google-cloud-bigquery==1.9.0 pydata-google-auth==0.1.2 diff --git a/ci/requirements-3.5.pip b/ci/requirements-3.5.pip index 980d0700..41a41891 100644 --- a/ci/requirements-3.5.pip +++ b/ci/requirements-3.5.pip @@ -1,5 +1,5 @@ pandas==0.19.0 google-auth==1.4.1 google-auth-oauthlib==0.0.1 -google-cloud-bigquery==0.32.0 +google-cloud-bigquery==1.9.0 pydata-google-auth==0.1.2 \ No newline at end of file diff --git a/ci/requirements-3.6-0.20.1.conda b/ci/requirements-3.6-0.20.1.conda index a057399d..1c7eb3f2 100644 --- a/ci/requirements-3.6-0.20.1.conda +++ b/ci/requirements-3.6-0.20.1.conda @@ -1,6 +1,5 @@ -google-auth -google-auth-oauthlib -google-cloud-bigquery==0.32.0 +pydata-google-auth +google-cloud-bigquery==1.9.0 pytest pytest-cov codecov diff --git a/ci/run_conda.sh b/ci/run_conda.sh index 59769328..60ae6ff0 100755 --- a/ci/run_conda.sh +++ b/ci/run_conda.sh @@ -21,7 +21,7 @@ fi REQ="ci/requirements-${PYTHON}-${PANDAS}" conda install -q --file "$REQ.conda"; -python setup.py develop +python setup.py develop --no-deps # Run the tests $DIR/run_tests.sh diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 6f3aa5cd..60861dea 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -1,6 +1,24 @@ Changelog ========= +.. _changelog-0.10.0: + +0.10.0 / TBD +------------ + +Dependency updates +~~~~~~~~~~~~~~~~~~ + +- Update the minimum version of ``google-cloud-bigquery`` to 1.9.0. + (:issue:`247`) + +Internal changes +~~~~~~~~~~~~~~~~ + +- Use ``to_dataframe()`` from ``google-cloud-bigquery`` in the ``read_gbq()`` + function. (:issue:`247`) + + .. _changelog-0.9.0: 0.9.0 / 2019-01-11 diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 948fd980..13d65669 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -1,11 +1,9 @@ import logging import time import warnings -from collections import OrderedDict from datetime import datetime import numpy as np -from pandas import DataFrame from pandas_gbq.exceptions import AccessDenied @@ -37,7 +35,7 @@ def _check_google_client_version(): raise ImportError("Could not import pkg_resources (setuptools).") # https://github.com/GoogleCloudPlatform/google-cloud-python/blob/master/bigquery/CHANGELOG.md - bigquery_minimum_version = pkg_resources.parse_version("0.32.0") + bigquery_minimum_version = pkg_resources.parse_version("1.9.0") BIGQUERY_INSTALLED_VERSION = pkg_resources.get_distribution( "google-cloud-bigquery" ).parsed_version @@ -482,15 +480,16 @@ def run_query(self, query, **kwargs): rows_iter = query_reply.result() except self.http_error as ex: self.process_http_error(ex) - result_rows = list(rows_iter) - total_rows = rows_iter.total_rows - schema = { - "fields": [field.to_api_repr() for field in rows_iter.schema] - } - logger.debug("Got {} rows.\n".format(total_rows)) + schema_fields = [field.to_api_repr() for field in rows_iter.schema] + nullsafe_dtypes = _bqschema_to_nullsafe_dtypes(schema_fields) + df = rows_iter.to_dataframe(dtypes=nullsafe_dtypes) + + if df.empty: + df = _cast_empty_df_dtypes(schema_fields, df) - return schema, result_rows + logger.debug("Got {} rows.\n".format(rows_iter.total_rows)) + return df def load_data( self, @@ -638,45 +637,62 @@ def delete_and_recreate_table(self, dataset_id, table_id, table_schema): table.create(table_id, table_schema) -def _parse_schema(schema_fields): - # see: +def _bqschema_to_nullsafe_dtypes(schema_fields): + # Only specify dtype when the dtype allows nulls. Otherwise, use pandas's + # default dtype choice. + # + # See: # http://pandas.pydata.org/pandas-docs/dev/missing_data.html # #missing-data-casting-rules-and-indexing dtype_map = { "FLOAT": np.dtype(float), + # Even though TIMESTAMPs are timezone-aware in BigQuery, pandas doesn't + # support datetime64[ns, UTC] as dtype in DataFrame constructors. See: + # https://github.com/pandas-dev/pandas/issues/12513 "TIMESTAMP": "datetime64[ns]", "TIME": "datetime64[ns]", "DATE": "datetime64[ns]", "DATETIME": "datetime64[ns]", - "BOOLEAN": bool, - "INTEGER": np.int64, } + dtypes = {} for field in schema_fields: name = str(field["name"]) if field["mode"].upper() == "REPEATED": - yield name, object - else: - dtype = dtype_map.get(field["type"].upper()) - yield name, dtype + continue + + dtype = dtype_map.get(field["type"].upper()) + if dtype: + dtypes[name] = dtype + return dtypes -def _parse_data(schema, rows): - column_dtypes = OrderedDict(_parse_schema(schema["fields"])) - df = DataFrame(data=(iter(r) for r in rows), columns=column_dtypes.keys()) +def _cast_empty_df_dtypes(schema_fields, df): + """Cast any columns in an empty dataframe to correct type. - for column in df: - dtype = column_dtypes[column] - null_safe = ( - df[column].notnull().all() - or dtype == float - or dtype == "datetime64[ns]" + In an empty dataframe, pandas cannot choose a dtype unless one is + explicitly provided. The _bqschema_to_nullsafe_dtypes() function only + provides dtypes when the dtype safely handles null values. This means + that empty int64 and boolean columns are incorrectly classified as + ``object``. + """ + if not df.empty: + raise ValueError( + "DataFrame must be empty in order to cast non-nullsafe dtypes" ) - if dtype and null_safe: - df[column] = df[column].astype( - column_dtypes[column], errors="ignore" - ) + + dtype_map = {"BOOLEAN": bool, "INTEGER": np.int64} + + for field in schema_fields: + column = str(field["name"]) + if field["mode"].upper() == "REPEATED": + continue + + dtype = dtype_map.get(field["type"].upper()) + if dtype: + df[column] = df[column].astype(dtype) + return df @@ -825,8 +841,8 @@ def read_gbq( credentials=credentials, private_key=private_key, ) - schema, rows = connector.run_query(query, configuration=configuration) - final_df = _parse_data(schema, rows) + + final_df = connector.run_query(query, configuration=configuration) # Reindex the DataFrame on the provided column if index_col is not None: diff --git a/setup.py b/setup.py index e53d43f5..e5e40505 100644 --- a/setup.py +++ b/setup.py @@ -22,7 +22,7 @@ def readme(): "pydata-google-auth", "google-auth", "google-auth-oauthlib", - "google-cloud-bigquery>=0.32.0", + "google-cloud-bigquery>=1.9.0", ] extras = {"tqdm": "tqdm>=4.23.0"} diff --git a/tests/system/test_gbq.py b/tests/system/test_gbq.py index dde34cb1..82753a38 100644 --- a/tests/system/test_gbq.py +++ b/tests/system/test_gbq.py @@ -6,11 +6,12 @@ import google.oauth2.service_account import numpy as np +import pandas import pandas.util.testing as tm -import pytest -import pytz from pandas import DataFrame, NaT, compat from pandas.compat import range, u +import pytest +import pytz from pandas_gbq import gbq @@ -138,14 +139,6 @@ def test_should_be_able_to_get_a_bigquery_client(self, gbq_connector): bigquery_client = gbq_connector.get_client() assert bigquery_client is not None - def test_should_be_able_to_get_schema_from_query(self, gbq_connector): - schema, pages = gbq_connector.run_query("SELECT 1") - assert schema is not None - - def test_should_be_able_to_get_results_from_query(self, gbq_connector): - schema, pages = gbq_connector.run_query("SELECT 1") - assert pages is not None - def test_should_read(project, credentials): query = 'SELECT "PI" AS valid_string' @@ -319,7 +312,8 @@ def test_should_properly_handle_timestamp_unix_epoch(self, project_id): tm.assert_frame_equal( df, DataFrame( - {"unix_epoch": [np.datetime64("1970-01-01T00:00:00.000000Z")]} + {"unix_epoch": ["1970-01-01T00:00:00.000000Z"]}, + dtype="datetime64[ns]", ), ) @@ -334,11 +328,38 @@ def test_should_properly_handle_arbitrary_timestamp(self, project_id): tm.assert_frame_equal( df, DataFrame( - { - "valid_timestamp": [ - np.datetime64("2004-09-15T05:00:00.000000Z") - ] - } + {"valid_timestamp": ["2004-09-15T05:00:00.000000Z"]}, + dtype="datetime64[ns]", + ), + ) + + def test_should_properly_handle_datetime_unix_epoch(self, project_id): + query = 'SELECT DATETIME("1970-01-01 00:00:00") AS unix_epoch' + df = gbq.read_gbq( + query, + project_id=project_id, + credentials=self.credentials, + dialect="legacy", + ) + tm.assert_frame_equal( + df, + DataFrame( + {"unix_epoch": ["1970-01-01T00:00:00"]}, dtype="datetime64[ns]" + ), + ) + + def test_should_properly_handle_arbitrary_datetime(self, project_id): + query = 'SELECT DATETIME("2004-09-15 05:00:00") AS valid_timestamp' + df = gbq.read_gbq( + query, + project_id=project_id, + credentials=self.credentials, + dialect="legacy", + ) + tm.assert_frame_equal( + df, + DataFrame( + {"valid_timestamp": [np.datetime64("2004-09-15T05:00:00")]} ), ) @@ -346,7 +367,7 @@ def test_should_properly_handle_arbitrary_timestamp(self, project_id): "expression, type_", [ ("current_date()", "