diff --git a/ci/requirements-3.7-0.24.2.conda b/ci/requirements-3.7-0.24.2.conda index e0323d92..a99bd59e 100644 --- a/ci/requirements-3.7-0.24.2.conda +++ b/ci/requirements-3.7-0.24.2.conda @@ -4,7 +4,8 @@ db-dtypes==0.3.1 fastavro flake8 numpy==1.16.6 -google-cloud-bigquery==1.11.1 +google-cloud-bigquery==1.27.2 +google-cloud-bigquery-storage==1.1.0 pyarrow==3.0.0 pydata-google-auth pytest diff --git a/pandas_gbq/features.py b/pandas_gbq/features.py index 4259eaf1..77535041 100644 --- a/pandas_gbq/features.py +++ b/pandas_gbq/features.py @@ -8,7 +8,10 @@ BIGQUERY_MINIMUM_VERSION = "1.11.1" BIGQUERY_CLIENT_INFO_VERSION = "1.12.0" BIGQUERY_BQSTORAGE_VERSION = "1.24.0" +BIGQUERY_ACCURATE_TIMESTAMP_VERSION = "2.6.0" BIGQUERY_FROM_DATAFRAME_CSV_VERSION = "2.6.0" +BIGQUERY_SUPPORTS_BIGNUMERIC_VERSION = "2.10.0" +BIGQUERY_NO_DATE_AS_OBJECT_VERSION = "3.0.0dev" PANDAS_VERBOSITY_DEPRECATION_VERSION = "0.23.0" PANDAS_BOOLEAN_DTYPE_VERSION = "1.0.0" PANDAS_PARQUET_LOSSLESS_TIMESTAMP_VERSION = "1.1.0" @@ -42,6 +45,13 @@ def bigquery_installed_version(self): return self._bigquery_installed_version + @property + def bigquery_has_accurate_timestamp(self): + import pkg_resources + + min_version = pkg_resources.parse_version(BIGQUERY_ACCURATE_TIMESTAMP_VERSION) + return self.bigquery_installed_version >= min_version + @property def bigquery_has_client_info(self): import pkg_resources @@ -51,6 +61,13 @@ def bigquery_has_client_info(self): ) return self.bigquery_installed_version >= bigquery_client_info_version + @property + def bigquery_has_bignumeric(self): + import pkg_resources + + min_version = pkg_resources.parse_version(BIGQUERY_SUPPORTS_BIGNUMERIC_VERSION) + return self.bigquery_installed_version >= min_version + @property def bigquery_has_bqstorage(self): import pkg_resources @@ -69,6 +86,13 @@ def bigquery_has_from_dataframe_with_csv(self): ) return self.bigquery_installed_version >= bigquery_from_dataframe_version + @property + def bigquery_needs_date_as_object(self): + import pkg_resources + + max_version = pkg_resources.parse_version(BIGQUERY_NO_DATE_AS_OBJECT_VERSION) + return self.bigquery_installed_version < max_version + @property def pandas_installed_version(self): import pandas diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index 0a18cc3a..feca5e2a 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -2,13 +2,13 @@ # Use of this source code is governed by a BSD-style # license that can be found in the LICENSE file. +from datetime import datetime import logging import re import time -import warnings -from datetime import datetime import typing -from typing import Any, Dict, Optional, Union +from typing import Any, Dict, Optional, Sequence, Union +import warnings import numpy as np @@ -37,14 +37,13 @@ import pandas_gbq.schema import pandas_gbq.timestamp - -logger = logging.getLogger(__name__) - try: import tqdm # noqa except ImportError: tqdm = None +logger = logging.getLogger(__name__) + def _test_google_api_imports(): try: @@ -52,6 +51,11 @@ def _test_google_api_imports(): except ImportError as ex: raise ImportError("pandas-gbq requires setuptools") from ex + try: + import db_dtypes # noqa + except ImportError as ex: + raise ImportError("pandas-gbq requires db-dtypes") from ex + try: import pydata_google_auth # noqa except ImportError as ex: @@ -546,6 +550,8 @@ def _download_results( to_dataframe_kwargs = {} if FEATURES.bigquery_has_bqstorage: to_dataframe_kwargs["create_bqstorage_client"] = create_bqstorage_client + if FEATURES.bigquery_needs_date_as_object: + to_dataframe_kwargs["date_as_object"] = True try: schema_fields = [field.to_api_repr() for field in rows_iter.schema] @@ -559,11 +565,7 @@ def _download_results( except self.http_error as ex: self.process_http_error(ex) - if df.empty: - df = _cast_empty_df_dtypes(schema_fields, df) - - # Ensure any TIMESTAMP columns are tz-aware. - df = pandas_gbq.timestamp.localize_df(df, schema_fields) + df = _finalize_dtypes(df, schema_fields) logger.debug("Got {} rows.\n".format(rows_iter.total_rows)) return df @@ -617,23 +619,18 @@ def _bqschema_to_nullsafe_dtypes(schema_fields): See: http://pandas.pydata.org/pandas-docs/dev/missing_data.html #missing-data-casting-rules-and-indexing """ + import db_dtypes + # If you update this mapping, also update the table at # `docs/reading.rst`. dtype_map = { - "DATE": "datetime64[ns]", - "DATETIME": "datetime64[ns]", "FLOAT": np.dtype(float), - "GEOMETRY": "object", "INTEGER": "Int64", - "RECORD": "object", - "STRING": "object", - # datetime.time objects cannot be case to datetime64. - # https://github.com/pydata/pandas-gbq/issues/328 - "TIME": "object", - # pandas doesn't support timezone-aware dtype in DataFrame/Series - # constructors. It's more idiomatic to localize after construction. - # https://github.com/pandas-dev/pandas/issues/25843 - "TIMESTAMP": "datetime64[ns]", + "TIME": db_dtypes.TimeDtype(), + # Note: Other types such as 'datetime64[ns]' and db_types.DateDtype() + # are not included because the pandas range does not align with the + # BigQuery range. We need to attempt a conversion to those types and + # fall back to 'object' when there are out-of-range values. } # Amend dtype_map with newer extension types if pandas version allows. @@ -656,28 +653,43 @@ def _bqschema_to_nullsafe_dtypes(schema_fields): return dtypes -def _cast_empty_df_dtypes(schema_fields, df): - """Cast any columns in an empty dataframe to correct type. +def _finalize_dtypes( + df: "pandas.DataFrame", schema_fields: Sequence[Dict[str, Any]] +) -> "pandas.DataFrame": + """ + Attempt to change the dtypes of those columns that don't map exactly. - In an empty dataframe, pandas cannot choose a dtype unless one is - explicitly provided. The _bqschema_to_nullsafe_dtypes() function only - provides dtypes when the dtype safely handles null values. This means - that empty int64 and boolean columns are incorrectly classified as - ``object``. + For example db_dtypes.DateDtype() and datetime64[ns] cannot represent + 0001-01-01, but they can represent dates within a couple hundred years of + 1970. See: + https://github.com/googleapis/python-bigquery-pandas/issues/365 """ - if not df.empty: - raise ValueError("DataFrame must be empty in order to cast non-nullsafe dtypes") + import db_dtypes + import pandas.api.types - dtype_map = {"BOOLEAN": bool, "INTEGER": np.int64} + # If you update this mapping, also update the table at + # `docs/reading.rst`. + dtype_map = { + "DATE": db_dtypes.DateDtype(), + "DATETIME": "datetime64[ns]", + "TIMESTAMP": "datetime64[ns]", + } for field in schema_fields: - column = str(field["name"]) + # This method doesn't modify ARRAY/REPEATED columns. if field["mode"].upper() == "REPEATED": continue + name = str(field["name"]) dtype = dtype_map.get(field["type"].upper()) - if dtype: - df[column] = df[column].astype(dtype) + + # Avoid deprecated conversion to timezone-naive dtype by only casting + # object dtypes. + if dtype and pandas.api.types.is_object_dtype(df[name]): + df[name] = df[name].astype(dtype, errors="ignore") + + # Ensure any TIMESTAMP columns are tz-aware. + df = pandas_gbq.timestamp.localize_df(df, schema_fields) return df diff --git a/pandas_gbq/timestamp.py b/pandas_gbq/timestamp.py index c6bb6d93..66374881 100644 --- a/pandas_gbq/timestamp.py +++ b/pandas_gbq/timestamp.py @@ -30,11 +30,6 @@ def localize_df(df, schema_fields): pandas.DataFrame DataFrame with localized TIMESTAMP columns. """ - if len(df.index) == 0: - # If there are no rows, there is nothing to do. - # Fix for https://github.com/pydata/pandas-gbq/issues/299 - return df - for field in schema_fields: column = str(field["name"]) if "mode" in field and field["mode"].upper() == "REPEATED": diff --git a/setup.py b/setup.py index 2e596cc6..ccee7265 100644 --- a/setup.py +++ b/setup.py @@ -28,12 +28,19 @@ "pandas >=0.24.2", "pyarrow >=3.0.0, <7.0dev", "pydata-google-auth", - "google-api-core >=1.14.0", - "google-auth >=1.4.1", + # Note: google-api-core and google-auth are also included via transitive + # dependency on google-cloud-bigquery, but this library also uses them + # directly. + "google-api-core >=1.21.0", + "google-auth >=1.18.0", "google-auth-oauthlib >=0.0.1", - # 2.4.* has a bug where waiting for the query can hang indefinitely. - # https://github.com/pydata/pandas-gbq/issues/343 - "google-cloud-bigquery[bqstorage,pandas] >=1.11.1,<4.0.0dev,!=2.4.*", + # Require 1.27.* because it has a fix for out-of-bounds timestamps. See: + # https://github.com/googleapis/python-bigquery/pull/209 and + # https://github.com/googleapis/python-bigquery-pandas/issues/365 + # Exclude 2.4.* because it has a bug where waiting for the query can hang + # indefinitely. https://github.com/pydata/pandas-gbq/issues/343 + "google-cloud-bigquery >=1.27.2,<4.0.0dev,!=2.4.*", + "google-cloud-bigquery-storage >=1.1.0,<3.0.0dev", ] extras = { "tqdm": "tqdm>=4.23.0", diff --git a/testing/constraints-3.7.txt b/testing/constraints-3.7.txt index 2a500f35..f0c9a4ac 100644 --- a/testing/constraints-3.7.txt +++ b/testing/constraints-3.7.txt @@ -6,10 +6,10 @@ # e.g., if setup.py has "foo >= 1.14.0, < 2.0.0dev", # Then this file should have foo==1.14.0 db-dtypes==0.3.1 -google-api-core==1.14.0 -google-auth==1.4.1 +google-api-core==1.21.0 +google-auth==1.18.0 google-auth-oauthlib==0.0.1 -google-cloud-bigquery==1.11.1 +google-cloud-bigquery==1.27.2 google-cloud-bigquery-storage==1.1.0 numpy==1.16.6 pandas==0.24.2 diff --git a/tests/system/test_gbq.py b/tests/system/test_gbq.py index 67735c53..ec588a3e 100644 --- a/tests/system/test_gbq.py +++ b/tests/system/test_gbq.py @@ -11,7 +11,7 @@ import pandas import pandas.api.types import pandas.testing as tm -from pandas import DataFrame, NaT +from pandas import DataFrame try: import pkg_resources # noqa @@ -21,7 +21,6 @@ import pytz from pandas_gbq import gbq -from pandas_gbq.features import FEATURES import pandas_gbq.schema @@ -153,319 +152,6 @@ def setup(self, project, credentials): self.gbq_connector = gbq.GbqConnector(project, credentials=credentials) self.credentials = credentials - def test_should_properly_handle_empty_strings(self, project_id): - query = 'SELECT "" AS empty_string' - df = gbq.read_gbq( - query, - project_id=project_id, - credentials=self.credentials, - dialect="legacy", - ) - tm.assert_frame_equal(df, DataFrame({"empty_string": [""]})) - - def test_should_properly_handle_null_strings(self, project_id): - query = "SELECT STRING(NULL) AS null_string" - df = gbq.read_gbq( - query, - project_id=project_id, - credentials=self.credentials, - dialect="legacy", - ) - tm.assert_frame_equal(df, DataFrame({"null_string": [None]})) - - def test_should_properly_handle_valid_integers(self, project_id): - query = "SELECT CAST(3 AS INT64) AS valid_integer" - df = gbq.read_gbq( - query, - project_id=project_id, - credentials=self.credentials, - dialect="standard", - ) - tm.assert_frame_equal(df, DataFrame({"valid_integer": [3]}, dtype="Int64")) - - def test_should_properly_handle_nullable_integers(self, project_id): - query = """SELECT * FROM - UNNEST([1, NULL]) AS nullable_integer - """ - df = gbq.read_gbq( - query, - project_id=project_id, - credentials=self.credentials, - dialect="standard", - dtypes={"nullable_integer": "Int64"}, - ) - tm.assert_frame_equal( - df, - DataFrame({"nullable_integer": pandas.Series([1, None], dtype="Int64")}), - ) - - def test_should_properly_handle_valid_longs(self, project_id): - query = "SELECT 1 << 62 AS valid_long" - df = gbq.read_gbq( - query, - project_id=project_id, - credentials=self.credentials, - dialect="standard", - ) - tm.assert_frame_equal(df, DataFrame({"valid_long": [1 << 62]}, dtype="Int64")) - - def test_should_properly_handle_nullable_longs(self, project_id): - query = """SELECT * FROM - UNNEST([1 << 62, NULL]) AS nullable_long - """ - df = gbq.read_gbq( - query, - project_id=project_id, - credentials=self.credentials, - dialect="standard", - dtypes={"nullable_long": "Int64"}, - ) - tm.assert_frame_equal( - df, - DataFrame({"nullable_long": pandas.Series([1 << 62, None], dtype="Int64")}), - ) - - def test_should_properly_handle_null_integers(self, project_id): - query = "SELECT CAST(NULL AS INT64) AS null_integer" - df = gbq.read_gbq( - query, - project_id=project_id, - credentials=self.credentials, - dialect="standard", - dtypes={"null_integer": "Int64"}, - ) - tm.assert_frame_equal( - df, DataFrame({"null_integer": pandas.Series([None], dtype="Int64")}), - ) - - def test_should_properly_handle_valid_floats(self, project_id): - from math import pi - - query = "SELECT PI() AS valid_float" - df = gbq.read_gbq( - query, - project_id=project_id, - credentials=self.credentials, - dialect="legacy", - ) - tm.assert_frame_equal(df, DataFrame({"valid_float": [pi]})) - - def test_should_properly_handle_nullable_floats(self, project_id): - from math import pi - - query = """SELECT * FROM - (SELECT PI() AS nullable_float), - (SELECT NULL AS nullable_float)""" - df = gbq.read_gbq( - query, - project_id=project_id, - credentials=self.credentials, - dialect="legacy", - ) - tm.assert_frame_equal(df, DataFrame({"nullable_float": [pi, None]})) - - def test_should_properly_handle_valid_doubles(self, project_id): - from math import pi - - query = "SELECT PI() * POW(10, 307) AS valid_double" - df = gbq.read_gbq( - query, - project_id=project_id, - credentials=self.credentials, - dialect="legacy", - ) - tm.assert_frame_equal(df, DataFrame({"valid_double": [pi * 10 ** 307]})) - - def test_should_properly_handle_nullable_doubles(self, project_id): - from math import pi - - query = """SELECT * FROM - (SELECT PI() * POW(10, 307) AS nullable_double), - (SELECT NULL AS nullable_double)""" - df = gbq.read_gbq( - query, - project_id=project_id, - credentials=self.credentials, - dialect="legacy", - ) - tm.assert_frame_equal( - df, DataFrame({"nullable_double": [pi * 10 ** 307, None]}) - ) - - def test_should_properly_handle_null_floats(self, project_id): - query = """SELECT null_float - FROM UNNEST(ARRAY[NULL, 1.0]) AS null_float - """ - df = gbq.read_gbq( - query, - project_id=project_id, - credentials=self.credentials, - dialect="standard", - ) - tm.assert_frame_equal(df, DataFrame({"null_float": [np.nan, 1.0]})) - - def test_should_properly_handle_date(self, project_id): - query = "SELECT DATE(2003, 1, 4) AS date_col" - df = gbq.read_gbq(query, project_id=project_id, credentials=self.credentials,) - expected = DataFrame( - { - "date_col": pandas.Series( - [datetime.date(2003, 1, 4)], dtype="datetime64[ns]" - ) - }, - ) - tm.assert_frame_equal(df, expected) - - def test_should_properly_handle_time(self, project_id): - query = ( - "SELECT TIME_ADD(TIME(3, 14, 15), INTERVAL 926589 MICROSECOND) AS time_col" - ) - df = gbq.read_gbq(query, project_id=project_id, credentials=self.credentials,) - expected = DataFrame( - { - "time_col": pandas.Series( - [datetime.time(3, 14, 15, 926589)], dtype="object" - ) - }, - ) - tm.assert_frame_equal(df, expected) - - def test_should_properly_handle_timestamp_unix_epoch(self, project_id): - query = 'SELECT TIMESTAMP("1970-01-01 00:00:00") AS unix_epoch' - df = gbq.read_gbq( - query, - project_id=project_id, - credentials=self.credentials, - dialect="legacy", - ) - expected = DataFrame( - {"unix_epoch": ["1970-01-01T00:00:00.000000Z"]}, dtype="datetime64[ns]", - ) - if expected["unix_epoch"].dt.tz is None: - expected["unix_epoch"] = expected["unix_epoch"].dt.tz_localize("UTC") - tm.assert_frame_equal(df, expected) - - def test_should_properly_handle_arbitrary_timestamp(self, project_id): - query = 'SELECT TIMESTAMP("2004-09-15 05:00:00") AS valid_timestamp' - df = gbq.read_gbq( - query, - project_id=project_id, - credentials=self.credentials, - dialect="legacy", - ) - expected = DataFrame( - {"valid_timestamp": ["2004-09-15T05:00:00.000000Z"]}, - dtype="datetime64[ns]", - ) - if expected["valid_timestamp"].dt.tz is None: - expected["valid_timestamp"] = expected["valid_timestamp"].dt.tz_localize( - "UTC" - ) - tm.assert_frame_equal(df, expected) - - def test_should_properly_handle_datetime_unix_epoch(self, project_id): - query = 'SELECT DATETIME("1970-01-01 00:00:00") AS unix_epoch' - df = gbq.read_gbq( - query, - project_id=project_id, - credentials=self.credentials, - dialect="legacy", - ) - tm.assert_frame_equal( - df, - DataFrame({"unix_epoch": ["1970-01-01T00:00:00"]}, dtype="datetime64[ns]"), - ) - - def test_should_properly_handle_arbitrary_datetime(self, project_id): - query = 'SELECT DATETIME("2004-09-15 05:00:00") AS valid_timestamp' - df = gbq.read_gbq( - query, - project_id=project_id, - credentials=self.credentials, - dialect="legacy", - ) - tm.assert_frame_equal( - df, DataFrame({"valid_timestamp": [np.datetime64("2004-09-15T05:00:00")]}), - ) - - @pytest.mark.parametrize( - "expression, is_expected_dtype", - [ - ("current_date()", pandas.api.types.is_datetime64_ns_dtype), - ("current_timestamp()", pandas.api.types.is_datetime64tz_dtype), - ("current_datetime()", pandas.api.types.is_datetime64_ns_dtype), - ("TRUE", pandas.api.types.is_bool_dtype), - ("FALSE", pandas.api.types.is_bool_dtype), - ], - ) - def test_return_correct_types(self, project_id, expression, is_expected_dtype): - """ - All type checks can be added to this function using additional - parameters, rather than creating additional functions. - We can consolidate the existing functions here in time - - TODO: time doesn't currently parse - ("time(12,30,00)", "