From 933d47067b23f62c25ba739a3a4722f3f28dae07 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 6 Dec 2021 14:13:48 -0600 Subject: [PATCH 1/3] fix: read out-of-bounds DATETIME values such as `0001-01-01 00:00:00` deps: require google-cloud-bigquery 1.26.1 or later --- ci/requirements-3.7-0.24.2.conda | 5 ++-- pandas_gbq/load.py | 9 ++++-- setup.py | 9 +++--- testing/constraints-3.7.txt | 6 ++-- tests/system/test_to_gbq.py | 48 ++++++++++++++++++++++++++++++++ 5 files changed, 66 insertions(+), 11 deletions(-) diff --git a/ci/requirements-3.7-0.24.2.conda b/ci/requirements-3.7-0.24.2.conda index 82f4e7b9..430c742e 100644 --- a/ci/requirements-3.7-0.24.2.conda +++ b/ci/requirements-3.7-0.24.2.conda @@ -1,10 +1,11 @@ codecov coverage -db-dtypes==0.3.0 +db-dtypes==0.3.1 fastavro flake8 numpy==1.16.6 -google-cloud-bigquery==1.11.1 +google-cloud-bigquery==1.26.1 +google-cloud-bigquery-storage==1.1.0 pyarrow==3.0.0 pydata-google-auth pytest diff --git a/pandas_gbq/load.py b/pandas_gbq/load.py index 5422402e..943c4f07 100644 --- a/pandas_gbq/load.py +++ b/pandas_gbq/load.py @@ -94,8 +94,13 @@ def cast_dataframe_for_parquet( # .astype() with DateDtype. With .astype(), I get the error: # # TypeError: Cannot interpret '' as a data type - cast_column = pandas.Series( - dataframe[column_name], dtype=db_dtypes.DateDtype() + cast_column = dataframe[column_name].astype( + dtype=db_dtypes.DateDtype(), + # Return the original column if there was an error converting + # to the dtype, such as is there is a date outside the + # supported range. + # https://github.com/googleapis/python-bigquery-pandas/issues/441 + errors="ignore", ) elif column_type in {"NUMERIC", "DECIMAL", "BIGNUMERIC", "BIGDECIMAL"}: cast_column = dataframe[column_name].map(decimal.Decimal) diff --git a/setup.py b/setup.py index 28c81eee..a5b645e7 100644 --- a/setup.py +++ b/setup.py @@ -23,16 +23,17 @@ release_status = "Development Status :: 4 - Beta" dependencies = [ "setuptools", - "db-dtypes >=0.3.0,<2.0.0", - "numpy>=1.16.6", - "pandas>=0.24.2", + "db-dtypes >=0.3.1,<2.0.0", + "numpy >=1.16.6", + "pandas >=0.24.2", "pyarrow >=3.0.0, <7.0dev", "pydata-google-auth", "google-auth", "google-auth-oauthlib", # 2.4.* has a bug where waiting for the query can hang indefinitely. # https://github.com/pydata/pandas-gbq/issues/343 - "google-cloud-bigquery[bqstorage,pandas]>=1.11.1,<3.0.0dev,!=2.4.*", + "google-cloud-bigquery >=1.26.1,<3.0.0dev,!=2.4.*", + "google-cloud-bigquery-storage >=1.1.0,<3.0.0dev", ] extras = { "tqdm": "tqdm>=4.23.0", diff --git a/testing/constraints-3.7.txt b/testing/constraints-3.7.txt index 7920656a..a5b04f0d 100644 --- a/testing/constraints-3.7.txt +++ b/testing/constraints-3.7.txt @@ -5,10 +5,10 @@ # # e.g., if setup.py has "foo >= 1.14.0, < 2.0.0dev", # Then this file should have foo==1.14.0 -db-dtypes==0.3.0 -google-auth==1.4.1 +db-dtypes==0.3.1 +google-auth==1.18.0 google-auth-oauthlib==0.0.1 -google-cloud-bigquery==1.11.1 +google-cloud-bigquery==1.26.1 google-cloud-bigquery-storage==1.1.0 numpy==1.16.6 pandas==0.24.2 diff --git a/tests/system/test_to_gbq.py b/tests/system/test_to_gbq.py index 4421f3be..046a2a86 100644 --- a/tests/system/test_to_gbq.py +++ b/tests/system/test_to_gbq.py @@ -188,6 +188,54 @@ def test_series_round_trip( {"name": "num_col", "type": "NUMERIC"}, ], ), + pytest.param( + *DataFrameRoundTripTestCase( + input_df=pandas.DataFrame( + { + "row_num": [1, 2, 3], + # DATE valuess outside the pandas range for timestamp + # aren't supported by the db-dtypes package. + # https://github.com/googleapis/python-bigquery-pandas/issues/441 + "date_col": [ + datetime.date(1, 1, 1), + datetime.date(1970, 1, 1), + datetime.date(9999, 12, 31), + ], + # DATETIME values outside of the range for pandas timestamp + # require `date_as_object` parameter in + # google-cloud-bigquery versions 1.x and 2.x. + # https://github.com/googleapis/python-bigquery-pandas/issues/365 + "datetime_col": [ + datetime.datetime(1, 1, 1), + datetime.datetime(1970, 1, 1), + datetime.datetime(9999, 12, 31, 23, 59, 59, 999999), + ], + "timestamp_col": [ + datetime.datetime(1, 1, 1, tzinfo=datetime.timezone.utc), + datetime.datetime(1970, 1, 1, tzinfo=datetime.timezone.utc), + datetime.datetime( + 9999, + 12, + 31, + 23, + 59, + 59, + 999999, + tzinfo=datetime.timezone.utc, + ), + ], + }, + columns=["row_num", "date_col", "datetime_col", "timestamp_col"], + ), + table_schema=[ + {"name": "row_num", "type": "INTEGER"}, + {"name": "date_col", "type": "DATE"}, + {"name": "datetime_col", "type": "DATETIME"}, + {"name": "timestamp_col", "type": "TIMESTAMP"}, + ], + ), + id="issue365-extreme-datetimes", + ), ] From 2a76982df7cff48e58d8b1ad7eae19477665cb76 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 6 Dec 2021 16:28:43 -0600 Subject: [PATCH 2/3] revert tests for read_gbq fix which isn't yet resolved --- ci/requirements-3.7-0.24.2.conda | 3 +-- setup.py | 3 +-- testing/constraints-3.7.txt | 4 +-- tests/system/test_to_gbq.py | 42 ++++++++++++++++---------------- 4 files changed, 25 insertions(+), 27 deletions(-) diff --git a/ci/requirements-3.7-0.24.2.conda b/ci/requirements-3.7-0.24.2.conda index 430c742e..e0323d92 100644 --- a/ci/requirements-3.7-0.24.2.conda +++ b/ci/requirements-3.7-0.24.2.conda @@ -4,8 +4,7 @@ db-dtypes==0.3.1 fastavro flake8 numpy==1.16.6 -google-cloud-bigquery==1.26.1 -google-cloud-bigquery-storage==1.1.0 +google-cloud-bigquery==1.11.1 pyarrow==3.0.0 pydata-google-auth pytest diff --git a/setup.py b/setup.py index a5b645e7..283e5ea8 100644 --- a/setup.py +++ b/setup.py @@ -32,8 +32,7 @@ "google-auth-oauthlib", # 2.4.* has a bug where waiting for the query can hang indefinitely. # https://github.com/pydata/pandas-gbq/issues/343 - "google-cloud-bigquery >=1.26.1,<3.0.0dev,!=2.4.*", - "google-cloud-bigquery-storage >=1.1.0,<3.0.0dev", + "google-cloud-bigquery[bqstorage,pandas] >=1.11.1,<3.0.0dev,!=2.4.*", ] extras = { "tqdm": "tqdm>=4.23.0", diff --git a/testing/constraints-3.7.txt b/testing/constraints-3.7.txt index a5b04f0d..6c3080dc 100644 --- a/testing/constraints-3.7.txt +++ b/testing/constraints-3.7.txt @@ -6,9 +6,9 @@ # e.g., if setup.py has "foo >= 1.14.0, < 2.0.0dev", # Then this file should have foo==1.14.0 db-dtypes==0.3.1 -google-auth==1.18.0 +google-auth==1.4.1 google-auth-oauthlib==0.0.1 -google-cloud-bigquery==1.26.1 +google-cloud-bigquery==1.11.1 google-cloud-bigquery-storage==1.1.0 numpy==1.16.6 pandas==0.24.2 diff --git a/tests/system/test_to_gbq.py b/tests/system/test_to_gbq.py index 046a2a86..f7184024 100644 --- a/tests/system/test_to_gbq.py +++ b/tests/system/test_to_gbq.py @@ -201,29 +201,29 @@ def test_series_round_trip( datetime.date(1970, 1, 1), datetime.date(9999, 12, 31), ], - # DATETIME values outside of the range for pandas timestamp - # require `date_as_object` parameter in + # TODO: DATETIME/TIMESTAMP values outside of the range for + # pandas timestamp require `date_as_object` parameter in # google-cloud-bigquery versions 1.x and 2.x. # https://github.com/googleapis/python-bigquery-pandas/issues/365 - "datetime_col": [ - datetime.datetime(1, 1, 1), - datetime.datetime(1970, 1, 1), - datetime.datetime(9999, 12, 31, 23, 59, 59, 999999), - ], - "timestamp_col": [ - datetime.datetime(1, 1, 1, tzinfo=datetime.timezone.utc), - datetime.datetime(1970, 1, 1, tzinfo=datetime.timezone.utc), - datetime.datetime( - 9999, - 12, - 31, - 23, - 59, - 59, - 999999, - tzinfo=datetime.timezone.utc, - ), - ], + # "datetime_col": [ + # datetime.datetime(1, 1, 1), + # datetime.datetime(1970, 1, 1), + # datetime.datetime(9999, 12, 31, 23, 59, 59, 999999), + # ], + # "timestamp_col": [ + # datetime.datetime(1, 1, 1, tzinfo=datetime.timezone.utc), + # datetime.datetime(1970, 1, 1, tzinfo=datetime.timezone.utc), + # datetime.datetime( + # 9999, + # 12, + # 31, + # 23, + # 59, + # 59, + # 999999, + # tzinfo=datetime.timezone.utc, + # ), + # ], }, columns=["row_num", "date_col", "datetime_col", "timestamp_col"], ), From bc561b3c43c0794a182b33c44e3e85c65cb42123 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 7 Dec 2021 13:51:29 -0600 Subject: [PATCH 3/3] remove out-of-date comment --- pandas_gbq/load.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas_gbq/load.py b/pandas_gbq/load.py index 943c4f07..315ad5cd 100644 --- a/pandas_gbq/load.py +++ b/pandas_gbq/load.py @@ -90,10 +90,6 @@ def cast_dataframe_for_parquet( # Use extension dtype first so that it uses the correct equality operator. and db_dtypes.DateDtype() != dataframe[column_name].dtype ): - # Construct converted column manually, because I can't use - # .astype() with DateDtype. With .astype(), I get the error: - # - # TypeError: Cannot interpret '' as a data type cast_column = dataframe[column_name].astype( dtype=db_dtypes.DateDtype(), # Return the original column if there was an error converting