ENH: Use tz-aware dtype for timestamp.

tswast · tswast · commit b0445ca485c3 · 2019-03-23T15:04:07.000-07:00
I couldn't figure out how *not* to get a tz-aware dtype in 0.24.x
versions, and I wanted a tz-aware dtype anyway for TIMESTAMP, so this
makes it official.
diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
@@ -6,8 +6,6 @@ Changelog
 0.10.0 / TBD
 ------------
 
-- This fixes a bug where pandas-gbq could not upload an empty database. (:issue:`237`)
-
 Dependency updates
 ~~~~~~~~~~~~~~~~~~
 
@@ -22,11 +20,21 @@ Internal changes
 
 Enhancements
 ~~~~~~~~~~~~
+
 - Allow ``table_schema`` in :func:`to_gbq` to contain only a subset of columns,
-  with the rest being populated using the DataFrame dtypes (:issue:`218`) 
+  with the rest being populated using the DataFrame dtypes (:issue:`218`)
   (contributed by @johnpaton)
 - Read ``project_id`` in :func:`to_gbq` from provided ``credentials`` if
   available (contributed by @daureg)
+- ``read_gbq`` uses the timezone-aware ``DatetimeTZDtype(unit='ns',
+  tz='UTC')`` dtype for BigQuery ``TIMESTAMP`` columns. (:issue:`263`)
+
+Bug fixes
+~~~~~~~~~
+
+- Fix a bug where pandas-gbq could not upload an empty database.
+  (:issue:`237`)
+
 
 .. _changelog-0.9.0:
 
diff --git a/docs/source/reading.rst b/docs/source/reading.rst
@@ -9,21 +9,32 @@ Suppose you want to load all data from an existing BigQuery table
 
 .. code-block:: python
 
-   # Insert your BigQuery Project ID Here
-   # Can be found in the Google web console
+   import pandas_gbq
+
+   # TODO: Set your BigQuery Project ID.
    projectid = "xxxxxxxx"
 
-   data_frame = read_gbq('SELECT * FROM test_dataset.test_table', projectid)
+   data_frame = pandas_gbq.read_gbq(
+       'SELECT * FROM `test_dataset.test_table`',
+       project_id=projectid)
+
+.. note::
 
+    A project ID is sometimes optional if it can be inferred during
+    authentication, but it is required when authenticating with user
+    credentials. You can find your project ID in the `Google Cloud console
+    <https://console.cloud.google.com>`__.
 
 You can define which column from BigQuery to use as an index in the
 destination DataFrame as well as a preferred column order as follows:
 
 .. code-block:: python
 
-   data_frame = read_gbq('SELECT * FROM test_dataset.test_table',
-                          index_col='index_column_name',
-                          col_order=['col1', 'col2', 'col3'], projectid)
+   data_frame = pandas_gbq.read_gbq(
+       'SELECT * FROM `test_dataset.test_table`',
+       project_id=projectid,
+       index_col='index_column_name',
+       col_order=['col1', 'col2', 'col3'])
 
 
 You can specify the query config as parameter to use additional options of
@@ -37,20 +48,39 @@ your job. For more information about query configuration parameters see `here
         "useQueryCache": False
       }
    }
-   data_frame = read_gbq('SELECT * FROM test_dataset.test_table',
-                          configuration=configuration, projectid)
+   data_frame = read_gbq(
+       'SELECT * FROM `test_dataset.test_table`',
+       project_id=projectid,
+       configuration=configuration)
 
 
-.. note::
+The ``dialect`` argument can be used to indicate whether to use
+BigQuery's ``'legacy'`` SQL or BigQuery's ``'standard'`` SQL (beta). The
+default value is ``'standard'`` For more information on BigQuery's standard
+SQL, see `BigQuery SQL Reference
+<https://cloud.google.com/bigquery/docs/reference/standard-sql/>`__
 
-   You can find your project id in the `Google developers console
-   <https://console.developers.google.com>`__.
+.. code-block:: python
 
+   data_frame = pandas_gbq.read_gbq(
+       'SELECT * FROM [test_dataset.test_table]',
+       project_id=projectid,
+       dialect='legacy')
 
-.. note::
 
-    The ``dialect`` argument can be used to indicate whether to use BigQuery's ``'legacy'`` SQL
-    or BigQuery's ``'standard'`` SQL (beta). The default value is ``'legacy'``, though this will change
-    in a subsequent release to ``'standard'``. For more information
-    on BigQuery's standard SQL, see `BigQuery SQL Reference
-    <https://cloud.google.com/bigquery/sql-reference/>`__
+.. _reading-dtypes:
+
+Inferring the DataFrame's dtypes
+--------------------------------
+
+The :func:`~pandas_gbq.read_gbq` method infers the pandas dtype for each column, based on the BigQuery table schema.
+
+================== =========================
+BigQuery Data Type dtype
+================== =========================
+FLOAT              float
+TIMESTAMP          DatetimeTZDtype(unit='ns', tz='UTC')
+DATETIME           datetime64[ns]
+TIME               datetime64[ns]
+DATE               datetime64[ns]
+================== =========================
diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py
@@ -644,21 +644,24 @@ def delete_and_recreate_table(self, dataset_id, table_id, table_schema):
 
 
 def _bqschema_to_nullsafe_dtypes(schema_fields):
-    # Only specify dtype when the dtype allows nulls. Otherwise, use pandas's
-    # default dtype choice.
-    #
-    # See:
-    # http://pandas.pydata.org/pandas-docs/dev/missing_data.html
-    # #missing-data-casting-rules-and-indexing
+    """Specify explicit dtypes based on BigQuery schema.
+
+    This function only specifies a dtype when the dtype allows nulls.
+    Otherwise, use pandas's default dtype choice.
+
+    See: http://pandas.pydata.org/pandas-docs/dev/missing_data.html
+    #missing-data-casting-rules-and-indexing
+    """
+    import pandas
+
+    # If you update this mapping, also update the table at
+    # `docs/source/reading.rst`.
     dtype_map = {
         "FLOAT": np.dtype(float),
-        # Even though TIMESTAMPs are timezone-aware in BigQuery, pandas doesn't
-        # support datetime64[ns, UTC] as dtype in DataFrame constructors. See:
-        # https://github.com/pandas-dev/pandas/issues/12513
-        "TIMESTAMP": "datetime64[ns]",
+        "TIMESTAMP": pandas.DatetimeTZDtype(tz="UTC"),
+        "DATETIME": "datetime64[ns]",
         "TIME": "datetime64[ns]",
         "DATE": "datetime64[ns]",
-        "DATETIME": "datetime64[ns]",
     }
 
     dtypes = {}
diff --git a/tests/system/test_gbq.py b/tests/system/test_gbq.py
@@ -368,7 +368,7 @@ def test_should_properly_handle_arbitrary_datetime(self, project_id):
         "expression, is_expected_dtype",
         [
             ("current_date()", pandas.api.types.is_datetime64_ns_dtype),
-            ("current_timestamp()", pandas.api.types.is_datetime64_ns_dtype),
+            ("current_timestamp()", pandas.api.types.is_datetime64tz_dtype),
             ("current_datetime()", pandas.api.types.is_datetime64_ns_dtype),
             ("TRUE", pandas.api.types.is_bool_dtype),
             ("FALSE", pandas.api.types.is_bool_dtype),

Original file line number	Diff line number	Diff line change
`@@ -368,7 +368,7 @@ def test_should_properly_handle_arbitrary_datetime(self, project_id):`
`368`	`368`	`"expression, is_expected_dtype",`
`369`	`369`	`[`
`370`	`370`	`("current_date()", pandas.api.types.is_datetime64_ns_dtype),`
`371`		`- ("current_timestamp()", pandas.api.types.is_datetime64_ns_dtype),`
	`371`	`+ ("current_timestamp()", pandas.api.types.is_datetime64tz_dtype),`
`372`	`372`	`("current_datetime()", pandas.api.types.is_datetime64_ns_dtype),`
`373`	`373`	`("TRUE", pandas.api.types.is_bool_dtype),`
`374`	`374`	`("FALSE", pandas.api.types.is_bool_dtype),`