ENH: Support writing timestamps with timezones with to_sql (pandas-dev#22654)

mroeschke · Pingviinituutti · commit 8986e3c90534 · 2019-02-28T10:19:10.000+02:00
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -4806,6 +4806,36 @@ default ``Text`` type for string columns:
     Because of this, reading the database table back in does **not** generate
     a categorical.
 
+.. _io.sql_datetime_data:
+
+Datetime data types
+'''''''''''''''''''
+
+Using SQLAlchemy, :func:`~pandas.DataFrame.to_sql` is capable of writing
+datetime data that is timezone naive or timezone aware. However, the resulting
+data stored in the database ultimately depends on the supported data type
+for datetime data of the database system being used.
+
+The following table lists supported data types for datetime data for some
+common databases. Other database dialects may have different data types for
+datetime data.
+
+===========   =============================================  ===================
+Database      SQL Datetime Types                             Timezone Support
+===========   =============================================  ===================
+SQLite        ``TEXT``                                       No
+MySQL         ``TIMESTAMP`` or ``DATETIME``                  No
+PostgreSQL    ``TIMESTAMP`` or ``TIMESTAMP WITH TIME ZONE``  Yes
+===========   =============================================  ===================
+
+When writing timezone aware data to databases that do not support timezones,
+the data will be written as timezone naive timestamps that are in local time
+with respect to the timezone.
+
+:func:`~pandas.read_sql_table` is also capable of reading datetime data that is
+timezone aware or naive. When reading ``TIMESTAMP WITH TIME ZONE`` types, pandas
+will convert the data to UTC.
+
 Reading Tables
 ''''''''''''''
 
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -222,6 +222,7 @@ Other Enhancements
 - :class:`IntervalIndex` has gained the :meth:`~IntervalIndex.set_closed` method to change the existing ``closed`` value (:issue:`21670`)
 - :func:`~DataFrame.to_csv`, :func:`~Series.to_csv`, :func:`~DataFrame.to_json`, and :func:`~Series.to_json` now support ``compression='infer'`` to infer compression based on filename extension (:issue:`15008`).
   The default compression for ``to_csv``, ``to_json``, and ``to_pickle`` methods has been updated to ``'infer'`` (:issue:`22004`).
+- :meth:`DataFrame.to_sql` now supports writing ``TIMESTAMP WITH TIME ZONE`` types for supported databases. For databases that don't support timezones, datetime data will be stored as timezone unaware local timestamps. See the :ref:`io.sql_datetime_data` for implications (:issue:`9086`).
 - :func:`to_timedelta` now supports iso-formated timedelta strings (:issue:`21877`)
 - :class:`Series` and :class:`DataFrame` now support :class:`Iterable` in constructor (:issue:`2193`)
 - :class:`DatetimeIndex` gained :attr:`DatetimeIndex.timetz` attribute. Returns local time with timezone information. (:issue:`21358`)
@@ -1246,6 +1247,9 @@ MultiIndex
 I/O
 ^^^
 
+- Bug in :meth:`to_sql` when writing timezone aware data (``datetime64[ns, tz]`` dtype) would raise a ``TypeError`` (:issue:`9086`)
+- Bug in :meth:`to_sql` where a naive DatetimeIndex would be written as ``TIMESTAMP WITH TIMEZONE`` type in supported databases, e.g. PostgreSQL (:issue:`23510`)
+
 .. _whatsnew_0240.bug_fixes.nan_with_str_dtype:
 
 Proper handling of `np.NaN` in a string data-typed column with the Python engine
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -2397,6 +2397,15 @@ def to_sql(self, name, con, schema=None, if_exists='fail', index=True,
         --------
         pandas.read_sql : read a DataFrame from a table
 
+        Notes
+        -----
+        Timezone aware datetime columns will be written as
+        ``Timestamp with timezone`` type with SQLAlchemy if supported by the
+        database. Otherwise, the datetimes will be stored as timezone unaware
+        timestamps local to the original timezone.
+
+        .. versionadded:: 0.24.0
+
         References
         ----------
         .. [1] http://docs.sqlalchemy.org
diff --git a/pandas/io/sql.py b/pandas/io/sql.py
@@ -592,12 +592,17 @@ def insert_data(self):
         data_list = [None] * ncols
         blocks = temp._data.blocks
 
-        for i in range(len(blocks)):
-            b = blocks[i]
+        for b in blocks:
             if b.is_datetime:
-                # convert to microsecond resolution so this yields
-                # datetime.datetime
-                d = b.values.astype('M8[us]').astype(object)
+                # return datetime.datetime objects
+                if b.is_datetimetz:
+                    # GH 9086: Ensure we return datetimes with timezone info
+                    # Need to return 2-D data; DatetimeIndex is 1D
+                    d = b.values.to_pydatetime()
+                    d = np.expand_dims(d, axis=0)
+                else:
+                    # convert to microsecond resolution for datetime.datetime
+                    d = b.values.astype('M8[us]').astype(object)
             else:
                 d = np.array(b.get_values(), dtype=object)
 
@@ -612,7 +617,7 @@ def insert_data(self):
         return column_names, data_list
 
     def _execute_insert(self, conn, keys, data_iter):
-        data = [{k: v for k, v in zip(keys, row)} for row in data_iter]
+        data = [dict(zip(keys, row)) for row in data_iter]
         conn.execute(self.insert_statement(), data)
 
     def insert(self, chunksize=None):
@@ -741,8 +746,9 @@ def _get_column_names_and_types(self, dtype_mapper):
     def _create_table_setup(self):
         from sqlalchemy import Table, Column, PrimaryKeyConstraint
 
-        column_names_and_types = \
-            self._get_column_names_and_types(self._sqlalchemy_type)
+        column_names_and_types = self._get_column_names_and_types(
+            self._sqlalchemy_type
+        )
 
         columns = [Column(name, typ, index=is_index)
                    for name, typ, is_index in column_names_and_types]
@@ -841,14 +847,19 @@ def _sqlalchemy_type(self, col):
 
         from sqlalchemy.types import (BigInteger, Integer, Float,
                                       Text, Boolean,
-                                      DateTime, Date, Time)
+                                      DateTime, Date, Time, TIMESTAMP)
 
         if col_type == 'datetime64' or col_type == 'datetime':
+            # GH 9086: TIMESTAMP is the suggested type if the column contains
+            # timezone information
             try:
-                tz = col.tzinfo  # noqa
-                return DateTime(timezone=True)
+                if col.dt.tz is not None:
+                    return TIMESTAMP(timezone=True)
             except AttributeError:
-                return DateTime
+                # The column is actually a DatetimeIndex
+                if col.tz is not None:
+                    return TIMESTAMP(timezone=True)
+            return DateTime
         if col_type == 'timedelta64':
             warnings.warn("the 'timedelta' type is not supported, and will be "
                           "written as integer values (ns frequency) to the "
@@ -1275,8 +1286,9 @@ def _create_table_setup(self):
         structure of a DataFrame.  The first entry will be a CREATE TABLE
         statement while the rest will be CREATE INDEX statements.
         """
-        column_names_and_types = \
-            self._get_column_names_and_types(self._sql_type_name)
+        column_names_and_types = self._get_column_names_and_types(
+            self._sql_type_name
+        )
 
         pat = re.compile(r'\s+')
         column_names = [col_name for col_name, _, _ in column_names_and_types]
diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py
@@ -961,7 +961,8 @@ def test_sqlalchemy_type_mapping(self):
                                             utc=True)})
         db = sql.SQLDatabase(self.conn)
         table = sql.SQLTable("test_type", db, frame=df)
-        assert isinstance(table.table.c['time'].type, sqltypes.DateTime)
+        # GH 9086: TIMESTAMP is the suggested type for datetimes with timezones
+        assert isinstance(table.table.c['time'].type, sqltypes.TIMESTAMP)
 
     def test_database_uri_string(self):
 
@@ -1361,9 +1362,51 @@ def check(col):
         df = sql.read_sql_table("types_test_data", self.conn)
         check(df.DateColWithTz)
 
+    def test_datetime_with_timezone_roundtrip(self):
+        # GH 9086
+        # Write datetimetz data to a db and read it back
+        # For dbs that support timestamps with timezones, should get back UTC
+        # otherwise naive data should be returned
+        expected = DataFrame({'A': date_range(
+            '2013-01-01 09:00:00', periods=3, tz='US/Pacific'
+        )})
+        expected.to_sql('test_datetime_tz', self.conn, index=False)
+
+        if self.flavor == 'postgresql':
+            # SQLAlchemy "timezones" (i.e. offsets) are coerced to UTC
+            expected['A'] = expected['A'].dt.tz_convert('UTC')
+        else:
+            # Otherwise, timestamps are returned as local, naive
+            expected['A'] = expected['A'].dt.tz_localize(None)
+
+        result = sql.read_sql_table('test_datetime_tz', self.conn)
+        tm.assert_frame_equal(result, expected)
+
+        result = sql.read_sql_query(
+            'SELECT * FROM test_datetime_tz', self.conn
+        )
+        if self.flavor == 'sqlite':
+            # read_sql_query does not return datetime type like read_sql_table
+            assert isinstance(result.loc[0, 'A'], string_types)
+            result['A'] = to_datetime(result['A'])
+        tm.assert_frame_equal(result, expected)
+
+    def test_naive_datetimeindex_roundtrip(self):
+        # GH 23510
+        # Ensure that a naive DatetimeIndex isn't converted to UTC
+        dates = date_range('2018-01-01', periods=5, freq='6H')
+        expected = DataFrame({'nums': range(5)}, index=dates)
+        expected.to_sql('foo_table', self.conn, index_label='info_date')
+        result = sql.read_sql_table('foo_table', self.conn,
+                                    index_col='info_date')
+        # result index with gain a name from a set_index operation; expected
+        tm.assert_frame_equal(result, expected, check_names=False)
+
     def test_date_parsing(self):
         # No Parsing
         df = sql.read_sql_table("types_test_data", self.conn)
+        expected_type = object if self.flavor == 'sqlite' else np.datetime64
+        assert issubclass(df.DateCol.dtype.type, expected_type)
 
         df = sql.read_sql_table("types_test_data", self.conn,
                                 parse_dates=['DateCol'])