From 5cc35f4fd7af9fa9d3579f5e300ef400cea42d12 Mon Sep 17 00:00:00 2001 From: Dan Birken Date: Mon, 23 Mar 2015 17:35:38 -0700 Subject: [PATCH] BUG: Fix error when reading postgres table with timezone #7139 `read_sql_table()` will break if it reads a table with a `timestamp with time zone` column if individual rows within that column have different time zones. This is very common due to daylight savings time. Pandas right now does not have good support for a Series containing datetimes with different time zones (hence this bug). So this change simply converts a `timestamp with time zone` column into UTC during import, which pandas has great support for. --- doc/source/whatsnew/v0.16.1.txt | 2 +- pandas/io/sql.py | 18 ++++++-- pandas/io/tests/test_sql.py | 77 ++++++++++++++++++++++++++++----- 3 files changed, 82 insertions(+), 15 deletions(-) diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt index d130879b85475..54ba2ac6586d0 100644 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -52,7 +52,7 @@ Bug Fixes - Bug in ``transform`` causing length mismatch when null entries were present and a fast aggregator was being used (:issue:`9697`) - +- Bug in ``read_sql_table`` error when reading postgres table with timezone (:issue:`7139`) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 117d7b4a9ceaa..7c70b4b1df492 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -83,14 +83,14 @@ def _handle_date_column(col, format=None): return to_datetime(col, **format) else: if format in ['D', 's', 'ms', 'us', 'ns']: - return to_datetime(col, coerce=True, unit=format) + return to_datetime(col, coerce=True, unit=format, utc=True) elif (issubclass(col.dtype.type, np.floating) or issubclass(col.dtype.type, np.integer)): # parse dates as timestamp format = 's' if format is None else format - return to_datetime(col, coerce=True, unit=format) + return to_datetime(col, coerce=True, unit=format, utc=True) else: - return to_datetime(col, coerce=True, format=format) + return to_datetime(col, coerce=True, format=format, utc=True) def _parse_date_columns(data_frame, parse_dates): @@ -318,6 +318,10 @@ def read_sql_table(table_name, con, schema=None, index_col=None, ------- DataFrame + Notes + ----- + Any datetime values with time zone information will be converted to UTC + See also -------- read_sql_query : Read SQL query into a DataFrame. @@ -390,6 +394,11 @@ def read_sql_query(sql, con, index_col=None, coerce_float=True, params=None, ------- DataFrame + Notes + ----- + Any datetime values with time zone information parsed via the `parse_dates` + parameter will be converted to UTC + See also -------- read_sql_table : Read SQL database table into a DataFrame @@ -451,7 +460,8 @@ def read_sql(sql, con, index_col=None, coerce_float=True, params=None, This function is a convenience wrapper around ``read_sql_table`` and ``read_sql_query`` (and for backward compatibility) and will delegate to the specific function depending on the provided input (database - table name or sql query). + table name or sql query). The delegated function might have more specific + notes about their functionality not listed here. See also -------- diff --git a/pandas/io/tests/test_sql.py b/pandas/io/tests/test_sql.py index 2db6f1e104770..ac266dd77c984 100644 --- a/pandas/io/tests/test_sql.py +++ b/pandas/io/tests/test_sql.py @@ -29,7 +29,7 @@ from datetime import datetime, date, time from pandas import DataFrame, Series, Index, MultiIndex, isnull, concat -from pandas import date_range, to_datetime, to_timedelta +from pandas import date_range, to_datetime, to_timedelta, Timestamp import pandas.compat as compat from pandas.compat import StringIO, range, lrange, string_types from pandas.core.datetools import format as date_format @@ -100,6 +100,7 @@ 'postgresql': """CREATE TABLE types_test_data ( "TextCol" TEXT, "DateCol" TIMESTAMP, + "DateColWithTz" TIMESTAMP WITH TIME ZONE, "IntDateCol" INTEGER, "FloatCol" DOUBLE PRECISION, "IntCol" INTEGER, @@ -109,18 +110,36 @@ )""" }, 'insert_test_types': { - 'sqlite': """ + 'sqlite': { + 'query': """ INSERT INTO types_test_data VALUES(?, ?, ?, ?, ?, ?, ?, ?) """, - 'mysql': """ + 'fields': ( + 'TextCol', 'DateCol', 'IntDateCol', 'FloatCol', + 'IntCol', 'BoolCol', 'IntColWithNull', 'BoolColWithNull' + ) + }, + 'mysql': { + 'query': """ INSERT INTO types_test_data VALUES("%s", %s, %s, %s, %s, %s, %s, %s) """, - 'postgresql': """ + 'fields': ( + 'TextCol', 'DateCol', 'IntDateCol', 'FloatCol', + 'IntCol', 'BoolCol', 'IntColWithNull', 'BoolColWithNull' + ) + }, + 'postgresql': { + 'query': """ INSERT INTO types_test_data - VALUES(%s, %s, %s, %s, %s, %s, %s, %s) - """ + VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s) + """, + 'fields': ( + 'TextCol', 'DateCol', 'DateColWithTz', 'IntDateCol', 'FloatCol', + 'IntCol', 'BoolCol', 'IntColWithNull', 'BoolColWithNull' + ) + }, }, 'read_parameters': { 'sqlite': "SELECT * FROM iris WHERE Name=? AND SepalLength=?", @@ -218,11 +237,36 @@ def _load_raw_sql(self): self._get_exec().execute(SQL_STRINGS['create_test_types'][self.flavor]) ins = SQL_STRINGS['insert_test_types'][self.flavor] - data = [( - 'first', '2000-01-03 00:00:00', 535852800, 10.10, 1, False, 1, False), - ('first', '2000-01-04 00:00:00', 1356998400, 10.10, 1, False, None, None)] + data = [ + { + 'TextCol': 'first', + 'DateCol': '2000-01-03 00:00:00', + 'DateColWithTz': '2000-01-01 00:00:00-08:00', + 'IntDateCol': 535852800, + 'FloatCol': 10.10, + 'IntCol': 1, + 'BoolCol': False, + 'IntColWithNull': 1, + 'BoolColWithNull': False, + }, + { + 'TextCol': 'first', + 'DateCol': '2000-01-04 00:00:00', + 'DateColWithTz': '2000-06-01 00:00:00-07:00', + 'IntDateCol': 1356998400, + 'FloatCol': 10.10, + 'IntCol': 1, + 'BoolCol': False, + 'IntColWithNull': None, + 'BoolColWithNull': None, + }, + ] + for d in data: - self._get_exec().execute(ins, d) + self._get_exec().execute( + ins['query'], + [d[field] for field in ins['fields']] + ) def _count_rows(self, table_name): result = self._get_exec().execute( @@ -1512,6 +1556,19 @@ def test_schema_support(self): res2 = pdsql.read_table('test_schema_other2') tm.assert_frame_equal(res1, res2) + def test_datetime_with_time_zone(self): + # Test to see if we read the date column with timezones that + # the timezone information is converted to utc and into a + # np.datetime64 (GH #7139) + df = sql.read_sql_table("types_test_data", self.conn) + self.assertTrue(issubclass(df.DateColWithTz.dtype.type, np.datetime64), + "DateColWithTz loaded with incorrect type") + + # "2000-01-01 00:00:00-08:00" should convert to "2000-01-01 08:00:00" + self.assertEqual(df.DateColWithTz[0], Timestamp('2000-01-01 08:00:00')) + + # "2000-06-01 00:00:00-07:00" should convert to "2000-06-01 07:00:00" + self.assertEqual(df.DateColWithTz[1], Timestamp('2000-06-01 07:00:00')) #------------------------------------------------------------------------------ #--- Test Sqlite / MySQL fallback