diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 21d54e2514f8b..a4597e61971a1 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -392,6 +392,7 @@ I/O - Bug in :meth:`read_csv` with ``engine='python'`` truncating data if multiple items present in first row and first element started with BOM (:issue:`36343`) - Removed ``private_key`` and ``verbose`` from :func:`read_gbq` as they are no longer supported in ``pandas-gbq`` (:issue:`34654`, :issue:`30200`) - Bumped minimum pytables version to 3.5.1 to avoid a ``ValueError`` in :meth:`read_hdf` (:issue:`24839`) +- Bug in :meth:`read_parquet` with fixed offset timezones. String representation of timezones was not recognized (:issue:`35997`, :issue:`36004`) Plotting ^^^^^^^^ diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx index b82291a71057e..3deabc57ec522 100644 --- a/pandas/_libs/tslibs/timezones.pyx +++ b/pandas/_libs/tslibs/timezones.pyx @@ -1,4 +1,4 @@ -from datetime import timezone +from datetime import timedelta, timezone from cpython.datetime cimport datetime, timedelta, tzinfo @@ -102,6 +102,14 @@ cpdef inline tzinfo maybe_get_tz(object tz): # On Python 3 on Windows, the filename is not always set correctly. if isinstance(tz, _dateutil_tzfile) and '.tar.gz' in tz._filename: tz._filename = zone + elif tz[0] in {'-', '+'}: + hours = int(tz[0:3]) + minutes = int(tz[0] + tz[4:6]) + tz = timezone(timedelta(hours=hours, minutes=minutes)) + elif tz[0:4] in {'UTC-', 'UTC+'}: + hours = int(tz[3:6]) + minutes = int(tz[3] + tz[7:9]) + tz = timezone(timedelta(hours=hours, minutes=minutes)) else: tz = pytz.timezone(tz) elif is_integer_object(tz): diff --git a/pandas/conftest.py b/pandas/conftest.py index 5ac5e3670f69f..998c45d82fb4d 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -857,6 +857,10 @@ def iris(datapath): "Asia/Tokyo", "dateutil/US/Pacific", "dateutil/Asia/Singapore", + "+01:15", + "-02:15", + "UTC+01:15", + "UTC-02:15", tzutc(), tzlocal(), FixedOffset(300), diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index f7b25f8c0eeac..9114edc19315f 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -125,6 +125,21 @@ def df_full(): ) +@pytest.fixture( + params=[ + datetime.datetime.now(datetime.timezone.utc), + datetime.datetime.now(datetime.timezone.min), + datetime.datetime.now(datetime.timezone.max), + datetime.datetime.strptime("2019-01-04T16:41:24+0200", "%Y-%m-%dT%H:%M:%S%z"), + datetime.datetime.strptime("2019-01-04T16:41:24+0215", "%Y-%m-%dT%H:%M:%S%z"), + datetime.datetime.strptime("2019-01-04T16:41:24-0200", "%Y-%m-%dT%H:%M:%S%z"), + datetime.datetime.strptime("2019-01-04T16:41:24-0215", "%Y-%m-%dT%H:%M:%S%z"), + ] +) +def timezone_aware_date_list(request): + return request.param + + def check_round_trip( df, engine=None, @@ -134,6 +149,7 @@ def check_round_trip( expected=None, check_names=True, check_like=False, + check_dtype=True, repeat=2, ): """Verify parquet serializer and deserializer produce the same results. @@ -175,7 +191,11 @@ def compare(repeat): actual = read_parquet(path, **read_kwargs) tm.assert_frame_equal( - expected, actual, check_names=check_names, check_like=check_like + expected, + actual, + check_names=check_names, + check_like=check_like, + check_dtype=check_dtype, ) if path is None: @@ -739,6 +759,21 @@ def test_timestamp_nanoseconds(self, pa): df = pd.DataFrame({"a": pd.date_range("2017-01-01", freq="1n", periods=10)}) check_round_trip(df, pa, write_kwargs={"version": "2.0"}) + def test_timezone_aware_index(self, pa, timezone_aware_date_list): + idx = 5 * [timezone_aware_date_list] + df = pd.DataFrame(index=idx, data={"index_as_col": idx}) + + # see gh-36004 + # compare time(zone) values only, skip their class: + # pyarrow always creates fixed offset timezones using pytz.FixedOffset() + # even if it was datetime.timezone() originally + # + # technically they are the same: + # they both implement datetime.tzinfo + # they both wrap datetime.timedelta() + # this use-case sets the resolution to 1 minute + check_round_trip(df, pa, check_dtype=False) + @td.skip_if_no("pyarrow", min_version="0.17") def test_filter_row_groups(self, pa): # https://github.com/pandas-dev/pandas/issues/26551 @@ -877,3 +912,12 @@ def test_empty_dataframe(self, fp): expected = df.copy() expected.index.name = "index" check_round_trip(df, fp, expected=expected) + + def test_timezone_aware_index(self, fp, timezone_aware_date_list): + idx = 5 * [timezone_aware_date_list] + + df = pd.DataFrame(index=idx, data={"index_as_col": idx}) + + expected = df.copy() + expected.index.name = "index" + check_round_trip(df, fp, expected=expected) diff --git a/pandas/tests/tslibs/test_timezones.py b/pandas/tests/tslibs/test_timezones.py index 81b41f567976d..e49f511fe3cc4 100644 --- a/pandas/tests/tslibs/test_timezones.py +++ b/pandas/tests/tslibs/test_timezones.py @@ -1,4 +1,4 @@ -from datetime import datetime +from datetime import datetime, timedelta, timezone import dateutil.tz import pytest @@ -118,3 +118,25 @@ def test_maybe_get_tz_invalid_types(): msg = "" with pytest.raises(TypeError, match=msg): timezones.maybe_get_tz(Timestamp.now("UTC")) + + +def test_maybe_get_tz_offset_only(): + # see gh-36004 + + # timezone.utc + tz = timezones.maybe_get_tz(timezone.utc) + assert tz == timezone(timedelta(hours=0, minutes=0)) + + # without UTC+- prefix + tz = timezones.maybe_get_tz("+01:15") + assert tz == timezone(timedelta(hours=1, minutes=15)) + + tz = timezones.maybe_get_tz("-01:15") + assert tz == timezone(-timedelta(hours=1, minutes=15)) + + # with UTC+- prefix + tz = timezones.maybe_get_tz("UTC+02:45") + assert tz == timezone(timedelta(hours=2, minutes=45)) + + tz = timezones.maybe_get_tz("UTC-02:45") + assert tz == timezone(-timedelta(hours=2, minutes=45))