Skip to content

BUG: Can't restore index from parquet with offset-specified timezone #35997 #36004

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 7, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,7 @@ I/O
- Bug in :meth:`read_csv` with ``engine='python'`` truncating data if multiple items present in first row and first element started with BOM (:issue:`36343`)
- Removed ``private_key`` and ``verbose`` from :func:`read_gbq` as they are no longer supported in ``pandas-gbq`` (:issue:`34654`, :issue:`30200`)
- Bumped minimum pytables version to 3.5.1 to avoid a ``ValueError`` in :meth:`read_hdf` (:issue:`24839`)
- Bug in :meth:`read_parquet` with fixed offset timezones. String representation of timezones was not recognized (:issue:`35997`, :issue:`36004`)

Plotting
^^^^^^^^
Expand Down
10 changes: 9 additions & 1 deletion pandas/_libs/tslibs/timezones.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from datetime import timezone
from datetime import timedelta, timezone

from cpython.datetime cimport datetime, timedelta, tzinfo

Expand Down Expand Up @@ -102,6 +102,14 @@ cpdef inline tzinfo maybe_get_tz(object tz):
# On Python 3 on Windows, the filename is not always set correctly.
if isinstance(tz, _dateutil_tzfile) and '.tar.gz' in tz._filename:
tz._filename = zone
elif tz[0] in {'-', '+'}:
hours = int(tz[0:3])
minutes = int(tz[0] + tz[4:6])
tz = timezone(timedelta(hours=hours, minutes=minutes))
elif tz[0:4] in {'UTC-', 'UTC+'}:
hours = int(tz[3:6])
minutes = int(tz[3] + tz[7:9])
tz = timezone(timedelta(hours=hours, minutes=minutes))
else:
tz = pytz.timezone(tz)
elif is_integer_object(tz):
Expand Down
4 changes: 4 additions & 0 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -857,6 +857,10 @@ def iris(datapath):
"Asia/Tokyo",
"dateutil/US/Pacific",
"dateutil/Asia/Singapore",
"+01:15",
"-02:15",
"UTC+01:15",
"UTC-02:15",
tzutc(),
tzlocal(),
FixedOffset(300),
Expand Down
46 changes: 45 additions & 1 deletion pandas/tests/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,21 @@ def df_full():
)


@pytest.fixture(
params=[
datetime.datetime.now(datetime.timezone.utc),
datetime.datetime.now(datetime.timezone.min),
datetime.datetime.now(datetime.timezone.max),
datetime.datetime.strptime("2019-01-04T16:41:24+0200", "%Y-%m-%dT%H:%M:%S%z"),
datetime.datetime.strptime("2019-01-04T16:41:24+0215", "%Y-%m-%dT%H:%M:%S%z"),
datetime.datetime.strptime("2019-01-04T16:41:24-0200", "%Y-%m-%dT%H:%M:%S%z"),
datetime.datetime.strptime("2019-01-04T16:41:24-0215", "%Y-%m-%dT%H:%M:%S%z"),
]
)
def timezone_aware_date_list(request):
return request.param


def check_round_trip(
df,
engine=None,
Expand All @@ -134,6 +149,7 @@ def check_round_trip(
expected=None,
check_names=True,
check_like=False,
check_dtype=True,
repeat=2,
):
"""Verify parquet serializer and deserializer produce the same results.
Expand Down Expand Up @@ -175,7 +191,11 @@ def compare(repeat):
actual = read_parquet(path, **read_kwargs)

tm.assert_frame_equal(
expected, actual, check_names=check_names, check_like=check_like
expected,
actual,
check_names=check_names,
check_like=check_like,
check_dtype=check_dtype,
)

if path is None:
Expand Down Expand Up @@ -739,6 +759,21 @@ def test_timestamp_nanoseconds(self, pa):
df = pd.DataFrame({"a": pd.date_range("2017-01-01", freq="1n", periods=10)})
check_round_trip(df, pa, write_kwargs={"version": "2.0"})

def test_timezone_aware_index(self, pa, timezone_aware_date_list):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you change this so that on future versions of pyarrow it will set check_dtype=True (IOW when the bug is fixed). otherwise I am worried this will perpetuate forever.

alternatively, could remove the check_dtype and just xfail this (and when new pyarrow fixes this test will then start to xpass which will make it fail as we have strict=True). so again could make this a conditional on an older / newer version of pyarrow.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback based on the comment above by @jorisvandenbossche , I don't expect this to be "fixed" in Arrow. pytz.FixedOffset is a wrapper around timedelta like datetime.timezone is and they both implement datetime.tzinfo. ofc I'm happy to do any change you ask

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, there are no active plans in pyarrow to change this. Although it makes probably sense to use datetime.timezone, but if we do that, it's pytz.FixedOffset that won't be preserved, in which case another test probably needs a check_dtype=False.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback I added a long description. I know this won't help a follow up improvement in the future, but at this point it's unlikely to change.

There are two scenarios:

  1. arrow/parquet persists the initial class information
  2. the assert_ introduces a sub-feature of check_dtype=True (check_timezone_dtype=False by default) accepting pytz.FixedOffset and datetime.timezone equality

I don't see any of them on the short or mid-term roadmap.

idx = 5 * [timezone_aware_date_list]
df = pd.DataFrame(index=idx, data={"index_as_col": idx})

# see gh-36004
# compare time(zone) values only, skip their class:
# pyarrow always creates fixed offset timezones using pytz.FixedOffset()
# even if it was datetime.timezone() originally
#
# technically they are the same:
# they both implement datetime.tzinfo
# they both wrap datetime.timedelta()
# this use-case sets the resolution to 1 minute
check_round_trip(df, pa, check_dtype=False)

@td.skip_if_no("pyarrow", min_version="0.17")
def test_filter_row_groups(self, pa):
# https://github.com/pandas-dev/pandas/issues/26551
Expand Down Expand Up @@ -877,3 +912,12 @@ def test_empty_dataframe(self, fp):
expected = df.copy()
expected.index.name = "index"
check_round_trip(df, fp, expected=expected)

def test_timezone_aware_index(self, fp, timezone_aware_date_list):
idx = 5 * [timezone_aware_date_list]

df = pd.DataFrame(index=idx, data={"index_as_col": idx})

expected = df.copy()
expected.index.name = "index"
check_round_trip(df, fp, expected=expected)
24 changes: 23 additions & 1 deletion pandas/tests/tslibs/test_timezones.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from datetime import datetime
from datetime import datetime, timedelta, timezone

import dateutil.tz
import pytest
Expand Down Expand Up @@ -118,3 +118,25 @@ def test_maybe_get_tz_invalid_types():
msg = "<class 'pandas._libs.tslibs.timestamps.Timestamp'>"
with pytest.raises(TypeError, match=msg):
timezones.maybe_get_tz(Timestamp.now("UTC"))


def test_maybe_get_tz_offset_only():
# see gh-36004

# timezone.utc
tz = timezones.maybe_get_tz(timezone.utc)
assert tz == timezone(timedelta(hours=0, minutes=0))

# without UTC+- prefix
tz = timezones.maybe_get_tz("+01:15")
assert tz == timezone(timedelta(hours=1, minutes=15))

tz = timezones.maybe_get_tz("-01:15")
assert tz == timezone(-timedelta(hours=1, minutes=15))

# with UTC+- prefix
tz = timezones.maybe_get_tz("UTC+02:45")
assert tz == timezone(timedelta(hours=2, minutes=45))

tz = timezones.maybe_get_tz("UTC-02:45")
assert tz == timezone(-timedelta(hours=2, minutes=45))