Skip to content

Commit cbbf5cf

Browse files
committed
BUG: Pandas can't restore index from parquet with offset-specified timezone pandas-dev#35997
1 parent f1adcd1 commit cbbf5cf

File tree

5 files changed

+82
-3
lines changed

5 files changed

+82
-3
lines changed

doc/source/whatsnew/v1.2.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -392,6 +392,7 @@ I/O
392392
- Bug in :meth:`read_csv` with ``engine='python'`` truncating data if multiple items present in first row and first element started with BOM (:issue:`36343`)
393393
- Removed ``private_key`` and ``verbose`` from :func:`read_gbq` as they are no longer supported in ``pandas-gbq`` (:issue:`34654`, :issue:`30200`)
394394
- Bumped minimum pytables version to 3.5.1 to avoid a ``ValueError`` in :meth:`read_hdf` (:issue:`24839`)
395+
- Bug in :meth:`read_parquet` with fixed offset timezones' string representation was not recognized (:issue:`35997`, :issue:`36004`)
395396

396397
Plotting
397398
^^^^^^^^

pandas/_libs/tslibs/timezones.pyx

+9-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from datetime import timezone
1+
from datetime import timedelta, timezone
22

33
from cpython.datetime cimport datetime, timedelta, tzinfo
44

@@ -102,6 +102,14 @@ cpdef inline tzinfo maybe_get_tz(object tz):
102102
# On Python 3 on Windows, the filename is not always set correctly.
103103
if isinstance(tz, _dateutil_tzfile) and '.tar.gz' in tz._filename:
104104
tz._filename = zone
105+
elif tz[0] in {'-', '+'}:
106+
hours = int(tz[0:3])
107+
minutes = int(tz[0] + tz[4:6])
108+
tz = timezone(timedelta(hours=hours, minutes=minutes))
109+
elif tz[0:4] in {'UTC-', 'UTC+'}:
110+
hours = int(tz[3:6])
111+
minutes = int(tz[3] + tz[7:9])
112+
tz = timezone(timedelta(hours=hours, minutes=minutes))
105113
else:
106114
tz = pytz.timezone(tz)
107115
elif is_integer_object(tz):

pandas/conftest.py

+4
Original file line numberDiff line numberDiff line change
@@ -857,6 +857,10 @@ def iris(datapath):
857857
"Asia/Tokyo",
858858
"dateutil/US/Pacific",
859859
"dateutil/Asia/Singapore",
860+
"+01:15",
861+
"-02:15",
862+
"UTC+01:15",
863+
"UTC-02:15",
860864
tzutc(),
861865
tzlocal(),
862866
FixedOffset(300),

pandas/tests/io/test_parquet.py

+45-1
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,21 @@ def df_full():
125125
)
126126

127127

128+
@pytest.fixture(
129+
params=[
130+
datetime.datetime.now(datetime.timezone.utc),
131+
datetime.datetime.now(datetime.timezone.min),
132+
datetime.datetime.now(datetime.timezone.max),
133+
datetime.datetime.strptime("2019-01-04T16:41:24+0200", "%Y-%m-%dT%H:%M:%S%z"),
134+
datetime.datetime.strptime("2019-01-04T16:41:24+0215", "%Y-%m-%dT%H:%M:%S%z"),
135+
datetime.datetime.strptime("2019-01-04T16:41:24-0200", "%Y-%m-%dT%H:%M:%S%z"),
136+
datetime.datetime.strptime("2019-01-04T16:41:24-0215", "%Y-%m-%dT%H:%M:%S%z"),
137+
]
138+
)
139+
def timezone_aware_date_list(request):
140+
return request.param
141+
142+
128143
def check_round_trip(
129144
df,
130145
engine=None,
@@ -134,6 +149,7 @@ def check_round_trip(
134149
expected=None,
135150
check_names=True,
136151
check_like=False,
152+
check_dtype=True,
137153
repeat=2,
138154
):
139155
"""Verify parquet serializer and deserializer produce the same results.
@@ -175,7 +191,11 @@ def compare(repeat):
175191
actual = read_parquet(path, **read_kwargs)
176192

177193
tm.assert_frame_equal(
178-
expected, actual, check_names=check_names, check_like=check_like
194+
expected,
195+
actual,
196+
check_names=check_names,
197+
check_like=check_like,
198+
check_dtype=check_dtype,
179199
)
180200

181201
if path is None:
@@ -739,6 +759,21 @@ def test_timestamp_nanoseconds(self, pa):
739759
df = pd.DataFrame({"a": pd.date_range("2017-01-01", freq="1n", periods=10)})
740760
check_round_trip(df, pa, write_kwargs={"version": "2.0"})
741761

762+
def test_timezone_aware_index(self, pa, timezone_aware_date_list):
763+
idx = 5 * [timezone_aware_date_list]
764+
df = pd.DataFrame(index=idx, data={"index_as_col": idx})
765+
766+
# see gh-36004
767+
# compare time(zone) values only, skip their class:
768+
# pyarrow always creates fixed offset timezones using pytz.FixedOffset()
769+
# even if it was datetime.timezone() originally
770+
#
771+
# technically they are the same:
772+
# they both implement datetime.tzinfo
773+
# they both wrap datetime.timedelta()
774+
# this use-case sets the resolution to 1 minute
775+
check_round_trip(df, pa, check_dtype=False)
776+
742777
@td.skip_if_no("pyarrow", min_version="0.17")
743778
def test_filter_row_groups(self, pa):
744779
# https://github.com/pandas-dev/pandas/issues/26551
@@ -877,3 +912,12 @@ def test_empty_dataframe(self, fp):
877912
expected = df.copy()
878913
expected.index.name = "index"
879914
check_round_trip(df, fp, expected=expected)
915+
916+
def test_timezone_aware_index(self, fp, timezone_aware_date_list):
917+
idx = 5 * [timezone_aware_date_list]
918+
919+
df = pd.DataFrame(index=idx, data={"index_as_col": idx})
920+
921+
expected = df.copy()
922+
expected.index.name = "index"
923+
check_round_trip(df, fp, expected=expected)

pandas/tests/tslibs/test_timezones.py

+23-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from datetime import datetime
1+
from datetime import datetime, timedelta, timezone
22

33
import dateutil.tz
44
import pytest
@@ -118,3 +118,25 @@ def test_maybe_get_tz_invalid_types():
118118
msg = "<class 'pandas._libs.tslibs.timestamps.Timestamp'>"
119119
with pytest.raises(TypeError, match=msg):
120120
timezones.maybe_get_tz(Timestamp.now("UTC"))
121+
122+
123+
def test_maybe_get_tz_offset_only():
124+
# see gh-36004
125+
126+
# timezone.utc
127+
tz = timezones.maybe_get_tz(timezone.utc)
128+
assert tz == timezone(timedelta(hours=0, minutes=0))
129+
130+
# without UTC+- prefix
131+
tz = timezones.maybe_get_tz("+01:15")
132+
assert tz == timezone(timedelta(hours=1, minutes=15))
133+
134+
tz = timezones.maybe_get_tz("-01:15")
135+
assert tz == timezone(-timedelta(hours=1, minutes=15))
136+
137+
# with UTC+- prefix
138+
tz = timezones.maybe_get_tz("UTC+02:45")
139+
assert tz == timezone(timedelta(hours=2, minutes=45))
140+
141+
tz = timezones.maybe_get_tz("UTC-02:45")
142+
assert tz == timezone(-timedelta(hours=2, minutes=45))

0 commit comments

Comments
 (0)