-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
WIP: multi-timezone handling for array_to_datetime #24006
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
13336d9
4b80797
f0dccc7
1b36e6f
9d42d97
515a23b
edc177d
a2a01c9
1c6a8ee
9677010
cf8b4cc
ec35002
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -32,7 +32,9 @@ from tslibs.np_datetime import OutOfBoundsDatetime | |
from tslibs.parsing import parse_datetime_string | ||
|
||
from tslibs.timedeltas cimport cast_from_unit | ||
from tslibs.timezones cimport is_utc, is_tzlocal, get_dst_info | ||
from tslibs.timezones cimport ( | ||
is_utc, is_tzlocal, get_dst_info, tz_cache_key, get_utcoffset, | ||
is_fixed_offset, tz_compare, get_timezone) | ||
from tslibs.timezones import UTC | ||
from tslibs.conversion cimport (tz_convert_single, _TSObject, | ||
convert_datetime_to_tsobject, | ||
|
@@ -459,6 +461,55 @@ def array_with_unit_to_datetime(ndarray values, object unit, | |
return oresult | ||
|
||
|
||
cdef get_key(tz): | ||
if tz is None: | ||
return None | ||
if is_fixed_offset(tz): | ||
# TODO: these should all be mapped together | ||
try: | ||
# pytz | ||
return str(tz._minutes) # pytz specific? | ||
except AttributeError: | ||
try: | ||
# dateutil.tz.tzoffset | ||
return str(tz._offset.total_seconds()) | ||
except AttributeError: | ||
return str(tz) | ||
return tz_cache_key(tz) | ||
|
||
|
||
cdef fixed_offset_to_pytz(tz): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As mentioned in my comment, I don't think we shouldn't be converting dateutil objects to pytz objects unless completely necessary. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm inclined to agree, am seeing now how many tests break if I change this. What if we make The downside is that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just so I understand some implications:
In general I am not a fan of this coercion. I would prefer to maintain the individual timezones instances (or raise) over coercion. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It may be worth separately considering Also note that the existing code coerces dateutil tzoffsets parsed from strings to pytz.FixedOffset. As a result, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
If we end up doing this then, I would prefer to pick a consistent timezone object (probably pytz) and not pick a psudo-random one (e.g. pick the last one). In the case of mixed timezones though, would we coerce to object or raise? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
So we would need some kind of hierarchy like:
The approach I took in #23675 is to have There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Another (far in the future) option is to use a pandas-equivalent tzinfo as discussed here: #23959 (comment). But yeah I hierarchy would be needed in the short term. Since |
||
""" | ||
If we have a FixedOffset, ensure it is a pytz fixed offset | ||
""" | ||
if is_fixed_offset(tz): | ||
# tests expect pytz, not dateutil... | ||
if tz is pytz.utc: | ||
pass | ||
elif hasattr(tz, '_minutes'): | ||
# i.e. pytz | ||
pass # TODO: use the treat_as_pytz method? | ||
elif hasattr(tz, '_offset'): | ||
# i.e. dateutil # TODO: use the treat_as_dateutil method? | ||
secs = tz._offset.total_seconds() | ||
assert secs % 60 == 0, secs | ||
tz = pytz.FixedOffset(secs / 60) | ||
else: | ||
# e.g. custom FixedOffset implemented in tests | ||
pass | ||
# TODO: using the below breaks some tests and fixes others | ||
# off = get_utcoffset(tz, Timestamp.now()) | ||
# secs = off.total_seconds() | ||
# assert secs % 60 == 0, secs | ||
# tz = pytz.FixedOffset(secs / 60) | ||
|
||
elif is_utc(tz): | ||
# if we have a dateutil UTC (or stdlib), change to pytz to make | ||
# tests happy | ||
tz = pytz.utc | ||
return tz | ||
|
||
|
||
@cython.wraparound(False) | ||
@cython.boundscheck(False) | ||
cpdef array_to_datetime(ndarray[object] values, str errors='raise', | ||
|
@@ -504,25 +555,22 @@ cpdef array_to_datetime(ndarray[object] values, str errors='raise', | |
npy_datetimestruct dts | ||
bint utc_convert = bool(utc) | ||
bint seen_integer = 0 | ||
bint seen_string = 0 | ||
bint seen_datetime = 0 | ||
bint seen_datetime_offset = 0 | ||
bint is_raise = errors=='raise' | ||
bint is_ignore = errors=='ignore' | ||
bint is_coerce = errors=='coerce' | ||
bint is_same_offsets | ||
_TSObject _ts | ||
int64_t value | ||
int out_local=0, out_tzoffset=0 | ||
float offset_seconds, tz_offset | ||
set out_tzoffset_vals = set() | ||
dict out_tzinfos = {} | ||
|
||
# specify error conditions | ||
assert is_raise or is_ignore or is_coerce | ||
|
||
result = np.empty(n, dtype='M8[ns]') | ||
iresult = result.view('i8') | ||
try: | ||
result = np.empty(n, dtype='M8[ns]') | ||
iresult = result.view('i8') | ||
for i in range(n): | ||
val = values[i] | ||
|
||
|
@@ -531,34 +579,18 @@ cpdef array_to_datetime(ndarray[object] values, str errors='raise', | |
|
||
elif PyDateTime_Check(val): | ||
seen_datetime = 1 | ||
if val.tzinfo is not None: | ||
if utc_convert: | ||
try: | ||
_ts = convert_datetime_to_tsobject(val, None) | ||
iresult[i] = _ts.value | ||
except OutOfBoundsDatetime: | ||
if is_coerce: | ||
iresult[i] = NPY_NAT | ||
continue | ||
raise | ||
else: | ||
raise ValueError('Tz-aware datetime.datetime cannot ' | ||
'be converted to datetime64 unless ' | ||
'utc=True') | ||
else: | ||
iresult[i] = pydatetime_to_dt64(val, &dts) | ||
if not PyDateTime_CheckExact(val): | ||
# i.e. a Timestamp object | ||
iresult[i] += val.nanosecond | ||
try: | ||
check_dts_bounds(&dts) | ||
except OutOfBoundsDatetime: | ||
if is_coerce: | ||
iresult[i] = NPY_NAT | ||
continue | ||
raise | ||
out_tzinfos[get_key(val.tzinfo)] = val.tzinfo | ||
try: | ||
_ts = convert_datetime_to_tsobject(val, None) | ||
iresult[i] = _ts.value | ||
except OutOfBoundsDatetime: | ||
if is_coerce: | ||
iresult[i] = NPY_NAT | ||
continue | ||
raise | ||
|
||
elif PyDate_Check(val): | ||
# Treating as either naive or UTC | ||
seen_datetime = 1 | ||
iresult[i] = pydate_to_dt64(val, &dts) | ||
try: | ||
|
@@ -570,17 +602,15 @@ cpdef array_to_datetime(ndarray[object] values, str errors='raise', | |
raise | ||
|
||
elif is_datetime64_object(val): | ||
# Treating as either naive or UTC | ||
seen_datetime = 1 | ||
if get_datetime64_value(val) == NPY_NAT: | ||
iresult[i] = NPY_NAT | ||
else: | ||
try: | ||
iresult[i] = get_datetime64_nanos(val) | ||
except OutOfBoundsDatetime: | ||
if is_coerce: | ||
iresult[i] = NPY_NAT | ||
continue | ||
raise | ||
try: | ||
iresult[i] = get_datetime64_nanos(val) | ||
except OutOfBoundsDatetime: | ||
if is_coerce: | ||
iresult[i] = NPY_NAT | ||
continue | ||
raise | ||
|
||
elif is_integer_object(val) or is_float_object(val): | ||
# these must be ns unit by-definition | ||
|
@@ -603,11 +633,11 @@ cpdef array_to_datetime(ndarray[object] values, str errors='raise', | |
|
||
elif is_string_object(val): | ||
# string | ||
seen_string = 1 | ||
|
||
if len(val) == 0 or val in nat_strings: | ||
iresult[i] = NPY_NAT | ||
continue | ||
|
||
if isinstance(val, unicode) and PY2: | ||
val = val.encode('utf-8') | ||
|
||
|
@@ -617,6 +647,8 @@ cpdef array_to_datetime(ndarray[object] values, str errors='raise', | |
# A ValueError at this point is a _parsing_ error | ||
# specifically _not_ OutOfBoundsDatetime | ||
if _parse_today_now(val, &iresult[i]): | ||
# TODO: Do we treat this as local? | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I feel like we should follow There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There's #18705 suggesting Consider two cases:
I'm not wild about having the parsed meaning for the last three entries depending on first entry. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I do think that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The point is that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @mroeschke thoughts on this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Okay yeah I agree with your example. One argument's timezone information shouldn't propagate to the other arguments. I would opt for |
||
# "now" is UTC, "today" is local | ||
continue | ||
elif require_iso8601: | ||
# if requiring iso8601 strings, skip trying | ||
|
@@ -642,15 +674,8 @@ cpdef array_to_datetime(ndarray[object] values, str errors='raise', | |
# If the dateutil parser returned tzinfo, capture it | ||
# to check if all arguments have the same tzinfo | ||
tz = py_dt.utcoffset() | ||
if tz is not None: | ||
seen_datetime_offset = 1 | ||
# dateutil timezone objects cannot be hashed, so store | ||
# the UTC offsets in seconds instead | ||
out_tzoffset_vals.add(tz.total_seconds()) | ||
else: | ||
# Add a marker for naive string, to track if we are | ||
# parsing mixed naive and aware strings | ||
out_tzoffset_vals.add('naive') | ||
out_tzinfos[get_key(py_dt.tzinfo)] = py_dt.tzinfo | ||
|
||
try: | ||
_ts = convert_datetime_to_tsobject(py_dt, None) | ||
iresult[i] = _ts.value | ||
|
@@ -670,17 +695,17 @@ cpdef array_to_datetime(ndarray[object] values, str errors='raise', | |
# where we left off | ||
value = dtstruct_to_dt64(&dts) | ||
if out_local == 1: | ||
seen_datetime_offset = 1 | ||
# Store the out_tzoffset in seconds | ||
# since we store the total_seconds of | ||
# dateutil.tz.tzoffset objects | ||
out_tzoffset_vals.add(out_tzoffset * 60.) | ||
tz = pytz.FixedOffset(out_tzoffset) | ||
out_tzinfos[get_key(tz)] = tz | ||
value = tz_convert_single(value, tz, UTC) | ||
else: | ||
# Add a marker for naive string, to track if we are | ||
# parsing mixed naive and aware strings | ||
out_tzoffset_vals.add('naive') | ||
out_tzinfos[None] = None | ||
|
||
iresult[i] = value | ||
try: | ||
check_dts_bounds(&dts) | ||
|
@@ -722,21 +747,28 @@ cpdef array_to_datetime(ndarray[object] values, str errors='raise', | |
else: | ||
raise TypeError | ||
|
||
if seen_datetime_offset and not utc_convert: | ||
# TODO: File bug report with cython. it raises | ||
# Closures Not Supported | ||
# error when I tried to use | ||
# `if any(key is not None for key in out_tzinfos)` | ||
keys = out_tzinfos.keys() | ||
nnkeys = [x for x in keys if x is not None] | ||
if len(nnkeys) and not utc_convert: | ||
# GH 17697 | ||
# 1) If all the offsets are equal, return one offset for | ||
# the parsed dates to (maybe) pass to DatetimeIndex | ||
# 2) If the offsets are different, then force the parsing down the | ||
# object path where an array of datetimes | ||
# (with individual dateutil.tzoffsets) are returned | ||
is_same_offsets = len(out_tzoffset_vals) == 1 | ||
is_same_offsets = len(out_tzinfos) == 1 | ||
if not is_same_offsets: | ||
return array_to_datetime_object(values, is_raise, | ||
dayfirst, yearfirst) | ||
else: | ||
tz_offset = out_tzoffset_vals.pop() | ||
tz_out = pytz.FixedOffset(tz_offset / 60.) | ||
tz_out = list(out_tzinfos.values())[0] | ||
tz_out = fixed_offset_to_pytz(tz_out) | ||
return result, tz_out | ||
|
||
except OutOfBoundsDatetime: | ||
if is_raise: | ||
raise | ||
|
@@ -816,6 +848,8 @@ cdef array_to_datetime_object(ndarray[object] values, bint is_raise, | |
if is_raise: | ||
raise | ||
return values, None | ||
elif PyDateTime_Check(val): | ||
oresult[i] = val # TODO: check_dts_bounds? | ||
else: | ||
if is_raise: | ||
raise | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you type & add a doc-string