Skip to content

WIP: multi-timezone handling for array_to_datetime #24006

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 12 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 44 additions & 35 deletions pandas/_libs/tslib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,9 @@ from tslibs.np_datetime import OutOfBoundsDatetime
from tslibs.parsing import parse_datetime_string

from tslibs.timedeltas cimport cast_from_unit
from tslibs.timezones cimport is_utc, is_tzlocal, get_dst_info
from tslibs.timezones cimport (
is_utc, is_tzlocal, get_dst_info, tz_cache_key, get_utcoffset,
is_fixed_offset, tz_compare, get_timezone)
from tslibs.timezones import UTC
from tslibs.conversion cimport (tz_convert_single, _TSObject,
convert_datetime_to_tsobject,
Expand Down Expand Up @@ -459,6 +461,26 @@ def array_with_unit_to_datetime(ndarray values, object unit,
return oresult


cdef get_key(tz):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you type & add a doc-string

if tz is None:
return None
if is_fixed_offset(tz):
# TODO: these should all be mapped together
try:
# pytz
return str(tz._minutes) # pytz specific?
except AttributeError:
try:
# dateutil.tz.tzoffset
return str(tz._offset.total_seconds())
except AttributeError:
return str(tz)
if is_utc(tz):
return 'UTC'
return tz_cache_key(tz)



@cython.wraparound(False)
@cython.boundscheck(False)
cpdef array_to_datetime(ndarray[object] values, str errors='raise',
Expand Down Expand Up @@ -506,16 +528,14 @@ cpdef array_to_datetime(ndarray[object] values, str errors='raise',
bint seen_integer = 0
bint seen_string = 0
bint seen_datetime = 0
bint seen_datetime_offset = 0
bint is_raise = errors=='raise'
bint is_ignore = errors=='ignore'
bint is_coerce = errors=='coerce'
bint is_same_offsets
_TSObject _ts
int64_t value
int out_local=0, out_tzoffset=0
float offset_seconds, tz_offset
set out_tzoffset_vals = set()
dict out_tzinfos = {}

# specify error conditions
assert is_raise or is_ignore or is_coerce
Expand All @@ -533,27 +553,18 @@ cpdef array_to_datetime(ndarray[object] values, str errors='raise',

elif PyDateTime_Check(val):
seen_datetime = 1
if val.tzinfo is not None:
if utc_convert:
_ts = convert_datetime_to_tsobject(val, None)
iresult[i] = _ts.value
else:
raise ValueError('Tz-aware datetime.datetime '
'cannot be converted to '
'datetime64 unless utc=True')
else:
iresult[i] = pydatetime_to_dt64(val, &dts)
if not PyDateTime_CheckExact(val):
# i.e. a Timestamp object
iresult[i] += val.nanosecond
check_dts_bounds(&dts)
out_tzinfos[get_key(val.tzinfo)] = val.tzinfo
_ts = convert_datetime_to_tsobject(val, None)
iresult[i] = _ts.value

elif PyDate_Check(val):
# Treating as either naive or UTC
seen_datetime = 1
iresult[i] = pydate_to_dt64(val, &dts)
check_dts_bounds(&dts)

elif is_datetime64_object(val):
# Treating as either naive or UTC
seen_datetime = 1
iresult[i] = get_datetime64_nanos(val)

Expand Down Expand Up @@ -592,6 +603,8 @@ cpdef array_to_datetime(ndarray[object] values, str errors='raise',
# A ValueError at this point is a _parsing_ error
# specifically _not_ OutOfBoundsDatetime
if _parse_today_now(val, &iresult[i]):
# TODO: Do we treat this as local?
# "now" is UTC, "today" is local
continue
elif require_iso8601:
# if requiring iso8601 strings, skip trying
Expand Down Expand Up @@ -619,16 +632,7 @@ cpdef array_to_datetime(ndarray[object] values, str errors='raise',
# If the dateutil parser returned tzinfo, capture it
# to check if all arguments have the same tzinfo
tz = py_dt.utcoffset()
if tz is not None:
seen_datetime_offset = 1
# dateutil timezone objects cannot be hashed, so
# store the UTC offsets in seconds instead
out_tzoffset_vals.add(tz.total_seconds())
else:
# Add a marker for naive string, to track if we are
# parsing mixed naive and aware strings
out_tzoffset_vals.add('naive')

out_tzinfos[get_key(py_dt.tzinfo)] = py_dt.tzinfo
_ts = convert_datetime_to_tsobject(py_dt, None)
iresult[i] = _ts.value
except:
Expand All @@ -642,17 +646,17 @@ cpdef array_to_datetime(ndarray[object] values, str errors='raise',
# where we left off
value = dtstruct_to_dt64(&dts)
if out_local == 1:
seen_datetime_offset = 1
# Store the out_tzoffset in seconds
# since we store the total_seconds of
# dateutil.tz.tzoffset objects
out_tzoffset_vals.add(out_tzoffset * 60.)
tz = pytz.FixedOffset(out_tzoffset)
out_tzinfos[get_key(tz)] = tz
value = tz_convert_single(value, tz, UTC)
else:
# Add a marker for naive string, to track if we are
# parsing mixed naive and aware strings
out_tzoffset_vals.add('naive')
out_tzinfos[None] = None

iresult[i] = value
check_dts_bounds(&dts)

Expand Down Expand Up @@ -704,20 +708,25 @@ cpdef array_to_datetime(ndarray[object] values, str errors='raise',
return array_to_datetime_object(values, is_raise,
dayfirst, yearfirst)

if seen_datetime_offset and not utc_convert:
# TODO: File bug report with cython. it raises
# Closures Not Supported
# error when I tried to use
# `if any(key is not None for key in out_tzinfos)`
keys = out_tzinfos.keys()
nnkeys = [x for x in keys if x is not None]
if len(nnkeys) and not utc_convert:
# GH#17697
# 1) If all the offsets are equal, return one offset for
# the parsed dates to (maybe) pass to DatetimeIndex
# 2) If the offsets are different, then force the parsing down the
# object path where an array of datetimes
# (with individual dateutil.tzoffsets) are returned
is_same_offsets = len(out_tzoffset_vals) == 1
is_same_offsets = len(out_tzinfos) == 1
if not is_same_offsets:
return array_to_datetime_object(values, is_raise,
dayfirst, yearfirst)
else:
tz_offset = out_tzoffset_vals.pop()
tz_out = pytz.FixedOffset(tz_offset / 60.)
tz_out = list(out_tzinfos.values())[0]
return result, tz_out


Expand Down
57 changes: 4 additions & 53 deletions pandas/_libs/tslibs/conversion.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ from timezones import UTC
from parsing import parse_datetime_string

from nattype import nat_strings
from nattype cimport NPY_NAT, checknull_with_nat, c_NaT as NaT
from nattype cimport NPY_NAT, c_NaT as NaT

# ----------------------------------------------------------------------
# Constants
Expand Down Expand Up @@ -68,6 +68,9 @@ cdef inline int64_t get_datetime64_nanos(object val) except? -1:

unit = get_datetime64_unit(val)

if ival == NPY_NAT:
return ival

if unit != NPY_FR_ns:
pandas_datetime_to_datetimestruct(ival, unit, &dts)
check_dts_bounds(&dts)
Expand Down Expand Up @@ -146,58 +149,6 @@ def ensure_timedelta64ns(arr: ndarray, copy: bool=True):
# TODO: check for overflows when going from a lower-resolution to nanos


@cython.boundscheck(False)
@cython.wraparound(False)
def datetime_to_datetime64(values: object[:]):
"""
Convert ndarray of datetime-like objects to int64 array representing
nanosecond timestamps.

Parameters
----------
values : ndarray[object]

Returns
-------
result : ndarray[int64_t]
inferred_tz : tzinfo or None
"""
cdef:
Py_ssize_t i, n = len(values)
object val, inferred_tz = None
int64_t[:] iresult
npy_datetimestruct dts
_TSObject _ts

result = np.empty(n, dtype='M8[ns]')
iresult = result.view('i8')
for i in range(n):
val = values[i]
if checknull_with_nat(val):
iresult[i] = NPY_NAT
elif PyDateTime_Check(val):
if val.tzinfo is not None:
if inferred_tz is not None:
if not tz_compare(val.tzinfo, inferred_tz):
raise ValueError('Array must be all same time zone')
else:
inferred_tz = get_timezone(val.tzinfo)

_ts = convert_datetime_to_tsobject(val, None)
iresult[i] = _ts.value
check_dts_bounds(&_ts.dts)
else:
if inferred_tz is not None:
raise ValueError('Cannot mix tz-aware with '
'tz-naive values')
iresult[i] = pydatetime_to_dt64(val, &dts)
check_dts_bounds(&dts)
else:
raise TypeError('Unrecognized value type: %s' % type(val))

return result, inferred_tz


cdef inline maybe_datetimelike_to_i8(object val):
"""
Try to convert to a nanosecond timestamp. Fall back to returning the
Expand Down
4 changes: 3 additions & 1 deletion pandas/_libs/tslibs/timezones.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,13 @@ cdef bint is_tzlocal(object tz)
cdef bint treat_tz_as_pytz(object tz)
cdef bint treat_tz_as_dateutil(object tz)

cpdef bint tz_compare(object start, object end)
cpdef bint tz_compare(object start, object end) except? -1
cpdef object get_timezone(object tz)
cpdef object maybe_get_tz(object tz)

cdef get_utcoffset(tzinfo, obj)
cdef bint is_fixed_offset(object tz)

cdef object get_dst_info(object tz)

cpdef object tz_cache_key(object tz)
28 changes: 24 additions & 4 deletions pandas/_libs/tslibs/timezones.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def _p_tz_cache_key(tz):
dst_cache = {}


cdef inline object tz_cache_key(object tz):
cpdef object tz_cache_key(object tz):
"""
Return the key in the cache for the timezone info object or None
if unknown.
Expand Down Expand Up @@ -158,8 +158,22 @@ cdef get_utcoffset(tzinfo, obj):
return tzinfo.utcoffset(obj)


cdef get_fixed_offset_total_seconds(tzinfo tz):
"""
For compat between pytz.FixedOffset, dateutil.tz.tzoffset
"""
if hasattr(tz, "_offset"):
# dateutil, pytz
return tz._offset.total_seconds()
else:
# TODO: Will it ever want an actual datetime?
return tz.utcoffset(None)


cdef inline bint is_fixed_offset(object tz):
if treat_tz_as_dateutil(tz):
if tz is None:
return 0
elif treat_tz_as_dateutil(tz):
if len(tz._trans_idx) == 0 and len(tz._trans_list) == 0:
return 1
else:
Expand All @@ -170,7 +184,9 @@ cdef inline bint is_fixed_offset(object tz):
return 1
else:
return 0
return 1
if not isinstance(tz, tzinfo):
return 0
return 1 # TODO: No!


cdef object get_utc_trans_times_from_dateutil_tz(object tz):
Expand Down Expand Up @@ -295,7 +311,7 @@ def infer_tzinfo(start, end):
return tz


cpdef bint tz_compare(object start, object end):
cpdef bint tz_compare(object start, object end) except? -1:
"""
Compare string representations of timezones

Expand All @@ -321,6 +337,10 @@ cpdef bint tz_compare(object start, object end):

"""
# GH 18523
if is_fixed_offset(start) and is_fixed_offset(end):
start_seconds = get_fixed_offset_total_seconds(start)
end_seconds = get_fixed_offset_total_seconds(end)
return start_seconds == end_seconds
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mroeschke @jreback are we in agreement that two FixedOffsets of matching length should be considered equal?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this seems reasonsable

can u just compare the start == end ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No:

>>> off1 = pytz.FixedOffset(420)
>>> off2 = dateutil.tz.tzoffset(None, 420*60)
>>> off1 == off2
False

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok that makes sense

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds reasonable.

If a user passes both a pytz.FixedOffset and a dateutil.tz.tzoffset will be coercing to one of the tzinfos? Once subtle point is if the dateutil.tz.tzoffset has a name but has the same offset as the pytz.FixedOffset, we should opt to keeping the dateutil instance so we don't drop the name.

return get_timezone(start) == get_timezone(end)


Expand Down
30 changes: 11 additions & 19 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -877,7 +877,7 @@ def maybe_infer_to_datetimelike(value, convert_dates=False):
return value

shape = v.shape
if not v.ndim == 1:
if v.ndim != 1:
v = v.ravel()

if not len(v):
Expand All @@ -887,26 +887,18 @@ def try_datetime(v):
# safe coerce to datetime64
try:
# GH19671
v = tslib.array_to_datetime(v,
require_iso8601=True,
errors='raise')[0]
except ValueError:

# we might have a sequence of the same-datetimes with tz's
# if so coerce to a DatetimeIndex; if they are not the same,
# then these stay as object dtype, xref GH19671
try:
from pandas._libs.tslibs import conversion
from pandas import DatetimeIndex

values, tz = conversion.datetime_to_datetime64(v)
return DatetimeIndex(values).tz_localize(
'UTC').tz_convert(tz=tz)
except (ValueError, TypeError):
pass

v, inferred_tz = tslib.array_to_datetime(v,
require_iso8601=True,
errors='raise')
except Exception:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this still needed?

pass
else:
if inferred_tz is not None:
# TODO: de-duplicate with to_datetime
from pandas import DatetimeIndex
dti = DatetimeIndex(v).tz_localize('UTC')
return dti.tz_convert(tz=inferred_tz)
# TODO: possibly reshape?

return v.reshape(shape)

Expand Down
4 changes: 3 additions & 1 deletion pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,9 +364,11 @@ def _set_axis(self, axis, labels, fastpath=False):
# need to set here because we changed the index
if fastpath:
self._data.set_axis(axis, labels)
except (tslibs.OutOfBoundsDatetime, ValueError):
except (tslibs.OutOfBoundsDatetime, ValueError, TypeError):
# labels may exceeds datetime bounds,
# or not be a DatetimeIndex
# GH#24006 TypeError can occur when all entries are
# datetimes but they do not have matching timezones
pass

self._set_subtyp(is_all_dates)
Expand Down
Loading