Skip to content

Commit 9a8cebc

Browse files
mroeschkejreback
authored andcommitted
BUG/API: to_datetime preserves UTC offsets when parsing datetime strings (#21822)
1 parent cf14366 commit 9a8cebc

File tree

14 files changed

+369
-100
lines changed

14 files changed

+369
-100
lines changed

asv_bench/benchmarks/timeseries.py

+19
Original file line numberDiff line numberDiff line change
@@ -343,6 +343,25 @@ def time_iso8601_tz_spaceformat(self):
343343
to_datetime(self.strings_tz_space)
344344

345345

346+
class ToDatetimeNONISO8601(object):
347+
348+
goal_time = 0.2
349+
350+
def setup(self):
351+
N = 10000
352+
half = int(N / 2)
353+
ts_string_1 = 'March 1, 2018 12:00:00+0400'
354+
ts_string_2 = 'March 1, 2018 12:00:00+0500'
355+
self.same_offset = [ts_string_1] * N
356+
self.diff_offset = [ts_string_1] * half + [ts_string_2] * half
357+
358+
def time_same_offset(self):
359+
to_datetime(self.same_offset)
360+
361+
def time_different_offset(self):
362+
to_datetime(self.diff_offset)
363+
364+
346365
class ToDatetimeFormat(object):
347366

348367
goal_time = 0.2

doc/source/whatsnew/v0.24.0.txt

+57
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,62 @@ For situations where you need an ``ndarray`` of ``Interval`` objects, use
224224
np.asarray(idx)
225225
idx.values.astype(object)
226226

227+
.. _whatsnew_0240.api.timezone_offset_parsing:
228+
229+
Parsing Datetime Strings with Timezone Offsets
230+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
231+
232+
Previously, parsing datetime strings with UTC offsets with :func:`to_datetime`
233+
or :class:`DatetimeIndex` would automatically convert the datetime to UTC
234+
without timezone localization. This is inconsistent from parsing the same
235+
datetime string with :class:`Timestamp` which would preserve the UTC
236+
offset in the ``tz`` attribute. Now, :func:`to_datetime` preserves the UTC
237+
offset in the ``tz`` attribute when all the datetime strings have the same
238+
UTC offset (:issue:`17697`, :issue:`11736`)
239+
240+
*Previous Behavior*:
241+
242+
.. code-block:: ipython
243+
244+
245+
In [2]: pd.to_datetime("2015-11-18 15:30:00+05:30")
246+
Out[2]: Timestamp('2015-11-18 10:00:00')
247+
248+
In [3]: pd.Timestamp("2015-11-18 15:30:00+05:30")
249+
Out[3]: Timestamp('2015-11-18 15:30:00+0530', tz='pytz.FixedOffset(330)')
250+
251+
# Different UTC offsets would automatically convert the datetimes to UTC (without a UTC timezone)
252+
In [4]: pd.to_datetime(["2015-11-18 15:30:00+05:30", "2015-11-18 16:30:00+06:30"])
253+
Out[4]: DatetimeIndex(['2015-11-18 10:00:00', '2015-11-18 10:00:00'], dtype='datetime64[ns]', freq=None)
254+
255+
*Current Behavior*:
256+
257+
.. ipython:: python
258+
259+
pd.to_datetime("2015-11-18 15:30:00+05:30")
260+
pd.Timestamp("2015-11-18 15:30:00+05:30")
261+
262+
Parsing datetime strings with the same UTC offset will preserve the UTC offset in the ``tz``
263+
264+
.. ipython:: python
265+
266+
pd.to_datetime(["2015-11-18 15:30:00+05:30"] * 2)
267+
268+
Parsing datetime strings with different UTC offsets will now create an Index of
269+
``datetime.datetime`` objects with different UTC offsets
270+
271+
.. ipython:: python
272+
273+
idx = pd.to_datetime(["2015-11-18 15:30:00+05:30", "2015-11-18 16:30:00+06:30"])
274+
idx
275+
idx[0]
276+
idx[1]
277+
278+
Passing ``utc=True`` will mimic the previous behavior but will correctly indicate
279+
that the dates have been converted to UTC
280+
281+
.. ipython:: python
282+
pd.to_datetime(["2015-11-18 15:30:00+05:30", "2015-11-18 16:30:00+06:30"], utc=True)
227283

228284
.. _whatsnew_0240.api.datetimelike.normalize:
229285

@@ -439,6 +495,7 @@ Datetimelike
439495

440496
- Fixed bug where two :class:`DateOffset` objects with different ``normalize`` attributes could evaluate as equal (:issue:`21404`)
441497
- Fixed bug where :meth:`Timestamp.resolution` incorrectly returned 1-microsecond ``timedelta`` instead of 1-nanosecond :class:`Timedelta` (:issue:`21336`,:issue:`21365`)
498+
- Bug in :func:`to_datetime` that did not consistently return an :class:`Index` when ``box=True`` was specified (:issue:`21864`)
442499

443500
Timedelta
444501
^^^^^^^^^

pandas/_libs/tslib.pyx

+145-29
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ import numpy as np
1919
cnp.import_array()
2020

2121
import pytz
22+
from dateutil.tz import tzlocal, tzutc as dateutil_utc
2223

2324

2425
from util cimport (is_integer_object, is_float_object, is_string_object,
@@ -328,7 +329,7 @@ cpdef array_with_unit_to_datetime(ndarray values, unit, errors='coerce'):
328329
if unit == 'ns':
329330
if issubclass(values.dtype.type, np.integer):
330331
return values.astype('M8[ns]')
331-
return array_to_datetime(values.astype(object), errors=errors)
332+
return array_to_datetime(values.astype(object), errors=errors)[0]
332333

333334
m = cast_from_unit(None, unit)
334335

@@ -457,21 +458,58 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise',
457458
dayfirst=False, yearfirst=False,
458459
format=None, utc=None,
459460
require_iso8601=False):
461+
"""
462+
Converts a 1D array of date-like values to a numpy array of either:
463+
1) datetime64[ns] data
464+
2) datetime.datetime objects, if OutOfBoundsDatetime or TypeError
465+
is encountered
466+
467+
Also returns a pytz.FixedOffset if an array of strings with the same
468+
timezone offset is passed and utc=True is not passed. Otherwise, None
469+
is returned
470+
471+
Handles datetime.date, datetime.datetime, np.datetime64 objects, numeric,
472+
strings
473+
474+
Parameters
475+
----------
476+
values : ndarray of object
477+
date-like objects to convert
478+
errors : str, default 'raise'
479+
error behavior when parsing
480+
dayfirst : bool, default False
481+
dayfirst parsing behavior when encountering datetime strings
482+
yearfirst : bool, default False
483+
yearfirst parsing behavior when encountering datetime strings
484+
format : str, default None
485+
format of the string to parse
486+
utc : bool, default None
487+
indicator whether the dates should be UTC
488+
require_iso8601 : bool, default False
489+
indicator whether the datetime string should be iso8601
490+
491+
Returns
492+
-------
493+
tuple (ndarray, tzoffset)
494+
"""
460495
cdef:
461496
Py_ssize_t i, n = len(values)
462-
object val, py_dt
497+
object val, py_dt, tz, tz_out = None
463498
ndarray[int64_t] iresult
464499
ndarray[object] oresult
465500
npy_datetimestruct dts
466501
bint utc_convert = bool(utc)
467502
bint seen_integer = 0
468503
bint seen_string = 0
469504
bint seen_datetime = 0
505+
bint seen_datetime_offset = 0
470506
bint is_raise = errors=='raise'
471507
bint is_ignore = errors=='ignore'
472508
bint is_coerce = errors=='coerce'
473509
_TSObject _ts
474510
int out_local=0, out_tzoffset=0
511+
float offset_seconds
512+
set out_tzoffset_vals = set()
475513

476514
# specify error conditions
477515
assert is_raise or is_ignore or is_coerce
@@ -584,7 +622,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise',
584622
raise ValueError("time data {val} doesn't match "
585623
"format specified"
586624
.format(val=val))
587-
return values
625+
return values, tz_out
588626

589627
try:
590628
py_dt = parse_datetime_string(val, dayfirst=dayfirst,
@@ -595,6 +633,30 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise',
595633
continue
596634
raise TypeError("invalid string coercion to datetime")
597635

636+
# If the dateutil parser returned tzinfo, capture it
637+
# to check if all arguments have the same tzinfo
638+
tz = py_dt.tzinfo
639+
if tz is not None:
640+
seen_datetime_offset = 1
641+
if tz == dateutil_utc():
642+
# dateutil.tz.tzutc has no offset-like attribute
643+
# Just add the 0 offset explicitly
644+
out_tzoffset_vals.add(0)
645+
elif tz == tzlocal():
646+
# is comparison fails unlike other dateutil.tz
647+
# objects. Also, dateutil.tz.tzlocal has no
648+
# _offset attribute like tzoffset
649+
offset_seconds = tz._dst_offset.total_seconds()
650+
out_tzoffset_vals.add(offset_seconds)
651+
else:
652+
# dateutil.tz.tzoffset objects cannot be hashed
653+
# store the total_seconds() instead
654+
offset_seconds = tz._offset.total_seconds()
655+
out_tzoffset_vals.add(offset_seconds)
656+
else:
657+
# Add a marker for naive string, to track if we are
658+
# parsing mixed naive and aware strings
659+
out_tzoffset_vals.add('naive')
598660
try:
599661
_ts = convert_datetime_to_tsobject(py_dt, None)
600662
iresult[i] = _ts.value
@@ -614,8 +676,17 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise',
614676
# where we left off
615677
value = dtstruct_to_dt64(&dts)
616678
if out_local == 1:
679+
seen_datetime_offset = 1
680+
# Store the out_tzoffset in seconds
681+
# since we store the total_seconds of
682+
# dateutil.tz.tzoffset objects
683+
out_tzoffset_vals.add(out_tzoffset * 60.)
617684
tz = pytz.FixedOffset(out_tzoffset)
618685
value = tz_convert_single(value, tz, 'UTC')
686+
else:
687+
# Add a marker for naive string, to track if we are
688+
# parsing mixed naive and aware strings
689+
out_tzoffset_vals.add('naive')
619690
iresult[i] = value
620691
try:
621692
check_dts_bounds(&dts)
@@ -631,7 +702,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise',
631702
raise ValueError("time data {val} doesn't "
632703
"match format specified"
633704
.format(val=val))
634-
return values
705+
return values, tz_out
635706
raise
636707

637708
else:
@@ -657,7 +728,21 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise',
657728
else:
658729
raise TypeError
659730

660-
return result
731+
if seen_datetime_offset and not utc_convert:
732+
# GH 17697
733+
# 1) If all the offsets are equal, return one offset for
734+
# the parsed dates to (maybe) pass to DatetimeIndex
735+
# 2) If the offsets are different, then force the parsing down the
736+
# object path where an array of datetimes
737+
# (with individual dateutil.tzoffsets) are returned
738+
is_same_offsets = len(out_tzoffset_vals) == 1
739+
if not is_same_offsets:
740+
return array_to_datetime_object(values, is_raise,
741+
dayfirst, yearfirst)
742+
else:
743+
tz_offset = out_tzoffset_vals.pop()
744+
tz_out = pytz.FixedOffset(tz_offset / 60.)
745+
return result, tz_out
661746
except OutOfBoundsDatetime:
662747
if is_raise:
663748
raise
@@ -679,36 +764,67 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise',
679764
oresult[i] = val.item()
680765
else:
681766
oresult[i] = val
682-
return oresult
767+
return oresult, tz_out
683768
except TypeError:
684-
oresult = np.empty(n, dtype=object)
769+
return array_to_datetime_object(values, is_raise, dayfirst, yearfirst)
685770

686-
for i in range(n):
687-
val = values[i]
688-
if checknull_with_nat(val):
689-
oresult[i] = val
690-
elif is_string_object(val):
691771

692-
if len(val) == 0 or val in nat_strings:
693-
oresult[i] = 'NaT'
694-
continue
772+
cdef array_to_datetime_object(ndarray[object] values, bint is_raise,
773+
dayfirst=False, yearfirst=False):
774+
"""
775+
Fall back function for array_to_datetime
695776
696-
try:
697-
oresult[i] = parse_datetime_string(val, dayfirst=dayfirst,
698-
yearfirst=yearfirst)
699-
pydatetime_to_dt64(oresult[i], &dts)
700-
check_dts_bounds(&dts)
701-
except Exception:
702-
if is_raise:
703-
raise
704-
return values
705-
# oresult[i] = val
706-
else:
777+
Attempts to parse datetime strings with dateutil to return an array
778+
of datetime objects
779+
780+
Parameters
781+
----------
782+
values : ndarray of object
783+
date-like objects to convert
784+
is_raise : bool
785+
error behavior when parsing
786+
dayfirst : bool, default False
787+
dayfirst parsing behavior when encountering datetime strings
788+
yearfirst : bool, default False
789+
yearfirst parsing behavior when encountering datetime strings
790+
791+
Returns
792+
-------
793+
tuple (ndarray, None)
794+
"""
795+
cdef:
796+
Py_ssize_t i, n = len(values)
797+
object val,
798+
ndarray[object] oresult
799+
npy_datetimestruct dts
800+
801+
oresult = np.empty(n, dtype=object)
802+
803+
# We return an object array and only attempt to parse:
804+
# 1) NaT or NaT-like values
805+
# 2) datetime strings, which we return as datetime.datetime
806+
for i in range(n):
807+
val = values[i]
808+
if checknull_with_nat(val):
809+
oresult[i] = val
810+
elif is_string_object(val):
811+
if len(val) == 0 or val in nat_strings:
812+
oresult[i] = 'NaT'
813+
continue
814+
try:
815+
oresult[i] = parse_datetime_string(val, dayfirst=dayfirst,
816+
yearfirst=yearfirst)
817+
pydatetime_to_dt64(oresult[i], &dts)
818+
check_dts_bounds(&dts)
819+
except (ValueError, OverflowError):
707820
if is_raise:
708821
raise
709-
return values
710-
711-
return oresult
822+
return values, None
823+
else:
824+
if is_raise:
825+
raise
826+
return values, None
827+
return oresult, None
712828

713829

714830
cdef inline bint _parse_today_now(str val, int64_t* iresult):

pandas/core/dtypes/cast.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -918,7 +918,7 @@ def try_datetime(v):
918918
# GH19671
919919
v = tslib.array_to_datetime(v,
920920
require_iso8601=True,
921-
errors='raise')
921+
errors='raise')[0]
922922
except ValueError:
923923

924924
# we might have a sequence of the same-datetimes with tz's

0 commit comments

Comments
 (0)