-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
BUG/API: to_datetime preserves UTC offsets when parsing datetime strings #21822
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 15 commits
ac5a3d1
b81a8e9
6bf46a8
678b337
1917148
581a33e
a1bc8f9
80042e6
7c4135e
0651416
bacb6e3
a2f4aad
d48f341
7efb25c
1d527ff
f89d6b6
d91c63f
1054e8b
d9cdc91
7d04613
031284c
749e62e
23cbf75
1e6f87a
7539bcf
c1f51cd
db75a24
4733ac5
e3db735
2fa681f
5f36c98
d7ff275
4ff7cb3
8463d91
dddc6b3
cca3983
e441be0
a8a65f7
75f9fd9
6052475
f916c69
807a251
1cbd9b9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -320,7 +320,7 @@ cpdef array_with_unit_to_datetime(ndarray values, unit, errors='coerce'): | |
if unit == 'ns': | ||
if issubclass(values.dtype.type, np.integer): | ||
return values.astype('M8[ns]') | ||
return array_to_datetime(values.astype(object), errors=errors) | ||
return array_to_datetime(values.astype(object), errors=errors)[0] | ||
|
||
m = cast_from_unit(None, unit) | ||
|
||
|
@@ -449,26 +449,51 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', | |
dayfirst=False, yearfirst=False, | ||
format=None, utc=None, | ||
require_iso8601=False): | ||
""" | ||
Converts a 1D array of date-like values to a numpy array of either: | ||
1) datetime64[ns] data | ||
2) datetime.datetime objects, if OutOfBoundsDatetime or TypeError | ||
is encountered | ||
|
||
Also returns a pytz.FixedOffset if an array of strings with the same | ||
timezone offset if passed and utc=True is not passed | ||
|
||
Handles datetime.date, datetime.datetime, np.datetime64 objects, numeric, | ||
strings | ||
|
||
Returns | ||
------- | ||
(ndarray, timezone offset) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. underscore between timezone and offset? |
||
""" | ||
cdef: | ||
Py_ssize_t i, n = len(values) | ||
object val, py_dt | ||
object val, py_dt, tz, tz_out = None | ||
ndarray[int64_t] iresult | ||
ndarray[object] oresult | ||
pandas_datetimestruct dts | ||
bint utc_convert = bool(utc) | ||
bint seen_integer = 0 | ||
bint seen_string = 0 | ||
bint seen_datetime = 0 | ||
bint seen_datetime_offset = 0 | ||
bint is_raise = errors=='raise' | ||
bint is_ignore = errors=='ignore' | ||
bint is_coerce = errors=='coerce' | ||
_TSObject _ts | ||
int out_local=0, out_tzoffset=0 | ||
# Can't directly create a ndarray[int] out_local, | ||
# since most np.array constructors expect a long dtype | ||
# while _string_to_dts expects purely int | ||
# maybe something I am missing? | ||
ndarray[int64_t] out_local_values | ||
ndarray[int64_t] out_tzoffset_vals | ||
|
||
# specify error conditions | ||
assert is_raise or is_ignore or is_coerce | ||
|
||
try: | ||
out_local_values = np.empty(n, dtype=np.int64) | ||
out_tzoffset_vals = np.empty(n, dtype=np.int64) | ||
result = np.empty(n, dtype='M8[ns]') | ||
iresult = result.view('i8') | ||
for i in range(n): | ||
|
@@ -576,7 +601,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', | |
raise ValueError("time data {val} doesn't match " | ||
"format specified" | ||
.format(val=val)) | ||
return values | ||
return values, tz_out | ||
|
||
try: | ||
py_dt = parse_datetime_string(val, dayfirst=dayfirst, | ||
|
@@ -604,8 +629,11 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', | |
else: | ||
# No error raised by string_to_dts, pick back up | ||
# where we left off | ||
out_tzoffset_vals[i] = out_tzoffset | ||
out_local_values[i] = out_local | ||
value = dtstruct_to_dt64(&dts) | ||
if out_local == 1: | ||
seen_datetime_offset = 1 | ||
tz = pytz.FixedOffset(out_tzoffset) | ||
value = tz_convert_single(value, tz, 'UTC') | ||
iresult[i] = value | ||
|
@@ -623,7 +651,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', | |
raise ValueError("time data {val} doesn't " | ||
"match format specified" | ||
.format(val=val)) | ||
return values | ||
return values, tz_out | ||
raise | ||
|
||
else: | ||
|
@@ -649,7 +677,22 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', | |
else: | ||
raise TypeError | ||
|
||
return result | ||
if seen_datetime_offset and not utc_convert: | ||
# GH 17697 | ||
# 1) If all the offsets are equal, return one pytz.FixedOffset for | ||
# the parsed dates to (maybe) pass to DatetimeIndex | ||
# 2) If the offsets are different, then force the parsing down the | ||
# object path where an array of datetimes | ||
# (with individual datutil.tzoffsets) are returned | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. typo datutil |
||
|
||
# Faster to compare integers than to compare objects | ||
is_same_offsets = (out_tzoffset_vals[0] == out_tzoffset_vals).all() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There may be a perf tradeoff here, specifically in the case where we have all-strings, all of which are ISO, but that don't have matching timezones. Going through the The various paths (including |
||
if not is_same_offsets: | ||
raise TypeError | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This (pre-existing) pattern is pretty opaque to a first-time reader. What if instead of raising |
||
else: | ||
tz_out = pytz.FixedOffset(out_tzoffset_vals[0]) | ||
|
||
return result, tz_out | ||
except OutOfBoundsDatetime: | ||
if is_raise: | ||
raise | ||
|
@@ -671,7 +714,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', | |
oresult[i] = val.item() | ||
else: | ||
oresult[i] = val | ||
return oresult | ||
return oresult, tz_out | ||
except TypeError: | ||
oresult = np.empty(n, dtype=object) | ||
|
||
|
@@ -693,14 +736,13 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', | |
except Exception: | ||
if is_raise: | ||
raise | ||
return values | ||
# oresult[i] = val | ||
return values, tz_out | ||
else: | ||
if is_raise: | ||
raise | ||
return values | ||
return values, tz_out | ||
|
||
return oresult | ||
return oresult, tz_out | ||
|
||
|
||
cdef inline bint _parse_today_now(str val, int64_t* iresult): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,7 +23,8 @@ | |
is_float, | ||
is_list_like, | ||
is_scalar, | ||
is_numeric_dtype) | ||
is_numeric_dtype, | ||
is_object_dtype) | ||
from pandas.core.dtypes.generic import ( | ||
ABCIndexClass, ABCSeries, | ||
ABCDataFrame) | ||
|
@@ -266,17 +267,24 @@ def _convert_listlike_datetimes(arg, box, format, name=None, tz=None, | |
result = arg | ||
|
||
if result is None and (format is None or infer_datetime_format): | ||
result = tslib.array_to_datetime( | ||
result, tz_parsed = tslib.array_to_datetime( | ||
arg, | ||
errors=errors, | ||
utc=tz == 'utc', | ||
dayfirst=dayfirst, | ||
yearfirst=yearfirst, | ||
require_iso8601=require_iso8601 | ||
) | ||
if tz_parsed is not None and box: | ||
return DatetimeIndex._simple_new(result, name=name, | ||
tz=tz_parsed) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. case with multiple tzs that has to get wrapped in object-dtype? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That case will result in |
||
|
||
if is_datetime64_dtype(result) and box: | ||
result = DatetimeIndex(result, tz=tz, name=name) | ||
if box: | ||
if is_datetime64_dtype(result): | ||
return DatetimeIndex(result, tz=tz, name=name) | ||
elif is_object_dtype(result): | ||
from pandas import Index | ||
return Index(result, name=name) | ||
return result | ||
|
||
except ValueError as e: | ||
|
@@ -404,7 +412,7 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, | |
datetime.datetime objects as well). | ||
box : boolean, default True | ||
|
||
- If True returns a DatetimeIndex | ||
- If True returns a DatetimeIndex or Index | ||
- If False returns ndarray of values. | ||
format : string, default None | ||
strftime to parse time, eg "%d/%m/%Y", note that "%f" will parse | ||
|
@@ -696,7 +704,7 @@ def calc(carg): | |
parsed = parsing.try_parse_year_month_day(carg / 10000, | ||
carg / 100 % 100, | ||
carg % 100) | ||
return tslib.array_to_datetime(parsed, errors=errors) | ||
return tslib.array_to_datetime(parsed, errors=errors)[0] | ||
|
||
def calc_with_mask(carg, mask): | ||
result = np.empty(carg.shape, dtype='M8[ns]') | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In principle other tzinfos could be returned, specifically if it falls back to dateutil
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is specifying that
array_to_datetime
function itself can return apytz.FixedOffset
orNone
from the C parser output. If it goes through the dateutil parser in the non-ValueError
branch, I don't think there's a way to recover the timezone?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
When
parse_datetime_string
is called if there's a timezone it should return a tz-awaredatetime
object, so the tzinfo can be pulled off that can't it?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh right that's true, good catch. Sure I will try to add some tests and functionality to hit the dateutil parser before the object branch.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So I am using a
set
to store the parsed timezone offsets (which should be more performant that using an array in theory / I was having some trouble using an array due to duplicates), howeverdateutil.tz.tzoffset
s cannot be hashed: dateutil/dateutil#792So instead, I am saving the
total_seconds()
of the dateutil tzoffset in the set instead and reconstructing the offsets aspytz.FixedOffset
sThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you add Parameters here