Skip to content

Commit 502919e

Browse files
authored
BUG refactor datetime parsing and fix 8 bugs (#50242)
1 parent 0720b03 commit 502919e

File tree

11 files changed

+258
-177
lines changed

11 files changed

+258
-177
lines changed

doc/source/whatsnew/v2.0.0.rst

+8
Original file line numberDiff line numberDiff line change
@@ -781,6 +781,7 @@ Performance improvements
781781
- Performance improvement in :func:`merge` when not merging on the index - the new index will now be :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`49478`)
782782
- Performance improvement in :meth:`DataFrame.to_dict` and :meth:`Series.to_dict` when using any non-object dtypes (:issue:`46470`)
783783
- Performance improvement in :func:`read_html` when there are multiple tables (:issue:`49929`)
784+
- Performance improvement in :func:`to_datetime` when using ``'%Y%m%d'`` format (:issue:`17410`)
784785

785786
.. ---------------------------------------------------------------------------
786787
.. _whatsnew_200.bug_fixes:
@@ -810,6 +811,13 @@ Datetimelike
810811
- Bug in :meth:`Timestamp.round` when the ``freq`` argument has zero-duration (e.g. "0ns") returning incorrect results instead of raising (:issue:`49737`)
811812
- Bug in :func:`to_datetime` was not raising ``ValueError`` when invalid format was passed and ``errors`` was ``'ignore'`` or ``'coerce'`` (:issue:`50266`)
812813
- Bug in :class:`DateOffset` was throwing ``TypeError`` when constructing with milliseconds and another super-daily argument (:issue:`49897`)
814+
- Bug in :func:`to_datetime` was not raising ``ValueError`` when parsing string with decimal date with format ``'%Y%m%d'`` (:issue:`50051`)
815+
- Bug in :func:`to_datetime` was not converting ``None`` to ``NaT`` when parsing mixed-offset date strings with ISO8601 format (:issue:`50071`)
816+
- Bug in :func:`to_datetime` was not returning input when parsing out-of-bounds date string with ``errors='ignore'`` and ``format='%Y%m%d'`` (:issue:`14487`)
817+
- Bug in :func:`to_datetime` was converting timezone-naive ``datetime.datetime`` to timezone-aware when parsing with timezone-aware strings, ISO8601 format, and ``utc=False`` (:issue:`50254`)
818+
- Bug in :func:`to_datetime` was throwing ``ValueError`` when parsing dates with ISO8601 format where some values were not zero-padded (:issue:`21422`)
819+
- Bug in :func:`to_datetime` was giving incorrect results when using ``format='%Y%m%d'`` and ``errors='ignore'`` (:issue:`26493`)
820+
- Bug in :func:`to_datetime` was failing to parse date strings ``'today'`` and ``'now'`` if ``format`` was not ISO8601 (:issue:`50359`)
813821
-
814822

815823
Timedelta

pandas/_libs/tslib.pyi

-3
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,6 @@ def array_to_datetime(
2323
dayfirst: bool = ...,
2424
yearfirst: bool = ...,
2525
utc: bool = ...,
26-
require_iso8601: bool = ...,
27-
format: str | None = ...,
28-
exact: bool = ...,
2926
) -> tuple[np.ndarray, tzinfo | None]: ...
3027

3128
# returned ndarray may be object dtype or datetime64[ns]

pandas/_libs/tslib.pyx

+3-61
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ from pandas._libs.tslibs.np_datetime cimport (
3838
pydatetime_to_dt64,
3939
string_to_dts,
4040
)
41+
from pandas._libs.tslibs.strptime cimport parse_today_now
4142
from pandas._libs.util cimport (
4243
is_datetime64_object,
4344
is_float_object,
@@ -443,9 +444,6 @@ cpdef array_to_datetime(
443444
bint dayfirst=False,
444445
bint yearfirst=False,
445446
bint utc=False,
446-
bint require_iso8601=False,
447-
format: str | None=None,
448-
bint exact=True,
449447
):
450448
"""
451449
Converts a 1D array of date-like values to a numpy array of either:
@@ -472,8 +470,6 @@ cpdef array_to_datetime(
472470
yearfirst parsing behavior when encountering datetime strings
473471
utc : bool, default False
474472
indicator whether the dates should be UTC
475-
require_iso8601 : bool, default False
476-
indicator whether the datetime string should be iso8601
477473
478474
Returns
479475
-------
@@ -539,16 +535,6 @@ cpdef array_to_datetime(
539535
iresult[i] = get_datetime64_nanos(val, NPY_FR_ns)
540536

541537
elif is_integer_object(val) or is_float_object(val):
542-
if require_iso8601:
543-
if is_coerce:
544-
iresult[i] = NPY_NAT
545-
continue
546-
elif is_raise:
547-
raise ValueError(
548-
f"time data \"{val}\" doesn't "
549-
f"match format \"{format}\", at position {i}"
550-
)
551-
return values, tz_out
552538
# these must be ns unit by-definition
553539

554540
if val != val or val == NPY_NAT:
@@ -578,25 +564,13 @@ cpdef array_to_datetime(
578564

579565
string_to_dts_failed = string_to_dts(
580566
val, &dts, &out_bestunit, &out_local,
581-
&out_tzoffset, False, format, exact
567+
&out_tzoffset, False, None, False
582568
)
583569
if string_to_dts_failed:
584570
# An error at this point is a _parsing_ error
585571
# specifically _not_ OutOfBoundsDatetime
586-
if _parse_today_now(val, &iresult[i], utc):
572+
if parse_today_now(val, &iresult[i], utc):
587573
continue
588-
elif require_iso8601:
589-
# if requiring iso8601 strings, skip trying
590-
# other formats
591-
if is_coerce:
592-
iresult[i] = NPY_NAT
593-
continue
594-
elif is_raise:
595-
raise ValueError(
596-
f"time data \"{val}\" doesn't "
597-
f"match format \"{format}\", at position {i}"
598-
)
599-
return values, tz_out
600574

601575
try:
602576
py_dt = parse_datetime_string(val,
@@ -659,18 +633,6 @@ cpdef array_to_datetime(
659633
if is_coerce:
660634
iresult[i] = NPY_NAT
661635
continue
662-
elif require_iso8601 and isinstance(val, str):
663-
# GH#19382 for just-barely-OutOfBounds falling back to
664-
# dateutil parser will return incorrect result because
665-
# it will ignore nanoseconds
666-
if is_raise:
667-
668-
# Still raise OutOfBoundsDatetime,
669-
# as error message is informative.
670-
raise
671-
672-
assert is_ignore
673-
return values, tz_out
674636
raise
675637

676638
except OutOfBoundsDatetime:
@@ -818,26 +780,6 @@ cdef _array_to_datetime_object(
818780
return oresult, None
819781

820782

821-
cdef bint _parse_today_now(str val, int64_t* iresult, bint utc):
822-
# We delay this check for as long as possible
823-
# because it catches relatively rare cases
824-
825-
# Multiply by 1000 to convert to nanos, since these methods naturally have
826-
# microsecond resolution
827-
if val == "now":
828-
if utc:
829-
iresult[0] = Timestamp.utcnow().value * 1000
830-
else:
831-
# GH#18705 make sure to_datetime("now") matches Timestamp("now")
832-
# Note using Timestamp.now() is faster than Timestamp("now")
833-
iresult[0] = Timestamp.now().value * 1000
834-
return True
835-
elif val == "today":
836-
iresult[0] = Timestamp.today().value * 1000
837-
return True
838-
return False
839-
840-
841783
def array_to_datetime_with_tz(ndarray values, tzinfo tz):
842784
"""
843785
Vectorized analogue to pd.Timestamp(value, tz=tz)

pandas/_libs/tslibs/parsing.pyi

-1
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@ def try_parse_datetime_components(
4040
minutes: npt.NDArray[np.object_], # object[:]
4141
seconds: npt.NDArray[np.object_], # object[:]
4242
) -> npt.NDArray[np.object_]: ...
43-
def format_is_iso(f: str) -> bool: ...
4443
def guess_datetime_format(
4544
dt_str,
4645
dayfirst: bool | None = ...,

pandas/_libs/tslibs/parsing.pyx

-20
Original file line numberDiff line numberDiff line change
@@ -818,26 +818,6 @@ class _timelex:
818818
_DATEUTIL_LEXER_SPLIT = _timelex.split
819819

820820

821-
def format_is_iso(f: str) -> bint:
822-
"""
823-
Does format match the iso8601 set that can be handled by the C parser?
824-
Generally of form YYYY-MM-DDTHH:MM:SS - date separator can be different
825-
but must be consistent. Leading 0s in dates and times are optional.
826-
"""
827-
iso_template = "%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M:%S{micro_or_tz}".format
828-
excluded_formats = ["%Y%m%d", "%Y%m", "%Y"]
829-
830-
for date_sep in [" ", "/", "\\", "-", ".", ""]:
831-
for time_sep in [" ", "T"]:
832-
for micro_or_tz in ["", "%z", ".%f", ".%f%z"]:
833-
if (iso_template(date_sep=date_sep,
834-
time_sep=time_sep,
835-
micro_or_tz=micro_or_tz,
836-
).startswith(f) and f not in excluded_formats):
837-
return True
838-
return False
839-
840-
841821
def guess_datetime_format(dt_str: str, bint dayfirst=False) -> str | None:
842822
"""
843823
Guess the datetime format of a given datetime string.

pandas/_libs/tslibs/strptime.pxd

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
from numpy cimport int64_t
2+
3+
4+
cdef bint parse_today_now(str val, int64_t* iresult, bint utc)

pandas/_libs/tslibs/strptime.pyx

+74
Original file line numberDiff line numberDiff line change
@@ -34,12 +34,14 @@ from pandas._libs.tslibs.nattype cimport (
3434
c_nat_strings as nat_strings,
3535
)
3636
from pandas._libs.tslibs.np_datetime cimport (
37+
NPY_DATETIMEUNIT,
3738
NPY_FR_ns,
3839
check_dts_bounds,
3940
npy_datetimestruct,
4041
npy_datetimestruct_to_datetime,
4142
pydate_to_dt64,
4243
pydatetime_to_dt64,
44+
string_to_dts,
4345
)
4446
from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime
4547
from pandas._libs.tslibs.timestamps cimport _Timestamp
@@ -48,9 +50,50 @@ from pandas._libs.util cimport (
4850
is_float_object,
4951
is_integer_object,
5052
)
53+
from pandas._libs.tslibs.timestamps import Timestamp
5154

5255
cnp.import_array()
5356

57+
cdef bint format_is_iso(f: str):
58+
"""
59+
Does format match the iso8601 set that can be handled by the C parser?
60+
Generally of form YYYY-MM-DDTHH:MM:SS - date separator can be different
61+
but must be consistent. Leading 0s in dates and times are optional.
62+
"""
63+
excluded_formats = ["%Y%m"]
64+
65+
for date_sep in [" ", "/", "\\", "-", ".", ""]:
66+
for time_sep in [" ", "T"]:
67+
for micro_or_tz in ["", "%z", ".%f", ".%f%z"]:
68+
iso_fmt = f"%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M:%S{micro_or_tz}"
69+
if iso_fmt.startswith(f) and f not in excluded_formats:
70+
return True
71+
return False
72+
73+
74+
def _test_format_is_iso(f: str) -> bool:
75+
"""Only used in testing."""
76+
return format_is_iso(f)
77+
78+
79+
cdef bint parse_today_now(str val, int64_t* iresult, bint utc):
80+
# We delay this check for as long as possible
81+
# because it catches relatively rare cases
82+
83+
# Multiply by 1000 to convert to nanos, since these methods naturally have
84+
# microsecond resolution
85+
if val == "now":
86+
if utc:
87+
iresult[0] = Timestamp.utcnow().value * 1000
88+
else:
89+
# GH#18705 make sure to_datetime("now") matches Timestamp("now")
90+
# Note using Timestamp.now() is faster than Timestamp("now")
91+
iresult[0] = Timestamp.now().value * 1000
92+
return True
93+
elif val == "today":
94+
iresult[0] = Timestamp.today().value * 1000
95+
return True
96+
return False
5497

5598
cdef dict _parse_code_table = {"y": 0,
5699
"Y": 1,
@@ -111,6 +154,9 @@ def array_strptime(
111154
bint found_naive = False
112155
bint found_tz = False
113156
tzinfo tz_out = None
157+
bint iso_format = fmt is not None and format_is_iso(fmt)
158+
NPY_DATETIMEUNIT out_bestunit
159+
int out_local = 0, out_tzoffset = 0
114160

115161
assert is_raise or is_ignore or is_coerce
116162

@@ -232,6 +278,34 @@ def array_strptime(
232278
else:
233279
val = str(val)
234280

281+
if iso_format:
282+
string_to_dts_failed = string_to_dts(
283+
val, &dts, &out_bestunit, &out_local,
284+
&out_tzoffset, False, fmt, exact
285+
)
286+
if not string_to_dts_failed:
287+
# No error reported by string_to_dts, pick back up
288+
# where we left off
289+
value = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts)
290+
if out_local == 1:
291+
# Store the out_tzoffset in seconds
292+
# since we store the total_seconds of
293+
# dateutil.tz.tzoffset objects
294+
tz = timezone(timedelta(minutes=out_tzoffset))
295+
result_timezone[i] = tz
296+
out_local = 0
297+
out_tzoffset = 0
298+
iresult[i] = value
299+
check_dts_bounds(&dts)
300+
continue
301+
302+
if parse_today_now(val, &iresult[i], utc):
303+
continue
304+
305+
# Some ISO formats can't be parsed by string_to_dts
306+
# For example, 6-digit YYYYMD. So, if there's an error,
307+
# try the string-matching code below.
308+
235309
# exact matching
236310
if exact:
237311
found = format_regex.match(val)

pandas/core/arrays/datetimes.py

-7
Original file line numberDiff line numberDiff line change
@@ -2118,10 +2118,7 @@ def objects_to_datetime64ns(
21182118
yearfirst,
21192119
utc: bool = False,
21202120
errors: DateTimeErrorChoices = "raise",
2121-
require_iso8601: bool = False,
21222121
allow_object: bool = False,
2123-
format: str | None = None,
2124-
exact: bool = True,
21252122
):
21262123
"""
21272124
Convert data to array of timestamps.
@@ -2134,7 +2131,6 @@ def objects_to_datetime64ns(
21342131
utc : bool, default False
21352132
Whether to convert/localize timestamps to UTC.
21362133
errors : {'raise', 'ignore', 'coerce'}
2137-
require_iso8601 : bool, default False
21382134
allow_object : bool
21392135
Whether to return an object-dtype ndarray instead of raising if the
21402136
data contains more than one timezone.
@@ -2165,9 +2161,6 @@ def objects_to_datetime64ns(
21652161
utc=utc,
21662162
dayfirst=dayfirst,
21672163
yearfirst=yearfirst,
2168-
require_iso8601=require_iso8601,
2169-
format=format,
2170-
exact=exact,
21712164
)
21722165
result = result.reshape(data.shape, order=order)
21732166
except OverflowError as err:

0 commit comments

Comments
 (0)