Skip to content

Commit d2630d8

Browse files
authored
BUG: pandas.to_datetime() does not respect exact format string with ISO8601 (#49333)
* initial format support Co-Authored-By: MarcoGorelli <> Co-Authored-By: FDRocha <> * set exact=False default in objects_to_datetime * 🏷️ typing * simplify * replace macro with function * clean up * 📝 restore docstring * inline * set format default to None * clean up * remove function, perform check inline * only compare *format++ if format_len * clean up * typing * split out branches * use compare_format function * remove tmp variable * Add co-authors > > Co-authored-by: fdrocha <> Co-authored-by: nikitaved <> Co-authored-by: Marco Gorelli <>
1 parent 4a5d77f commit d2630d8

File tree

10 files changed

+283
-11
lines changed

10 files changed

+283
-11
lines changed

doc/source/whatsnew/v2.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -644,6 +644,7 @@ Conversion
644644
- Bug in :meth:`Series.convert_dtypes` not converting dtype to nullable dtype when :class:`Series` contains ``NA`` and has dtype ``object`` (:issue:`48791`)
645645
- Bug where any :class:`ExtensionDtype` subclass with ``kind="M"`` would be interpreted as a timezone type (:issue:`34986`)
646646
- Bug in :class:`.arrays.ArrowExtensionArray` that would raise ``NotImplementedError`` when passed a sequence of strings or binary (:issue:`49172`)
647+
- Bug in :func:`to_datetime` was not respecting ``exact`` argument when ``format`` was an ISO8601 format (:issue:`12649`)
647648

648649
Strings
649650
^^^^^^^

pandas/_libs/tslib.pyi

+2
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ def array_to_datetime(
2424
yearfirst: bool = ...,
2525
utc: bool = ...,
2626
require_iso8601: bool = ...,
27+
format: str | None = ...,
28+
exact: bool = ...,
2729
) -> tuple[np.ndarray, tzinfo | None]: ...
2830

2931
# returned ndarray may be object dtype or datetime64[ns]

pandas/_libs/tslib.pyx

+14-2
Original file line numberDiff line numberDiff line change
@@ -446,6 +446,8 @@ cpdef array_to_datetime(
446446
bint yearfirst=False,
447447
bint utc=False,
448448
bint require_iso8601=False,
449+
format: str | None=None,
450+
bint exact=True,
449451
):
450452
"""
451453
Converts a 1D array of date-like values to a numpy array of either:
@@ -563,6 +565,16 @@ cpdef array_to_datetime(
563565
iresult[i] = get_datetime64_nanos(val, NPY_FR_ns)
564566

565567
elif is_integer_object(val) or is_float_object(val):
568+
if require_iso8601:
569+
if is_coerce:
570+
iresult[i] = NPY_NAT
571+
continue
572+
elif is_raise:
573+
raise ValueError(
574+
f"time data \"{val}\" at position {i} doesn't "
575+
f"match format \"{format}\""
576+
)
577+
return values, tz_out
566578
# these must be ns unit by-definition
567579
seen_integer = True
568580

@@ -593,7 +605,7 @@ cpdef array_to_datetime(
593605

594606
string_to_dts_failed = string_to_dts(
595607
val, &dts, &out_bestunit, &out_local,
596-
&out_tzoffset, False
608+
&out_tzoffset, False, format, exact
597609
)
598610
if string_to_dts_failed:
599611
# An error at this point is a _parsing_ error
@@ -609,7 +621,7 @@ cpdef array_to_datetime(
609621
elif is_raise:
610622
raise ValueError(
611623
f"time data \"{val}\" at position {i} doesn't "
612-
"match format specified"
624+
f"match format \"{format}\""
613625
)
614626
return values, tz_out
615627

pandas/_libs/tslibs/np_datetime.pxd

+2
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,8 @@ cdef int string_to_dts(
9595
int* out_local,
9696
int* out_tzoffset,
9797
bint want_exc,
98+
format: str | None = *,
99+
bint exact = *
98100
) except? -1
99101

100102
cdef NPY_DATETIMEUNIT get_unit_from_dtype(cnp.dtype dtype)

pandas/_libs/tslibs/np_datetime.pyx

+14-2
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,8 @@ cdef extern from "src/datetime/np_datetime_strings.h":
5252
int parse_iso_8601_datetime(const char *str, int len, int want_exc,
5353
npy_datetimestruct *out,
5454
NPY_DATETIMEUNIT *out_bestunit,
55-
int *out_local, int *out_tzoffset)
55+
int *out_local, int *out_tzoffset,
56+
const char *format, int format_len, int exact)
5657

5758

5859
# ----------------------------------------------------------------------
@@ -277,14 +278,25 @@ cdef inline int string_to_dts(
277278
int* out_local,
278279
int* out_tzoffset,
279280
bint want_exc,
281+
format: str | None=None,
282+
bint exact=True,
280283
) except? -1:
281284
cdef:
282285
Py_ssize_t length
283286
const char* buf
287+
Py_ssize_t format_length
288+
const char* format_buf
284289

285290
buf = get_c_string_buf_and_size(val, &length)
291+
if format is None:
292+
format_buf = b''
293+
format_length = 0
294+
exact = False
295+
else:
296+
format_buf = get_c_string_buf_and_size(format, &format_length)
286297
return parse_iso_8601_datetime(buf, length, want_exc,
287-
dts, out_bestunit, out_local, out_tzoffset)
298+
dts, out_bestunit, out_local, out_tzoffset,
299+
format_buf, format_length, exact)
288300

289301

290302
cpdef ndarray astype_overflowsafe(

pandas/_libs/tslibs/src/datetime/np_datetime_strings.c

+114-2
Original file line numberDiff line numberDiff line change
@@ -66,10 +66,45 @@ This file implements string parsing and creation for NumPy datetime.
6666
*
6767
* Returns 0 on success, -1 on failure.
6868
*/
69+
70+
// This function will advance the pointer on format
71+
// and decrement characters_remaining by n on success
72+
// On failure will return -1 without incrementing
73+
static int compare_format(const char **format, int *characters_remaining,
74+
const char *compare_to, int n, const int exact) {
75+
if (*characters_remaining < n) {
76+
if (exact) {
77+
// TODO(pandas-dev): in the future we should set a PyErr here
78+
// to be very clear about what went wrong
79+
return -1;
80+
} else if (*characters_remaining) {
81+
// TODO(pandas-dev): same return value in this function as
82+
// above branch, but stub out a future where
83+
// we have a better error message
84+
return -1;
85+
} else {
86+
return 0;
87+
}
88+
} else {
89+
if (strncmp(*format, compare_to, n)) {
90+
// TODO(pandas-dev): PyErr to differentiate what went wrong
91+
return -1;
92+
} else {
93+
*format += n;
94+
*characters_remaining -= n;
95+
return 0;
96+
}
97+
}
98+
return 0;
99+
}
100+
69101
int parse_iso_8601_datetime(const char *str, int len, int want_exc,
70102
npy_datetimestruct *out,
71103
NPY_DATETIMEUNIT *out_bestunit,
72-
int *out_local, int *out_tzoffset) {
104+
int *out_local, int *out_tzoffset,
105+
const char* format, int format_len, int exact) {
106+
if (len < 0 || format_len < 0)
107+
goto parse_error;
73108
int year_leap = 0;
74109
int i, numdigits;
75110
const char *substr;
@@ -104,6 +139,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
104139
while (sublen > 0 && isspace(*substr)) {
105140
++substr;
106141
--sublen;
142+
if (compare_format(&format, &format_len, " ", 1, exact)) {
143+
goto parse_error;
144+
}
107145
}
108146

109147
/* Leading '-' sign for negative year */
@@ -117,6 +155,10 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
117155
}
118156

119157
/* PARSE THE YEAR (4 digits) */
158+
if (compare_format(&format, &format_len, "%Y", 2, exact)) {
159+
goto parse_error;
160+
}
161+
120162
out->year = 0;
121163
if (sublen >= 4 && isdigit(substr[0]) && isdigit(substr[1]) &&
122164
isdigit(substr[2]) && isdigit(substr[3])) {
@@ -139,6 +181,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
139181
if (out_local != NULL) {
140182
*out_local = 0;
141183
}
184+
if (format_len) {
185+
goto parse_error;
186+
}
142187
bestunit = NPY_FR_Y;
143188
goto finish;
144189
}
@@ -156,13 +201,20 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
156201
ymd_sep = valid_ymd_sep[i];
157202
++substr;
158203
--sublen;
204+
205+
if (compare_format(&format, &format_len, &ymd_sep, 1, exact)) {
206+
goto parse_error;
207+
}
159208
/* Cannot have trailing separator */
160209
if (sublen == 0 || !isdigit(*substr)) {
161210
goto parse_error;
162211
}
163212
}
164213

165214
/* PARSE THE MONTH */
215+
if (compare_format(&format, &format_len, "%m", 2, exact)) {
216+
goto parse_error;
217+
}
166218
/* First digit required */
167219
out->month = (*substr - '0');
168220
++substr;
@@ -190,6 +242,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
190242
if (!has_ymd_sep) {
191243
goto parse_error;
192244
}
245+
if (format_len) {
246+
goto parse_error;
247+
}
193248
if (out_local != NULL) {
194249
*out_local = 0;
195250
}
@@ -203,9 +258,15 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
203258
}
204259
++substr;
205260
--sublen;
261+
if (compare_format(&format, &format_len, &ymd_sep, 1, exact)) {
262+
goto parse_error;
263+
}
206264
}
207265

208266
/* PARSE THE DAY */
267+
if (compare_format(&format, &format_len, "%d", 2, exact)) {
268+
goto parse_error;
269+
}
209270
/* First digit required */
210271
if (!isdigit(*substr)) {
211272
goto parse_error;
@@ -235,17 +296,26 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
235296
if (out_local != NULL) {
236297
*out_local = 0;
237298
}
299+
if (format_len) {
300+
goto parse_error;
301+
}
238302
bestunit = NPY_FR_D;
239303
goto finish;
240304
}
241305

242306
if ((*substr != 'T' && *substr != ' ') || sublen == 1) {
243307
goto parse_error;
244308
}
309+
if (compare_format(&format, &format_len, substr, 1, exact)) {
310+
goto parse_error;
311+
}
245312
++substr;
246313
--sublen;
247314

248315
/* PARSE THE HOURS */
316+
if (compare_format(&format, &format_len, "%H", 2, exact)) {
317+
goto parse_error;
318+
}
249319
/* First digit required */
250320
if (!isdigit(*substr)) {
251321
goto parse_error;
@@ -274,6 +344,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
274344
if (!hour_was_2_digits) {
275345
goto parse_error;
276346
}
347+
if (format_len) {
348+
goto parse_error;
349+
}
277350
bestunit = NPY_FR_h;
278351
goto finish;
279352
}
@@ -286,6 +359,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
286359
if (sublen == 0 || !isdigit(*substr)) {
287360
goto parse_error;
288361
}
362+
if (compare_format(&format, &format_len, ":", 1, exact)) {
363+
goto parse_error;
364+
}
289365
} else if (!isdigit(*substr)) {
290366
if (!hour_was_2_digits) {
291367
goto parse_error;
@@ -294,6 +370,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
294370
}
295371

296372
/* PARSE THE MINUTES */
373+
if (compare_format(&format, &format_len, "%M", 2, exact)) {
374+
goto parse_error;
375+
}
297376
/* First digit required */
298377
out->min = (*substr - '0');
299378
++substr;
@@ -317,12 +396,18 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
317396

318397
if (sublen == 0) {
319398
bestunit = NPY_FR_m;
399+
if (format_len) {
400+
goto parse_error;
401+
}
320402
goto finish;
321403
}
322404

323405
/* If we make it through this condition block, then the next
324406
* character is a digit. */
325407
if (has_hms_sep && *substr == ':') {
408+
if (compare_format(&format, &format_len, ":", 1, exact)) {
409+
goto parse_error;
410+
}
326411
++substr;
327412
--sublen;
328413
/* Cannot have a trailing ':' */
@@ -335,6 +420,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
335420
}
336421

337422
/* PARSE THE SECONDS */
423+
if (compare_format(&format, &format_len, "%S", 2, exact)) {
424+
goto parse_error;
425+
}
338426
/* First digit required */
339427
out->sec = (*substr - '0');
340428
++substr;
@@ -360,12 +448,18 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
360448
if (sublen > 0 && *substr == '.') {
361449
++substr;
362450
--sublen;
451+
if (compare_format(&format, &format_len, ".", 1, exact)) {
452+
goto parse_error;
453+
}
363454
} else {
364455
bestunit = NPY_FR_s;
365456
goto parse_timezone;
366457
}
367458

368459
/* PARSE THE MICROSECONDS (0 to 6 digits) */
460+
if (compare_format(&format, &format_len, "%f", 2, exact)) {
461+
goto parse_error;
462+
}
369463
numdigits = 0;
370464
for (i = 0; i < 6; ++i) {
371465
out->us *= 10;
@@ -430,15 +524,24 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
430524
while (sublen > 0 && isspace(*substr)) {
431525
++substr;
432526
--sublen;
527+
if (compare_format(&format, &format_len, " ", 1, exact)) {
528+
goto parse_error;
529+
}
433530
}
434531

435532
if (sublen == 0) {
436533
// Unlike NumPy, treating no time zone as naive
534+
if (format_len > 0) {
535+
goto parse_error;
536+
}
437537
goto finish;
438538
}
439539

440540
/* UTC specifier */
441541
if (*substr == 'Z') {
542+
if (compare_format(&format, &format_len, "%Z", 2, exact)) {
543+
goto parse_error;
544+
}
442545
/* "Z" should be equivalent to tz offset "+00:00" */
443546
if (out_local != NULL) {
444547
*out_local = 1;
@@ -449,12 +552,18 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
449552
}
450553

451554
if (sublen == 1) {
555+
if (format_len > 0) {
556+
goto parse_error;
557+
}
452558
goto finish;
453559
} else {
454560
++substr;
455561
--sublen;
456562
}
457563
} else if (*substr == '-' || *substr == '+') {
564+
if (compare_format(&format, &format_len, "%z", 2, exact)) {
565+
goto parse_error;
566+
}
458567
/* Time zone offset */
459568
int offset_neg = 0, offset_hour = 0, offset_minute = 0;
460569

@@ -538,9 +647,12 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
538647
while (sublen > 0 && isspace(*substr)) {
539648
++substr;
540649
--sublen;
650+
if (compare_format(&format, &format_len, " ", 1, exact)) {
651+
goto parse_error;
652+
}
541653
}
542654

543-
if (sublen != 0) {
655+
if ((sublen != 0) || (format_len != 0)) {
544656
goto parse_error;
545657
}
546658

0 commit comments

Comments
 (0)