Skip to content

Commit a28cadb

Browse files
authored
BUG: inconsistent handling of exact=False case in to_datetime parsing (#50435)
* fixup * use enum * more descriptive names * renaming fixup * cast * clean up * doc * correct syntax * use typedef * check for negative characters remaining * reduce diff Co-authored-by: MarcoGorelli <>
1 parent d859ecc commit a28cadb

File tree

6 files changed

+170
-44
lines changed

6 files changed

+170
-44
lines changed

.pre-commit-config.yaml

+2-1
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,8 @@ repos:
6363
'--extensions=c,h',
6464
'--headers=h',
6565
--recursive,
66-
'--filter=-readability/casting,-runtime/int,-build/include_subdir'
66+
--linelength=88,
67+
'--filter=-readability/casting,-runtime/int,-build/include_subdir,-readability/fn_size'
6768
]
6869
- repo: https://github.com/PyCQA/flake8
6970
rev: 6.0.0

pandas/_libs/tslibs/np_datetime.pxd

+6
Original file line numberDiff line numberDiff line change
@@ -120,3 +120,9 @@ cdef int64_t convert_reso(
120120
NPY_DATETIMEUNIT to_reso,
121121
bint round_ok,
122122
) except? -1
123+
124+
cdef extern from "src/datetime/np_datetime_strings.h":
125+
ctypedef enum FormatRequirement:
126+
PARTIAL_MATCH
127+
EXACT_MATCH
128+
INFER_FORMAT

pandas/_libs/tslibs/np_datetime.pyx

+7-3
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,8 @@ cdef extern from "src/datetime/np_datetime_strings.h":
5353
npy_datetimestruct *out,
5454
NPY_DATETIMEUNIT *out_bestunit,
5555
int *out_local, int *out_tzoffset,
56-
const char *format, int format_len, int exact)
56+
const char *format, int format_len,
57+
FormatRequirement exact)
5758

5859

5960
# ----------------------------------------------------------------------
@@ -286,17 +287,20 @@ cdef int string_to_dts(
286287
const char* buf
287288
Py_ssize_t format_length
288289
const char* format_buf
290+
FormatRequirement format_requirement
289291

290292
buf = get_c_string_buf_and_size(val, &length)
291293
if format is None:
292294
format_buf = b""
293295
format_length = 0
294-
exact = False
296+
format_requirement = INFER_FORMAT
295297
else:
296298
format_buf = get_c_string_buf_and_size(format, &format_length)
299+
format_requirement = <FormatRequirement>exact
297300
return parse_iso_8601_datetime(buf, length, want_exc,
298301
dts, out_bestunit, out_local, out_tzoffset,
299-
format_buf, format_length, exact)
302+
format_buf, format_length,
303+
format_requirement)
300304

301305

302306
cpdef ndarray astype_overflowsafe(

pandas/_libs/tslibs/src/datetime/np_datetime_strings.c

+108-39
Original file line numberDiff line numberDiff line change
@@ -67,49 +67,62 @@ This file implements string parsing and creation for NumPy datetime.
6767
* Returns 0 on success, -1 on failure.
6868
*/
6969

70+
typedef enum {
71+
COMPARISON_SUCCESS,
72+
COMPLETED_PARTIAL_MATCH,
73+
COMPARISON_ERROR
74+
} DatetimePartParseResult;
7075
// This function will advance the pointer on format
7176
// and decrement characters_remaining by n on success
72-
// On failure will return -1 without incrementing
73-
static int compare_format(const char **format, int *characters_remaining,
74-
const char *compare_to, int n, const int exact) {
77+
// On failure will return COMPARISON_ERROR without incrementing
78+
// If `format_requirement` is PARTIAL_MATCH, and the `format` string has
79+
// been exhausted, then return COMPLETED_PARTIAL_MATCH.
80+
static DatetimePartParseResult compare_format(
81+
const char **format,
82+
int *characters_remaining,
83+
const char *compare_to,
84+
int n,
85+
const FormatRequirement format_requirement
86+
) {
87+
if (format_requirement == INFER_FORMAT) {
88+
return COMPARISON_SUCCESS;
89+
}
90+
if (*characters_remaining < 0) {
91+
return COMPARISON_ERROR;
92+
}
93+
if (format_requirement == PARTIAL_MATCH && *characters_remaining == 0) {
94+
return COMPLETED_PARTIAL_MATCH;
95+
}
7596
if (*characters_remaining < n) {
76-
if (exact) {
77-
// TODO(pandas-dev): in the future we should set a PyErr here
78-
// to be very clear about what went wrong
79-
return -1;
80-
} else if (*characters_remaining) {
81-
// TODO(pandas-dev): same return value in this function as
82-
// above branch, but stub out a future where
83-
// we have a better error message
84-
return -1;
85-
} else {
86-
return 0;
87-
}
97+
// TODO(pandas-dev): PyErr to differentiate what went wrong
98+
return COMPARISON_ERROR;
8899
} else {
89100
if (strncmp(*format, compare_to, n)) {
90101
// TODO(pandas-dev): PyErr to differentiate what went wrong
91-
return -1;
102+
return COMPARISON_ERROR;
92103
} else {
93104
*format += n;
94105
*characters_remaining -= n;
95-
return 0;
106+
return COMPARISON_SUCCESS;
96107
}
97108
}
98-
return 0;
109+
return COMPARISON_SUCCESS;
99110
}
100111

101112
int parse_iso_8601_datetime(const char *str, int len, int want_exc,
102113
npy_datetimestruct *out,
103114
NPY_DATETIMEUNIT *out_bestunit,
104115
int *out_local, int *out_tzoffset,
105-
const char* format, int format_len, int exact) {
116+
const char* format, int format_len,
117+
FormatRequirement format_requirement) {
106118
if (len < 0 || format_len < 0)
107119
goto parse_error;
108120
int year_leap = 0;
109121
int i, numdigits;
110122
const char *substr;
111123
int sublen;
112124
NPY_DATETIMEUNIT bestunit = NPY_FR_GENERIC;
125+
DatetimePartParseResult comparison;
113126

114127
/* If year-month-day are separated by a valid separator,
115128
* months/days without leading zeroes will be parsed
@@ -139,8 +152,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
139152
while (sublen > 0 && isspace(*substr)) {
140153
++substr;
141154
--sublen;
142-
if (compare_format(&format, &format_len, " ", 1, exact)) {
155+
comparison = compare_format(&format, &format_len, " ", 1, format_requirement);
156+
if (comparison == COMPARISON_ERROR) {
143157
goto parse_error;
158+
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
159+
goto finish;
144160
}
145161
}
146162

@@ -155,8 +171,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
155171
}
156172

157173
/* PARSE THE YEAR (4 digits) */
158-
if (compare_format(&format, &format_len, "%Y", 2, exact)) {
174+
comparison = compare_format(&format, &format_len, "%Y", 2, format_requirement);
175+
if (comparison == COMPARISON_ERROR) {
159176
goto parse_error;
177+
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
178+
goto finish;
160179
}
161180

162181
out->year = 0;
@@ -202,8 +221,12 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
202221
++substr;
203222
--sublen;
204223

205-
if (compare_format(&format, &format_len, &ymd_sep, 1, exact)) {
224+
comparison = compare_format(&format, &format_len, &ymd_sep, 1,
225+
format_requirement);
226+
if (comparison == COMPARISON_ERROR) {
206227
goto parse_error;
228+
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
229+
goto finish;
207230
}
208231
/* Cannot have trailing separator */
209232
if (sublen == 0 || !isdigit(*substr)) {
@@ -212,8 +235,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
212235
}
213236

214237
/* PARSE THE MONTH */
215-
if (compare_format(&format, &format_len, "%m", 2, exact)) {
238+
comparison = compare_format(&format, &format_len, "%m", 2, format_requirement);
239+
if (comparison == COMPARISON_ERROR) {
216240
goto parse_error;
241+
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
242+
goto finish;
217243
}
218244
/* First digit required */
219245
out->month = (*substr - '0');
@@ -258,14 +284,21 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
258284
}
259285
++substr;
260286
--sublen;
261-
if (compare_format(&format, &format_len, &ymd_sep, 1, exact)) {
287+
comparison = compare_format(&format, &format_len, &ymd_sep, 1,
288+
format_requirement);
289+
if (comparison == COMPARISON_ERROR) {
262290
goto parse_error;
291+
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
292+
goto finish;
263293
}
264294
}
265295

266296
/* PARSE THE DAY */
267-
if (compare_format(&format, &format_len, "%d", 2, exact)) {
297+
comparison = compare_format(&format, &format_len, "%d", 2, format_requirement);
298+
if (comparison == COMPARISON_ERROR) {
268299
goto parse_error;
300+
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
301+
goto finish;
269302
}
270303
/* First digit required */
271304
if (!isdigit(*substr)) {
@@ -306,15 +339,21 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
306339
if ((*substr != 'T' && *substr != ' ') || sublen == 1) {
307340
goto parse_error;
308341
}
309-
if (compare_format(&format, &format_len, substr, 1, exact)) {
310-
goto parse_error;
311-
}
342+
comparison = compare_format(&format, &format_len, substr, 1, format_requirement);
343+
if (comparison == COMPARISON_ERROR) {
344+
goto parse_error;
345+
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
346+
goto finish;
347+
}
312348
++substr;
313349
--sublen;
314350

315351
/* PARSE THE HOURS */
316-
if (compare_format(&format, &format_len, "%H", 2, exact)) {
352+
comparison = compare_format(&format, &format_len, "%H", 2, format_requirement);
353+
if (comparison == COMPARISON_ERROR) {
317354
goto parse_error;
355+
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
356+
goto finish;
318357
}
319358
/* First digit required */
320359
if (!isdigit(*substr)) {
@@ -359,8 +398,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
359398
if (sublen == 0 || !isdigit(*substr)) {
360399
goto parse_error;
361400
}
362-
if (compare_format(&format, &format_len, ":", 1, exact)) {
401+
comparison = compare_format(&format, &format_len, ":", 1, format_requirement);
402+
if (comparison == COMPARISON_ERROR) {
363403
goto parse_error;
404+
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
405+
goto finish;
364406
}
365407
} else if (!isdigit(*substr)) {
366408
if (!hour_was_2_digits) {
@@ -370,8 +412,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
370412
}
371413

372414
/* PARSE THE MINUTES */
373-
if (compare_format(&format, &format_len, "%M", 2, exact)) {
415+
comparison = compare_format(&format, &format_len, "%M", 2, format_requirement);
416+
if (comparison == COMPARISON_ERROR) {
374417
goto parse_error;
418+
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
419+
goto finish;
375420
}
376421
/* First digit required */
377422
out->min = (*substr - '0');
@@ -405,8 +450,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
405450
/* If we make it through this condition block, then the next
406451
* character is a digit. */
407452
if (has_hms_sep && *substr == ':') {
408-
if (compare_format(&format, &format_len, ":", 1, exact)) {
453+
comparison = compare_format(&format, &format_len, ":", 1, format_requirement);
454+
if (comparison == COMPARISON_ERROR) {
409455
goto parse_error;
456+
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
457+
goto finish;
410458
}
411459
++substr;
412460
--sublen;
@@ -420,8 +468,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
420468
}
421469

422470
/* PARSE THE SECONDS */
423-
if (compare_format(&format, &format_len, "%S", 2, exact)) {
471+
comparison = compare_format(&format, &format_len, "%S", 2, format_requirement);
472+
if (comparison == COMPARISON_ERROR) {
424473
goto parse_error;
474+
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
475+
goto finish;
425476
}
426477
/* First digit required */
427478
out->sec = (*substr - '0');
@@ -448,17 +499,23 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
448499
if (sublen > 0 && *substr == '.') {
449500
++substr;
450501
--sublen;
451-
if (compare_format(&format, &format_len, ".", 1, exact)) {
502+
comparison = compare_format(&format, &format_len, ".", 1, format_requirement);
503+
if (comparison == COMPARISON_ERROR) {
452504
goto parse_error;
505+
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
506+
goto finish;
453507
}
454508
} else {
455509
bestunit = NPY_FR_s;
456510
goto parse_timezone;
457511
}
458512

459513
/* PARSE THE MICROSECONDS (0 to 6 digits) */
460-
if (compare_format(&format, &format_len, "%f", 2, exact)) {
514+
comparison = compare_format(&format, &format_len, "%f", 2, format_requirement);
515+
if (comparison == COMPARISON_ERROR) {
461516
goto parse_error;
517+
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
518+
goto finish;
462519
}
463520
numdigits = 0;
464521
for (i = 0; i < 6; ++i) {
@@ -524,8 +581,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
524581
while (sublen > 0 && isspace(*substr)) {
525582
++substr;
526583
--sublen;
527-
if (compare_format(&format, &format_len, " ", 1, exact)) {
584+
comparison = compare_format(&format, &format_len, " ", 1, format_requirement);
585+
if (comparison == COMPARISON_ERROR) {
528586
goto parse_error;
587+
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
588+
goto finish;
529589
}
530590
}
531591

@@ -539,8 +599,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
539599

540600
/* UTC specifier */
541601
if (*substr == 'Z') {
542-
if (compare_format(&format, &format_len, "%z", 2, exact)) {
602+
comparison = compare_format(&format, &format_len, "%z", 2, format_requirement);
603+
if (comparison == COMPARISON_ERROR) {
543604
goto parse_error;
605+
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
606+
goto finish;
544607
}
545608
/* "Z" should be equivalent to tz offset "+00:00" */
546609
if (out_local != NULL) {
@@ -561,8 +624,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
561624
--sublen;
562625
}
563626
} else if (*substr == '-' || *substr == '+') {
564-
if (compare_format(&format, &format_len, "%z", 2, exact)) {
627+
comparison = compare_format(&format, &format_len, "%z", 2, format_requirement);
628+
if (comparison == COMPARISON_ERROR) {
565629
goto parse_error;
630+
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
631+
goto finish;
566632
}
567633
/* Time zone offset */
568634
int offset_neg = 0, offset_hour = 0, offset_minute = 0;
@@ -647,8 +713,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
647713
while (sublen > 0 && isspace(*substr)) {
648714
++substr;
649715
--sublen;
650-
if (compare_format(&format, &format_len, " ", 1, exact)) {
716+
comparison = compare_format(&format, &format_len, " ", 1, format_requirement);
717+
if (comparison == COMPARISON_ERROR) {
651718
goto parse_error;
719+
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
720+
goto finish;
652721
}
653722
}
654723

pandas/_libs/tslibs/src/datetime/np_datetime_strings.h

+16-1
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,21 @@ This file implements string parsing and creation for NumPy datetime.
2626
#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
2727
#endif // NPY_NO_DEPRECATED_API
2828

29+
/* 'format_requirement' can be one of three values:
30+
* * PARTIAL_MATCH : Only require a partial match with 'format'.
31+
* For example, if the string is '2020-01-01 05:00:00' and
32+
* 'format' is '%Y-%m-%d', then parse '2020-01-01';
33+
* * EXACT_MATCH : require an exact match with 'format'. If the
34+
* string is '2020-01-01', then the only format which will
35+
* be able to parse it without error is '%Y-%m-%d';
36+
* * INFER_FORMAT: parse without comparing 'format' (i.e. infer it).
37+
*/
38+
typedef enum {
39+
PARTIAL_MATCH,
40+
EXACT_MATCH,
41+
INFER_FORMAT
42+
} FormatRequirement;
43+
2944
/*
3045
* Parses (almost) standard ISO 8601 date strings. The differences are:
3146
*
@@ -61,7 +76,7 @@ parse_iso_8601_datetime(const char *str, int len, int want_exc,
6176
int *out_tzoffset,
6277
const char* format,
6378
int format_len,
64-
int exact);
79+
FormatRequirement format_requirement);
6580

6681
/*
6782
* Provides a string length to use for converting datetime

0 commit comments

Comments
 (0)