-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
BUG: inconsistent handling of exact=False case in to_datetime parsing #50435
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 12 commits
3de2331
947353d
e3fe55b
9e18d33
efeaf7a
6c51924
b96158f
0ebff5c
caa9c90
84eeb3d
f92ff7a
5c67ed3
bad704e
ec6591b
8d8f90e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -67,49 +67,59 @@ This file implements string parsing and creation for NumPy datetime. | |
* Returns 0 on success, -1 on failure. | ||
*/ | ||
|
||
typedef enum { | ||
COMPARISON_SUCCESS, | ||
COMPLETED_PARTIAL_MATCH, | ||
COMPARISON_ERROR | ||
} DatetimePartParseResult; | ||
// This function will advance the pointer on format | ||
// and decrement characters_remaining by n on success | ||
// On failure will return -1 without incrementing | ||
static int compare_format(const char **format, int *characters_remaining, | ||
const char *compare_to, int n, const int exact) { | ||
// On failure will return COMPARISON_ERROR without incrementing | ||
// If `format_requirement` is PARTIAL_MATCH, and the `format` string has | ||
// been exhausted, then return COMPLETED_PARTIAL_MATCH. | ||
static DatetimePartParseResult compare_format( | ||
const char **format, | ||
int *characters_remaining, | ||
const char *compare_to, | ||
int n, | ||
const FormatRequirement format_requirement | ||
) { | ||
if (format_requirement == PARTIAL_MATCH && !*characters_remaining) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also a nit but I think we need to handle characters_remaining being negative. It could just simply return a Understood it is impossible in the current state of things. However, if this gets refactored in the future and a negative number makes its way in here uncaught I think it would return a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yup, thanks for this (and other) thoughtful comments! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I presume you meant "less than 0" - that's what I've gone for anyway |
||
return COMPLETED_PARTIAL_MATCH; | ||
} | ||
if (format_requirement == INFER_FORMAT) { | ||
return COMPARISON_SUCCESS; | ||
} | ||
if (*characters_remaining < n) { | ||
if (exact) { | ||
// TODO(pandas-dev): in the future we should set a PyErr here | ||
// to be very clear about what went wrong | ||
return -1; | ||
} else if (*characters_remaining) { | ||
// TODO(pandas-dev): same return value in this function as | ||
// above branch, but stub out a future where | ||
// we have a better error message | ||
return -1; | ||
} else { | ||
return 0; | ||
} | ||
// TODO(pandas-dev): PyErr to differentiate what went wrong | ||
return COMPARISON_ERROR; | ||
} else { | ||
if (strncmp(*format, compare_to, n)) { | ||
// TODO(pandas-dev): PyErr to differentiate what went wrong | ||
return -1; | ||
return COMPARISON_ERROR; | ||
} else { | ||
*format += n; | ||
*characters_remaining -= n; | ||
return 0; | ||
return COMPARISON_SUCCESS; | ||
} | ||
} | ||
return 0; | ||
return COMPARISON_SUCCESS; | ||
} | ||
|
||
int parse_iso_8601_datetime(const char *str, int len, int want_exc, | ||
npy_datetimestruct *out, | ||
NPY_DATETIMEUNIT *out_bestunit, | ||
int *out_local, int *out_tzoffset, | ||
const char* format, int format_len, int exact) { | ||
const char* format, int format_len, | ||
FormatRequirement format_requirement) { | ||
if (len < 0 || format_len < 0) | ||
goto parse_error; | ||
int year_leap = 0; | ||
int i, numdigits; | ||
const char *substr; | ||
int sublen; | ||
NPY_DATETIMEUNIT bestunit = NPY_FR_GENERIC; | ||
DatetimePartParseResult comparison; | ||
|
||
/* If year-month-day are separated by a valid separator, | ||
* months/days without leading zeroes will be parsed | ||
|
@@ -139,8 +149,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |
while (sublen > 0 && isspace(*substr)) { | ||
++substr; | ||
--sublen; | ||
if (compare_format(&format, &format_len, " ", 1, exact)) { | ||
comparison = compare_format(&format, &format_len, " ", 1, format_requirement); | ||
if (comparison == COMPARISON_ERROR) { | ||
goto parse_error; | ||
} else if (comparison == COMPLETED_PARTIAL_MATCH) { | ||
goto finish; | ||
} | ||
} | ||
|
||
|
@@ -155,8 +168,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |
} | ||
|
||
/* PARSE THE YEAR (4 digits) */ | ||
if (compare_format(&format, &format_len, "%Y", 2, exact)) { | ||
comparison = compare_format(&format, &format_len, "%Y", 2, format_requirement); | ||
if (comparison == COMPARISON_ERROR) { | ||
goto parse_error; | ||
} else if (comparison == COMPLETED_PARTIAL_MATCH) { | ||
goto finish; | ||
} | ||
|
||
out->year = 0; | ||
|
@@ -202,8 +218,12 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |
++substr; | ||
--sublen; | ||
|
||
if (compare_format(&format, &format_len, &ymd_sep, 1, exact)) { | ||
comparison = compare_format(&format, &format_len, &ymd_sep, 1, | ||
format_requirement); | ||
if (comparison == COMPARISON_ERROR) { | ||
goto parse_error; | ||
} else if (comparison == COMPLETED_PARTIAL_MATCH) { | ||
goto finish; | ||
} | ||
/* Cannot have trailing separator */ | ||
if (sublen == 0 || !isdigit(*substr)) { | ||
|
@@ -212,8 +232,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |
} | ||
|
||
/* PARSE THE MONTH */ | ||
if (compare_format(&format, &format_len, "%m", 2, exact)) { | ||
comparison = compare_format(&format, &format_len, "%m", 2, format_requirement); | ||
if (comparison == COMPARISON_ERROR) { | ||
goto parse_error; | ||
} else if (comparison == COMPLETED_PARTIAL_MATCH) { | ||
goto finish; | ||
} | ||
/* First digit required */ | ||
out->month = (*substr - '0'); | ||
|
@@ -258,14 +281,21 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |
} | ||
++substr; | ||
--sublen; | ||
if (compare_format(&format, &format_len, &ymd_sep, 1, exact)) { | ||
comparison = compare_format(&format, &format_len, &ymd_sep, 1, | ||
format_requirement); | ||
if (comparison == COMPARISON_ERROR) { | ||
goto parse_error; | ||
} else if (comparison == COMPLETED_PARTIAL_MATCH) { | ||
goto finish; | ||
} | ||
} | ||
|
||
/* PARSE THE DAY */ | ||
if (compare_format(&format, &format_len, "%d", 2, exact)) { | ||
comparison = compare_format(&format, &format_len, "%d", 2, format_requirement); | ||
if (comparison == COMPARISON_ERROR) { | ||
goto parse_error; | ||
} else if (comparison == COMPLETED_PARTIAL_MATCH) { | ||
goto finish; | ||
} | ||
/* First digit required */ | ||
if (!isdigit(*substr)) { | ||
|
@@ -306,15 +336,21 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |
if ((*substr != 'T' && *substr != ' ') || sublen == 1) { | ||
goto parse_error; | ||
} | ||
if (compare_format(&format, &format_len, substr, 1, exact)) { | ||
goto parse_error; | ||
} | ||
comparison = compare_format(&format, &format_len, substr, 1, format_requirement); | ||
if (comparison == COMPARISON_ERROR) { | ||
goto parse_error; | ||
} else if (comparison == COMPLETED_PARTIAL_MATCH) { | ||
goto finish; | ||
} | ||
++substr; | ||
--sublen; | ||
|
||
/* PARSE THE HOURS */ | ||
if (compare_format(&format, &format_len, "%H", 2, exact)) { | ||
comparison = compare_format(&format, &format_len, "%H", 2, format_requirement); | ||
if (comparison == COMPARISON_ERROR) { | ||
goto parse_error; | ||
} else if (comparison == COMPLETED_PARTIAL_MATCH) { | ||
goto finish; | ||
} | ||
/* First digit required */ | ||
if (!isdigit(*substr)) { | ||
|
@@ -359,8 +395,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |
if (sublen == 0 || !isdigit(*substr)) { | ||
goto parse_error; | ||
} | ||
if (compare_format(&format, &format_len, ":", 1, exact)) { | ||
comparison = compare_format(&format, &format_len, ":", 1, format_requirement); | ||
if (comparison == COMPARISON_ERROR) { | ||
goto parse_error; | ||
} else if (comparison == COMPLETED_PARTIAL_MATCH) { | ||
goto finish; | ||
} | ||
} else if (!isdigit(*substr)) { | ||
if (!hour_was_2_digits) { | ||
|
@@ -370,8 +409,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |
} | ||
|
||
/* PARSE THE MINUTES */ | ||
if (compare_format(&format, &format_len, "%M", 2, exact)) { | ||
comparison = compare_format(&format, &format_len, "%M", 2, format_requirement); | ||
if (comparison == COMPARISON_ERROR) { | ||
goto parse_error; | ||
} else if (comparison == COMPLETED_PARTIAL_MATCH) { | ||
goto finish; | ||
} | ||
/* First digit required */ | ||
out->min = (*substr - '0'); | ||
|
@@ -405,8 +447,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |
/* If we make it through this condition block, then the next | ||
* character is a digit. */ | ||
if (has_hms_sep && *substr == ':') { | ||
if (compare_format(&format, &format_len, ":", 1, exact)) { | ||
comparison = compare_format(&format, &format_len, ":", 1, format_requirement); | ||
if (comparison == COMPARISON_ERROR) { | ||
goto parse_error; | ||
} else if (comparison == COMPLETED_PARTIAL_MATCH) { | ||
goto finish; | ||
} | ||
++substr; | ||
--sublen; | ||
|
@@ -420,8 +465,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |
} | ||
|
||
/* PARSE THE SECONDS */ | ||
if (compare_format(&format, &format_len, "%S", 2, exact)) { | ||
comparison = compare_format(&format, &format_len, "%S", 2, format_requirement); | ||
if (comparison == COMPARISON_ERROR) { | ||
goto parse_error; | ||
} else if (comparison == COMPLETED_PARTIAL_MATCH) { | ||
goto finish; | ||
} | ||
/* First digit required */ | ||
out->sec = (*substr - '0'); | ||
|
@@ -448,17 +496,23 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |
if (sublen > 0 && *substr == '.') { | ||
++substr; | ||
--sublen; | ||
if (compare_format(&format, &format_len, ".", 1, exact)) { | ||
comparison = compare_format(&format, &format_len, ".", 1, format_requirement); | ||
if (comparison == COMPARISON_ERROR) { | ||
goto parse_error; | ||
} else if (comparison == COMPLETED_PARTIAL_MATCH) { | ||
goto finish; | ||
} | ||
} else { | ||
bestunit = NPY_FR_s; | ||
goto parse_timezone; | ||
} | ||
|
||
/* PARSE THE MICROSECONDS (0 to 6 digits) */ | ||
if (compare_format(&format, &format_len, "%f", 2, exact)) { | ||
comparison = compare_format(&format, &format_len, "%f", 2, format_requirement); | ||
if (comparison == COMPARISON_ERROR) { | ||
goto parse_error; | ||
} else if (comparison == COMPLETED_PARTIAL_MATCH) { | ||
goto finish; | ||
} | ||
numdigits = 0; | ||
for (i = 0; i < 6; ++i) { | ||
|
@@ -524,8 +578,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |
while (sublen > 0 && isspace(*substr)) { | ||
++substr; | ||
--sublen; | ||
if (compare_format(&format, &format_len, " ", 1, exact)) { | ||
comparison = compare_format(&format, &format_len, " ", 1, format_requirement); | ||
if (comparison == COMPARISON_ERROR) { | ||
goto parse_error; | ||
} else if (comparison == COMPLETED_PARTIAL_MATCH) { | ||
goto finish; | ||
} | ||
} | ||
|
||
|
@@ -539,8 +596,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |
|
||
/* UTC specifier */ | ||
if (*substr == 'Z') { | ||
if (compare_format(&format, &format_len, "%z", 2, exact)) { | ||
comparison = compare_format(&format, &format_len, "%z", 2, format_requirement); | ||
if (comparison == COMPARISON_ERROR) { | ||
goto parse_error; | ||
} else if (comparison == COMPLETED_PARTIAL_MATCH) { | ||
goto finish; | ||
} | ||
/* "Z" should be equivalent to tz offset "+00:00" */ | ||
if (out_local != NULL) { | ||
|
@@ -561,8 +621,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |
--sublen; | ||
} | ||
} else if (*substr == '-' || *substr == '+') { | ||
if (compare_format(&format, &format_len, "%z", 2, exact)) { | ||
comparison = compare_format(&format, &format_len, "%z", 2, format_requirement); | ||
if (comparison == COMPARISON_ERROR) { | ||
goto parse_error; | ||
} else if (comparison == COMPLETED_PARTIAL_MATCH) { | ||
goto finish; | ||
} | ||
/* Time zone offset */ | ||
int offset_neg = 0, offset_hour = 0, offset_minute = 0; | ||
|
@@ -647,8 +710,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |
while (sublen > 0 && isspace(*substr)) { | ||
++substr; | ||
--sublen; | ||
if (compare_format(&format, &format_len, " ", 1, exact)) { | ||
comparison = compare_format(&format, &format_len, " ", 1, format_requirement); | ||
if (comparison == COMPARISON_ERROR) { | ||
goto parse_error; | ||
} else if (comparison == COMPLETED_PARTIAL_MATCH) { | ||
goto finish; | ||
} | ||
} | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this is a good check to keep in place - otherwise these functions get unwieldy
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unfortunately the function is now 522 lines long, whereas the limit for this check is 500
Is it OK to turn it off now, or would you prefer a precursor PR to split up this function?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmm not a great solution here. I think OK for now but something we should take care of in a follow up.
Ideally you could change numpy upstream to split the function (maybe split into a date / time parsing functions?). That way we wouldn't diverge too far from them when we bring that downstream
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK I'll see if I can upstream something, thanks!