-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
BUG: inconsistent handling of exact=False case in to_datetime parsing #50435
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
3de2331
947353d
e3fe55b
9e18d33
efeaf7a
6c51924
b96158f
0ebff5c
caa9c90
84eeb3d
f92ff7a
5c67ed3
bad704e
ec6591b
8d8f90e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -96,7 +96,7 @@ cdef int string_to_dts( | |
int* out_tzoffset, | ||
bint want_exc, | ||
format: str | None = *, | ||
bint exact = * | ||
Exact exact = * | ||
) except? -1 | ||
|
||
cdef NPY_DATETIMEUNIT get_unit_from_dtype(cnp.dtype dtype) | ||
|
@@ -120,3 +120,9 @@ cdef int64_t convert_reso( | |
NPY_DATETIMEUNIT to_reso, | ||
bint round_ok, | ||
) except? -1 | ||
|
||
cdef extern from "src/datetime/np_datetime_strings.h": | ||
cdef enum Exact: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the name There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes good call, I've gone with |
||
PARTIAL_MATCH | ||
EXACT_MATCH | ||
NO_MATCH | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You're right, I've renamed to |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -67,49 +67,59 @@ This file implements string parsing and creation for NumPy datetime. | |||||
* Returns 0 on success, -1 on failure. | ||||||
*/ | ||||||
|
||||||
enum Comparison { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Design wise this assumes that the callee knows what the caller is doing and can instruct it on actions to take. I think it would be better to separate those entities and just have the callee report back what it knows. With that in mind, maybe call rename this to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As in, to name the values the same way as those from The issue is that different format requirements can result in the same result from this function - for example, both I've renamed the values to
, is that clearer? |
||||||
ADVANCE, | ||||||
RETURN, | ||||||
ERROR | ||||||
}; | ||||||
// This function will advance the pointer on format | ||||||
// and decrement characters_remaining by n on success | ||||||
// On failure will return -1 without incrementing | ||||||
static int compare_format(const char **format, int *characters_remaining, | ||||||
const char *compare_to, int n, const int exact) { | ||||||
// On failure will return ERROR without incrementing | ||||||
// If `exact` is PARTIAL_MATCH, and the `format` string has | ||||||
// been exhausted, then signal to the caller to finish parsing. | ||||||
static enum Comparison compare_format( | ||||||
const char **format, | ||||||
int *characters_remaining, | ||||||
const char *compare_to, | ||||||
int n, | ||||||
const enum Exact exact | ||||||
) { | ||||||
if (exact == PARTIAL_MATCH && !*characters_remaining) { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Nit but would be good to explicitly compare to 0. Depending on code structure we may also want to be careful what happens if |
||||||
return RETURN; | ||||||
} | ||||||
if (exact == NO_MATCH) { | ||||||
return ADVANCE; | ||||||
} | ||||||
if (*characters_remaining < n) { | ||||||
if (exact) { | ||||||
// TODO(pandas-dev): in the future we should set a PyErr here | ||||||
// to be very clear about what went wrong | ||||||
return -1; | ||||||
} else if (*characters_remaining) { | ||||||
// TODO(pandas-dev): same return value in this function as | ||||||
// above branch, but stub out a future where | ||||||
// we have a better error message | ||||||
return -1; | ||||||
} else { | ||||||
return 0; | ||||||
} | ||||||
// TODO(pandas-dev): PyErr to differentiate what went wrong | ||||||
return ERROR; | ||||||
} else { | ||||||
if (strncmp(*format, compare_to, n)) { | ||||||
// TODO(pandas-dev): PyErr to differentiate what went wrong | ||||||
return -1; | ||||||
return ERROR; | ||||||
} else { | ||||||
*format += n; | ||||||
*characters_remaining -= n; | ||||||
return 0; | ||||||
return ADVANCE; | ||||||
} | ||||||
} | ||||||
return 0; | ||||||
return ADVANCE; | ||||||
} | ||||||
|
||||||
int parse_iso_8601_datetime(const char *str, int len, int want_exc, | ||||||
npy_datetimestruct *out, | ||||||
NPY_DATETIMEUNIT *out_bestunit, | ||||||
int *out_local, int *out_tzoffset, | ||||||
const char* format, int format_len, int exact) { | ||||||
const char* format, int format_len, | ||||||
enum Exact exact) { | ||||||
if (len < 0 || format_len < 0) | ||||||
goto parse_error; | ||||||
int year_leap = 0; | ||||||
int i, numdigits; | ||||||
const char *substr; | ||||||
int sublen; | ||||||
NPY_DATETIMEUNIT bestunit = NPY_FR_GENERIC; | ||||||
enum Comparison comparison; | ||||||
|
||||||
/* If year-month-day are separated by a valid separator, | ||||||
* months/days without leading zeroes will be parsed | ||||||
|
@@ -139,8 +149,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |||||
while (sublen > 0 && isspace(*substr)) { | ||||||
++substr; | ||||||
--sublen; | ||||||
if (compare_format(&format, &format_len, " ", 1, exact)) { | ||||||
comparison = compare_format(&format, &format_len, " ", 1, exact); | ||||||
if (comparison == ERROR) { | ||||||
goto parse_error; | ||||||
} else if (comparison == RETURN) { | ||||||
goto finish; | ||||||
} | ||||||
} | ||||||
|
||||||
|
@@ -155,8 +168,14 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |||||
} | ||||||
|
||||||
/* PARSE THE YEAR (4 digits) */ | ||||||
if (compare_format(&format, &format_len, "%Y", 2, exact)) { | ||||||
if (exact == PARTIAL_MATCH && !format_len) { | ||||||
goto finish; | ||||||
} | ||||||
comparison = compare_format(&format, &format_len, "%Y", 2, exact); | ||||||
if (comparison == ERROR) { | ||||||
goto parse_error; | ||||||
} else if (comparison == RETURN) { | ||||||
goto finish; | ||||||
} | ||||||
|
||||||
out->year = 0; | ||||||
|
@@ -202,8 +221,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |||||
++substr; | ||||||
--sublen; | ||||||
|
||||||
if (compare_format(&format, &format_len, &ymd_sep, 1, exact)) { | ||||||
comparison = compare_format(&format, &format_len, &ymd_sep, 1, exact); | ||||||
if (comparison == ERROR) { | ||||||
goto parse_error; | ||||||
} else if (comparison == RETURN) { | ||||||
goto finish; | ||||||
} | ||||||
/* Cannot have trailing separator */ | ||||||
if (sublen == 0 || !isdigit(*substr)) { | ||||||
|
@@ -212,8 +234,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |||||
} | ||||||
|
||||||
/* PARSE THE MONTH */ | ||||||
if (compare_format(&format, &format_len, "%m", 2, exact)) { | ||||||
comparison = compare_format(&format, &format_len, "%m", 2, exact); | ||||||
if (comparison == ERROR) { | ||||||
goto parse_error; | ||||||
} else if (comparison == RETURN) { | ||||||
goto finish; | ||||||
} | ||||||
/* First digit required */ | ||||||
out->month = (*substr - '0'); | ||||||
|
@@ -258,14 +283,20 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |||||
} | ||||||
++substr; | ||||||
--sublen; | ||||||
if (compare_format(&format, &format_len, &ymd_sep, 1, exact)) { | ||||||
comparison = compare_format(&format, &format_len, &ymd_sep, 1, exact); | ||||||
if (comparison == ERROR) { | ||||||
goto parse_error; | ||||||
} else if (comparison == RETURN) { | ||||||
goto finish; | ||||||
} | ||||||
} | ||||||
|
||||||
/* PARSE THE DAY */ | ||||||
if (compare_format(&format, &format_len, "%d", 2, exact)) { | ||||||
comparison = compare_format(&format, &format_len, "%d", 2, exact); | ||||||
if (comparison == ERROR) { | ||||||
goto parse_error; | ||||||
} else if (comparison == RETURN) { | ||||||
goto finish; | ||||||
} | ||||||
/* First digit required */ | ||||||
if (!isdigit(*substr)) { | ||||||
|
@@ -306,15 +337,21 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |||||
if ((*substr != 'T' && *substr != ' ') || sublen == 1) { | ||||||
goto parse_error; | ||||||
} | ||||||
if (compare_format(&format, &format_len, substr, 1, exact)) { | ||||||
goto parse_error; | ||||||
} | ||||||
comparison = compare_format(&format, &format_len, substr, 1, exact); | ||||||
if (comparison == ERROR) { | ||||||
goto parse_error; | ||||||
} else if (comparison == RETURN) { | ||||||
goto finish; | ||||||
} | ||||||
++substr; | ||||||
--sublen; | ||||||
|
||||||
/* PARSE THE HOURS */ | ||||||
if (compare_format(&format, &format_len, "%H", 2, exact)) { | ||||||
comparison = compare_format(&format, &format_len, "%H", 2, exact); | ||||||
if (comparison == ERROR) { | ||||||
goto parse_error; | ||||||
} else if (comparison == RETURN) { | ||||||
goto finish; | ||||||
} | ||||||
/* First digit required */ | ||||||
if (!isdigit(*substr)) { | ||||||
|
@@ -359,8 +396,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |||||
if (sublen == 0 || !isdigit(*substr)) { | ||||||
goto parse_error; | ||||||
} | ||||||
if (compare_format(&format, &format_len, ":", 1, exact)) { | ||||||
comparison = compare_format(&format, &format_len, ":", 1, exact); | ||||||
if (comparison == ERROR) { | ||||||
goto parse_error; | ||||||
} else if (comparison == RETURN) { | ||||||
goto finish; | ||||||
} | ||||||
} else if (!isdigit(*substr)) { | ||||||
if (!hour_was_2_digits) { | ||||||
|
@@ -370,8 +410,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |||||
} | ||||||
|
||||||
/* PARSE THE MINUTES */ | ||||||
if (compare_format(&format, &format_len, "%M", 2, exact)) { | ||||||
comparison = compare_format(&format, &format_len, "%M", 2, exact); | ||||||
if (comparison == ERROR) { | ||||||
goto parse_error; | ||||||
} else if (comparison == RETURN) { | ||||||
goto finish; | ||||||
} | ||||||
/* First digit required */ | ||||||
out->min = (*substr - '0'); | ||||||
|
@@ -405,8 +448,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |||||
/* If we make it through this condition block, then the next | ||||||
* character is a digit. */ | ||||||
if (has_hms_sep && *substr == ':') { | ||||||
if (compare_format(&format, &format_len, ":", 1, exact)) { | ||||||
comparison = compare_format(&format, &format_len, ":", 1, exact); | ||||||
if (comparison == ERROR) { | ||||||
goto parse_error; | ||||||
} else if (comparison == RETURN) { | ||||||
goto finish; | ||||||
} | ||||||
++substr; | ||||||
--sublen; | ||||||
|
@@ -420,8 +466,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |||||
} | ||||||
|
||||||
/* PARSE THE SECONDS */ | ||||||
if (compare_format(&format, &format_len, "%S", 2, exact)) { | ||||||
comparison = compare_format(&format, &format_len, "%S", 2, exact); | ||||||
if (comparison == ERROR) { | ||||||
goto parse_error; | ||||||
} else if (comparison == RETURN) { | ||||||
goto finish; | ||||||
} | ||||||
/* First digit required */ | ||||||
out->sec = (*substr - '0'); | ||||||
|
@@ -448,17 +497,23 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |||||
if (sublen > 0 && *substr == '.') { | ||||||
++substr; | ||||||
--sublen; | ||||||
if (compare_format(&format, &format_len, ".", 1, exact)) { | ||||||
comparison = compare_format(&format, &format_len, ".", 1, exact); | ||||||
if (comparison == ERROR) { | ||||||
goto parse_error; | ||||||
} else if (comparison == RETURN) { | ||||||
goto finish; | ||||||
} | ||||||
} else { | ||||||
bestunit = NPY_FR_s; | ||||||
goto parse_timezone; | ||||||
} | ||||||
|
||||||
/* PARSE THE MICROSECONDS (0 to 6 digits) */ | ||||||
if (compare_format(&format, &format_len, "%f", 2, exact)) { | ||||||
comparison = compare_format(&format, &format_len, "%f", 2, exact); | ||||||
if (comparison == ERROR) { | ||||||
goto parse_error; | ||||||
} else if (comparison == RETURN) { | ||||||
goto finish; | ||||||
} | ||||||
numdigits = 0; | ||||||
for (i = 0; i < 6; ++i) { | ||||||
|
@@ -524,8 +579,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |||||
while (sublen > 0 && isspace(*substr)) { | ||||||
++substr; | ||||||
--sublen; | ||||||
if (compare_format(&format, &format_len, " ", 1, exact)) { | ||||||
comparison = compare_format(&format, &format_len, " ", 1, exact); | ||||||
if (comparison == ERROR) { | ||||||
goto parse_error; | ||||||
} else if (comparison == RETURN) { | ||||||
goto finish; | ||||||
} | ||||||
} | ||||||
|
||||||
|
@@ -539,8 +597,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |||||
|
||||||
/* UTC specifier */ | ||||||
if (*substr == 'Z') { | ||||||
if (compare_format(&format, &format_len, "%z", 2, exact)) { | ||||||
comparison = compare_format(&format, &format_len, "%z", 2, exact); | ||||||
if (comparison == ERROR) { | ||||||
goto parse_error; | ||||||
} else if (comparison == RETURN) { | ||||||
goto finish; | ||||||
} | ||||||
/* "Z" should be equivalent to tz offset "+00:00" */ | ||||||
if (out_local != NULL) { | ||||||
|
@@ -561,8 +622,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |||||
--sublen; | ||||||
} | ||||||
} else if (*substr == '-' || *substr == '+') { | ||||||
if (compare_format(&format, &format_len, "%z", 2, exact)) { | ||||||
comparison = compare_format(&format, &format_len, "%z", 2, exact); | ||||||
if (comparison == ERROR) { | ||||||
goto parse_error; | ||||||
} else if (comparison == RETURN) { | ||||||
goto finish; | ||||||
} | ||||||
/* Time zone offset */ | ||||||
int offset_neg = 0, offset_hour = 0, offset_minute = 0; | ||||||
|
@@ -647,6 +711,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |||||
while (sublen > 0 && isspace(*substr)) { | ||||||
++substr; | ||||||
--sublen; | ||||||
if (exact == PARTIAL_MATCH && !format_len) { | ||||||
goto finish; | ||||||
} | ||||||
if (compare_format(&format, &format_len, " ", 1, exact)) { | ||||||
goto parse_error; | ||||||
} | ||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this is a good check to keep in place - otherwise these functions get unwieldy
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unfortunately the function is now 522 lines long, whereas the limit for this check is 500
Is it OK to turn it off now, or would you prefer a precursor PR to split up this function?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmm not a great solution here. I think OK for now but something we should take care of in a follow up.
Ideally you could change numpy upstream to split the function (maybe split into a date / time parsing functions?). That way we wouldn't diverge too far from them when we bring that downstream
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK I'll see if I can upstream something, thanks!