-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
BUG: inconsistent handling of exact=False case in to_datetime parsing #50435
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
3de2331
947353d
e3fe55b
9e18d33
efeaf7a
6c51924
b96158f
0ebff5c
caa9c90
84eeb3d
f92ff7a
5c67ed3
bad704e
ec6591b
8d8f90e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -96,7 +96,7 @@ cdef int string_to_dts( | |
int* out_tzoffset, | ||
bint want_exc, | ||
format: str | None = *, | ||
bint exact = * | ||
Exact exact = * | ||
) except? -1 | ||
|
||
cdef NPY_DATETIMEUNIT get_unit_from_dtype(cnp.dtype dtype) | ||
|
@@ -120,3 +120,9 @@ cdef int64_t convert_reso( | |
NPY_DATETIMEUNIT to_reso, | ||
bint round_ok, | ||
) except? -1 | ||
|
||
cdef extern from "src/datetime/np_datetime_strings.h": | ||
cdef enum Exact: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the name There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes good call, I've gone with |
||
PARTIAL_MATCH | ||
EXACT_MATCH | ||
NO_MATCH | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You're right, I've renamed to |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -71,20 +71,13 @@ This file implements string parsing and creation for NumPy datetime. | |
// and decrement characters_remaining by n on success | ||
// On failure will return -1 without incrementing | ||
static int compare_format(const char **format, int *characters_remaining, | ||
const char *compare_to, int n, const int exact) { | ||
const char *compare_to, int n, const enum Exact exact) { | ||
if (exact == NO_MATCH) { | ||
return 0; | ||
} | ||
if (*characters_remaining < n) { | ||
if (exact) { | ||
// TODO(pandas-dev): in the future we should set a PyErr here | ||
// to be very clear about what went wrong | ||
return -1; | ||
} else if (*characters_remaining) { | ||
// TODO(pandas-dev): same return value in this function as | ||
// above branch, but stub out a future where | ||
// we have a better error message | ||
return -1; | ||
} else { | ||
return 0; | ||
} | ||
// TODO(pandas-dev): PyErr to differentiate what went wrong | ||
return -1; | ||
} else { | ||
if (strncmp(*format, compare_to, n)) { | ||
// TODO(pandas-dev): PyErr to differentiate what went wrong | ||
|
@@ -102,7 +95,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |
npy_datetimestruct *out, | ||
NPY_DATETIMEUNIT *out_bestunit, | ||
int *out_local, int *out_tzoffset, | ||
const char* format, int format_len, int exact) { | ||
const char* format, int format_len, | ||
enum Exact exact) { | ||
if (len < 0 || format_len < 0) | ||
goto parse_error; | ||
int year_leap = 0; | ||
|
@@ -139,6 +133,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |
while (sublen > 0 && isspace(*substr)) { | ||
++substr; | ||
--sublen; | ||
if (exact == PARTIAL_MATCH && !format_len) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we not just make There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To clarify, I think you can return an enum from OK_EXACT
OK_PARTIAL etc... describing the different states, then branch in the caller appropriately |
||
goto finish; | ||
} | ||
if (compare_format(&format, &format_len, " ", 1, exact)) { | ||
goto parse_error; | ||
} | ||
|
@@ -155,6 +152,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |
} | ||
|
||
/* PARSE THE YEAR (4 digits) */ | ||
if (exact == PARTIAL_MATCH && !format_len) { | ||
goto finish; | ||
} | ||
if (compare_format(&format, &format_len, "%Y", 2, exact)) { | ||
goto parse_error; | ||
} | ||
|
@@ -202,6 +202,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |
++substr; | ||
--sublen; | ||
|
||
if (exact == PARTIAL_MATCH && !format_len) { | ||
goto finish; | ||
} | ||
if (compare_format(&format, &format_len, &ymd_sep, 1, exact)) { | ||
goto parse_error; | ||
} | ||
|
@@ -212,6 +215,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |
} | ||
|
||
/* PARSE THE MONTH */ | ||
if (exact == PARTIAL_MATCH && !format_len) { | ||
goto finish; | ||
} | ||
if (compare_format(&format, &format_len, "%m", 2, exact)) { | ||
goto parse_error; | ||
} | ||
|
@@ -258,12 +264,18 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |
} | ||
++substr; | ||
--sublen; | ||
if (exact == PARTIAL_MATCH && !format_len) { | ||
goto finish; | ||
} | ||
if (compare_format(&format, &format_len, &ymd_sep, 1, exact)) { | ||
goto parse_error; | ||
} | ||
} | ||
|
||
/* PARSE THE DAY */ | ||
if (exact == PARTIAL_MATCH && !format_len) { | ||
goto finish; | ||
} | ||
if (compare_format(&format, &format_len, "%d", 2, exact)) { | ||
goto parse_error; | ||
} | ||
|
@@ -306,13 +318,19 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |
if ((*substr != 'T' && *substr != ' ') || sublen == 1) { | ||
goto parse_error; | ||
} | ||
if (exact == PARTIAL_MATCH && !format_len) { | ||
goto finish; | ||
} | ||
if (compare_format(&format, &format_len, substr, 1, exact)) { | ||
goto parse_error; | ||
} | ||
++substr; | ||
--sublen; | ||
|
||
/* PARSE THE HOURS */ | ||
if (exact == PARTIAL_MATCH && !format_len) { | ||
goto finish; | ||
} | ||
if (compare_format(&format, &format_len, "%H", 2, exact)) { | ||
goto parse_error; | ||
} | ||
|
@@ -359,6 +377,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |
if (sublen == 0 || !isdigit(*substr)) { | ||
goto parse_error; | ||
} | ||
if (exact == PARTIAL_MATCH && !format_len) { | ||
goto finish; | ||
} | ||
if (compare_format(&format, &format_len, ":", 1, exact)) { | ||
goto parse_error; | ||
} | ||
|
@@ -370,6 +391,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |
} | ||
|
||
/* PARSE THE MINUTES */ | ||
if (exact == PARTIAL_MATCH && !format_len) { | ||
goto finish; | ||
} | ||
if (compare_format(&format, &format_len, "%M", 2, exact)) { | ||
goto parse_error; | ||
} | ||
|
@@ -405,6 +429,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |
/* If we make it through this condition block, then the next | ||
* character is a digit. */ | ||
if (has_hms_sep && *substr == ':') { | ||
if (exact == PARTIAL_MATCH && !format_len) { | ||
goto finish; | ||
} | ||
if (compare_format(&format, &format_len, ":", 1, exact)) { | ||
goto parse_error; | ||
} | ||
|
@@ -420,6 +447,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |
} | ||
|
||
/* PARSE THE SECONDS */ | ||
if (exact == PARTIAL_MATCH && !format_len) { | ||
goto finish; | ||
} | ||
if (compare_format(&format, &format_len, "%S", 2, exact)) { | ||
goto parse_error; | ||
} | ||
|
@@ -448,6 +478,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |
if (sublen > 0 && *substr == '.') { | ||
++substr; | ||
--sublen; | ||
if (exact == PARTIAL_MATCH && !format_len) { | ||
goto finish; | ||
} | ||
if (compare_format(&format, &format_len, ".", 1, exact)) { | ||
goto parse_error; | ||
} | ||
|
@@ -457,6 +490,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |
} | ||
|
||
/* PARSE THE MICROSECONDS (0 to 6 digits) */ | ||
if (exact == PARTIAL_MATCH && !format_len) { | ||
goto finish; | ||
} | ||
if (compare_format(&format, &format_len, "%f", 2, exact)) { | ||
goto parse_error; | ||
} | ||
|
@@ -524,6 +560,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |
while (sublen > 0 && isspace(*substr)) { | ||
++substr; | ||
--sublen; | ||
if (exact == PARTIAL_MATCH && !format_len) { | ||
goto finish; | ||
} | ||
if (compare_format(&format, &format_len, " ", 1, exact)) { | ||
goto parse_error; | ||
} | ||
|
@@ -539,6 +578,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |
|
||
/* UTC specifier */ | ||
if (*substr == 'Z') { | ||
if (exact == PARTIAL_MATCH && !format_len) { | ||
goto finish; | ||
} | ||
if (compare_format(&format, &format_len, "%z", 2, exact)) { | ||
goto parse_error; | ||
} | ||
|
@@ -561,6 +603,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |
--sublen; | ||
} | ||
} else if (*substr == '-' || *substr == '+') { | ||
if (exact == PARTIAL_MATCH && !format_len) { | ||
goto finish; | ||
} | ||
if (compare_format(&format, &format_len, "%z", 2, exact)) { | ||
goto parse_error; | ||
} | ||
|
@@ -647,6 +692,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, | |
while (sublen > 0 && isspace(*substr)) { | ||
++substr; | ||
--sublen; | ||
if (exact == PARTIAL_MATCH && !format_len) { | ||
goto finish; | ||
} | ||
if (compare_format(&format, &format_len, " ", 1, exact)) { | ||
goto parse_error; | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,6 +26,22 @@ This file implements string parsing and creation for NumPy datetime. | |
#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION | ||
#endif // NPY_NO_DEPRECATED_API | ||
|
||
/* 'exact' can be one of three values: | ||
* * PARTIAL_MATCH : Only require a partial match with 'format'. | ||
* For example, if the string is '2020-01-01 05:00:00' and | ||
* 'format' is '%Y-%m-%d', then parse '2020-01-01'; | ||
* * EXACT_MATCH : require an exact match with 'format'. If the | ||
* string is '2020-01-01', then the only format which will | ||
* be able to parse it without error is '%Y-%m-%d'; | ||
* * NO_MATCH: don't require any match - parse without comparing | ||
* with 'format'. | ||
*/ | ||
enum Exact { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. FYI this file is vendored from numpy. The ship has sailed a bit in terms of editing directly, but when we move to Meson and abandon setuptools its worth considering a split to put all of our custom logic into a separate library and leaving the vendored code in place (or upstreaming changes if they make sense for numpy) |
||
PARTIAL_MATCH, | ||
EXACT_MATCH, | ||
NO_MATCH | ||
}; | ||
|
||
/* | ||
* Parses (almost) standard ISO 8601 date strings. The differences are: | ||
* | ||
|
@@ -61,7 +77,7 @@ parse_iso_8601_datetime(const char *str, int len, int want_exc, | |
int *out_tzoffset, | ||
const char* format, | ||
int format_len, | ||
int exact); | ||
enum Exact exact); | ||
|
||
/* | ||
* Provides a string length to use for converting datetime | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this is a good check to keep in place - otherwise these functions get unwieldy
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unfortunately the function is now 522 lines long, whereas the limit for this check is 500
Is it OK to turn it off now, or would you prefer a precursor PR to split up this function?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmm not a great solution here. I think OK for now but something we should take care of in a follow up.
Ideally you could change numpy upstream to split the function (maybe split into a date / time parsing functions?). That way we wouldn't diverge too far from them when we bring that downstream
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK I'll see if I can upstream something, thanks!