Skip to content

BUG: inconsistent handling of exact=False case in to_datetime parsing #50435

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Dec 31, 2022
3 changes: 2 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,8 @@ repos:
'--extensions=c,h',
'--headers=h',
--recursive,
'--filter=-readability/casting,-runtime/int,-build/include_subdir'
--linelength=88,
'--filter=-readability/casting,-runtime/int,-build/include_subdir,-readability/fn_size'
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is a good check to keep in place - otherwise these functions get unwieldy

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately the function is now 522 lines long, whereas the limit for this check is 500

Is it OK to turn it off now, or would you prefer a precursor PR to split up this function?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm not a great solution here. I think OK for now but something we should take care of in a follow up.

Ideally you could change numpy upstream to split the function (maybe split into a date / time parsing functions?). That way we wouldn't diverge too far from them when we bring that downstream

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK I'll see if I can upstream something, thanks!

]
- repo: https://github.com/PyCQA/flake8
rev: 6.0.0
Expand Down
6 changes: 6 additions & 0 deletions pandas/_libs/tslibs/np_datetime.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -120,3 +120,9 @@ cdef int64_t convert_reso(
NPY_DATETIMEUNIT to_reso,
bint round_ok,
) except? -1

cdef extern from "src/datetime/np_datetime_strings.h":
ctypedef enum FormatRequirement:
PARTIAL_MATCH
EXACT_MATCH
INFER_FORMAT
12 changes: 8 additions & 4 deletions pandas/_libs/tslibs/np_datetime.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,8 @@ cdef extern from "src/datetime/np_datetime_strings.h":
npy_datetimestruct *out,
NPY_DATETIMEUNIT *out_bestunit,
int *out_local, int *out_tzoffset,
const char *format, int format_len, int exact)
const char *format, int format_len,
FormatRequirement exact)


# ----------------------------------------------------------------------
Expand Down Expand Up @@ -279,24 +280,27 @@ cdef int string_to_dts(
int* out_tzoffset,
bint want_exc,
format: str | None=None,
bint exact=True,
bint exact=EXACT_MATCH,
) except? -1:
cdef:
Py_ssize_t length
const char* buf
Py_ssize_t format_length
const char* format_buf
FormatRequirement format_requirement

buf = get_c_string_buf_and_size(val, &length)
if format is None:
format_buf = b""
format_length = 0
exact = False
format_requirement = INFER_FORMAT
else:
format_buf = get_c_string_buf_and_size(format, &format_length)
format_requirement = <FormatRequirement>exact
return parse_iso_8601_datetime(buf, length, want_exc,
dts, out_bestunit, out_local, out_tzoffset,
format_buf, format_length, exact)
format_buf, format_length,
format_requirement)


cpdef ndarray astype_overflowsafe(
Expand Down
144 changes: 105 additions & 39 deletions pandas/_libs/tslibs/src/datetime/np_datetime_strings.c
Original file line number Diff line number Diff line change
Expand Up @@ -67,49 +67,59 @@ This file implements string parsing and creation for NumPy datetime.
* Returns 0 on success, -1 on failure.
*/

typedef enum {
COMPARISON_SUCCESS,
COMPLETED_PARTIAL_MATCH,
COMPARISON_ERROR
} DatetimePartParseResult;
// This function will advance the pointer on format
// and decrement characters_remaining by n on success
// On failure will return -1 without incrementing
static int compare_format(const char **format, int *characters_remaining,
const char *compare_to, int n, const int exact) {
// On failure will return COMPARISON_ERROR without incrementing
// If `format_requirement` is PARTIAL_MATCH, and the `format` string has
// been exhausted, then return COMPLETED_PARTIAL_MATCH.
static DatetimePartParseResult compare_format(
const char **format,
int *characters_remaining,
const char *compare_to,
int n,
const FormatRequirement format_requirement
) {
if (format_requirement == PARTIAL_MATCH && !*characters_remaining) {
Copy link
Member

@WillAyd WillAyd Dec 29, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also a nit but I think we need to handle characters_remaining being negative. It could just simply return a COMPARISON_ERROR right?

Understood it is impossible in the current state of things. However, if this gets refactored in the future and a negative number makes its way in here uncaught I think it would return a COMPARISON_SUCCCESS and be very difficult to troubleshoot without intimate knowledge of this function

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yup, thanks for this (and other) thoughtful comments!

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also a nit but I think we need to handle characters_remaining being 0

I presume you meant "less than 0" - that's what I've gone for anyway

return COMPLETED_PARTIAL_MATCH;
}
if (format_requirement == INFER_FORMAT) {
return COMPARISON_SUCCESS;
}
if (*characters_remaining < n) {
if (exact) {
// TODO(pandas-dev): in the future we should set a PyErr here
// to be very clear about what went wrong
return -1;
} else if (*characters_remaining) {
// TODO(pandas-dev): same return value in this function as
// above branch, but stub out a future where
// we have a better error message
return -1;
} else {
return 0;
}
// TODO(pandas-dev): PyErr to differentiate what went wrong
return COMPARISON_ERROR;
} else {
if (strncmp(*format, compare_to, n)) {
// TODO(pandas-dev): PyErr to differentiate what went wrong
return -1;
return COMPARISON_ERROR;
} else {
*format += n;
*characters_remaining -= n;
return 0;
return COMPARISON_SUCCESS;
}
}
return 0;
return COMPARISON_SUCCESS;
}

int parse_iso_8601_datetime(const char *str, int len, int want_exc,
npy_datetimestruct *out,
NPY_DATETIMEUNIT *out_bestunit,
int *out_local, int *out_tzoffset,
const char* format, int format_len, int exact) {
const char* format, int format_len,
FormatRequirement format_requirement) {
if (len < 0 || format_len < 0)
goto parse_error;
int year_leap = 0;
int i, numdigits;
const char *substr;
int sublen;
NPY_DATETIMEUNIT bestunit = NPY_FR_GENERIC;
DatetimePartParseResult comparison;

/* If year-month-day are separated by a valid separator,
* months/days without leading zeroes will be parsed
Expand Down Expand Up @@ -139,8 +149,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
while (sublen > 0 && isspace(*substr)) {
++substr;
--sublen;
if (compare_format(&format, &format_len, " ", 1, exact)) {
comparison = compare_format(&format, &format_len, " ", 1, format_requirement);
if (comparison == COMPARISON_ERROR) {
goto parse_error;
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
goto finish;
}
}

Expand All @@ -155,8 +168,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
}

/* PARSE THE YEAR (4 digits) */
if (compare_format(&format, &format_len, "%Y", 2, exact)) {
comparison = compare_format(&format, &format_len, "%Y", 2, format_requirement);
if (comparison == COMPARISON_ERROR) {
goto parse_error;
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
goto finish;
}

out->year = 0;
Expand Down Expand Up @@ -202,8 +218,12 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
++substr;
--sublen;

if (compare_format(&format, &format_len, &ymd_sep, 1, exact)) {
comparison = compare_format(&format, &format_len, &ymd_sep, 1,
format_requirement);
if (comparison == COMPARISON_ERROR) {
goto parse_error;
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
goto finish;
}
/* Cannot have trailing separator */
if (sublen == 0 || !isdigit(*substr)) {
Expand All @@ -212,8 +232,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
}

/* PARSE THE MONTH */
if (compare_format(&format, &format_len, "%m", 2, exact)) {
comparison = compare_format(&format, &format_len, "%m", 2, format_requirement);
if (comparison == COMPARISON_ERROR) {
goto parse_error;
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
goto finish;
}
/* First digit required */
out->month = (*substr - '0');
Expand Down Expand Up @@ -258,14 +281,21 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
}
++substr;
--sublen;
if (compare_format(&format, &format_len, &ymd_sep, 1, exact)) {
comparison = compare_format(&format, &format_len, &ymd_sep, 1,
format_requirement);
if (comparison == COMPARISON_ERROR) {
goto parse_error;
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
goto finish;
}
}

/* PARSE THE DAY */
if (compare_format(&format, &format_len, "%d", 2, exact)) {
comparison = compare_format(&format, &format_len, "%d", 2, format_requirement);
if (comparison == COMPARISON_ERROR) {
goto parse_error;
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
goto finish;
}
/* First digit required */
if (!isdigit(*substr)) {
Expand Down Expand Up @@ -306,15 +336,21 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
if ((*substr != 'T' && *substr != ' ') || sublen == 1) {
goto parse_error;
}
if (compare_format(&format, &format_len, substr, 1, exact)) {
goto parse_error;
}
comparison = compare_format(&format, &format_len, substr, 1, format_requirement);
if (comparison == COMPARISON_ERROR) {
goto parse_error;
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
goto finish;
}
++substr;
--sublen;

/* PARSE THE HOURS */
if (compare_format(&format, &format_len, "%H", 2, exact)) {
comparison = compare_format(&format, &format_len, "%H", 2, format_requirement);
if (comparison == COMPARISON_ERROR) {
goto parse_error;
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
goto finish;
}
/* First digit required */
if (!isdigit(*substr)) {
Expand Down Expand Up @@ -359,8 +395,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
if (sublen == 0 || !isdigit(*substr)) {
goto parse_error;
}
if (compare_format(&format, &format_len, ":", 1, exact)) {
comparison = compare_format(&format, &format_len, ":", 1, format_requirement);
if (comparison == COMPARISON_ERROR) {
goto parse_error;
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
goto finish;
}
} else if (!isdigit(*substr)) {
if (!hour_was_2_digits) {
Expand All @@ -370,8 +409,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
}

/* PARSE THE MINUTES */
if (compare_format(&format, &format_len, "%M", 2, exact)) {
comparison = compare_format(&format, &format_len, "%M", 2, format_requirement);
if (comparison == COMPARISON_ERROR) {
goto parse_error;
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
goto finish;
}
/* First digit required */
out->min = (*substr - '0');
Expand Down Expand Up @@ -405,8 +447,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
/* If we make it through this condition block, then the next
* character is a digit. */
if (has_hms_sep && *substr == ':') {
if (compare_format(&format, &format_len, ":", 1, exact)) {
comparison = compare_format(&format, &format_len, ":", 1, format_requirement);
if (comparison == COMPARISON_ERROR) {
goto parse_error;
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
goto finish;
}
++substr;
--sublen;
Expand All @@ -420,8 +465,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
}

/* PARSE THE SECONDS */
if (compare_format(&format, &format_len, "%S", 2, exact)) {
comparison = compare_format(&format, &format_len, "%S", 2, format_requirement);
if (comparison == COMPARISON_ERROR) {
goto parse_error;
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
goto finish;
}
/* First digit required */
out->sec = (*substr - '0');
Expand All @@ -448,17 +496,23 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
if (sublen > 0 && *substr == '.') {
++substr;
--sublen;
if (compare_format(&format, &format_len, ".", 1, exact)) {
comparison = compare_format(&format, &format_len, ".", 1, format_requirement);
if (comparison == COMPARISON_ERROR) {
goto parse_error;
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
goto finish;
}
} else {
bestunit = NPY_FR_s;
goto parse_timezone;
}

/* PARSE THE MICROSECONDS (0 to 6 digits) */
if (compare_format(&format, &format_len, "%f", 2, exact)) {
comparison = compare_format(&format, &format_len, "%f", 2, format_requirement);
if (comparison == COMPARISON_ERROR) {
goto parse_error;
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
goto finish;
}
numdigits = 0;
for (i = 0; i < 6; ++i) {
Expand Down Expand Up @@ -524,8 +578,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
while (sublen > 0 && isspace(*substr)) {
++substr;
--sublen;
if (compare_format(&format, &format_len, " ", 1, exact)) {
comparison = compare_format(&format, &format_len, " ", 1, format_requirement);
if (comparison == COMPARISON_ERROR) {
goto parse_error;
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
goto finish;
}
}

Expand All @@ -539,8 +596,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,

/* UTC specifier */
if (*substr == 'Z') {
if (compare_format(&format, &format_len, "%z", 2, exact)) {
comparison = compare_format(&format, &format_len, "%z", 2, format_requirement);
if (comparison == COMPARISON_ERROR) {
goto parse_error;
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
goto finish;
}
/* "Z" should be equivalent to tz offset "+00:00" */
if (out_local != NULL) {
Expand All @@ -561,8 +621,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
--sublen;
}
} else if (*substr == '-' || *substr == '+') {
if (compare_format(&format, &format_len, "%z", 2, exact)) {
comparison = compare_format(&format, &format_len, "%z", 2, format_requirement);
if (comparison == COMPARISON_ERROR) {
goto parse_error;
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
goto finish;
}
/* Time zone offset */
int offset_neg = 0, offset_hour = 0, offset_minute = 0;
Expand Down Expand Up @@ -647,8 +710,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
while (sublen > 0 && isspace(*substr)) {
++substr;
--sublen;
if (compare_format(&format, &format_len, " ", 1, exact)) {
comparison = compare_format(&format, &format_len, " ", 1, format_requirement);
if (comparison == COMPARISON_ERROR) {
goto parse_error;
} else if (comparison == COMPLETED_PARTIAL_MATCH) {
goto finish;
}
}

Expand Down
Loading