Skip to content

BUG: pandas.to_datetime() does not respect exact format string with ISO8601 #49333

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 26 commits into from
Nov 17, 2022
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
45f82c3
initial format support
nikitaved Oct 26, 2022
e473adb
set exact=False default in objects_to_datetime
Oct 28, 2022
a6ea6d0
:label: typing
Oct 28, 2022
8fede1f
simplify
Oct 28, 2022
2e21e71
Merge remote-tracking branch 'upstream/main' into pr/nikitaved-qssumm…
Oct 28, 2022
afc4d96
Merge remote-tracking branch 'upstream/main' into pr/nikitaved/qssumm…
Oct 30, 2022
12721b0
replace macro with function
Oct 30, 2022
0531967
clean up
Oct 30, 2022
a571753
:memo: restore docstring
Oct 30, 2022
e814a2e
inline
Oct 30, 2022
19c34f8
set format default to None
Oct 31, 2022
70fb820
Merge remote-tracking branch 'upstream/main' into pr/nikitaved/qssumm…
Oct 31, 2022
eb50dfb
clean up
Oct 31, 2022
ac61ac5
Merge remote-tracking branch 'upstream/main' into pr/nikitaved/qssumm…
Nov 6, 2022
0dd7407
remove function, perform check inline
Nov 6, 2022
4d35ea7
Merge remote-tracking branch 'upstream/main' into pr/nikitaved/qssumm…
Nov 7, 2022
3acfdf6
only compare *format++ if format_len
Nov 7, 2022
f3060c9
clean up
Nov 7, 2022
7310e13
typing
Nov 7, 2022
b18ade7
Merge remote-tracking branch 'upstream/main' into pr/nikitaved/qssumm…
Nov 9, 2022
3ceb1ee
split out branches
Nov 9, 2022
bde5ef9
Merge remote-tracking branch 'upstream/main' into pr/nikitaved/qssumm…
Nov 13, 2022
080f018
use compare_format function
Nov 13, 2022
031e0e3
remove tmp variable
Nov 13, 2022
01e8bd1
Merge remote-tracking branch 'upstream/main' into pr/nikitaved/qssumm…
Nov 17, 2022
c1e6bc2
Add co-authors
Nov 17, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -518,6 +518,7 @@ Conversion
- Bug in :meth:`Series.convert_dtypes` not converting dtype to nullable dtype when :class:`Series` contains ``NA`` and has dtype ``object`` (:issue:`48791`)
- Bug where any :class:`ExtensionDtype` subclass with ``kind="M"`` would be interpreted as a timezone type (:issue:`34986`)
- Bug in :class:`.arrays.ArrowExtensionArray` that would raise ``NotImplementedError`` when passed a sequence of strings or binary (:issue:`49172`)
- Bug in :func:`to_datetime` was not respecting ``exact`` argument when ``format`` was an ISO8601 format (:issue:`12649`)

Strings
^^^^^^^
Expand Down
2 changes: 2 additions & 0 deletions pandas/_libs/tslib.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ def array_to_datetime(
utc: bool = ...,
require_iso8601: bool = ...,
allow_mixed: bool = ...,
format: str | None = ...,
exact: bool = ...,
) -> tuple[np.ndarray, tzinfo | None]: ...

# returned ndarray may be object dtype or datetime64[ns]
Expand Down
16 changes: 14 additions & 2 deletions pandas/_libs/tslib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -447,6 +447,8 @@ cpdef array_to_datetime(
bint utc=False,
bint require_iso8601=False,
bint allow_mixed=False,
format: str | None=None,
bint exact=True,
):
"""
Converts a 1D array of date-like values to a numpy array of either:
Expand Down Expand Up @@ -566,6 +568,16 @@ cpdef array_to_datetime(
iresult[i] = get_datetime64_nanos(val, NPY_FR_ns)

elif is_integer_object(val) or is_float_object(val):
if require_iso8601:
if is_coerce:
iresult[i] = NPY_NAT
continue
elif is_raise:
raise ValueError(
f"time data \"{val}\" at position {i} doesn't "
f"match format \"{format}\""
)
return values, tz_out
# these must be ns unit by-definition
seen_integer = True

Expand Down Expand Up @@ -596,7 +608,7 @@ cpdef array_to_datetime(

string_to_dts_failed = string_to_dts(
val, &dts, &out_bestunit, &out_local,
&out_tzoffset, False
&out_tzoffset, False, format, exact
)
if string_to_dts_failed:
# An error at this point is a _parsing_ error
Expand All @@ -612,7 +624,7 @@ cpdef array_to_datetime(
elif is_raise:
raise ValueError(
f"time data \"{val}\" at position {i} doesn't "
"match format specified"
f"match format \"{format}\""
)
return values, tz_out

Expand Down
2 changes: 2 additions & 0 deletions pandas/_libs/tslibs/np_datetime.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,8 @@ cdef int string_to_dts(
int* out_local,
int* out_tzoffset,
bint want_exc,
format: str | None = *,
bint exact = *
) except? -1

cdef NPY_DATETIMEUNIT get_unit_from_dtype(cnp.dtype dtype)
Expand Down
16 changes: 14 additions & 2 deletions pandas/_libs/tslibs/np_datetime.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,8 @@ cdef extern from "src/datetime/np_datetime_strings.h":
int parse_iso_8601_datetime(const char *str, int len, int want_exc,
npy_datetimestruct *out,
NPY_DATETIMEUNIT *out_bestunit,
int *out_local, int *out_tzoffset)
int *out_local, int *out_tzoffset,
const char *format, int format_len, int exact)


# ----------------------------------------------------------------------
Expand Down Expand Up @@ -277,14 +278,25 @@ cdef inline int string_to_dts(
int* out_local,
int* out_tzoffset,
bint want_exc,
format: str | None=None,
bint exact=True,
) except? -1:
cdef:
Py_ssize_t length
const char* buf
Py_ssize_t format_length
const char* format_buf

buf = get_c_string_buf_and_size(val, &length)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Orthogonal to this but if you want to work more with the C codebase I think we should just replace get_c_string_buf_and_size with PyUnicode_AsUTF8AndSize directly; the former might have served a purpose with the Py2/3 transition but is just an unnecessary layer at this point

if format is None:
format_buf = b''
format_length = 0
exact = False
else:
format_buf = get_c_string_buf_and_size(format, &format_length)
return parse_iso_8601_datetime(buf, length, want_exc,
dts, out_bestunit, out_local, out_tzoffset)
dts, out_bestunit, out_local, out_tzoffset,
format_buf, format_length, exact)


cpdef ndarray astype_overflowsafe(
Expand Down
93 changes: 91 additions & 2 deletions pandas/_libs/tslibs/src/datetime/np_datetime_strings.c
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,8 @@ This file implements string parsing and creation for NumPy datetime.
int parse_iso_8601_datetime(const char *str, int len, int want_exc,
npy_datetimestruct *out,
NPY_DATETIMEUNIT *out_bestunit,
int *out_local, int *out_tzoffset) {
int *out_local, int *out_tzoffset,
const char* format, int format_len, int exact) {
int year_leap = 0;
int i, numdigits;
const char *substr;
Expand Down Expand Up @@ -104,19 +105,30 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
while (sublen > 0 && isspace(*substr)) {
++substr;
--sublen;
if (format_len < 1 && exact) goto parse_error;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if (format_len < 1 && exact) goto parse_error;
if (format_len < 1 && exact) {
goto parse_error;
}

I think clang-format will automatically enforce this style for you

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried clang-format and it formatted it without the braces, i.e.

        if (format_len < 1 && exact)
          goto parse_error;

so I've gone with that for now

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if (format_len < 1 && exact) goto parse_error;
if (format_len < 1) {
if (exact) {
goto parse_error;
}
// Do something?
} else {
if (*format++ != ' ') {
goto parse_error;
}
format_len--;
}

I think this control flow would work more universally and avoid logic pitfalls / segfaults (see other comment)

if (format_len && *format++ != ' ') goto parse_error;
if (format_len) --format_len;
}

/* Leading '-' sign for negative year */
if (*substr == '-') {
++substr;
--sublen;
if (format_len < 1 && exact) goto parse_error;
if (format_len && *format++ != '-') goto parse_error;
if (format_len) --format_len;
}

if (sublen == 0) {
goto parse_error;
}

/* PARSE THE YEAR (4 digits) */
if (format_len < 2 && exact) goto parse_error;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if (format_len < 2 && exact) goto parse_error;
if (format_len < 2) {
if (exact) {
goto parse_error;
}
// What happens here?
} else {
if (*format++ != '%' && *format++ != 'Y') {
goto parse_error;
}
format_len -= 2;
}

Current code has undefined behavior and a possible segfault if format_len is 1 while exact is False. Need to think through what action should be taken there

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In practice that shouldn't be possible - this function's only used internally, and

def format_is_iso(f: str) -> bint:
"""
Does format match the iso8601 set that can be handled by the C parser?
Generally of form YYYY-MM-DDTHH:MM:SS - date separator can be different
but must be consistent. Leading 0s in dates and times are optional.
"""
iso_template = '%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M:%S{micro_or_tz}'.format
excluded_formats = ['%Y%m%d', '%Y%m', '%Y']
for date_sep in [' ', '/', '\\', '-', '.', '']:
for time_sep in [' ', 'T']:
for micro_or_tz in ['', '%z', '%Z', '.%f', '.%f%z', '.%f%Z']:
if (iso_template(date_sep=date_sep,
time_sep=time_sep,
micro_or_tz=micro_or_tz,
).startswith(f) and f not in excluded_formats):
return True
return False

would return false if the format were for example %Y-%. I've changed it to raise if format_len isn't 0 anyway, thanks

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yea what you will find with C is that you want to be extremely explicit. Even if it's not possible today, if someone refactors the code and it does happen things become very difficult to debug.

Our error checking in some of our extensions is really loose, which is why our internal code is relatively difficult to refactor

if (format_len && *format++ != '%') goto parse_error;
if (format_len && *format++ != 'Y') goto parse_error;
if (format_len) format_len -= 2;

out->year = 0;
if (sublen >= 4 && isdigit(substr[0]) && isdigit(substr[1]) &&
isdigit(substr[2]) && isdigit(substr[3])) {
Expand All @@ -139,6 +151,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
if (out_local != NULL) {
*out_local = 0;
}
if (format_len) {
goto parse_error;
}
bestunit = NPY_FR_Y;
goto finish;
}
Expand All @@ -156,13 +171,20 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
ymd_sep = valid_ymd_sep[i];
++substr;
--sublen;
if (format_len < 1 && exact) goto parse_error;
if (format_len && *format++ != ymd_sep) goto parse_error;
if (format_len) --format_len;
/* Cannot have trailing separator */
if (sublen == 0 || !isdigit(*substr)) {
goto parse_error;
}
}

/* PARSE THE MONTH */
if (format_len < 2 && exact) goto parse_error;
if (format_len && *format++ != '%') goto parse_error;
if (format_len && *format++ != 'm') goto parse_error;
if (format_len) format_len -= 2;
/* First digit required */
out->month = (*substr - '0');
++substr;
Expand Down Expand Up @@ -190,6 +212,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
if (!has_ymd_sep) {
goto parse_error;
}
if (format_len) {
goto parse_error;
}
if (out_local != NULL) {
*out_local = 0;
}
Expand All @@ -203,9 +228,16 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
}
++substr;
--sublen;
if (format_len < 1 && exact) goto parse_error;
if (format_len && *format++ != ymd_sep) goto parse_error;
if (format_len) --format_len;
}

/* PARSE THE DAY */
if (format_len < 2 && exact) goto parse_error;
if (format_len && *format++ != '%') goto parse_error;
if (format_len && *format++ != 'd') goto parse_error;
if (format_len) format_len -= 2;
/* First digit required */
if (!isdigit(*substr)) {
goto parse_error;
Expand Down Expand Up @@ -235,17 +267,27 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
if (out_local != NULL) {
*out_local = 0;
}
if (format_len) {
goto parse_error;
}
bestunit = NPY_FR_D;
goto finish;
}

if ((*substr != 'T' && *substr != ' ') || sublen == 1) {
goto parse_error;
}
if (format_len < 1 && exact) goto parse_error;
if (format_len && *format++ != *substr) goto parse_error;
if (format_len) --format_len;
++substr;
--sublen;

/* PARSE THE HOURS */
if (format_len < 2 && exact) goto parse_error;
if (format_len && *format++ != '%') goto parse_error;
if (format_len && *format++ != 'H') goto parse_error;
if (format_len) format_len -= 2;
/* First digit required */
if (!isdigit(*substr)) {
goto parse_error;
Expand Down Expand Up @@ -274,6 +316,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
if (!hour_was_2_digits) {
goto parse_error;
}
if (format_len) {
goto parse_error;
}
bestunit = NPY_FR_h;
goto finish;
}
Expand All @@ -286,6 +331,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
if (sublen == 0 || !isdigit(*substr)) {
goto parse_error;
}
if (format_len < 1 && exact) goto parse_error;
if (format_len && *format++ != ':') goto parse_error;
if (format_len) --format_len;
} else if (!isdigit(*substr)) {
if (!hour_was_2_digits) {
goto parse_error;
Expand All @@ -294,6 +342,10 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
}

/* PARSE THE MINUTES */
if (format_len < 2 && exact) goto parse_error;
if (format_len && *format++ != '%') goto parse_error;
if (format_len && *format++ != 'M') goto parse_error;
if (format_len) format_len -= 2;
/* First digit required */
out->min = (*substr - '0');
++substr;
Expand All @@ -317,12 +369,18 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,

if (sublen == 0) {
bestunit = NPY_FR_m;
if (format_len) {
goto parse_error;
}
goto finish;
}

/* If we make it through this condition block, then the next
* character is a digit. */
if (has_hms_sep && *substr == ':') {
if (format_len < 1 && exact) goto parse_error;
if (format_len && *format++ != ':') goto parse_error;
if (format_len) --format_len;
++substr;
--sublen;
/* Cannot have a trailing ':' */
Expand All @@ -335,6 +393,10 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
}

/* PARSE THE SECONDS */
if (format_len < 2 && exact) goto parse_error;
if (format_len && *format++ != '%') goto parse_error;
if (format_len && *format++ != 'S') goto parse_error;
if (format_len) format_len -= 2;
/* First digit required */
out->sec = (*substr - '0');
++substr;
Expand All @@ -360,12 +422,19 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
if (sublen > 0 && *substr == '.') {
++substr;
--sublen;
if (format_len < 1 && exact) goto parse_error;
if (format_len && *format++ != '.') goto parse_error;
if (format_len) --format_len;
} else {
bestunit = NPY_FR_s;
goto parse_timezone;
}

/* PARSE THE MICROSECONDS (0 to 6 digits) */
if (format_len < 2 && exact) goto parse_error;
if (format_len && *format++ != '%') goto parse_error;
if (format_len && *format++ != 'f') goto parse_error;
if (format_len) format_len -= 2;
numdigits = 0;
for (i = 0; i < 6; ++i) {
out->us *= 10;
Expand Down Expand Up @@ -430,15 +499,25 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
while (sublen > 0 && isspace(*substr)) {
++substr;
--sublen;
if (format_len < 1 && exact) goto parse_error;
if (format_len && *format++ != ' ') goto parse_error;
if (format_len) --format_len;
}

if (sublen == 0) {
// Unlike NumPy, treating no time zone as naive
if (format_len > 0) {
goto parse_error;
}
goto finish;
}

/* UTC specifier */
if (*substr == 'Z') {
if (format_len < 2 && exact) goto parse_error;
if (format_len && *format++ != '%') goto parse_error;
if (format_len && *format++ != 'Z') goto parse_error;
if (format_len) format_len -= 2;
/* "Z" should be equivalent to tz offset "+00:00" */
if (out_local != NULL) {
*out_local = 1;
Expand All @@ -449,12 +528,19 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
}

if (sublen == 1) {
if (format_len > 0) {
goto parse_error;
}
goto finish;
} else {
++substr;
--sublen;
}
} else if (*substr == '-' || *substr == '+') {
if (format_len < 2 && exact) goto parse_error;
if (format_len && *format++ != '%') goto parse_error;
if (format_len && *format++ != 'z') goto parse_error;
if (format_len) format_len -= 2;
/* Time zone offset */
int offset_neg = 0, offset_hour = 0, offset_minute = 0;

Expand Down Expand Up @@ -538,9 +624,12 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
while (sublen > 0 && isspace(*substr)) {
++substr;
--sublen;
if (format_len < 1 && exact) goto parse_error;
if (format_len && *format++ != ' ') goto parse_error;
if (format_len) --format_len;
}

if (sublen != 0) {
if ((sublen != 0) || (format_len != 0)) {
goto parse_error;
}

Expand Down
5 changes: 4 additions & 1 deletion pandas/_libs/tslibs/src/datetime/np_datetime_strings.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,10 @@ parse_iso_8601_datetime(const char *str, int len, int want_exc,
npy_datetimestruct *out,
NPY_DATETIMEUNIT *out_bestunit,
int *out_local,
int *out_tzoffset);
int *out_tzoffset,
const char* format,
int format_len,
int exact);

/*
* Provides a string length to use for converting datetime
Expand Down
Loading