Skip to content

BUG: pandas.to_datetime() does not respect exact format string with ISO8601 #49333

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 26 commits into from
Nov 17, 2022
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
45f82c3
initial format support
nikitaved Oct 26, 2022
e473adb
set exact=False default in objects_to_datetime
Oct 28, 2022
a6ea6d0
:label: typing
Oct 28, 2022
8fede1f
simplify
Oct 28, 2022
2e21e71
Merge remote-tracking branch 'upstream/main' into pr/nikitaved-qssumm…
Oct 28, 2022
afc4d96
Merge remote-tracking branch 'upstream/main' into pr/nikitaved/qssumm…
Oct 30, 2022
12721b0
replace macro with function
Oct 30, 2022
0531967
clean up
Oct 30, 2022
a571753
:memo: restore docstring
Oct 30, 2022
e814a2e
inline
Oct 30, 2022
19c34f8
set format default to None
Oct 31, 2022
70fb820
Merge remote-tracking branch 'upstream/main' into pr/nikitaved/qssumm…
Oct 31, 2022
eb50dfb
clean up
Oct 31, 2022
ac61ac5
Merge remote-tracking branch 'upstream/main' into pr/nikitaved/qssumm…
Nov 6, 2022
0dd7407
remove function, perform check inline
Nov 6, 2022
4d35ea7
Merge remote-tracking branch 'upstream/main' into pr/nikitaved/qssumm…
Nov 7, 2022
3acfdf6
only compare *format++ if format_len
Nov 7, 2022
f3060c9
clean up
Nov 7, 2022
7310e13
typing
Nov 7, 2022
b18ade7
Merge remote-tracking branch 'upstream/main' into pr/nikitaved/qssumm…
Nov 9, 2022
3ceb1ee
split out branches
Nov 9, 2022
bde5ef9
Merge remote-tracking branch 'upstream/main' into pr/nikitaved/qssumm…
Nov 13, 2022
080f018
use compare_format function
Nov 13, 2022
031e0e3
remove tmp variable
Nov 13, 2022
01e8bd1
Merge remote-tracking branch 'upstream/main' into pr/nikitaved/qssumm…
Nov 17, 2022
c1e6bc2
Add co-authors
Nov 17, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,14 @@ repos:
# this particular codebase (e.g. src/headers, src/klib). However,
# we can lint all header files since they aren't "generated" like C files are.
exclude: ^pandas/_libs/src/(klib|headers)/
args: [--quiet, '--extensions=c,h', '--headers=h', --recursive, '--filter=-readability/casting,-runtime/int,-build/include_subdir']
args: [
--quiet,
'--extensions=c,h',
'--headers=h',
--recursive,
'--filter=-readability/casting,-runtime/int,-build/include_subdir',
'--linelength=88'
]
- repo: https://github.com/PyCQA/flake8
rev: 5.0.4
hooks:
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,7 @@ Conversion
- Bug in :meth:`Series.convert_dtypes` not converting dtype to nullable dtype when :class:`Series` contains ``NA`` and has dtype ``object`` (:issue:`48791`)
- Bug where any :class:`ExtensionDtype` subclass with ``kind="M"`` would be interpreted as a timezone type (:issue:`34986`)
- Bug in :class:`.arrays.ArrowExtensionArray` that would raise ``NotImplementedError`` when passed a sequence of strings or binary (:issue:`49172`)
- Bug in :func:`to_datetime` was not respecting ``exact`` argument when ``format`` was an ISO8601 format (:issue:`12649`)

Strings
^^^^^^^
Expand Down
2 changes: 2 additions & 0 deletions pandas/_libs/tslib.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ def array_to_datetime(
utc: bool = ...,
require_iso8601: bool = ...,
allow_mixed: bool = ...,
format: str = ...,
exact: bool = ...,
) -> tuple[np.ndarray, tzinfo | None]: ...

# returned ndarray may be object dtype or datetime64[ns]
15 changes: 13 additions & 2 deletions pandas/_libs/tslib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -445,6 +445,8 @@ cpdef array_to_datetime(
bint utc=False,
bint require_iso8601=False,
bint allow_mixed=False,
format: str | None=None,
bint exact=True,
):
"""
Converts a 1D array of date-like values to a numpy array of either:
Expand Down Expand Up @@ -564,6 +566,15 @@ cpdef array_to_datetime(
iresult[i] = get_datetime64_nanos(val, NPY_FR_ns)

elif is_integer_object(val) or is_float_object(val):
if require_iso8601:
if is_coerce:
iresult[i] = NPY_NAT
continue
elif is_raise:
raise ValueError(
f"time data \"{val}\" at position {i} doesn't match format \"{format}\""
)
return values, tz_out
# these must be ns unit by-definition
seen_integer = True

Expand Down Expand Up @@ -594,7 +605,7 @@ cpdef array_to_datetime(

string_to_dts_failed = string_to_dts(
val, &dts, &out_bestunit, &out_local,
&out_tzoffset, False
&out_tzoffset, False, format, exact
)
if string_to_dts_failed:
# An error at this point is a _parsing_ error
Expand All @@ -609,7 +620,7 @@ cpdef array_to_datetime(
continue
elif is_raise:
raise ValueError(
f"time data \"{val}\" at position {i} doesn't match format specified"
f"time data \"{val}\" at position {i} doesn't match format \"{format}\""
)
return values, tz_out

Expand Down
2 changes: 2 additions & 0 deletions pandas/_libs/tslibs/np_datetime.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,8 @@ cdef int string_to_dts(
int* out_local,
int* out_tzoffset,
bint want_exc,
format: str | None = *,
bint exact = *
) except? -1

cdef NPY_DATETIMEUNIT get_unit_from_dtype(cnp.dtype dtype)
Expand Down
16 changes: 14 additions & 2 deletions pandas/_libs/tslibs/np_datetime.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,8 @@ cdef extern from "src/datetime/np_datetime_strings.h":
int parse_iso_8601_datetime(const char *str, int len, int want_exc,
npy_datetimestruct *out,
NPY_DATETIMEUNIT *out_bestunit,
int *out_local, int *out_tzoffset)
int *out_local, int *out_tzoffset,
const char *format, int format_len, int exact)


# ----------------------------------------------------------------------
Expand Down Expand Up @@ -273,14 +274,25 @@ cdef inline int string_to_dts(
int* out_local,
int* out_tzoffset,
bint want_exc,
format: str | None = None,
bint exact = True,
) except? -1:
cdef:
Py_ssize_t length
const char* buf
Py_ssize_t format_length
const char* format_buf

buf = get_c_string_buf_and_size(val, &length)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Orthogonal to this but if you want to work more with the C codebase I think we should just replace get_c_string_buf_and_size with PyUnicode_AsUTF8AndSize directly; the former might have served a purpose with the Py2/3 transition but is just an unnecessary layer at this point

if format is None:
format_buf = b''
format_length = 0
exact = False
else:
format_buf = get_c_string_buf_and_size(format, &format_length)
return parse_iso_8601_datetime(buf, length, want_exc,
dts, out_bestunit, out_local, out_tzoffset)
dts, out_bestunit, out_local, out_tzoffset,
format_buf, format_length, exact)


cpdef ndarray astype_overflowsafe(
Expand Down
97 changes: 95 additions & 2 deletions pandas/_libs/tslibs/src/datetime/np_datetime_strings.c
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,25 @@ This file implements string parsing and creation for NumPy datetime.
*
* Returns 0 on success, -1 on failure.
*/

inline int format_startswith(char ch, int format_len, char format, int exact) {
Copy link
Member

@WillAyd WillAyd Oct 31, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Still not sure about this function. What is the advantage of this over using strncmp? Seems like it would be cleaner to just use that plus some combination of exact outside of this

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK I think I understand a bit more now what you are trying to do. I think it would be best if you just kept a local variable for characters consumed, which you can increment every time you increment the format pointer (you are kind of doing this anyway). You don't really need a function but can rather just do if (format++ != '%') { // handle error }

The order of operations will do the postfix increment after assignment, so this consolidates what you are trying to do

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just checking I've understood, is the suggestion to get rid of the function and each time do something like

if (((format_len > 0) && (format++ != '%')) || (exact && (format_len<=0))){
    if (format_len > 0) format_len--;
    goto parse_error;
}

?

I presume I've misunderstood because to me this looks more complicated and loses the docstring 😄

Copy link
Contributor Author

@nikitaved nikitaved Nov 2, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is possible to even do everything in one single if statement if ((exact && !format_len) || (format_len && format_len-- && *format++ != ch)) goto ... , but it then starts to look as one of these tricky C language questions... This explicit function has comments, it is more clear imho, and is very likely to get inlined.

Copy link
Member

@WillAyd WillAyd Nov 2, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Its not a matter of inlining as much as code organization. For instance, this block:

    if (!format_startswith('%', format_len, *format, exact)) goto parse_error;
    if (format_len) {++format; --format_len;}
    if (!format_startswith('Y', format_len, *format, exact)) goto parse_error;
    if (format_len) {++format; --format_len;}

Would seem more naturally expressed as:

if (format_len < 2 && exact) {
  goto parse_error;
} 

if (*format++ != '%') {
  goto parse_error;
} 

if (*format++ != 'Y') {
  goto parse_error;
}

There's some code golf you can do therein to shorten it but that's not really what I'm after. Moreso we should refactor this to avoid trying to cram a lot into a function with side effects, as we don't use that pattern much if at all in our code base, and a lot of the character by character checks being done are overkill

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The other advantage of this is you could provide an actual error message as to what went wrong at what position of the format, though not critical to scope for this PR

Copy link
Contributor Author

@nikitaved nikitaved Nov 2, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree. Can't we just refactor everything and replace it with something like

  1. match given format with the full format and return position of mismatch/success that depends on lengths and exactness.
  2. Parse the string with sprintf or something like this, with additional validity checks. Provided that 1 is a success

?

Unless doing everything in one go is a performance decision...

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you see a better way to refactor I am definitely open to it. I don't think the performance here will be that critical.
Probably better served as a pre cursor before adding functionality here, or alternately as a follow up to the "side-effect-less" design mentioned already

/* Check if the current character in `format` is `ch`.

Always error on character mismatch conditioned on non-exhausted format,
or when format is exhausted in the exact case.
Note that if `format` hasn't been exhausted, it should be advanced
outside of this function. */
if ((format_len && format != ch) || (exact && !format_len)) {
return 0;
}
return 1;
}

int parse_iso_8601_datetime(const char *str, int len, int want_exc,
npy_datetimestruct *out,
NPY_DATETIMEUNIT *out_bestunit,
int *out_local, int *out_tzoffset) {
int *out_local, int *out_tzoffset,
const char* format, int format_len, int exact) {
int year_leap = 0;
int i, numdigits;
const char *substr;
Expand Down Expand Up @@ -104,19 +119,28 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
while (sublen > 0 && isspace(*substr)) {
++substr;
--sublen;
if (!format_startswith(' ', format_len, *format, exact)) goto parse_error;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Minor nit but can you run clang-format against this file? I think it would format this differently

if (format_len) {++format; --format_len;}
}

/* Leading '-' sign for negative year */
if (*substr == '-') {
++substr;
--sublen;
if (!format_startswith('-', format_len, *format, exact)) goto parse_error;
if (format_len) {++format; --format_len;}
}

if (sublen == 0) {
goto parse_error;
}

/* PARSE THE YEAR (4 digits) */
if (!format_startswith('%', format_len, *format, exact)) goto parse_error;
if (format_len) {++format; --format_len;}
if (!format_startswith('Y', format_len, *format, exact)) goto parse_error;
if (format_len) {++format; --format_len;}

out->year = 0;
if (sublen >= 4 && isdigit(substr[0]) && isdigit(substr[1]) &&
isdigit(substr[2]) && isdigit(substr[3])) {
Expand All @@ -139,6 +163,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
if (out_local != NULL) {
*out_local = 0;
}
if (format_len) {
goto parse_error;
}
bestunit = NPY_FR_Y;
goto finish;
}
Expand All @@ -156,13 +183,19 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
ymd_sep = valid_ymd_sep[i];
++substr;
--sublen;
if (!format_startswith(ymd_sep, format_len, *format, exact)) goto parse_error;
if (format_len) {++format; --format_len;}
/* Cannot have trailing separator */
if (sublen == 0 || !isdigit(*substr)) {
goto parse_error;
}
}

/* PARSE THE MONTH */
if (!format_startswith('%', format_len, *format, exact)) goto parse_error;
if (format_len) {++format; --format_len;}
if (!format_startswith('m', format_len, *format, exact)) goto parse_error;
if (format_len) {++format; --format_len;}
/* First digit required */
out->month = (*substr - '0');
++substr;
Expand Down Expand Up @@ -190,6 +223,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
if (!has_ymd_sep) {
goto parse_error;
}
if (format_len) {
goto parse_error;
}
if (out_local != NULL) {
*out_local = 0;
}
Expand All @@ -203,9 +239,15 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
}
++substr;
--sublen;
if (!format_startswith(ymd_sep, format_len, *format, exact)) goto parse_error;
if (format_len) {++format; --format_len;}
}

/* PARSE THE DAY */
if (!format_startswith('%', format_len, *format, exact)) goto parse_error;
if (format_len) {++format; --format_len;}
if (!format_startswith('d', format_len, *format, exact)) goto parse_error;
if (format_len) {++format; --format_len;}
/* First digit required */
if (!isdigit(*substr)) {
goto parse_error;
Expand Down Expand Up @@ -235,17 +277,26 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
if (out_local != NULL) {
*out_local = 0;
}
if (format_len) {
goto parse_error;
}
bestunit = NPY_FR_D;
goto finish;
}

if ((*substr != 'T' && *substr != ' ') || sublen == 1) {
goto parse_error;
}
if (!format_startswith(*substr, format_len, *format, exact)) goto parse_error;
if (format_len) {++format; --format_len;}
++substr;
--sublen;

/* PARSE THE HOURS */
if (!format_startswith('%', format_len, *format, exact)) goto parse_error;
if (format_len) {++format; --format_len;}
if (!format_startswith('H', format_len, *format, exact)) goto parse_error;
if (format_len) {++format; --format_len;}
/* First digit required */
if (!isdigit(*substr)) {
goto parse_error;
Expand Down Expand Up @@ -274,6 +325,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
if (!hour_was_2_digits) {
goto parse_error;
}
if (format_len) {
goto parse_error;
}
bestunit = NPY_FR_h;
goto finish;
}
Expand All @@ -286,6 +340,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
if (sublen == 0 || !isdigit(*substr)) {
goto parse_error;
}
if (!format_startswith(':', format_len, *format, exact)) goto parse_error;
if (format_len) {++format; --format_len;}
} else if (!isdigit(*substr)) {
if (!hour_was_2_digits) {
goto parse_error;
Expand All @@ -294,6 +350,10 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
}

/* PARSE THE MINUTES */
if (!format_startswith('%', format_len, *format, exact)) goto parse_error;
if (format_len) {++format; --format_len;}
if (!format_startswith('M', format_len, *format, exact)) goto parse_error;
if (format_len) {++format; --format_len;}
/* First digit required */
out->min = (*substr - '0');
++substr;
Expand All @@ -317,12 +377,17 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,

if (sublen == 0) {
bestunit = NPY_FR_m;
if (format_len) {
goto parse_error;
}
goto finish;
}

/* If we make it through this condition block, then the next
* character is a digit. */
if (has_hms_sep && *substr == ':') {
if (!format_startswith(':', format_len, *format, exact)) goto parse_error;
if (format_len) {++format; --format_len;}
++substr;
--sublen;
/* Cannot have a trailing ':' */
Expand All @@ -335,6 +400,10 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
}

/* PARSE THE SECONDS */
if (!format_startswith('%', format_len, *format, exact)) goto parse_error;
if (format_len) {++format; --format_len;}
if (!format_startswith('S', format_len, *format, exact)) goto parse_error;
if (format_len) {++format; --format_len;}
/* First digit required */
out->sec = (*substr - '0');
++substr;
Expand All @@ -360,12 +429,18 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
if (sublen > 0 && *substr == '.') {
++substr;
--sublen;
if (!format_startswith('.', format_len, *format, exact)) goto parse_error;
if (format_len) {++format; --format_len;}
} else {
bestunit = NPY_FR_s;
goto parse_timezone;
}

/* PARSE THE MICROSECONDS (0 to 6 digits) */
if (!format_startswith('%', format_len, *format, exact)) goto parse_error;
if (format_len) {++format; --format_len;}
if (!format_startswith('f', format_len, *format, exact)) goto parse_error;
if (format_len) {++format; --format_len;}
numdigits = 0;
for (i = 0; i < 6; ++i) {
out->us *= 10;
Expand Down Expand Up @@ -430,15 +505,24 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
while (sublen > 0 && isspace(*substr)) {
++substr;
--sublen;
if (!format_startswith(' ', format_len, *format, exact)) goto parse_error;
if (format_len) {++format; --format_len;}
}

if (sublen == 0) {
// Unlike NumPy, treating no time zone as naive
if (format_len > 0) {
goto parse_error;
}
goto finish;
}

/* UTC specifier */
if (*substr == 'Z') {
if (!format_startswith('%', format_len, *format, exact)) goto parse_error;
if (format_len) {++format; --format_len;}
if (!format_startswith('Z', format_len, *format, exact)) goto parse_error;
if (format_len) {++format; --format_len;}
/* "Z" should be equivalent to tz offset "+00:00" */
if (out_local != NULL) {
*out_local = 1;
Expand All @@ -449,12 +533,19 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
}

if (sublen == 1) {
if (format_len > 0) {
goto parse_error;
}
goto finish;
} else {
++substr;
--sublen;
}
} else if (*substr == '-' || *substr == '+') {
if (!format_startswith('%', format_len, *format, exact)) goto parse_error;
if (format_len) {++format; --format_len;}
if (!format_startswith('z', format_len, *format, exact)) goto parse_error;
if (format_len) {++format; --format_len;}
/* Time zone offset */
int offset_neg = 0, offset_hour = 0, offset_minute = 0;

Expand Down Expand Up @@ -538,9 +629,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
while (sublen > 0 && isspace(*substr)) {
++substr;
--sublen;
if (!format_startswith(' ', format_len, *format, exact)) goto parse_error;
if (format_len) {++format; --format_len;}
}

if (sublen != 0) {
if ((sublen != 0) || (format_len != 0)) {
goto parse_error;
}

Expand Down
Loading