Skip to content

BUG: pandas.to_datetime() does not respect exact format string with ISO8601 #49232

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pandas/_libs/tslib.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ def array_to_datetime(
utc: bool = ...,
require_iso8601: bool = ...,
allow_mixed: bool = ...,
format: str | None = ...,
exact: bool = ...,
) -> tuple[np.ndarray, tzinfo | None]: ...

# returned ndarray may be object dtype or datetime64[ns]
38 changes: 35 additions & 3 deletions pandas/_libs/tslib.pyx
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
import warnings

cimport cython
Expand Down Expand Up @@ -85,6 +86,8 @@ def _test_parse_iso8601(ts: str):
_TSObject obj
int out_local = 0, out_tzoffset = 0
NPY_DATETIMEUNIT out_bestunit
char inferred_format
int inferred_format_len

obj = _TSObject()

Expand All @@ -93,7 +96,7 @@ def _test_parse_iso8601(ts: str):
elif ts == 'today':
return Timestamp.now().normalize()

string_to_dts(ts, &obj.dts, &out_bestunit, &out_local, &out_tzoffset, True)
string_to_dts(ts, &obj.dts, &out_bestunit, &out_local, &out_tzoffset, True, &inferred_format, &inferred_format_len)
obj.value = npy_datetimestruct_to_datetime(NPY_FR_ns, &obj.dts)
check_dts_bounds(&obj.dts)
if out_local == 1:
Expand Down Expand Up @@ -449,6 +452,8 @@ cpdef array_to_datetime(
bint utc=False,
bint require_iso8601=False,
bint allow_mixed=False,
str format=None,
bint exact=False,
):
"""
Converts a 1D array of date-like values to a numpy array of either:
Expand Down Expand Up @@ -509,6 +514,8 @@ cpdef array_to_datetime(
datetime py_dt
tzinfo tz_out = None
bint found_tz = False, found_naive = False
char inferred_format[100]
int inferred_format_len
Copy link
Member Author

@MarcoGorelli MarcoGorelli Oct 21, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hi @WillAyd - I don't have much experience with C, what would be a better way to do this?

What I'm trying to do is:

  • make a character array inferred_format in which to record the format which is guessed within parse_iso_8601_datetime
  • record the length of the inferred format into inferred_format_len

and then compare format with inferred_format[:inferred_format_len]

I'll try to figure this out, but I'm aware that char inferred_format[1000] is not the solution, just seeing if I can get something working first

EDIT: looks like strcat from string.h could work for this

EDIT2: passing a string to parse_iso_8601_datetime that gets written to is proving too hard, I think I'll just try a simpler approach of passing format to parse_iso_8601_datetime, and checking that the inferred format matches it as it gets parsed


# specify error conditions
assert is_raise or is_ignore or is_coerce
Expand Down Expand Up @@ -568,6 +575,15 @@ cpdef array_to_datetime(
iresult[i] = get_datetime64_nanos(val, NPY_FR_ns)

elif is_integer_object(val) or is_float_object(val):
if require_iso8601:
if is_coerce:
iresult[i] = NPY_NAT
continue
elif is_raise:
raise ValueError(
f"time data \"{val}\" at position {i} doesn't match format {format}"
)
return values, tz_out
# these must be ns unit by-definition
seen_integer = True

Expand Down Expand Up @@ -598,7 +614,8 @@ cpdef array_to_datetime(

string_to_dts_failed = string_to_dts(
val, &dts, &out_bestunit, &out_local,
&out_tzoffset, False
&out_tzoffset, False, inferred_format,
&inferred_format_len,
)
if string_to_dts_failed:
# An error at this point is a _parsing_ error
Expand All @@ -613,7 +630,7 @@ cpdef array_to_datetime(
continue
elif is_raise:
raise ValueError(
f"time data \"{val}\" at position {i} doesn't match format specified"
f"time data \"{val}\" at position {i} doesn't match {format}"
)
return values, tz_out

Expand Down Expand Up @@ -644,6 +661,21 @@ cpdef array_to_datetime(
_ts = convert_datetime_to_tsobject(py_dt, None)
iresult[i] = _ts.value
if not string_to_dts_failed:
if require_iso8601:
guess = inferred_format[:inferred_format_len].decode('utf-8')
if (
(exact and format != guess)
or (not exact and re.search(format, guess) is None)
):
if is_coerce:
iresult[i] = NPY_NAT
continue
elif is_raise:
raise ValueError(
f"time data \"{val}\" at position {i} doesn't "
f"match format {format}"
)
return values, tz_out
# No error reported by string_to_dts, pick back up
# where we left off
value = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts)
Expand Down
4 changes: 3 additions & 1 deletion pandas/_libs/tslibs/conversion.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -469,6 +469,8 @@ cdef _TSObject _convert_str_to_tsobject(object ts, tzinfo tz, str unit,
datetime dt
int64_t ival
NPY_DATETIMEUNIT out_bestunit
char inferred_format
int inferred_format_len

if len(ts) == 0 or ts in nat_strings:
ts = NaT
Expand All @@ -488,7 +490,7 @@ cdef _TSObject _convert_str_to_tsobject(object ts, tzinfo tz, str unit,
else:
string_to_dts_failed = string_to_dts(
ts, &dts, &out_bestunit, &out_local,
&out_tzoffset, False
&out_tzoffset, False, &inferred_format, &inferred_format_len
)
if not string_to_dts_failed:
try:
Expand Down
2 changes: 2 additions & 0 deletions pandas/_libs/tslibs/np_datetime.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,8 @@ cdef int string_to_dts(
int* out_local,
int* out_tzoffset,
bint want_exc,
char *inferred_format,
int *inferred_format_len,
) except? -1

cdef NPY_DATETIMEUNIT get_unit_from_dtype(cnp.dtype dtype)
Expand Down
8 changes: 6 additions & 2 deletions pandas/_libs/tslibs/np_datetime.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,8 @@ cdef extern from "src/datetime/np_datetime_strings.h":
int parse_iso_8601_datetime(const char *str, int len, int want_exc,
npy_datetimestruct *out,
NPY_DATETIMEUNIT *out_bestunit,
int *out_local, int *out_tzoffset)
int *out_local, int *out_tzoffset,
char *inferred_format, int *inferred_format_len)


# ----------------------------------------------------------------------
Expand Down Expand Up @@ -273,14 +274,17 @@ cdef inline int string_to_dts(
int* out_local,
int* out_tzoffset,
bint want_exc,
char *inferred_format,
int *inferred_format_len
) except? -1:
cdef:
Py_ssize_t length
const char* buf

buf = get_c_string_buf_and_size(val, &length)
return parse_iso_8601_datetime(buf, length, want_exc,
dts, out_bestunit, out_local, out_tzoffset)
dts, out_bestunit, out_local, out_tzoffset,
inferred_format, inferred_format_len)


cpdef ndarray astype_overflowsafe(
Expand Down
4 changes: 3 additions & 1 deletion pandas/_libs/tslibs/parsing.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -397,6 +397,8 @@ cdef parse_datetime_string_with_reso(
NPY_DATETIMEUNIT out_bestunit
int out_local
int out_tzoffset
char inferred_format
int inferred_format_len

if not _does_string_look_like_datetime(date_string):
raise ValueError(f'Given date string {date_string} not likely a datetime')
Expand All @@ -409,7 +411,7 @@ cdef parse_datetime_string_with_reso(
# TODO: does this render some/all of parse_delimited_date redundant?
string_to_dts_failed = string_to_dts(
date_string, &dts, &out_bestunit, &out_local,
&out_tzoffset, False
&out_tzoffset, False, &inferred_format, &inferred_format_len
)
if not string_to_dts_failed:
if dts.ps != 0 or out_local:
Expand Down
65 changes: 64 additions & 1 deletion pandas/_libs/tslibs/src/datetime/np_datetime_strings.c
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,9 @@ This file implements string parsing and creation for NumPy datetime.
int parse_iso_8601_datetime(const char *str, int len, int want_exc,
npy_datetimestruct *out,
NPY_DATETIMEUNIT *out_bestunit,
int *out_local, int *out_tzoffset) {
int *out_local, int *out_tzoffset,
char *inferred_format, int *inferred_format_len) {
int inferred_format_idx = 0;
int year_leap = 0;
int i, numdigits;
const char *substr;
Expand Down Expand Up @@ -104,6 +106,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
while (sublen > 0 && isspace(*substr)) {
++substr;
--sublen;
inferred_format[inferred_format_idx] = ' ';
++inferred_format_idx;
}

/* Leading '-' sign for negative year */
Expand All @@ -125,6 +129,10 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,

substr += 4;
sublen -= 4;
inferred_format[inferred_format_idx] = '%';
++inferred_format_idx;
inferred_format[inferred_format_idx] = 'Y';
++inferred_format_idx;
}

/* Negate the year if necessary */
Expand Down Expand Up @@ -156,6 +164,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
ymd_sep = valid_ymd_sep[i];
++substr;
--sublen;
inferred_format[inferred_format_idx] = ymd_sep;
++inferred_format_idx;
/* Cannot have trailing separator */
if (sublen == 0 || !isdigit(*substr)) {
goto parse_error;
Expand Down Expand Up @@ -183,6 +193,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
goto error;
}

inferred_format[inferred_format_idx] = '%';
++inferred_format_idx;
inferred_format[inferred_format_idx] = 'm';
++inferred_format_idx;

/* Next character must be the separator, start of day, or end of string */
if (sublen == 0) {
bestunit = NPY_FR_M;
Expand All @@ -201,6 +216,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
if (*substr != ymd_sep || sublen == 1) {
goto parse_error;
}
inferred_format[inferred_format_idx] = *substr;
++inferred_format_idx;
++substr;
--sublen;
}
Expand Down Expand Up @@ -230,6 +247,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
goto error;
}

inferred_format[inferred_format_idx] = '%';
++inferred_format_idx;
inferred_format[inferred_format_idx] = 'd';
++inferred_format_idx;

/* Next character must be a 'T', ' ', or end of string */
if (sublen == 0) {
if (out_local != NULL) {
Expand All @@ -242,6 +264,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
if ((*substr != 'T' && *substr != ' ') || sublen == 1) {
goto parse_error;
}
inferred_format[inferred_format_idx] = *substr;
++inferred_format_idx;
++substr;
--sublen;

Expand Down Expand Up @@ -269,6 +293,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
}
}

inferred_format[inferred_format_idx] = '%';
++inferred_format_idx;
inferred_format[inferred_format_idx] = 'H';
++inferred_format_idx;

/* Next character must be a ':' or the end of the string */
if (sublen == 0) {
if (!hour_was_2_digits) {
Expand All @@ -279,6 +308,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
}

if (*substr == ':') {
inferred_format[inferred_format_idx] = ':';
++inferred_format_idx;
has_hms_sep = 1;
++substr;
--sublen;
Expand Down Expand Up @@ -315,6 +346,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
goto parse_error;
}

inferred_format[inferred_format_idx] = '%';
++inferred_format_idx;
inferred_format[inferred_format_idx] = 'M';
++inferred_format_idx;

if (sublen == 0) {
bestunit = NPY_FR_m;
goto finish;
Expand All @@ -323,6 +359,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
/* If we make it through this condition block, then the next
* character is a digit. */
if (has_hms_sep && *substr == ':') {
inferred_format[inferred_format_idx] = ':';
++inferred_format_idx;
++substr;
--sublen;
/* Cannot have a trailing ':' */
Expand Down Expand Up @@ -356,15 +394,27 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
goto parse_error;
}

inferred_format[inferred_format_idx] = '%';
++inferred_format_idx;
inferred_format[inferred_format_idx] = 'S';
++inferred_format_idx;

/* Next character may be a '.' indicating fractional seconds */
if (sublen > 0 && *substr == '.') {
++substr;
--sublen;
inferred_format[inferred_format_idx] = '.';
++inferred_format_idx;
} else {
bestunit = NPY_FR_s;
goto parse_timezone;
}

inferred_format[inferred_format_idx] = '%';
++inferred_format_idx;
inferred_format[inferred_format_idx] = 'f';
++inferred_format_idx;

/* PARSE THE MICROSECONDS (0 to 6 digits) */
numdigits = 0;
for (i = 0; i < 6; ++i) {
Expand Down Expand Up @@ -430,6 +480,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
while (sublen > 0 && isspace(*substr)) {
++substr;
--sublen;
inferred_format[inferred_format_idx] = ' ';
++inferred_format_idx;
}

if (sublen == 0) {
Expand All @@ -439,6 +491,10 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,

/* UTC specifier */
if (*substr == 'Z') {
inferred_format[inferred_format_idx] = '%';
++inferred_format_idx;
inferred_format[inferred_format_idx] = 'Z';
++inferred_format_idx;
/* "Z" should be equivalent to tz offset "+00:00" */
if (out_local != NULL) {
*out_local = 1;
Expand All @@ -455,6 +511,10 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
--sublen;
}
} else if (*substr == '-' || *substr == '+') {
inferred_format[inferred_format_idx] = '%';
++inferred_format_idx;
inferred_format[inferred_format_idx] = 'z';
++inferred_format_idx;
/* Time zone offset */
int offset_neg = 0, offset_hour = 0, offset_minute = 0;

Expand Down Expand Up @@ -538,6 +598,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
while (sublen > 0 && isspace(*substr)) {
++substr;
--sublen;
inferred_format[inferred_format_idx] = ' ';
++inferred_format_idx;
}

if (sublen != 0) {
Expand All @@ -548,6 +610,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc,
if (out_bestunit != NULL) {
*out_bestunit = bestunit;
}
*inferred_format_len = inferred_format_idx;
return 0;

parse_error:
Expand Down
4 changes: 3 additions & 1 deletion pandas/_libs/tslibs/src/datetime/np_datetime_strings.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,9 @@ parse_iso_8601_datetime(const char *str, int len, int want_exc,
npy_datetimestruct *out,
NPY_DATETIMEUNIT *out_bestunit,
int *out_local,
int *out_tzoffset);
int *out_tzoffset,
char *inferred_format,
int *inferred_format_len);

/*
* Provides a string length to use for converting datetime
Expand Down
Loading