Skip to content

ENH: optional ':' separator in ISO8601 strings #12483

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.18.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1200,3 +1200,5 @@ Bug Fixes
- Bug when initializing categorical series with a scalar value. (:issue:`12336`)
- Bug when specifying a UTC ``DatetimeIndex`` by setting ``utc=True`` in ``.to_datetime`` (:issue:`11934`)
- Bug when increasing the buffer size of CSV reader in ``read_csv`` (:issue:`12494`)

- Bug in ``Timestamp`` constructor where microsecond resolution was lost if HHMMSS were not separated with ':' (:issue:`10041`)
279 changes: 143 additions & 136 deletions pandas/src/datetime/np_datetime_strings.c
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,8 @@ convert_datetimestruct_local_to_utc(pandas_datetimestruct *out_dts_utc,
* + Doesn't handle 24:00:00 as synonym for midnight (00:00:00) tomorrow
* + Accepts special values "NaT" (not a time), "Today", (current
* day according to local time) and "Now" (current time in UTC).
* + ':' separator between hours, minutes, and seconds is optional. When
* omitted, each component must be 2 digits if it appears. (GH-10041)
*
* 'str' must be a NULL-terminated string, and 'len' must be its length.
* 'unit' should contain -1 if the unit is unknown, or the unit
Expand Down Expand Up @@ -394,15 +396,21 @@ parse_iso_8601_datetime(char *str, int len,
char *substr, sublen;
PANDAS_DATETIMEUNIT bestunit;

/* if date components in are separated by one of valid separators
* months/days without leadings 0s will be parsed
/* If year-month-day are separated by a valid separator,
* months/days without leading zeroes will be parsed
* (though not iso8601). If the components aren't separated,
* an error code will be retuned because the date is ambigous
* 4 (YYYY) or 8 (YYYYMMDD) digits are expected. 6 digits are
* forbidden here (but parsed as YYMMDD elsewhere).
*/
int has_sep = 0;
char sep = '\0';
char valid_sep[] = {'-', '.', '/', '\\', ' '};
int valid_sep_len = 5;
int has_ymd_sep = 0;
char ymd_sep = '\0';
char valid_ymd_sep[] = {'-', '.', '/', '\\', ' '};
int valid_ymd_sep_len = sizeof(valid_ymd_sep);

/* hour-minute-second may or may not separated by ':'. If not, then
* each component must be 2 digits. */
int has_hms_sep = 0;
int hour_was_2_digits = 0;

/* Initialize the output to all zeros */
memset(out, 0, sizeof(pandas_datetimestruct));
Expand Down Expand Up @@ -550,67 +558,58 @@ parse_iso_8601_datetime(char *str, int len,
/* Check whether it's a leap-year */
year_leap = is_leapyear(out->year);

/* Next character must be a separator, start of month or end */
/* Next character must be a separator, start of month, or end of string */
if (sublen == 0) {
if (out_local != NULL) {
*out_local = 0;
}
bestunit = PANDAS_FR_Y;
goto finish;
}
else if (!isdigit(*substr)) {
for (i = 0; i < valid_sep_len; ++i) {
if (*substr == valid_sep[i]) {
has_sep = 1;
sep = valid_sep[i];
++substr;
--sublen;

if (!isdigit(*substr)) {
for (i = 0; i < valid_ymd_sep_len; ++i) {
if (*substr == valid_ymd_sep[i]) {
break;
}
}
if (i == valid_sep_len) {
if (i == valid_ymd_sep_len) {
goto parse_error;
}
}

/* Can't have a trailing sep */
if (sublen == 0) {
goto parse_error;
}


/* PARSE THE MONTH (2 digits) */
if (has_sep && ((sublen >= 2 && isdigit(substr[0]) && !isdigit(substr[1]))
|| (sublen == 1 && isdigit(substr[0])))) {
out->month = (substr[0] - '0');

if (out->month < 1) {
PyErr_Format(PyExc_ValueError,
"Month out of range in datetime string \"%s\"", str);
goto error;
}
has_ymd_sep = 1;
ymd_sep = valid_ymd_sep[i];
++substr;
--sublen;
/* Cannot have trailing separator */
if (sublen == 0 || !isdigit(*substr)) {
goto parse_error;
}
}
else if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) {
out->month = 10 * (substr[0] - '0') + (substr[1] - '0');

if (out->month < 1 || out->month > 12) {
PyErr_Format(PyExc_ValueError,
"Month out of range in datetime string \"%s\"", str);
goto error;
}
substr += 2;
sublen -= 2;
/* PARSE THE MONTH */
/* First digit required */
out->month = (*substr - '0');
++substr;
--sublen;
/* Second digit optional if there was a separator */
if (isdigit(*substr)) {
out->month = 10 * out->month + (*substr - '0');
++substr;
--sublen;
}
else {
else if (!has_ymd_sep) {
goto parse_error;
}
if (out->month < 1 || out->month > 12) {
PyErr_Format(PyExc_ValueError,
"Month out of range in datetime string \"%s\"", str);
goto error;
}

/* Next character must be a '-' or the end of the string */
/* Next character must be the separator, start of day, or end of string */
if (sublen == 0) {
/* dates of form YYYYMM are not valid */
if (!has_sep) {
/* Forbid YYYYMM. Parsed instead as YYMMDD by someone else. */
if (!has_ymd_sep) {
goto parse_error;
}
if (out_local != NULL) {
Expand All @@ -619,47 +618,40 @@ parse_iso_8601_datetime(char *str, int len,
bestunit = PANDAS_FR_M;
goto finish;
}
else if (has_sep && *substr == sep) {

if (has_ymd_sep) {
/* Must have separator, but cannot be trailing */
if (*substr != ymd_sep || sublen == 1) {
goto parse_error;
}
++substr;
--sublen;
}
else if (!isdigit(*substr)) {
goto parse_error;
}

/* Can't have a trailing '-' */
if (sublen == 0) {
goto parse_error;
/* PARSE THE DAY */
/* First digit required */
if (!isdigit(*substr)) {
goto parse_error;
}

/* PARSE THE DAY (2 digits) */
if (has_sep && ((sublen >= 2 && isdigit(substr[0]) && !isdigit(substr[1]))
|| (sublen == 1 && isdigit(substr[0])))) {
out->day = (substr[0] - '0');

if (out->day < 1) {
PyErr_Format(PyExc_ValueError,
"Day out of range in datetime string \"%s\"", str);
goto error;
}
out->day = (*substr - '0');
++substr;
--sublen;
/* Second digit optional if there was a separator */
if (isdigit(*substr)) {
out->day = 10 * out->day + (*substr - '0');
++substr;
--sublen;
}
else if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) {
out->day = 10 * (substr[0] - '0') + (substr[1] - '0');

if (out->day < 1 ||
out->day > days_per_month_table[year_leap][out->month-1]) {
PyErr_Format(PyExc_ValueError,
"Day out of range in datetime string \"%s\"", str);
goto error;
}
substr += 2;
sublen -= 2;
}
else {
else if (!has_ymd_sep) {
goto parse_error;
}
if (out->day < 1 ||
out->day > days_per_month_table[year_leap][out->month-1])
{
PyErr_Format(PyExc_ValueError,
"Day out of range in datetime string \"%s\"", str);
goto error;
}

/* Next character must be a 'T', ' ', or end of string */
if (sublen == 0) {
Expand All @@ -669,104 +661,119 @@ parse_iso_8601_datetime(char *str, int len,
bestunit = PANDAS_FR_D;
goto finish;
}
else if (*substr != 'T' && *substr != ' ') {

if ((*substr != 'T' && *substr != ' ') || sublen == 1) {
goto parse_error;
}
else {
++substr;
--sublen;

/* PARSE THE HOURS */
/* First digit required */
if (!isdigit(*substr)) {
goto parse_error;
}
out->hour = (*substr - '0');
++substr;
--sublen;
/* Second digit optional */
if (isdigit(*substr)) {
hour_was_2_digits = 1;
out->hour = 10 * out->hour + (*substr - '0');
++substr;
--sublen;
}

/* PARSE THE HOURS (2 digits) */
if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) {
out->hour = 10 * (substr[0] - '0') + (substr[1] - '0');

if (out->hour >= 24) {
PyErr_Format(PyExc_ValueError,
"Hours out of range in datetime string \"%s\"", str);
goto error;
}
substr += 2;
sublen -= 2;
}
else if (sublen >= 1 && isdigit(substr[0])) {
out->hour = substr[0] - '0';
++substr;
--sublen;
}
else {
goto parse_error;
}

/* Next character must be a ':' or the end of the string */
if (sublen > 0 && *substr == ':') {
if (sublen == 0) {
if (!hour_was_2_digits) {
goto parse_error;
}
bestunit = PANDAS_FR_h;
goto finish;
}

if (*substr == ':') {
has_hms_sep = 1;
++substr;
--sublen;
/* Cannot have a trailing separator */
if (sublen == 0 || !isdigit(*substr)) {
goto parse_error;
}
}
else {
else if (!isdigit(*substr)) {
if (!hour_was_2_digits) {
goto parse_error;
}
bestunit = PANDAS_FR_h;
goto parse_timezone;
}

/* Can't have a trailing ':' */
if (sublen == 0) {
goto parse_error;
}

/* PARSE THE MINUTES (2 digits) */
if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) {
out->min = 10 * (substr[0] - '0') + (substr[1] - '0');

/* PARSE THE MINUTES */
/* First digit required */
out->min = (*substr - '0');
++substr;
--sublen;
/* Second digit optional if there was a separator */
if (isdigit(*substr)) {
out->min = 10 * out->min + (*substr - '0');
++substr;
--sublen;
if (out->min >= 60) {
PyErr_Format(PyExc_ValueError,
"Minutes out of range in datetime string \"%s\"", str);
"Minutes out of range in datetime string \"%s\"", str);
goto error;
}
substr += 2;
sublen -= 2;
}
else if (sublen >= 1 && isdigit(substr[0])) {
out->min = substr[0] - '0';
++substr;
--sublen;
}
else {
else if (!has_hms_sep) {
goto parse_error;
}

/* Next character must be a ':' or the end of the string */
if (sublen > 0 && *substr == ':') {
if (sublen == 0) {
bestunit = PANDAS_FR_m;
goto finish;
}

/* If we make it through this condition block, then the next
* character is a digit. */
if (has_hms_sep && *substr == ':') {
++substr;
--sublen;
/* Cannot have a trailing ':' */
if (sublen == 0 || !isdigit(*substr)) {
goto parse_error;
}
}
else if (!has_hms_sep && isdigit(*substr)) {
}
else {
bestunit = PANDAS_FR_m;
goto parse_timezone;
}

/* Can't have a trailing ':' */
if (sublen == 0) {
goto parse_error;
}

/* PARSE THE SECONDS (2 digits) */
if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) {
out->sec = 10 * (substr[0] - '0') + (substr[1] - '0');

/* PARSE THE SECONDS */
/* First digit required */
out->sec = (*substr - '0');
++substr;
--sublen;
/* Second digit optional if there was a separator */
if (isdigit(*substr)) {
out->sec = 10 * out->sec + (*substr - '0');
++substr;
--sublen;
if (out->sec >= 60) {
PyErr_Format(PyExc_ValueError,
"Seconds out of range in datetime string \"%s\"", str);
"Seconds out of range in datetime string \"%s\"", str);
goto error;
}
substr += 2;
sublen -= 2;
}
else if (sublen >= 1 && isdigit(substr[0])) {
out->sec = substr[0] - '0';
++substr;
--sublen;
}
else {
else if (!has_hms_sep) {
goto parse_error;
}

Expand Down
Loading