Skip to content

Commit 60b307f

Browse files
thejohnfreemanjreback
authored andcommitted
ENH: optional ':' separator in ISO8601 strings
closes #10041 Author: John Freeman <[email protected]> Closes #12483 from thejohnfreeman/GH-10041 and squashes the following commits: 720b6a7 [John Freeman] ENH: Optional ':' HHMMSS separator in ISO8601 strings
1 parent 3928102 commit 60b307f

File tree

3 files changed

+168
-140
lines changed

3 files changed

+168
-140
lines changed

doc/source/whatsnew/v0.18.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1169,7 +1169,7 @@ Bug Fixes
11691169
- Bug in ``.to_csv`` ignoring formatting parameters ``decimal``, ``na_rep``, ``float_format`` for float indexes (:issue:`11553`)
11701170
- Bug in ``Int64Index`` and ``Float64Index`` preventing the use of the modulo operator (:issue:`9244`)
11711171
- Bug in ``MultiIndex.drop`` for not lexsorted multi-indexes (:issue:`12078`)
1172-
1172+
- Bug in ``Timestamp`` constructor where microsecond resolution was lost if HHMMSS were not separated with ':' (:issue:`10041`)
11731173
- Bug in ``DataFrame`` when masking an empty ``DataFrame`` (:issue:`11859`)
11741174

11751175

pandas/src/datetime/np_datetime_strings.c

+143-136
Original file line numberDiff line numberDiff line change
@@ -355,6 +355,8 @@ convert_datetimestruct_local_to_utc(pandas_datetimestruct *out_dts_utc,
355355
* + Doesn't handle 24:00:00 as synonym for midnight (00:00:00) tomorrow
356356
* + Accepts special values "NaT" (not a time), "Today", (current
357357
* day according to local time) and "Now" (current time in UTC).
358+
* + ':' separator between hours, minutes, and seconds is optional. When
359+
* omitted, each component must be 2 digits if it appears. (GH-10041)
358360
*
359361
* 'str' must be a NULL-terminated string, and 'len' must be its length.
360362
* 'unit' should contain -1 if the unit is unknown, or the unit
@@ -394,15 +396,21 @@ parse_iso_8601_datetime(char *str, int len,
394396
char *substr, sublen;
395397
PANDAS_DATETIMEUNIT bestunit;
396398

397-
/* if date components in are separated by one of valid separators
398-
* months/days without leadings 0s will be parsed
399+
/* If year-month-day are separated by a valid separator,
400+
* months/days without leading zeroes will be parsed
399401
* (though not iso8601). If the components aren't separated,
400-
* an error code will be retuned because the date is ambigous
402+
* 4 (YYYY) or 8 (YYYYMMDD) digits are expected. 6 digits are
403+
* forbidden here (but parsed as YYMMDD elsewhere).
401404
*/
402-
int has_sep = 0;
403-
char sep = '\0';
404-
char valid_sep[] = {'-', '.', '/', '\\', ' '};
405-
int valid_sep_len = 5;
405+
int has_ymd_sep = 0;
406+
char ymd_sep = '\0';
407+
char valid_ymd_sep[] = {'-', '.', '/', '\\', ' '};
408+
int valid_ymd_sep_len = sizeof(valid_ymd_sep);
409+
410+
/* hour-minute-second may or may not separated by ':'. If not, then
411+
* each component must be 2 digits. */
412+
int has_hms_sep = 0;
413+
int hour_was_2_digits = 0;
406414

407415
/* Initialize the output to all zeros */
408416
memset(out, 0, sizeof(pandas_datetimestruct));
@@ -550,67 +558,58 @@ parse_iso_8601_datetime(char *str, int len,
550558
/* Check whether it's a leap-year */
551559
year_leap = is_leapyear(out->year);
552560

553-
/* Next character must be a separator, start of month or end */
561+
/* Next character must be a separator, start of month, or end of string */
554562
if (sublen == 0) {
555563
if (out_local != NULL) {
556564
*out_local = 0;
557565
}
558566
bestunit = PANDAS_FR_Y;
559567
goto finish;
560568
}
561-
else if (!isdigit(*substr)) {
562-
for (i = 0; i < valid_sep_len; ++i) {
563-
if (*substr == valid_sep[i]) {
564-
has_sep = 1;
565-
sep = valid_sep[i];
566-
++substr;
567-
--sublen;
569+
570+
if (!isdigit(*substr)) {
571+
for (i = 0; i < valid_ymd_sep_len; ++i) {
572+
if (*substr == valid_ymd_sep[i]) {
568573
break;
569574
}
570575
}
571-
if (i == valid_sep_len) {
576+
if (i == valid_ymd_sep_len) {
572577
goto parse_error;
573578
}
574-
}
575-
576-
/* Can't have a trailing sep */
577-
if (sublen == 0) {
578-
goto parse_error;
579-
}
580-
581-
582-
/* PARSE THE MONTH (2 digits) */
583-
if (has_sep && ((sublen >= 2 && isdigit(substr[0]) && !isdigit(substr[1]))
584-
|| (sublen == 1 && isdigit(substr[0])))) {
585-
out->month = (substr[0] - '0');
586-
587-
if (out->month < 1) {
588-
PyErr_Format(PyExc_ValueError,
589-
"Month out of range in datetime string \"%s\"", str);
590-
goto error;
591-
}
579+
has_ymd_sep = 1;
580+
ymd_sep = valid_ymd_sep[i];
592581
++substr;
593582
--sublen;
583+
/* Cannot have trailing separator */
584+
if (sublen == 0 || !isdigit(*substr)) {
585+
goto parse_error;
586+
}
594587
}
595-
else if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) {
596-
out->month = 10 * (substr[0] - '0') + (substr[1] - '0');
597588

598-
if (out->month < 1 || out->month > 12) {
599-
PyErr_Format(PyExc_ValueError,
600-
"Month out of range in datetime string \"%s\"", str);
601-
goto error;
602-
}
603-
substr += 2;
604-
sublen -= 2;
589+
/* PARSE THE MONTH */
590+
/* First digit required */
591+
out->month = (*substr - '0');
592+
++substr;
593+
--sublen;
594+
/* Second digit optional if there was a separator */
595+
if (isdigit(*substr)) {
596+
out->month = 10 * out->month + (*substr - '0');
597+
++substr;
598+
--sublen;
605599
}
606-
else {
600+
else if (!has_ymd_sep) {
607601
goto parse_error;
608602
}
603+
if (out->month < 1 || out->month > 12) {
604+
PyErr_Format(PyExc_ValueError,
605+
"Month out of range in datetime string \"%s\"", str);
606+
goto error;
607+
}
609608

610-
/* Next character must be a '-' or the end of the string */
609+
/* Next character must be the separator, start of day, or end of string */
611610
if (sublen == 0) {
612-
/* dates of form YYYYMM are not valid */
613-
if (!has_sep) {
611+
/* Forbid YYYYMM. Parsed instead as YYMMDD by someone else. */
612+
if (!has_ymd_sep) {
614613
goto parse_error;
615614
}
616615
if (out_local != NULL) {
@@ -619,47 +618,40 @@ parse_iso_8601_datetime(char *str, int len,
619618
bestunit = PANDAS_FR_M;
620619
goto finish;
621620
}
622-
else if (has_sep && *substr == sep) {
621+
622+
if (has_ymd_sep) {
623+
/* Must have separator, but cannot be trailing */
624+
if (*substr != ymd_sep || sublen == 1) {
625+
goto parse_error;
626+
}
623627
++substr;
624628
--sublen;
625629
}
626-
else if (!isdigit(*substr)) {
627-
goto parse_error;
628-
}
629630

630-
/* Can't have a trailing '-' */
631-
if (sublen == 0) {
632-
goto parse_error;
631+
/* PARSE THE DAY */
632+
/* First digit required */
633+
if (!isdigit(*substr)) {
634+
goto parse_error;
633635
}
634-
635-
/* PARSE THE DAY (2 digits) */
636-
if (has_sep && ((sublen >= 2 && isdigit(substr[0]) && !isdigit(substr[1]))
637-
|| (sublen == 1 && isdigit(substr[0])))) {
638-
out->day = (substr[0] - '0');
639-
640-
if (out->day < 1) {
641-
PyErr_Format(PyExc_ValueError,
642-
"Day out of range in datetime string \"%s\"", str);
643-
goto error;
644-
}
636+
out->day = (*substr - '0');
637+
++substr;
638+
--sublen;
639+
/* Second digit optional if there was a separator */
640+
if (isdigit(*substr)) {
641+
out->day = 10 * out->day + (*substr - '0');
645642
++substr;
646643
--sublen;
647644
}
648-
else if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) {
649-
out->day = 10 * (substr[0] - '0') + (substr[1] - '0');
650-
651-
if (out->day < 1 ||
652-
out->day > days_per_month_table[year_leap][out->month-1]) {
653-
PyErr_Format(PyExc_ValueError,
654-
"Day out of range in datetime string \"%s\"", str);
655-
goto error;
656-
}
657-
substr += 2;
658-
sublen -= 2;
659-
}
660-
else {
645+
else if (!has_ymd_sep) {
661646
goto parse_error;
662647
}
648+
if (out->day < 1 ||
649+
out->day > days_per_month_table[year_leap][out->month-1])
650+
{
651+
PyErr_Format(PyExc_ValueError,
652+
"Day out of range in datetime string \"%s\"", str);
653+
goto error;
654+
}
663655

664656
/* Next character must be a 'T', ' ', or end of string */
665657
if (sublen == 0) {
@@ -669,104 +661,119 @@ parse_iso_8601_datetime(char *str, int len,
669661
bestunit = PANDAS_FR_D;
670662
goto finish;
671663
}
672-
else if (*substr != 'T' && *substr != ' ') {
664+
665+
if ((*substr != 'T' && *substr != ' ') || sublen == 1) {
673666
goto parse_error;
674667
}
675-
else {
668+
++substr;
669+
--sublen;
670+
671+
/* PARSE THE HOURS */
672+
/* First digit required */
673+
if (!isdigit(*substr)) {
674+
goto parse_error;
675+
}
676+
out->hour = (*substr - '0');
677+
++substr;
678+
--sublen;
679+
/* Second digit optional */
680+
if (isdigit(*substr)) {
681+
hour_was_2_digits = 1;
682+
out->hour = 10 * out->hour + (*substr - '0');
676683
++substr;
677684
--sublen;
678-
}
679-
680-
/* PARSE THE HOURS (2 digits) */
681-
if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) {
682-
out->hour = 10 * (substr[0] - '0') + (substr[1] - '0');
683-
684685
if (out->hour >= 24) {
685686
PyErr_Format(PyExc_ValueError,
686687
"Hours out of range in datetime string \"%s\"", str);
687688
goto error;
688689
}
689-
substr += 2;
690-
sublen -= 2;
691-
}
692-
else if (sublen >= 1 && isdigit(substr[0])) {
693-
out->hour = substr[0] - '0';
694-
++substr;
695-
--sublen;
696-
}
697-
else {
698-
goto parse_error;
699690
}
700691

701692
/* Next character must be a ':' or the end of the string */
702-
if (sublen > 0 && *substr == ':') {
693+
if (sublen == 0) {
694+
if (!hour_was_2_digits) {
695+
goto parse_error;
696+
}
697+
bestunit = PANDAS_FR_h;
698+
goto finish;
699+
}
700+
701+
if (*substr == ':') {
702+
has_hms_sep = 1;
703703
++substr;
704704
--sublen;
705+
/* Cannot have a trailing separator */
706+
if (sublen == 0 || !isdigit(*substr)) {
707+
goto parse_error;
708+
}
705709
}
706-
else {
710+
else if (!isdigit(*substr)) {
711+
if (!hour_was_2_digits) {
712+
goto parse_error;
713+
}
707714
bestunit = PANDAS_FR_h;
708715
goto parse_timezone;
709716
}
710717

711-
/* Can't have a trailing ':' */
712-
if (sublen == 0) {
713-
goto parse_error;
714-
}
715-
716-
/* PARSE THE MINUTES (2 digits) */
717-
if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) {
718-
out->min = 10 * (substr[0] - '0') + (substr[1] - '0');
719-
718+
/* PARSE THE MINUTES */
719+
/* First digit required */
720+
out->min = (*substr - '0');
721+
++substr;
722+
--sublen;
723+
/* Second digit optional if there was a separator */
724+
if (isdigit(*substr)) {
725+
out->min = 10 * out->min + (*substr - '0');
726+
++substr;
727+
--sublen;
720728
if (out->min >= 60) {
721729
PyErr_Format(PyExc_ValueError,
722-
"Minutes out of range in datetime string \"%s\"", str);
730+
"Minutes out of range in datetime string \"%s\"", str);
723731
goto error;
724732
}
725-
substr += 2;
726-
sublen -= 2;
727-
}
728-
else if (sublen >= 1 && isdigit(substr[0])) {
729-
out->min = substr[0] - '0';
730-
++substr;
731-
--sublen;
732733
}
733-
else {
734+
else if (!has_hms_sep) {
734735
goto parse_error;
735736
}
736737

737-
/* Next character must be a ':' or the end of the string */
738-
if (sublen > 0 && *substr == ':') {
738+
if (sublen == 0) {
739+
bestunit = PANDAS_FR_m;
740+
goto finish;
741+
}
742+
743+
/* If we make it through this condition block, then the next
744+
* character is a digit. */
745+
if (has_hms_sep && *substr == ':') {
739746
++substr;
740747
--sublen;
748+
/* Cannot have a trailing ':' */
749+
if (sublen == 0 || !isdigit(*substr)) {
750+
goto parse_error;
751+
}
752+
}
753+
else if (!has_hms_sep && isdigit(*substr)) {
741754
}
742755
else {
743756
bestunit = PANDAS_FR_m;
744757
goto parse_timezone;
745758
}
746759

747-
/* Can't have a trailing ':' */
748-
if (sublen == 0) {
749-
goto parse_error;
750-
}
751-
752-
/* PARSE THE SECONDS (2 digits) */
753-
if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) {
754-
out->sec = 10 * (substr[0] - '0') + (substr[1] - '0');
755-
760+
/* PARSE THE SECONDS */
761+
/* First digit required */
762+
out->sec = (*substr - '0');
763+
++substr;
764+
--sublen;
765+
/* Second digit optional if there was a separator */
766+
if (isdigit(*substr)) {
767+
out->sec = 10 * out->sec + (*substr - '0');
768+
++substr;
769+
--sublen;
756770
if (out->sec >= 60) {
757771
PyErr_Format(PyExc_ValueError,
758-
"Seconds out of range in datetime string \"%s\"", str);
772+
"Seconds out of range in datetime string \"%s\"", str);
759773
goto error;
760774
}
761-
substr += 2;
762-
sublen -= 2;
763-
}
764-
else if (sublen >= 1 && isdigit(substr[0])) {
765-
out->sec = substr[0] - '0';
766-
++substr;
767-
--sublen;
768775
}
769-
else {
776+
else if (!has_hms_sep) {
770777
goto parse_error;
771778
}
772779

0 commit comments

Comments
 (0)