From 720b6a73f86fb5fec5c4a1e7c30dd4d93c5d6523 Mon Sep 17 00:00:00 2001 From: John Freeman Date: Fri, 26 Feb 2016 20:22:52 -0500 Subject: [PATCH] ENH: Optional ':' HHMMSS separator in ISO8601 strings Allows Timestamps constructed from strings without the ':' separator in HHMMSS to preserve microsecond resolution. --- doc/source/whatsnew/v0.18.0.txt | 2 + pandas/src/datetime/np_datetime_strings.c | 279 +++++++++++----------- pandas/tseries/tests/test_tslib.py | 27 ++- 3 files changed, 169 insertions(+), 139 deletions(-) diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt index 7f253ae437d9f..dc74791e916cc 100644 --- a/doc/source/whatsnew/v0.18.0.txt +++ b/doc/source/whatsnew/v0.18.0.txt @@ -1200,3 +1200,5 @@ Bug Fixes - Bug when initializing categorical series with a scalar value. (:issue:`12336`) - Bug when specifying a UTC ``DatetimeIndex`` by setting ``utc=True`` in ``.to_datetime`` (:issue:`11934`) - Bug when increasing the buffer size of CSV reader in ``read_csv`` (:issue:`12494`) + +- Bug in ``Timestamp`` constructor where microsecond resolution was lost if HHMMSS were not separated with ':' (:issue:`10041`) diff --git a/pandas/src/datetime/np_datetime_strings.c b/pandas/src/datetime/np_datetime_strings.c index 33ddc6c6e1f27..3a1d37f86cc28 100644 --- a/pandas/src/datetime/np_datetime_strings.c +++ b/pandas/src/datetime/np_datetime_strings.c @@ -355,6 +355,8 @@ convert_datetimestruct_local_to_utc(pandas_datetimestruct *out_dts_utc, * + Doesn't handle 24:00:00 as synonym for midnight (00:00:00) tomorrow * + Accepts special values "NaT" (not a time), "Today", (current * day according to local time) and "Now" (current time in UTC). + * + ':' separator between hours, minutes, and seconds is optional. When + * omitted, each component must be 2 digits if it appears. (GH-10041) * * 'str' must be a NULL-terminated string, and 'len' must be its length. * 'unit' should contain -1 if the unit is unknown, or the unit @@ -394,15 +396,21 @@ parse_iso_8601_datetime(char *str, int len, char *substr, sublen; PANDAS_DATETIMEUNIT bestunit; - /* if date components in are separated by one of valid separators - * months/days without leadings 0s will be parsed + /* If year-month-day are separated by a valid separator, + * months/days without leading zeroes will be parsed * (though not iso8601). If the components aren't separated, - * an error code will be retuned because the date is ambigous + * 4 (YYYY) or 8 (YYYYMMDD) digits are expected. 6 digits are + * forbidden here (but parsed as YYMMDD elsewhere). */ - int has_sep = 0; - char sep = '\0'; - char valid_sep[] = {'-', '.', '/', '\\', ' '}; - int valid_sep_len = 5; + int has_ymd_sep = 0; + char ymd_sep = '\0'; + char valid_ymd_sep[] = {'-', '.', '/', '\\', ' '}; + int valid_ymd_sep_len = sizeof(valid_ymd_sep); + + /* hour-minute-second may or may not separated by ':'. If not, then + * each component must be 2 digits. */ + int has_hms_sep = 0; + int hour_was_2_digits = 0; /* Initialize the output to all zeros */ memset(out, 0, sizeof(pandas_datetimestruct)); @@ -550,7 +558,7 @@ parse_iso_8601_datetime(char *str, int len, /* Check whether it's a leap-year */ year_leap = is_leapyear(out->year); - /* Next character must be a separator, start of month or end */ + /* Next character must be a separator, start of month, or end of string */ if (sublen == 0) { if (out_local != NULL) { *out_local = 0; @@ -558,59 +566,50 @@ parse_iso_8601_datetime(char *str, int len, bestunit = PANDAS_FR_Y; goto finish; } - else if (!isdigit(*substr)) { - for (i = 0; i < valid_sep_len; ++i) { - if (*substr == valid_sep[i]) { - has_sep = 1; - sep = valid_sep[i]; - ++substr; - --sublen; + + if (!isdigit(*substr)) { + for (i = 0; i < valid_ymd_sep_len; ++i) { + if (*substr == valid_ymd_sep[i]) { break; } } - if (i == valid_sep_len) { + if (i == valid_ymd_sep_len) { goto parse_error; } - } - - /* Can't have a trailing sep */ - if (sublen == 0) { - goto parse_error; - } - - - /* PARSE THE MONTH (2 digits) */ - if (has_sep && ((sublen >= 2 && isdigit(substr[0]) && !isdigit(substr[1])) - || (sublen == 1 && isdigit(substr[0])))) { - out->month = (substr[0] - '0'); - - if (out->month < 1) { - PyErr_Format(PyExc_ValueError, - "Month out of range in datetime string \"%s\"", str); - goto error; - } + has_ymd_sep = 1; + ymd_sep = valid_ymd_sep[i]; ++substr; --sublen; + /* Cannot have trailing separator */ + if (sublen == 0 || !isdigit(*substr)) { + goto parse_error; + } } - else if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) { - out->month = 10 * (substr[0] - '0') + (substr[1] - '0'); - if (out->month < 1 || out->month > 12) { - PyErr_Format(PyExc_ValueError, - "Month out of range in datetime string \"%s\"", str); - goto error; - } - substr += 2; - sublen -= 2; + /* PARSE THE MONTH */ + /* First digit required */ + out->month = (*substr - '0'); + ++substr; + --sublen; + /* Second digit optional if there was a separator */ + if (isdigit(*substr)) { + out->month = 10 * out->month + (*substr - '0'); + ++substr; + --sublen; } - else { + else if (!has_ymd_sep) { goto parse_error; } + if (out->month < 1 || out->month > 12) { + PyErr_Format(PyExc_ValueError, + "Month out of range in datetime string \"%s\"", str); + goto error; + } - /* Next character must be a '-' or the end of the string */ + /* Next character must be the separator, start of day, or end of string */ if (sublen == 0) { - /* dates of form YYYYMM are not valid */ - if (!has_sep) { + /* Forbid YYYYMM. Parsed instead as YYMMDD by someone else. */ + if (!has_ymd_sep) { goto parse_error; } if (out_local != NULL) { @@ -619,47 +618,40 @@ parse_iso_8601_datetime(char *str, int len, bestunit = PANDAS_FR_M; goto finish; } - else if (has_sep && *substr == sep) { + + if (has_ymd_sep) { + /* Must have separator, but cannot be trailing */ + if (*substr != ymd_sep || sublen == 1) { + goto parse_error; + } ++substr; --sublen; } - else if (!isdigit(*substr)) { - goto parse_error; - } - /* Can't have a trailing '-' */ - if (sublen == 0) { - goto parse_error; + /* PARSE THE DAY */ + /* First digit required */ + if (!isdigit(*substr)) { + goto parse_error; } - - /* PARSE THE DAY (2 digits) */ - if (has_sep && ((sublen >= 2 && isdigit(substr[0]) && !isdigit(substr[1])) - || (sublen == 1 && isdigit(substr[0])))) { - out->day = (substr[0] - '0'); - - if (out->day < 1) { - PyErr_Format(PyExc_ValueError, - "Day out of range in datetime string \"%s\"", str); - goto error; - } + out->day = (*substr - '0'); + ++substr; + --sublen; + /* Second digit optional if there was a separator */ + if (isdigit(*substr)) { + out->day = 10 * out->day + (*substr - '0'); ++substr; --sublen; } - else if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) { - out->day = 10 * (substr[0] - '0') + (substr[1] - '0'); - - if (out->day < 1 || - out->day > days_per_month_table[year_leap][out->month-1]) { - PyErr_Format(PyExc_ValueError, - "Day out of range in datetime string \"%s\"", str); - goto error; - } - substr += 2; - sublen -= 2; - } - else { + else if (!has_ymd_sep) { goto parse_error; } + if (out->day < 1 || + out->day > days_per_month_table[year_leap][out->month-1]) + { + PyErr_Format(PyExc_ValueError, + "Day out of range in datetime string \"%s\"", str); + goto error; + } /* Next character must be a 'T', ' ', or end of string */ if (sublen == 0) { @@ -669,104 +661,119 @@ parse_iso_8601_datetime(char *str, int len, bestunit = PANDAS_FR_D; goto finish; } - else if (*substr != 'T' && *substr != ' ') { + + if ((*substr != 'T' && *substr != ' ') || sublen == 1) { goto parse_error; } - else { + ++substr; + --sublen; + + /* PARSE THE HOURS */ + /* First digit required */ + if (!isdigit(*substr)) { + goto parse_error; + } + out->hour = (*substr - '0'); + ++substr; + --sublen; + /* Second digit optional */ + if (isdigit(*substr)) { + hour_was_2_digits = 1; + out->hour = 10 * out->hour + (*substr - '0'); ++substr; --sublen; - } - - /* PARSE THE HOURS (2 digits) */ - if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) { - out->hour = 10 * (substr[0] - '0') + (substr[1] - '0'); - if (out->hour >= 24) { PyErr_Format(PyExc_ValueError, "Hours out of range in datetime string \"%s\"", str); goto error; } - substr += 2; - sublen -= 2; - } - else if (sublen >= 1 && isdigit(substr[0])) { - out->hour = substr[0] - '0'; - ++substr; - --sublen; - } - else { - goto parse_error; } /* Next character must be a ':' or the end of the string */ - if (sublen > 0 && *substr == ':') { + if (sublen == 0) { + if (!hour_was_2_digits) { + goto parse_error; + } + bestunit = PANDAS_FR_h; + goto finish; + } + + if (*substr == ':') { + has_hms_sep = 1; ++substr; --sublen; + /* Cannot have a trailing separator */ + if (sublen == 0 || !isdigit(*substr)) { + goto parse_error; + } } - else { + else if (!isdigit(*substr)) { + if (!hour_was_2_digits) { + goto parse_error; + } bestunit = PANDAS_FR_h; goto parse_timezone; } - /* Can't have a trailing ':' */ - if (sublen == 0) { - goto parse_error; - } - - /* PARSE THE MINUTES (2 digits) */ - if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) { - out->min = 10 * (substr[0] - '0') + (substr[1] - '0'); - + /* PARSE THE MINUTES */ + /* First digit required */ + out->min = (*substr - '0'); + ++substr; + --sublen; + /* Second digit optional if there was a separator */ + if (isdigit(*substr)) { + out->min = 10 * out->min + (*substr - '0'); + ++substr; + --sublen; if (out->min >= 60) { PyErr_Format(PyExc_ValueError, - "Minutes out of range in datetime string \"%s\"", str); + "Minutes out of range in datetime string \"%s\"", str); goto error; } - substr += 2; - sublen -= 2; - } - else if (sublen >= 1 && isdigit(substr[0])) { - out->min = substr[0] - '0'; - ++substr; - --sublen; } - else { + else if (!has_hms_sep) { goto parse_error; } - /* Next character must be a ':' or the end of the string */ - if (sublen > 0 && *substr == ':') { + if (sublen == 0) { + bestunit = PANDAS_FR_m; + goto finish; + } + + /* If we make it through this condition block, then the next + * character is a digit. */ + if (has_hms_sep && *substr == ':') { ++substr; --sublen; + /* Cannot have a trailing ':' */ + if (sublen == 0 || !isdigit(*substr)) { + goto parse_error; + } + } + else if (!has_hms_sep && isdigit(*substr)) { } else { bestunit = PANDAS_FR_m; goto parse_timezone; } - /* Can't have a trailing ':' */ - if (sublen == 0) { - goto parse_error; - } - - /* PARSE THE SECONDS (2 digits) */ - if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) { - out->sec = 10 * (substr[0] - '0') + (substr[1] - '0'); - + /* PARSE THE SECONDS */ + /* First digit required */ + out->sec = (*substr - '0'); + ++substr; + --sublen; + /* Second digit optional if there was a separator */ + if (isdigit(*substr)) { + out->sec = 10 * out->sec + (*substr - '0'); + ++substr; + --sublen; if (out->sec >= 60) { PyErr_Format(PyExc_ValueError, - "Seconds out of range in datetime string \"%s\"", str); + "Seconds out of range in datetime string \"%s\"", str); goto error; } - substr += 2; - sublen -= 2; - } - else if (sublen >= 1 && isdigit(substr[0])) { - out->sec = substr[0] - '0'; - ++substr; - --sublen; } - else { + else if (!has_hms_sep) { goto parse_error; } diff --git a/pandas/tseries/tests/test_tslib.py b/pandas/tseries/tests/test_tslib.py index 381b106b17eb0..937a8fa340348 100644 --- a/pandas/tseries/tests/test_tslib.py +++ b/pandas/tseries/tests/test_tslib.py @@ -519,7 +519,12 @@ def test_parsers(self): '2014-06': datetime.datetime(2014, 6, 1), '06-2014': datetime.datetime(2014, 6, 1), '2014-6': datetime.datetime(2014, 6, 1), - '6-2014': datetime.datetime(2014, 6, 1), } + '6-2014': datetime.datetime(2014, 6, 1), + + '20010101 12': datetime.datetime(2001, 1, 1, 12), + '20010101 1234': datetime.datetime(2001, 1, 1, 12, 34), + '20010101 123456': datetime.datetime(2001, 1, 1, 12, 34, 56), + } for date_str, expected in compat.iteritems(cases): result1, _, _ = tools.parse_time_string(date_str) @@ -713,11 +718,22 @@ def test_parsers_iso8601(self): self.assertEqual(actual, exp) # seperators must all match - YYYYMM not valid - invalid_cases = ['2011-01/02', '2011^11^11', '201401', - '201111', '200101'] + invalid_cases = ['2011-01/02', '2011^11^11', + '201401', '201111', '200101', + # mixed separated and unseparated + '2005-0101', '200501-01', + '20010101 12:3456', '20010101 1234:56', + # HHMMSS must have two digits in each component + # if unseparated + '20010101 1', '20010101 123', '20010101 12345', + '20010101 12345Z', + # wrong separator for HHMMSS + '2001-01-01 12-34-56'] for date_str in invalid_cases: with tm.assertRaises(ValueError): tslib._test_parse_iso8601(date_str) + # If no ValueError raised, let me know which case failed. + raise Exception(date_str) class TestArrayToDatetime(tm.TestCase): @@ -881,6 +897,11 @@ def test_nanosecond_string_parsing(self): self.assertEqual(ts.value, expected_value + 4 * 3600 * 1000000000) self.assertIn(expected_repr, repr(ts)) + # GH 10041 + ts = Timestamp('20130501T071545.123456789') + self.assertEqual(ts.value, expected_value) + self.assertIn(expected_repr, repr(ts)) + def test_nanosecond_timestamp(self): # GH 7610 expected = 1293840000000000005