From 6b32cb37c9bf5b56a3d97fdd96c032c5a076adf1 Mon Sep 17 00:00:00 2001 From: Chris Date: Fri, 15 Jan 2016 19:25:21 -0600 Subject: [PATCH] PERF: more flexible iso8601 parsing --- asv_bench/benchmarks/timeseries.py | 30 +++--- doc/source/whatsnew/v0.18.0.txt | 2 +- pandas/src/datetime/np_datetime_strings.c | 125 ++++++++++++++++++---- pandas/tseries/tests/test_timeseries.py | 13 ++- pandas/tseries/tests/test_tslib.py | 26 +++++ pandas/tseries/tools.py | 20 +++- pandas/tslib.pyx | 20 ++++ 7 files changed, 188 insertions(+), 48 deletions(-) diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index db0c526f25c7b..bdf193cd1f3d3 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -1059,33 +1059,27 @@ class timeseries_to_datetime_iso8601(object): goal_time = 0.2 def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) self.rng = date_range(start='1/1/2000', periods=20000, freq='H') self.strings = [x.strftime('%Y-%m-%d %H:%M:%S') for x in self.rng] + self.strings_nosep = [x.strftime('%Y%m%d %H:%M:%S') for x in self.rng] + self.strings_tz_space = [x.strftime('%Y-%m-%d %H:%M:%S') + ' -0800' + for x in self.rng] def time_timeseries_to_datetime_iso8601(self): to_datetime(self.strings) - -class timeseries_to_datetime_iso8601_format(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = date_range(start='1/1/2000', periods=20000, freq='H') - self.strings = [x.strftime('%Y-%m-%d %H:%M:%S') for x in self.rng] + def time_timeseries_to_datetime_iso8601_nosep(self): + to_datetime(self.strings_nosep) def time_timeseries_to_datetime_iso8601_format(self): to_datetime(self.strings, format='%Y-%m-%d %H:%M:%S') + def time_timeseries_to_datetime_iso8601_format_no_sep(self): + to_datetime(self.strings_nosep, format='%Y%m%d %H:%M:%S') + + def time_timeseries_to_datetime_iso8601_tz_spaceformat(self): + to_datetime(self.strings_tz_space) + class timeseries_with_format_no_exact(object): goal_time = 0.2 @@ -1160,4 +1154,4 @@ def setup(self): self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) def time_timeseries_year_incr(self): - (self.date + self.year) \ No newline at end of file + (self.date + self.year) diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt index b2eb7d9d97d58..f9ae5e1245551 100644 --- a/doc/source/whatsnew/v0.18.0.txt +++ b/doc/source/whatsnew/v0.18.0.txt @@ -461,7 +461,7 @@ Performance Improvements - +- Improved performance of ISO 8601 date parsing for dates without separators (:issue:`11899`), leading zeros (:issue:`11871`) and with whitespace preceding the time zone (:issue:`9714`) diff --git a/pandas/src/datetime/np_datetime_strings.c b/pandas/src/datetime/np_datetime_strings.c index bc4ef1b3c8184..1e59b31da1e65 100644 --- a/pandas/src/datetime/np_datetime_strings.c +++ b/pandas/src/datetime/np_datetime_strings.c @@ -346,8 +346,6 @@ convert_datetimestruct_local_to_utc(pandas_datetimestruct *out_dts_utc, /* * Parses (almost) standard ISO 8601 date strings. The differences are: * - * + The date "20100312" is parsed as the year 20100312, not as - * equivalent to "2010-03-12". The '-' in the dates are not optional. * + Only seconds may have a decimal point, with up to 18 digits after it * (maximum attoseconds precision). * + Either a 'T' as in ISO 8601 or a ' ' may be used to separate @@ -396,6 +394,16 @@ parse_iso_8601_datetime(char *str, int len, char *substr, sublen; PANDAS_DATETIMEUNIT bestunit; + /* if date components in are separated by one of valid separators + * months/days without leadings 0s will be parsed + * (though not iso8601). If the components aren't separated, + * an error code will be retuned because the date is ambigous + */ + int has_sep = 0; + char sep; + char valid_sep[] = {'-', '.', '/', '\\', ' '}; + int valid_sep_len = 5; + /* Initialize the output to all zeros */ memset(out, 0, sizeof(pandas_datetimestruct)); out->month = 1; @@ -523,12 +531,16 @@ parse_iso_8601_datetime(char *str, int len, goto parse_error; } - /* PARSE THE YEAR (digits until the '-' character) */ + /* PARSE THE YEAR (4 digits) */ out->year = 0; - while (sublen > 0 && isdigit(*substr)) { - out->year = 10 * out->year + (*substr - '0'); - ++substr; - --sublen; + if (sublen >= 4 && isdigit(substr[0]) && isdigit(substr[1]) && + isdigit(substr[2]) && isdigit(substr[3])) { + + out->year = 1000 * (substr[0] - '0') + 100 * (substr[1] - '0') + + 10 * (substr[2] - '0') + (substr[3] - '0'); + + substr += 4; + sublen -= 4;; } /* Negate the year if necessary */ @@ -538,7 +550,7 @@ parse_iso_8601_datetime(char *str, int len, /* Check whether it's a leap-year */ year_leap = is_leapyear(out->year); - /* Next character must be a '-' or the end of the string */ + /* Next character must be a separator, start of month or end */ if (sublen == 0) { if (out_local != NULL) { *out_local = 0; @@ -546,21 +558,41 @@ parse_iso_8601_datetime(char *str, int len, bestunit = PANDAS_FR_Y; goto finish; } - else if (*substr == '-') { - ++substr; - --sublen; - } - else { - goto parse_error; + else if (!isdigit(*substr)) { + for (i = 0; i < valid_sep_len; ++i) { + if (*substr == valid_sep[i]) { + has_sep = 1; + sep = valid_sep[i]; + ++substr; + --sublen; + break; + } + } + if (i == valid_sep_len) { + goto parse_error; + } } - /* Can't have a trailing '-' */ + /* Can't have a trailing sep */ if (sublen == 0) { goto parse_error; } + /* PARSE THE MONTH (2 digits) */ - if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) { + if (has_sep && ((sublen >= 2 && isdigit(substr[0]) && !isdigit(substr[1])) + || (sublen == 1 && isdigit(substr[0])))) { + out->month = (substr[0] - '0'); + + if (out->month < 1) { + PyErr_Format(PyExc_ValueError, + "Month out of range in datetime string \"%s\"", str); + goto error; + } + ++substr; + --sublen; + } + else if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) { out->month = 10 * (substr[0] - '0') + (substr[1] - '0'); if (out->month < 1 || out->month > 12) { @@ -577,18 +609,22 @@ parse_iso_8601_datetime(char *str, int len, /* Next character must be a '-' or the end of the string */ if (sublen == 0) { + /* dates of form YYYYMM are not valid */ + if (!has_sep) { + goto parse_error; + } if (out_local != NULL) { *out_local = 0; } bestunit = PANDAS_FR_M; goto finish; } - else if (*substr == '-') { + else if (has_sep && *substr == sep) { ++substr; --sublen; } - else { - goto parse_error; + else if (!isdigit(*substr)) { + goto parse_error; } /* Can't have a trailing '-' */ @@ -597,7 +633,19 @@ parse_iso_8601_datetime(char *str, int len, } /* PARSE THE DAY (2 digits) */ - if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) { + if (has_sep && ((sublen >= 2 && isdigit(substr[0]) && !isdigit(substr[1])) + || (sublen == 1 && isdigit(substr[0])))) { + out->day = (substr[0] - '0'); + + if (out->day < 1) { + PyErr_Format(PyExc_ValueError, + "Day out of range in datetime string \"%s\"", str); + goto error; + } + ++substr; + --sublen; + } + else if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) { out->day = 10 * (substr[0] - '0') + (substr[1] - '0'); if (out->day < 1 || @@ -633,7 +681,7 @@ parse_iso_8601_datetime(char *str, int len, if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) { out->hour = 10 * (substr[0] - '0') + (substr[1] - '0'); - if (out->hour < 0 || out->hour >= 24) { + if (out->hour >= 24) { PyErr_Format(PyExc_ValueError, "Hours out of range in datetime string \"%s\"", str); goto error; @@ -641,6 +689,11 @@ parse_iso_8601_datetime(char *str, int len, substr += 2; sublen -= 2; } + else if (sublen >= 1 && isdigit(substr[0])) { + out->hour = substr[0] - '0'; + ++substr; + --sublen; + } else { goto parse_error; } @@ -664,7 +717,7 @@ parse_iso_8601_datetime(char *str, int len, if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) { out->min = 10 * (substr[0] - '0') + (substr[1] - '0'); - if (out->hour < 0 || out->min >= 60) { + if (out->min >= 60) { PyErr_Format(PyExc_ValueError, "Minutes out of range in datetime string \"%s\"", str); goto error; @@ -672,6 +725,11 @@ parse_iso_8601_datetime(char *str, int len, substr += 2; sublen -= 2; } + else if (sublen >= 1 && isdigit(substr[0])) { + out->min = substr[0] - '0'; + ++substr; + --sublen; + } else { goto parse_error; } @@ -695,7 +753,7 @@ parse_iso_8601_datetime(char *str, int len, if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) { out->sec = 10 * (substr[0] - '0') + (substr[1] - '0'); - if (out->sec < 0 || out->sec >= 60) { + if (out->sec >= 60) { PyErr_Format(PyExc_ValueError, "Seconds out of range in datetime string \"%s\"", str); goto error; @@ -703,6 +761,11 @@ parse_iso_8601_datetime(char *str, int len, substr += 2; sublen -= 2; } + else if (sublen >= 1 && isdigit(substr[0])) { + out->sec = substr[0] - '0'; + ++substr; + --sublen; + } else { goto parse_error; } @@ -781,6 +844,12 @@ parse_iso_8601_datetime(char *str, int len, } parse_timezone: + /* trim any whitepsace between time/timeezone */ + while (sublen > 0 && isspace(*substr)) { + ++substr; + --sublen; + } + if (sublen == 0) { // Unlike NumPy, treating no time zone as naive goto finish; @@ -832,6 +901,11 @@ parse_iso_8601_datetime(char *str, int len, goto error; } } + else if (sublen >= 1 && isdigit(substr[0])) { + offset_hour = substr[0] - '0'; + ++substr; + --sublen; + } else { goto parse_error; } @@ -856,6 +930,11 @@ parse_iso_8601_datetime(char *str, int len, goto error; } } + else if (sublen >= 1 && isdigit(substr[0])) { + offset_minute = substr[0] - '0'; + ++substr; + --sublen; + } else { goto parse_error; } diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 84065c0340aad..8d7b5a31a5ab3 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -2454,7 +2454,7 @@ def test_constructor_datetime64_tzformat(self): idx = date_range('2013/1/1 0:00:00-5:00', '2016/1/1 23:59:59-5:00', freq=freq) expected = date_range('2013-01-01T00:00:00', '2016-01-01T23:59:59', - freq=freq, tz=tzoffset(None, -18000)) + freq=freq, tz=pytz.FixedOffset(-300)) tm.assert_index_equal(idx, expected) # Unable to use `US/Eastern` because of DST expected_i8 = date_range('2013-01-01T00:00:00', @@ -2465,7 +2465,7 @@ def test_constructor_datetime64_tzformat(self): idx = date_range('2013/1/1 0:00:00+9:00', '2016/1/1 23:59:59+09:00', freq=freq) expected = date_range('2013-01-01T00:00:00', '2016-01-01T23:59:59', - freq=freq, tz=tzoffset(None, 32400)) + freq=freq, tz=pytz.FixedOffset(540)) tm.assert_index_equal(idx, expected) expected_i8 = date_range('2013-01-01T00:00:00', '2016-01-01T23:59:59', freq=freq, @@ -4833,6 +4833,15 @@ def test_to_datetime_infer_datetime_format_series_starting_with_nans(self): pd.to_datetime(test_series, infer_datetime_format=True) ) + def test_to_datetime_iso8601_noleading_0s(self): + # GH 11871 + test_series = pd.Series(['2014-1-1', '2014-2-2', '2015-3-3']) + expected = pd.Series([pd.Timestamp('2014-01-01'), + pd.Timestamp('2014-02-02'), + pd.Timestamp('2015-03-03')]) + tm.assert_series_equal(pd.to_datetime(test_series), expected) + tm.assert_series_equal(pd.to_datetime(test_series, format='%Y-%m-%d'), + expected) class TestGuessDatetimeFormat(tm.TestCase): def test_guess_datetime_format_with_parseable_formats(self): diff --git a/pandas/tseries/tests/test_tslib.py b/pandas/tseries/tests/test_tslib.py index 123b91d8bbf82..27dbdcdd1d993 100644 --- a/pandas/tseries/tests/test_tslib.py +++ b/pandas/tseries/tests/test_tslib.py @@ -688,6 +688,32 @@ def test_parsers_timezone_minute_offsets_roundtrip(self): converted_time = dt_time.tz_localize('UTC').tz_convert(tz) self.assertEqual(dt_string_repr, repr(converted_time)) + def test_parsers_iso8601(self): + # GH 12060 + # test only the iso parser - flexibility to different + # separators and leadings 0s + # Timestamp construction falls back to dateutil + cases = {'2011-01-02': datetime.datetime(2011, 1, 2), + '2011-1-2': datetime.datetime(2011, 1, 2), + '2011-01': datetime.datetime(2011, 1, 1), + '2011-1': datetime.datetime(2011, 1, 1), + '2011 01 02': datetime.datetime(2011, 1, 2), + '2011.01.02': datetime.datetime(2011, 1, 2), + '2011/01/02': datetime.datetime(2011, 1, 2), + '2011\\01\\02': datetime.datetime(2011, 1, 2), + '2013-01-01 05:30:00': datetime.datetime(2013, 1, 1, 5, 30), + '2013-1-1 5:30:00': datetime.datetime(2013, 1, 1, 5, 30)} + for date_str, exp in compat.iteritems(cases): + actual = tslib._test_parse_iso8601(date_str) + self.assertEqual(actual, exp) + + # seperators must all match - YYYYMM not valid + invalid_cases = ['2011-01/02', '2011^11^11', '201401', + '201111', '200101'] + for date_str in invalid_cases: + with tm.assertRaises(ValueError): + tslib._test_parse_iso8601(date_str) + class TestArrayToDatetime(tm.TestCase): def test_parsing_valid_dates(self): diff --git a/pandas/tseries/tools.py b/pandas/tseries/tools.py index 734857c6d724d..85e36ba1df20e 100644 --- a/pandas/tseries/tools.py +++ b/pandas/tseries/tools.py @@ -334,10 +334,7 @@ def _convert_listlike(arg, box, format, name=None): # datetime strings, so in those cases don't use the inferred # format because this path makes process slower in this # special case - format_is_iso8601 = ( - ('%Y-%m-%dT%H:%M:%S.%f'.startswith(format) or - '%Y-%m-%d %H:%M:%S.%f'.startswith(format)) and - format != '%Y') + format_is_iso8601 = _format_is_iso(format) if format_is_iso8601: require_iso8601 = not infer_datetime_format format = None @@ -461,6 +458,21 @@ def calc_with_mask(carg, mask): return None +def _format_is_iso(f): + """ + Does format match the iso8601 set that can be handled by the C parser? + Generally of form YYYY-MM-DDTHH:MM:SS - date separator can be different + but must be consistent. Leading 0s in dates and times are optional. + """ + iso_template = '%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M:%S.%f'.format + excluded_formats = ['%Y%m%d','%Y%m', '%Y'] + + for date_sep in [' ', '/', '\\', '-', '.', '']: + for time_sep in [' ', 'T']: + if (iso_template(date_sep=date_sep, time_sep=time_sep).startswith(f) + and f not in excluded_formats): + return True + return False def parse_time_string(arg, freq=None, dayfirst=None, yearfirst=None): """ diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index f737ac8178a68..be1c5af74a95d 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -1388,6 +1388,26 @@ cpdef convert_str_to_tsobject(object ts, object tz, object unit, return convert_to_tsobject(ts, tz, unit) +def _test_parse_iso8601(object ts): + ''' + TESTING ONLY: Parse string into Timestamp using iso8601 parser. Used + only for testing, actual construction uses `convert_str_to_tsobject` + ''' + cdef: + _TSObject obj + int out_local = 0, out_tzoffset = 0 + + obj = _TSObject() + + _string_to_dts(ts, &obj.dts, &out_local, &out_tzoffset) + obj.value = pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &obj.dts) + _check_dts_bounds(&obj.dts) + if out_local == 1: + obj.tzinfo = pytz.FixedOffset(out_tzoffset) + obj.value = tz_convert_single(obj.value, obj.tzinfo, 'UTC') + return Timestamp(obj.value, tz=obj.tzinfo) + else: + return Timestamp(obj.value) cdef inline void _localize_tso(_TSObject obj, object tz): '''