From 45f82c3be1fd147aebf23c8fc2de16361d93b7cb Mon Sep 17 00:00:00 2001 From: Nikita Vedeneev Date: Wed, 26 Oct 2022 13:50:29 +0000 Subject: [PATCH 01/18] initial format support Co-Authored-By: MarcoGorelli <> Co-Authored-By: FDRocha <> --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/_libs/tslib.pyx | 17 ++- pandas/_libs/tslibs/conversion.pyx | 2 +- pandas/_libs/tslibs/np_datetime.pxd | 2 + pandas/_libs/tslibs/np_datetime.pyx | 11 +- pandas/_libs/tslibs/parsing.pyx | 2 +- .../tslibs/src/datetime/np_datetime_strings.c | 74 +++++++++- .../tslibs/src/datetime/np_datetime_strings.h | 5 +- pandas/core/arrays/datetimes.py | 4 + pandas/core/tools/datetimes.py | 9 +- pandas/tests/tools/test_to_datetime.py | 126 +++++++++++++++++- 11 files changed, 239 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index b5c5a721324c6..e3cc6e3c9bc1e 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -333,6 +333,7 @@ Conversion - Bug in :meth:`Series.convert_dtypes` not converting dtype to nullable dtype when :class:`Series` contains ``NA`` and has dtype ``object`` (:issue:`48791`) - Bug where any :class:`ExtensionDtype` subclass with ``kind="M"`` would be interpreted as a timezone type (:issue:`34986`) - Bug in :class:`.arrays.ArrowExtensionArray` that would raise ``NotImplementedError`` when passed a sequence of strings or binary (:issue:`49172`) +- Bug in :func:`to_datetime` was not respecting ``exact`` argument when ``format`` was an ISO8601 format (:issue:`12649`) Strings ^^^^^^^ diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index d7c0c91332e02..4fd015f15ed76 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -89,7 +89,7 @@ def _test_parse_iso8601(ts: str): elif ts == 'today': return Timestamp.now().normalize() - string_to_dts(ts, &obj.dts, &out_bestunit, &out_local, &out_tzoffset, True) + string_to_dts(ts, &obj.dts, &out_bestunit, &out_local, &out_tzoffset, True, "", False) obj.value = npy_datetimestruct_to_datetime(NPY_FR_ns, &obj.dts) check_dts_bounds(&obj.dts) if out_local == 1: @@ -445,6 +445,8 @@ cpdef array_to_datetime( bint utc=False, bint require_iso8601=False, bint allow_mixed=False, + str format="", + bint exact=False, ): """ Converts a 1D array of date-like values to a numpy array of either: @@ -564,6 +566,15 @@ cpdef array_to_datetime( iresult[i] = get_datetime64_nanos(val, NPY_FR_ns) elif is_integer_object(val) or is_float_object(val): + if require_iso8601: + if is_coerce: + iresult[i] = NPY_NAT + continue + elif is_raise: + raise ValueError( + f"time data \"{val}\" at position {i} doesn't match format \"{format}\"" + ) + return values, tz_out # these must be ns unit by-definition seen_integer = True @@ -594,7 +605,7 @@ cpdef array_to_datetime( string_to_dts_failed = string_to_dts( val, &dts, &out_bestunit, &out_local, - &out_tzoffset, False + &out_tzoffset, False, format, exact ) if string_to_dts_failed: # An error at this point is a _parsing_ error @@ -609,7 +620,7 @@ cpdef array_to_datetime( continue elif is_raise: raise ValueError( - f"time data \"{val}\" at position {i} doesn't match format specified" + f"time data \"{val}\" at position {i} doesn't match format \"{format}\"" ) return values, tz_out diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 923dfa3c54d26..481f66bc256bf 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -488,7 +488,7 @@ cdef _TSObject _convert_str_to_tsobject(object ts, tzinfo tz, str unit, else: string_to_dts_failed = string_to_dts( ts, &dts, &out_bestunit, &out_local, - &out_tzoffset, False + &out_tzoffset, False, "", False ) if not string_to_dts_failed: try: diff --git a/pandas/_libs/tslibs/np_datetime.pxd b/pandas/_libs/tslibs/np_datetime.pxd index e51bbd4e074e1..770b1f7e46c12 100644 --- a/pandas/_libs/tslibs/np_datetime.pxd +++ b/pandas/_libs/tslibs/np_datetime.pxd @@ -95,6 +95,8 @@ cdef int string_to_dts( int* out_local, int* out_tzoffset, bint want_exc, + str format, + bint exact ) except? -1 cdef NPY_DATETIMEUNIT get_unit_from_dtype(cnp.dtype dtype) diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index 07872050dc822..8acd51d8a4431 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -52,7 +52,8 @@ cdef extern from "src/datetime/np_datetime_strings.h": int parse_iso_8601_datetime(const char *str, int len, int want_exc, npy_datetimestruct *out, NPY_DATETIMEUNIT *out_bestunit, - int *out_local, int *out_tzoffset) + int *out_local, int *out_tzoffset, + const char *format, int format_len, int exact) # ---------------------------------------------------------------------- @@ -273,14 +274,20 @@ cdef inline int string_to_dts( int* out_local, int* out_tzoffset, bint want_exc, + str format, + bint exact, ) except? -1: cdef: Py_ssize_t length const char* buf + Py_ssize_t format_length + const char* format_buf buf = get_c_string_buf_and_size(val, &length) + format_buf = get_c_string_buf_and_size(format, &format_length) return parse_iso_8601_datetime(buf, length, want_exc, - dts, out_bestunit, out_local, out_tzoffset) + dts, out_bestunit, out_local, out_tzoffset, + format_buf, format_length, exact) cpdef ndarray astype_overflowsafe( diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 469e0721f1207..eeb738bdc3c1a 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -409,7 +409,7 @@ cdef parse_datetime_string_with_reso( # TODO: does this render some/all of parse_delimited_date redundant? string_to_dts_failed = string_to_dts( date_string, &dts, &out_bestunit, &out_local, - &out_tzoffset, False + &out_tzoffset, False, "", False ) if not string_to_dts_failed: if dts.ps != 0 or out_local: diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c index cfbaed01b57c9..9a95bf44ddd6d 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c @@ -66,10 +66,29 @@ This file implements string parsing and creation for NumPy datetime. * * Returns 0 on success, -1 on failure. */ + +#define FORMAT_STARTSWITH(ch) \ + if (exact) { \ + if (!format_len || *format != ch) { \ + goto parse_error; \ + } \ + ++format; \ + --format_len; \ + } else { \ + if (format_len > 0) { \ + if (*format != ch) { \ + goto parse_error; \ + } \ + ++format; \ + --format_len; \ + } \ + } \ + int parse_iso_8601_datetime(const char *str, int len, int want_exc, npy_datetimestruct *out, NPY_DATETIMEUNIT *out_bestunit, - int *out_local, int *out_tzoffset) { + int *out_local, int *out_tzoffset, + const char* format, int format_len, int exact) { int year_leap = 0; int i, numdigits; const char *substr; @@ -104,14 +123,19 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, while (sublen > 0 && isspace(*substr)) { ++substr; --sublen; + FORMAT_STARTSWITH(' '); } /* Leading '-' sign for negative year */ if (*substr == '-') { ++substr; --sublen; + FORMAT_STARTSWITH('-'); } + FORMAT_STARTSWITH('%'); + FORMAT_STARTSWITH('Y'); + if (sublen == 0) { goto parse_error; } @@ -139,6 +163,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if (out_local != NULL) { *out_local = 0; } + if (format_len) { + goto parse_error; + } bestunit = NPY_FR_Y; goto finish; } @@ -156,6 +183,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, ymd_sep = valid_ymd_sep[i]; ++substr; --sublen; + FORMAT_STARTSWITH(ymd_sep); /* Cannot have trailing separator */ if (sublen == 0 || !isdigit(*substr)) { goto parse_error; @@ -167,6 +195,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, out->month = (*substr - '0'); ++substr; --sublen; + FORMAT_STARTSWITH('%'); + FORMAT_STARTSWITH('m'); /* Second digit optional if there was a separator */ if (isdigit(*substr)) { out->month = 10 * out->month + (*substr - '0'); @@ -190,6 +220,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if (!has_ymd_sep) { goto parse_error; } + if (format_len) { + goto parse_error; + } if (out_local != NULL) { *out_local = 0; } @@ -203,6 +236,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } ++substr; --sublen; + FORMAT_STARTSWITH(ymd_sep); } /* PARSE THE DAY */ @@ -213,6 +247,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, out->day = (*substr - '0'); ++substr; --sublen; + FORMAT_STARTSWITH('%'); + FORMAT_STARTSWITH('d'); /* Second digit optional if there was a separator */ if (isdigit(*substr)) { out->day = 10 * out->day + (*substr - '0'); @@ -235,6 +271,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if (out_local != NULL) { *out_local = 0; } + if (format_len) { + goto parse_error; + } bestunit = NPY_FR_D; goto finish; } @@ -242,6 +281,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if ((*substr != 'T' && *substr != ' ') || sublen == 1) { goto parse_error; } + FORMAT_STARTSWITH(*substr); ++substr; --sublen; @@ -250,6 +290,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if (!isdigit(*substr)) { goto parse_error; } + FORMAT_STARTSWITH('%'); + FORMAT_STARTSWITH('H'); out->hour = (*substr - '0'); ++substr; --sublen; @@ -274,6 +316,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if (!hour_was_2_digits) { goto parse_error; } + if (format_len) { + goto parse_error; + } bestunit = NPY_FR_h; goto finish; } @@ -286,6 +331,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if (sublen == 0 || !isdigit(*substr)) { goto parse_error; } + FORMAT_STARTSWITH(':'); } else if (!isdigit(*substr)) { if (!hour_was_2_digits) { goto parse_error; @@ -298,6 +344,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, out->min = (*substr - '0'); ++substr; --sublen; + FORMAT_STARTSWITH('%'); + FORMAT_STARTSWITH('M'); /* Second digit optional if there was a separator */ if (isdigit(*substr)) { out->min = 10 * out->min + (*substr - '0'); @@ -317,12 +365,16 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if (sublen == 0) { bestunit = NPY_FR_m; + if (format_len) { + goto parse_error; + } goto finish; } /* If we make it through this condition block, then the next * character is a digit. */ if (has_hms_sep && *substr == ':') { + FORMAT_STARTSWITH(':'); ++substr; --sublen; /* Cannot have a trailing ':' */ @@ -339,6 +391,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, out->sec = (*substr - '0'); ++substr; --sublen; + FORMAT_STARTSWITH('%'); + FORMAT_STARTSWITH('S'); /* Second digit optional if there was a separator */ if (isdigit(*substr)) { out->sec = 10 * out->sec + (*substr - '0'); @@ -360,12 +414,15 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if (sublen > 0 && *substr == '.') { ++substr; --sublen; + FORMAT_STARTSWITH('.'); } else { bestunit = NPY_FR_s; goto parse_timezone; } /* PARSE THE MICROSECONDS (0 to 6 digits) */ + FORMAT_STARTSWITH('%'); + FORMAT_STARTSWITH('f'); numdigits = 0; for (i = 0; i < 6; ++i) { out->us *= 10; @@ -430,15 +487,22 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, while (sublen > 0 && isspace(*substr)) { ++substr; --sublen; + FORMAT_STARTSWITH(' '); } if (sublen == 0) { // Unlike NumPy, treating no time zone as naive + if (format_len > 0) { + goto parse_error; + } goto finish; } /* UTC specifier */ if (*substr == 'Z') { + FORMAT_STARTSWITH('%'); + FORMAT_STARTSWITH('Z'); + /* "Z" should be equivalent to tz offset "+00:00" */ if (out_local != NULL) { *out_local = 1; @@ -449,12 +513,17 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } if (sublen == 1) { + if (format_len > 0) { + goto parse_error; + } goto finish; } else { ++substr; --sublen; } } else if (*substr == '-' || *substr == '+') { + FORMAT_STARTSWITH('%'); + FORMAT_STARTSWITH('z'); /* Time zone offset */ int offset_neg = 0, offset_hour = 0, offset_minute = 0; @@ -538,9 +607,10 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, while (sublen > 0 && isspace(*substr)) { ++substr; --sublen; + FORMAT_STARTSWITH(' '); } - if (sublen != 0) { + if ((sublen != 0) || (format_len != 0)) { goto parse_error; } diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h index 511d9a401fed2..734f7daceba05 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h @@ -58,7 +58,10 @@ parse_iso_8601_datetime(const char *str, int len, int want_exc, npy_datetimestruct *out, NPY_DATETIMEUNIT *out_bestunit, int *out_local, - int *out_tzoffset); + int *out_tzoffset, + const char* format, + int format_len, + int exact); /* * Provides a string length to use for converting datetime diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 1768bb7507dd9..fc4234ccd8204 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -2161,6 +2161,8 @@ def objects_to_datetime64ns( require_iso8601: bool = False, allow_object: bool = False, allow_mixed: bool = False, + format: str = "", + exact: bool = True, ): """ Convert data to array of timestamps. @@ -2208,6 +2210,8 @@ def objects_to_datetime64ns( yearfirst=yearfirst, require_iso8601=require_iso8601, allow_mixed=allow_mixed, + format=format, + exact=exact, ) result = result.reshape(data.shape, order=order) except OverflowError as err: diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 7791ea804a52a..52c5d557dff9c 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -426,16 +426,17 @@ def _convert_listlike_datetimes( format_is_iso8601 = format_is_iso(format) if format_is_iso8601: require_iso8601 = not infer_datetime_format - format = None - if format is not None: + if format is not None and not require_iso8601: res = _to_datetime_with_format( arg, orig_arg, name, tz, format, exact, errors, infer_datetime_format ) if res is not None: return res + elif format is None: + format = "" + exact = False - assert format is None or infer_datetime_format utc = tz == "utc" result, tz_parsed = objects_to_datetime64ns( arg, @@ -445,6 +446,8 @@ def _convert_listlike_datetimes( errors=errors, require_iso8601=require_iso8601, allow_object=True, + format=format, + exact=exact, ) if tz_parsed is not None: diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index c3b4159c2cbfc..1b08faba2eb67 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1741,6 +1741,130 @@ def test_to_datetime_iso8601(self, cache, arg, exp_str): exp = Timestamp(exp_str) assert result[0] == exp + @pytest.mark.parametrize( + "input, format", + [ + ("2012", "%Y-%m"), + ("2012-01", "%Y-%m-%d"), + ("2012-01-01", "%Y-%m-%d %H"), + ("2012-01-01 10", "%Y-%m-%d %H:%M"), + ("2012-01-01 10:00", "%Y-%m-%d %H:%M:%S"), + ("2012-01-01 10:00:00", "%Y-%m-%d %H:%M:%S.%f"), + ("2012-01-01 10:00:00.123", "%Y-%m-%d %H:%M:%S.%f%Z"), + ("2012-01-01 10:00:00.123", "%Y-%m-%d %H:%M:%S.%f%z"), + (0, "%Y-%m-%d"), + ], + ) + @pytest.mark.parametrize("exact", [True, False]) + def test_to_datetime_iso8601_fails(self, input, format, exact): + # https://github.com/pandas-dev/pandas/issues/12649 + # `format` is longer than the string, so this fails regardless of `exact` + with pytest.raises( + ValueError, + match=( + rf"time data \"{input}\" at position 0 doesn't match format " + rf"\"{format}\"" + ), + ): + to_datetime(input, format=format, exact=exact) + + @pytest.mark.parametrize( + "input, format", + [ + ("2012-01-01", "%Y-%m"), + ("2012-01-01 10", "%Y-%m-%d"), + ("2012-01-01 10:00", "%Y-%m-%d %H"), + ("2012-01-01 10:00:00", "%Y-%m-%d %H:%M"), + (0, "%Y-%m-%d"), + ], + ) + def test_to_datetime_iso8601_exact_fails(self, input, format): + # https://github.com/pandas-dev/pandas/issues/12649 + # `format` is shorter than the date string, so only fails with `exact=True` + with pytest.raises( + ValueError, + match=( + rf"time data \"{input}\" at position 0 doesn't match format " + rf"\"{format}\"" + ), + ): + to_datetime(input, format=format) + + @pytest.mark.parametrize( + "input, format", + [ + ("2012-01-01", "%Y-%m"), + ("2012-01-01 00", "%Y-%m-%d"), + ("2012-01-01 00:00", "%Y-%m-%d %H"), + ("2012-01-01 00:00:00", "%Y-%m-%d %H:%M"), + ], + ) + def test_to_datetime_iso8601_non_exact(self, input, format): + # https://github.com/pandas-dev/pandas/issues/12649 + expected = Timestamp(2012, 1, 1) + result = to_datetime(input, format=format, exact=False) + assert result == expected + + @pytest.mark.parametrize( + "input, format", + [ + ("2020-01", "%Y/%m"), + ("2020-01-01", "%Y/%m/%d"), + ("2020-01-01 00", "%Y/%m/%dT%H"), + ("2020-01-01T00", "%Y/%m/%d %H"), + ("2020-01-01 00:00", "%Y/%m/%dT%H:%M"), + ("2020-01-01T00:00", "%Y/%m/%d %H:%M"), + ("2020-01-01 00:00:00", "%Y/%m/%dT%H:%M:%S"), + ("2020-01-01T00:00:00", "%Y/%m/%d %H:%M:%S"), + ], + ) + def test_to_datetime_iso8601_separator(self, input, format): + # https://github.com/pandas-dev/pandas/issues/12649 + with pytest.raises( + ValueError, + match=( + rf"time data \"{input}\" at position 0 doesn\'t match format " + rf"\"{format}\"" + ), + ): + to_datetime(input, format=format) + + @pytest.mark.parametrize( + "input, format", + [ + ("2020-01", "%Y-%m"), + ("2020-01-01", "%Y-%m-%d"), + ("2020-01-01 00", "%Y-%m-%d %H"), + ("2020-01-01T00", "%Y-%m-%dT%H"), + ("2020-01-01 00:00", "%Y-%m-%d %H:%M"), + ("2020-01-01T00:00", "%Y-%m-%dT%H:%M"), + ("2020-01-01 00:00:00", "%Y-%m-%d %H:%M:%S"), + ("2020-01-01T00:00:00", "%Y-%m-%dT%H:%M:%S"), + ("2020-01-01T00:00:00.000", "%Y-%m-%dT%H:%M:%S.%f"), + ("2020-01-01T00:00:00.000000", "%Y-%m-%dT%H:%M:%S.%f"), + ("2020-01-01T00:00:00.000000000", "%Y-%m-%dT%H:%M:%S.%f"), + ], + ) + def test_to_datetime_iso8601_valid(self, input, format): + # https://github.com/pandas-dev/pandas/issues/12649 + expected = Timestamp(2020, 1, 1) + result = to_datetime(input, format=format) + assert result == expected + + @pytest.mark.parametrize( + "input, format", + [ + ("2020-01-01T00:00:00.000000000+00:00", "%Y-%m-%dT%H:%M:%S.%f%z"), + ("2020-01-01T00:00:00+00:00", "%Y-%m-%dT%H:%M:%S%z"), + ("2020-01-01T00:00:00Z", "%Y-%m-%dT%H:%M:%S%Z"), + ], + ) + def test_to_datetime_iso8601_with_timezone_valid(self, input, format): + # https://github.com/pandas-dev/pandas/issues/12649 + expected = Timestamp(2020, 1, 1, tzinfo=pytz.UTC) + result = to_datetime(input, format=format) + assert result == expected + def test_to_datetime_default(self, cache): rs = to_datetime("2001", cache=cache) xp = datetime(2001, 1, 1) @@ -2254,7 +2378,7 @@ def test_day_not_in_month_raise(self, cache): @pytest.mark.parametrize("arg", ["2015-02-29", "2015-02-32", "2015-04-31"]) def test_day_not_in_month_raise_value(self, cache, arg): - msg = f'time data "{arg}" at position 0 doesn\'t match format specified' + msg = f'time data "{arg}" at position 0 doesn\'t match format "%Y-%m-%d"' with pytest.raises(ValueError, match=msg): to_datetime(arg, errors="raise", format="%Y-%m-%d", cache=cache) From e473adba8b5e9adf4e8667fab0ded57f3e9f3c1b Mon Sep 17 00:00:00 2001 From: Marco Gorelli <> Date: Fri, 28 Oct 2022 11:16:00 +0200 Subject: [PATCH 02/18] set exact=False default in objects_to_datetime --- pandas/core/arrays/datetimes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index fc4234ccd8204..3cb1267029287 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -2162,7 +2162,7 @@ def objects_to_datetime64ns( allow_object: bool = False, allow_mixed: bool = False, format: str = "", - exact: bool = True, + exact: bool = False, ): """ Convert data to array of timestamps. From a6ea6d0a3ba6a849f293cb4f367ba7f3715c2afc Mon Sep 17 00:00:00 2001 From: Marco Gorelli <> Date: Fri, 28 Oct 2022 12:22:58 +0200 Subject: [PATCH 03/18] :label: typing --- pandas/_libs/tslib.pyi | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/_libs/tslib.pyi b/pandas/_libs/tslib.pyi index 8fec9ecf27f30..4228cb3eb9354 100644 --- a/pandas/_libs/tslib.pyi +++ b/pandas/_libs/tslib.pyi @@ -25,6 +25,8 @@ def array_to_datetime( utc: bool = ..., require_iso8601: bool = ..., allow_mixed: bool = ..., + format: str = ..., + exact: bool = ..., ) -> tuple[np.ndarray, tzinfo | None]: ... # returned ndarray may be object dtype or datetime64[ns] From 8fede1fa9f2a3b7324e376ec6f751f9fcdf96365 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <> Date: Fri, 28 Oct 2022 15:51:32 +0200 Subject: [PATCH 04/18] simplify --- .../tslibs/src/datetime/np_datetime_strings.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c index 9a95bf44ddd6d..7f1a82477e144 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c @@ -68,21 +68,16 @@ This file implements string parsing and creation for NumPy datetime. */ #define FORMAT_STARTSWITH(ch) \ - if (exact) { \ - if (!format_len || *format != ch) { \ + /* Always error on character mismatch conditioned on non-exhausted format, \ + or when format is exhausted in the exact case. */ \ + if ((format_len && *format != ch) || (exact && !format_len)){ \ goto parse_error; \ } \ - ++format; \ - --format_len; \ - } else { \ - if (format_len > 0) { \ - if (*format != ch) { \ - goto parse_error; \ - } \ - ++format; \ - --format_len; \ + /* Advance if format is not exhausted */ \ + if (format_len) { \ + ++format; \ + --format_len; \ } \ - } \ int parse_iso_8601_datetime(const char *str, int len, int want_exc, npy_datetimestruct *out, From 12721b07faa2dfe8bb45ee3d5d249f819fd6ee6e Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Sun, 30 Oct 2022 08:52:59 +0000 Subject: [PATCH 05/18] replace macro with function --- .pre-commit-config.yaml | 9 +- .../tslibs/src/datetime/np_datetime_strings.c | 102 +++++++++++------- 2 files changed, 71 insertions(+), 40 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8ff7526b87521..97fd0b11c8eb4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -48,7 +48,14 @@ repos: # this particular codebase (e.g. src/headers, src/klib). However, # we can lint all header files since they aren't "generated" like C files are. exclude: ^pandas/_libs/src/(klib|headers)/ - args: [--quiet, '--extensions=c,h', '--headers=h', --recursive, '--filter=-readability/casting,-runtime/int,-build/include_subdir'] + args: [ + --quiet, + '--extensions=c,h', + '--headers=h', + --recursive, + '--filter=-readability/casting,-runtime/int,-build/include_subdir', + '--linelength=88' + ] - repo: https://github.com/PyCQA/flake8 rev: 5.0.4 hooks: diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c index 7f1a82477e144..79dc730c25906 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c @@ -67,19 +67,15 @@ This file implements string parsing and creation for NumPy datetime. * Returns 0 on success, -1 on failure. */ -#define FORMAT_STARTSWITH(ch) \ - /* Always error on character mismatch conditioned on non-exhausted format, \ - or when format is exhausted in the exact case. */ \ - if ((format_len && *format != ch) || (exact && !format_len)){ \ - goto parse_error; \ - } \ - /* Advance if format is not exhausted */ \ - if (format_len) { \ - ++format; \ - --format_len; \ - } \ +int format_startswith(char ch, int format_len, char format, int exact) { + if ((format_len && format != ch) || (exact && !format_len)) { + return 0; + } + return 1; +} int parse_iso_8601_datetime(const char *str, int len, int want_exc, + npy_datetimestruct *out, NPY_DATETIMEUNIT *out_bestunit, int *out_local, int *out_tzoffset, @@ -118,18 +114,22 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, while (sublen > 0 && isspace(*substr)) { ++substr; --sublen; - FORMAT_STARTSWITH(' '); + if (!format_startswith(' ', format_len, *format, exact)) goto parse_error; + if (format_len) {++format; --format_len;} } /* Leading '-' sign for negative year */ if (*substr == '-') { ++substr; --sublen; - FORMAT_STARTSWITH('-'); + if (!format_startswith('-', format_len, *format, exact)) goto parse_error; + if (format_len) {++format; --format_len;} } - FORMAT_STARTSWITH('%'); - FORMAT_STARTSWITH('Y'); + if (!format_startswith('%', format_len, *format, exact)) goto parse_error; + if (format_len) {++format; --format_len;} + if (!format_startswith('Y', format_len, *format, exact)) goto parse_error; + if (format_len) {++format; --format_len;} if (sublen == 0) { goto parse_error; @@ -178,7 +178,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, ymd_sep = valid_ymd_sep[i]; ++substr; --sublen; - FORMAT_STARTSWITH(ymd_sep); + if (!format_startswith(ymd_sep, format_len, *format, exact)) goto parse_error; + if (format_len) {++format; --format_len;} /* Cannot have trailing separator */ if (sublen == 0 || !isdigit(*substr)) { goto parse_error; @@ -190,8 +191,10 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, out->month = (*substr - '0'); ++substr; --sublen; - FORMAT_STARTSWITH('%'); - FORMAT_STARTSWITH('m'); + if (!format_startswith('%', format_len, *format, exact)) goto parse_error; + if (format_len) {++format; --format_len;} + if (!format_startswith('m', format_len, *format, exact)) goto parse_error; + if (format_len) {++format; --format_len;} /* Second digit optional if there was a separator */ if (isdigit(*substr)) { out->month = 10 * out->month + (*substr - '0'); @@ -231,7 +234,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } ++substr; --sublen; - FORMAT_STARTSWITH(ymd_sep); + if (!format_startswith(ymd_sep, format_len, *format, exact)) goto parse_error; + if (format_len) {++format; --format_len;} } /* PARSE THE DAY */ @@ -242,8 +246,10 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, out->day = (*substr - '0'); ++substr; --sublen; - FORMAT_STARTSWITH('%'); - FORMAT_STARTSWITH('d'); + if (!format_startswith('%', format_len, *format, exact)) goto parse_error; + if (format_len) {++format; --format_len;} + if (!format_startswith('d', format_len, *format, exact)) goto parse_error; + if (format_len) {++format; --format_len;} /* Second digit optional if there was a separator */ if (isdigit(*substr)) { out->day = 10 * out->day + (*substr - '0'); @@ -276,7 +282,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if ((*substr != 'T' && *substr != ' ') || sublen == 1) { goto parse_error; } - FORMAT_STARTSWITH(*substr); + if (!format_startswith(*substr, format_len, *format, exact)) goto parse_error; + if (format_len) {++format; --format_len;} ++substr; --sublen; @@ -285,8 +292,10 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if (!isdigit(*substr)) { goto parse_error; } - FORMAT_STARTSWITH('%'); - FORMAT_STARTSWITH('H'); + if (!format_startswith('%', format_len, *format, exact)) goto parse_error; + if (format_len) {++format; --format_len;} + if (!format_startswith('H', format_len, *format, exact)) goto parse_error; + if (format_len) {++format; --format_len;} out->hour = (*substr - '0'); ++substr; --sublen; @@ -326,7 +335,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if (sublen == 0 || !isdigit(*substr)) { goto parse_error; } - FORMAT_STARTSWITH(':'); + if (!format_startswith(':', format_len, *format, exact)) goto parse_error; + if (format_len) {++format; --format_len;} } else if (!isdigit(*substr)) { if (!hour_was_2_digits) { goto parse_error; @@ -339,8 +349,10 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, out->min = (*substr - '0'); ++substr; --sublen; - FORMAT_STARTSWITH('%'); - FORMAT_STARTSWITH('M'); + if (!format_startswith('%', format_len, *format, exact)) goto parse_error; + if (format_len) {++format; --format_len;} + if (!format_startswith('M', format_len, *format, exact)) goto parse_error; + if (format_len) {++format; --format_len;} /* Second digit optional if there was a separator */ if (isdigit(*substr)) { out->min = 10 * out->min + (*substr - '0'); @@ -369,7 +381,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, /* If we make it through this condition block, then the next * character is a digit. */ if (has_hms_sep && *substr == ':') { - FORMAT_STARTSWITH(':'); + if (!format_startswith(':', format_len, *format, exact)) goto parse_error; + if (format_len) {++format; --format_len;} ++substr; --sublen; /* Cannot have a trailing ':' */ @@ -386,8 +399,10 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, out->sec = (*substr - '0'); ++substr; --sublen; - FORMAT_STARTSWITH('%'); - FORMAT_STARTSWITH('S'); + if (!format_startswith('%', format_len, *format, exact)) goto parse_error; + if (format_len) {++format; --format_len;} + if (!format_startswith('S', format_len, *format, exact)) goto parse_error; + if (format_len) {++format; --format_len;} /* Second digit optional if there was a separator */ if (isdigit(*substr)) { out->sec = 10 * out->sec + (*substr - '0'); @@ -409,15 +424,18 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if (sublen > 0 && *substr == '.') { ++substr; --sublen; - FORMAT_STARTSWITH('.'); + if (!format_startswith('.', format_len, *format, exact)) goto parse_error; + if (format_len) {++format; --format_len;} } else { bestunit = NPY_FR_s; goto parse_timezone; } /* PARSE THE MICROSECONDS (0 to 6 digits) */ - FORMAT_STARTSWITH('%'); - FORMAT_STARTSWITH('f'); + if (!format_startswith('%', format_len, *format, exact)) goto parse_error; + if (format_len) {++format; --format_len;} + if (!format_startswith('f', format_len, *format, exact)) goto parse_error; + if (format_len) {++format; --format_len;} numdigits = 0; for (i = 0; i < 6; ++i) { out->us *= 10; @@ -482,7 +500,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, while (sublen > 0 && isspace(*substr)) { ++substr; --sublen; - FORMAT_STARTSWITH(' '); + if (!format_startswith(' ', format_len, *format, exact)) goto parse_error; + if (format_len) {++format; --format_len;} } if (sublen == 0) { @@ -495,8 +514,10 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, /* UTC specifier */ if (*substr == 'Z') { - FORMAT_STARTSWITH('%'); - FORMAT_STARTSWITH('Z'); + if (!format_startswith('%', format_len, *format, exact)) goto parse_error; + if (format_len) {++format; --format_len;} + if (!format_startswith('Z', format_len, *format, exact)) goto parse_error; + if (format_len) {++format; --format_len;} /* "Z" should be equivalent to tz offset "+00:00" */ if (out_local != NULL) { @@ -517,8 +538,10 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, --sublen; } } else if (*substr == '-' || *substr == '+') { - FORMAT_STARTSWITH('%'); - FORMAT_STARTSWITH('z'); + if (!format_startswith('%', format_len, *format, exact)) goto parse_error; + if (format_len) {++format; --format_len;} + if (!format_startswith('z', format_len, *format, exact)) goto parse_error; + if (format_len) {++format; --format_len;} /* Time zone offset */ int offset_neg = 0, offset_hour = 0, offset_minute = 0; @@ -602,7 +625,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, while (sublen > 0 && isspace(*substr)) { ++substr; --sublen; - FORMAT_STARTSWITH(' '); + if (!format_startswith(' ', format_len, *format, exact)) goto parse_error; + if (format_len) {++format; --format_len;} } if ((sublen != 0) || (format_len != 0)) { From 05319671ee4eba17f23c34637ac9285f20d9b8c1 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Sun, 30 Oct 2022 09:10:48 +0000 Subject: [PATCH 06/18] clean up --- .../tslibs/src/datetime/np_datetime_strings.c | 52 +++++++++---------- 1 file changed, 25 insertions(+), 27 deletions(-) diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c index 79dc730c25906..d3c9b330e0e63 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c @@ -75,7 +75,6 @@ int format_startswith(char ch, int format_len, char format, int exact) { } int parse_iso_8601_datetime(const char *str, int len, int want_exc, - npy_datetimestruct *out, NPY_DATETIMEUNIT *out_bestunit, int *out_local, int *out_tzoffset, @@ -126,16 +125,16 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if (format_len) {++format; --format_len;} } - if (!format_startswith('%', format_len, *format, exact)) goto parse_error; - if (format_len) {++format; --format_len;} - if (!format_startswith('Y', format_len, *format, exact)) goto parse_error; - if (format_len) {++format; --format_len;} - if (sublen == 0) { goto parse_error; } /* PARSE THE YEAR (4 digits) */ + if (!format_startswith('%', format_len, *format, exact)) goto parse_error; + if (format_len) {++format; --format_len;} + if (!format_startswith('Y', format_len, *format, exact)) goto parse_error; + if (format_len) {++format; --format_len;} + out->year = 0; if (sublen >= 4 && isdigit(substr[0]) && isdigit(substr[1]) && isdigit(substr[2]) && isdigit(substr[3])) { @@ -187,14 +186,14 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } /* PARSE THE MONTH */ - /* First digit required */ - out->month = (*substr - '0'); - ++substr; - --sublen; if (!format_startswith('%', format_len, *format, exact)) goto parse_error; if (format_len) {++format; --format_len;} if (!format_startswith('m', format_len, *format, exact)) goto parse_error; if (format_len) {++format; --format_len;} + /* First digit required */ + out->month = (*substr - '0'); + ++substr; + --sublen; /* Second digit optional if there was a separator */ if (isdigit(*substr)) { out->month = 10 * out->month + (*substr - '0'); @@ -239,6 +238,10 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } /* PARSE THE DAY */ + if (!format_startswith('%', format_len, *format, exact)) goto parse_error; + if (format_len) {++format; --format_len;} + if (!format_startswith('d', format_len, *format, exact)) goto parse_error; + if (format_len) {++format; --format_len;} /* First digit required */ if (!isdigit(*substr)) { goto parse_error; @@ -246,10 +249,6 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, out->day = (*substr - '0'); ++substr; --sublen; - if (!format_startswith('%', format_len, *format, exact)) goto parse_error; - if (format_len) {++format; --format_len;} - if (!format_startswith('d', format_len, *format, exact)) goto parse_error; - if (format_len) {++format; --format_len;} /* Second digit optional if there was a separator */ if (isdigit(*substr)) { out->day = 10 * out->day + (*substr - '0'); @@ -288,14 +287,14 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, --sublen; /* PARSE THE HOURS */ - /* First digit required */ - if (!isdigit(*substr)) { - goto parse_error; - } if (!format_startswith('%', format_len, *format, exact)) goto parse_error; if (format_len) {++format; --format_len;} if (!format_startswith('H', format_len, *format, exact)) goto parse_error; if (format_len) {++format; --format_len;} + /* First digit required */ + if (!isdigit(*substr)) { + goto parse_error; + } out->hour = (*substr - '0'); ++substr; --sublen; @@ -345,14 +344,14 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } /* PARSE THE MINUTES */ - /* First digit required */ - out->min = (*substr - '0'); - ++substr; - --sublen; if (!format_startswith('%', format_len, *format, exact)) goto parse_error; if (format_len) {++format; --format_len;} if (!format_startswith('M', format_len, *format, exact)) goto parse_error; if (format_len) {++format; --format_len;} + /* First digit required */ + out->min = (*substr - '0'); + ++substr; + --sublen; /* Second digit optional if there was a separator */ if (isdigit(*substr)) { out->min = 10 * out->min + (*substr - '0'); @@ -395,14 +394,14 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } /* PARSE THE SECONDS */ - /* First digit required */ - out->sec = (*substr - '0'); - ++substr; - --sublen; if (!format_startswith('%', format_len, *format, exact)) goto parse_error; if (format_len) {++format; --format_len;} if (!format_startswith('S', format_len, *format, exact)) goto parse_error; if (format_len) {++format; --format_len;} + /* First digit required */ + out->sec = (*substr - '0'); + ++substr; + --sublen; /* Second digit optional if there was a separator */ if (isdigit(*substr)) { out->sec = 10 * out->sec + (*substr - '0'); @@ -518,7 +517,6 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if (format_len) {++format; --format_len;} if (!format_startswith('Z', format_len, *format, exact)) goto parse_error; if (format_len) {++format; --format_len;} - /* "Z" should be equivalent to tz offset "+00:00" */ if (out_local != NULL) { *out_local = 1; From a57175323dd08dfd11ba1cc4b003b7520d5823c6 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Sun, 30 Oct 2022 09:33:53 +0000 Subject: [PATCH 07/18] :memo: restore docstring --- pandas/_libs/tslibs/src/datetime/np_datetime_strings.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c index d3c9b330e0e63..1d1beea608867 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c @@ -68,6 +68,12 @@ This file implements string parsing and creation for NumPy datetime. */ int format_startswith(char ch, int format_len, char format, int exact) { + /* Check if the current character in `format` is `ch`. + + Always error on character mismatch conditioned on non-exhausted format, + or when format is exhausted in the exact case. + Note that if `format` hasn't been exhausted, it should be advanced + outside of this function. */ if ((format_len && format != ch) || (exact && !format_len)) { return 0; } From e814a2ed4e400e2b7452c932268198100c26d678 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Sun, 30 Oct 2022 12:24:44 +0000 Subject: [PATCH 08/18] inline --- pandas/_libs/tslibs/src/datetime/np_datetime_strings.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c index 1d1beea608867..cec770027b868 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c @@ -67,7 +67,7 @@ This file implements string parsing and creation for NumPy datetime. * Returns 0 on success, -1 on failure. */ -int format_startswith(char ch, int format_len, char format, int exact) { +inline int format_startswith(char ch, int format_len, char format, int exact) { /* Check if the current character in `format` is `ch`. Always error on character mismatch conditioned on non-exhausted format, From 19c34f8eab9bf03631f672b909b75905b55f8718 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Mon, 31 Oct 2022 14:40:56 +0000 Subject: [PATCH 09/18] set format default to None --- pandas/_libs/tslib.pyx | 6 +++--- pandas/_libs/tslibs/conversion.pyx | 2 +- pandas/_libs/tslibs/np_datetime.pxd | 4 ++-- pandas/_libs/tslibs/np_datetime.pyx | 11 ++++++++--- pandas/_libs/tslibs/parsing.pyx | 2 +- pandas/core/tools/datetimes.py | 3 --- 6 files changed, 15 insertions(+), 13 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 4fd015f15ed76..b86d0453bd2e1 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -89,7 +89,7 @@ def _test_parse_iso8601(ts: str): elif ts == 'today': return Timestamp.now().normalize() - string_to_dts(ts, &obj.dts, &out_bestunit, &out_local, &out_tzoffset, True, "", False) + string_to_dts(ts, &obj.dts, &out_bestunit, &out_local, &out_tzoffset, True) obj.value = npy_datetimestruct_to_datetime(NPY_FR_ns, &obj.dts) check_dts_bounds(&obj.dts) if out_local == 1: @@ -445,8 +445,8 @@ cpdef array_to_datetime( bint utc=False, bint require_iso8601=False, bint allow_mixed=False, - str format="", - bint exact=False, + format: str | None=None, + bint exact=True, ): """ Converts a 1D array of date-like values to a numpy array of either: diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 481f66bc256bf..0db605d99a0a7 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -488,7 +488,7 @@ cdef _TSObject _convert_str_to_tsobject(object ts, tzinfo tz, str unit, else: string_to_dts_failed = string_to_dts( ts, &dts, &out_bestunit, &out_local, - &out_tzoffset, False, "", False + &out_tzoffset, False, ) if not string_to_dts_failed: try: diff --git a/pandas/_libs/tslibs/np_datetime.pxd b/pandas/_libs/tslibs/np_datetime.pxd index 770b1f7e46c12..de81c611c9ee9 100644 --- a/pandas/_libs/tslibs/np_datetime.pxd +++ b/pandas/_libs/tslibs/np_datetime.pxd @@ -95,8 +95,8 @@ cdef int string_to_dts( int* out_local, int* out_tzoffset, bint want_exc, - str format, - bint exact + format: str | None = *, + bint exact = * ) except? -1 cdef NPY_DATETIMEUNIT get_unit_from_dtype(cnp.dtype dtype) diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index 8acd51d8a4431..ec45fac6a5c69 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -274,8 +274,8 @@ cdef inline int string_to_dts( int* out_local, int* out_tzoffset, bint want_exc, - str format, - bint exact, + format: str | None = None, + bint exact = True, ) except? -1: cdef: Py_ssize_t length @@ -284,7 +284,12 @@ cdef inline int string_to_dts( const char* format_buf buf = get_c_string_buf_and_size(val, &length) - format_buf = get_c_string_buf_and_size(format, &format_length) + if format is None: + format_buf = b'' + format_length = 0 + exact = False + else: + format_buf = get_c_string_buf_and_size(format, &format_length) return parse_iso_8601_datetime(buf, length, want_exc, dts, out_bestunit, out_local, out_tzoffset, format_buf, format_length, exact) diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 965538044fd72..46042cabe97a1 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -409,7 +409,7 @@ cdef parse_datetime_string_with_reso( # TODO: does this render some/all of parse_delimited_date redundant? string_to_dts_failed = string_to_dts( date_string, &dts, &out_bestunit, &out_local, - &out_tzoffset, False, "", False + &out_tzoffset, False, ) if not string_to_dts_failed: if dts.ps != 0 or out_local: diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index b7a4700b74e0c..1800c91ee16f0 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -433,9 +433,6 @@ def _convert_listlike_datetimes( ) if res is not None: return res - elif format is None: - format = "" - exact = False utc = tz == "utc" result, tz_parsed = objects_to_datetime64ns( From eb50dfbb50126f2a4b4ebcdc0543f2d8f9b41463 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Mon, 31 Oct 2022 14:47:28 +0000 Subject: [PATCH 10/18] clean up --- pandas/_libs/tslibs/conversion.pyx | 2 +- pandas/_libs/tslibs/parsing.pyx | 2 +- pandas/core/arrays/datetimes.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 0db605d99a0a7..923dfa3c54d26 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -488,7 +488,7 @@ cdef _TSObject _convert_str_to_tsobject(object ts, tzinfo tz, str unit, else: string_to_dts_failed = string_to_dts( ts, &dts, &out_bestunit, &out_local, - &out_tzoffset, False, + &out_tzoffset, False ) if not string_to_dts_failed: try: diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 46042cabe97a1..1312124cfb77b 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -409,7 +409,7 @@ cdef parse_datetime_string_with_reso( # TODO: does this render some/all of parse_delimited_date redundant? string_to_dts_failed = string_to_dts( date_string, &dts, &out_bestunit, &out_local, - &out_tzoffset, False, + &out_tzoffset, False ) if not string_to_dts_failed: if dts.ps != 0 or out_local: diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index e1c7647fc7295..b52b31d4ecf2f 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -2133,8 +2133,8 @@ def objects_to_datetime64ns( require_iso8601: bool = False, allow_object: bool = False, allow_mixed: bool = False, - format: str = "", - exact: bool = False, + format: str | None = None, + exact: bool = True, ): """ Convert data to array of timestamps. From 0dd74072ce014df7402b96b042f3491bb829efd7 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Sun, 6 Nov 2022 13:02:01 +0000 Subject: [PATCH 11/18] remove function, perform check inline --- .../tslibs/src/datetime/np_datetime_strings.c | 135 +++++++++--------- 1 file changed, 66 insertions(+), 69 deletions(-) diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c index cec770027b868..e5a14b64da266 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c @@ -67,19 +67,6 @@ This file implements string parsing and creation for NumPy datetime. * Returns 0 on success, -1 on failure. */ -inline int format_startswith(char ch, int format_len, char format, int exact) { - /* Check if the current character in `format` is `ch`. - - Always error on character mismatch conditioned on non-exhausted format, - or when format is exhausted in the exact case. - Note that if `format` hasn't been exhausted, it should be advanced - outside of this function. */ - if ((format_len && format != ch) || (exact && !format_len)) { - return 0; - } - return 1; -} - int parse_iso_8601_datetime(const char *str, int len, int want_exc, npy_datetimestruct *out, NPY_DATETIMEUNIT *out_bestunit, @@ -119,16 +106,18 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, while (sublen > 0 && isspace(*substr)) { ++substr; --sublen; - if (!format_startswith(' ', format_len, *format, exact)) goto parse_error; - if (format_len) {++format; --format_len;} + if (format_len < 1 && exact) goto parse_error; + if (*format++ != ' ') goto parse_error; + if (format_len) --format_len; } /* Leading '-' sign for negative year */ if (*substr == '-') { ++substr; --sublen; - if (!format_startswith('-', format_len, *format, exact)) goto parse_error; - if (format_len) {++format; --format_len;} + if (format_len < 1 && exact) goto parse_error; + if (*format++ != '-') goto parse_error; + if (format_len) --format_len; } if (sublen == 0) { @@ -136,10 +125,10 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } /* PARSE THE YEAR (4 digits) */ - if (!format_startswith('%', format_len, *format, exact)) goto parse_error; - if (format_len) {++format; --format_len;} - if (!format_startswith('Y', format_len, *format, exact)) goto parse_error; - if (format_len) {++format; --format_len;} + if (format_len < 2 && exact) goto parse_error; + if (*format++ != '%') goto parse_error; + if (*format++ != 'Y') goto parse_error; + if (format_len) format_len -= 2; out->year = 0; if (sublen >= 4 && isdigit(substr[0]) && isdigit(substr[1]) && @@ -183,8 +172,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, ymd_sep = valid_ymd_sep[i]; ++substr; --sublen; - if (!format_startswith(ymd_sep, format_len, *format, exact)) goto parse_error; - if (format_len) {++format; --format_len;} + if (format_len < 1 && exact) goto parse_error; + if (*format++ != ymd_sep) goto parse_error; + if (format_len) --format_len; /* Cannot have trailing separator */ if (sublen == 0 || !isdigit(*substr)) { goto parse_error; @@ -192,10 +182,10 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } /* PARSE THE MONTH */ - if (!format_startswith('%', format_len, *format, exact)) goto parse_error; - if (format_len) {++format; --format_len;} - if (!format_startswith('m', format_len, *format, exact)) goto parse_error; - if (format_len) {++format; --format_len;} + if (format_len < 2 && exact) goto parse_error; + if (*format++ != '%') goto parse_error; + if (*format++ != 'm') goto parse_error; + if (format_len) format_len -= 2; /* First digit required */ out->month = (*substr - '0'); ++substr; @@ -239,15 +229,16 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } ++substr; --sublen; - if (!format_startswith(ymd_sep, format_len, *format, exact)) goto parse_error; - if (format_len) {++format; --format_len;} + if (format_len < 1 && exact) goto parse_error; + if (*format++ != ymd_sep) goto parse_error; + if (format_len) --format_len; } /* PARSE THE DAY */ - if (!format_startswith('%', format_len, *format, exact)) goto parse_error; - if (format_len) {++format; --format_len;} - if (!format_startswith('d', format_len, *format, exact)) goto parse_error; - if (format_len) {++format; --format_len;} + if (format_len < 2 && exact) goto parse_error; + if (*format++ != '%') goto parse_error; + if (*format++ != 'd') goto parse_error; + if (format_len) format_len -= 2; /* First digit required */ if (!isdigit(*substr)) { goto parse_error; @@ -284,19 +275,20 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, goto finish; } + if (format_len < 1 && exact) goto parse_error; + if (*format++ != *substr) goto parse_error; + if (format_len) --format_len; if ((*substr != 'T' && *substr != ' ') || sublen == 1) { goto parse_error; } - if (!format_startswith(*substr, format_len, *format, exact)) goto parse_error; - if (format_len) {++format; --format_len;} ++substr; --sublen; /* PARSE THE HOURS */ - if (!format_startswith('%', format_len, *format, exact)) goto parse_error; - if (format_len) {++format; --format_len;} - if (!format_startswith('H', format_len, *format, exact)) goto parse_error; - if (format_len) {++format; --format_len;} + if (format_len < 2 && exact) goto parse_error; + if (*format++ != '%') goto parse_error; + if (*format++ != 'H') goto parse_error; + if (format_len) format_len -= 2; /* First digit required */ if (!isdigit(*substr)) { goto parse_error; @@ -340,8 +332,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if (sublen == 0 || !isdigit(*substr)) { goto parse_error; } - if (!format_startswith(':', format_len, *format, exact)) goto parse_error; - if (format_len) {++format; --format_len;} + if (format_len < 1 && exact) goto parse_error; + if (*format++ != ':') goto parse_error; + if (format_len) --format_len; } else if (!isdigit(*substr)) { if (!hour_was_2_digits) { goto parse_error; @@ -350,10 +343,10 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } /* PARSE THE MINUTES */ - if (!format_startswith('%', format_len, *format, exact)) goto parse_error; - if (format_len) {++format; --format_len;} - if (!format_startswith('M', format_len, *format, exact)) goto parse_error; - if (format_len) {++format; --format_len;} + if (format_len < 2 && exact) goto parse_error; + if (*format++ != '%') goto parse_error; + if (*format++ != 'M') goto parse_error; + if (format_len) format_len -= 2; /* First digit required */ out->min = (*substr - '0'); ++substr; @@ -386,8 +379,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, /* If we make it through this condition block, then the next * character is a digit. */ if (has_hms_sep && *substr == ':') { - if (!format_startswith(':', format_len, *format, exact)) goto parse_error; - if (format_len) {++format; --format_len;} + if (format_len < 1 && exact) goto parse_error; + if (*format++ != ':') goto parse_error; + if (format_len) --format_len; ++substr; --sublen; /* Cannot have a trailing ':' */ @@ -400,10 +394,10 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } /* PARSE THE SECONDS */ - if (!format_startswith('%', format_len, *format, exact)) goto parse_error; - if (format_len) {++format; --format_len;} - if (!format_startswith('S', format_len, *format, exact)) goto parse_error; - if (format_len) {++format; --format_len;} + if (format_len < 2 && exact) goto parse_error; + if (*format++ != '%') goto parse_error; + if (*format++ != 'S') goto parse_error; + if (format_len) format_len -= 2; /* First digit required */ out->sec = (*substr - '0'); ++substr; @@ -429,18 +423,19 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if (sublen > 0 && *substr == '.') { ++substr; --sublen; - if (!format_startswith('.', format_len, *format, exact)) goto parse_error; - if (format_len) {++format; --format_len;} + if (format_len < 1 && exact) goto parse_error; + if (*format++ != '.') goto parse_error; + if (format_len) --format_len; } else { bestunit = NPY_FR_s; goto parse_timezone; } /* PARSE THE MICROSECONDS (0 to 6 digits) */ - if (!format_startswith('%', format_len, *format, exact)) goto parse_error; - if (format_len) {++format; --format_len;} - if (!format_startswith('f', format_len, *format, exact)) goto parse_error; - if (format_len) {++format; --format_len;} + if (format_len < 2 && exact) goto parse_error; + if (*format++ != '%') goto parse_error; + if (*format++ != 'S') goto parse_error; + if (format_len) format_len -= 2; numdigits = 0; for (i = 0; i < 6; ++i) { out->us *= 10; @@ -505,8 +500,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, while (sublen > 0 && isspace(*substr)) { ++substr; --sublen; - if (!format_startswith(' ', format_len, *format, exact)) goto parse_error; - if (format_len) {++format; --format_len;} + if (format_len < 1 && exact) goto parse_error; + if (*format++ != ' ') goto parse_error; + if (format_len) --format_len; } if (sublen == 0) { @@ -519,10 +515,10 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, /* UTC specifier */ if (*substr == 'Z') { - if (!format_startswith('%', format_len, *format, exact)) goto parse_error; - if (format_len) {++format; --format_len;} - if (!format_startswith('Z', format_len, *format, exact)) goto parse_error; - if (format_len) {++format; --format_len;} + if (format_len < 2 && exact) goto parse_error; + if (*format++ != '%') goto parse_error; + if (*format++ != 'Z') goto parse_error; + if (format_len) format_len -= 2; /* "Z" should be equivalent to tz offset "+00:00" */ if (out_local != NULL) { *out_local = 1; @@ -542,10 +538,10 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, --sublen; } } else if (*substr == '-' || *substr == '+') { - if (!format_startswith('%', format_len, *format, exact)) goto parse_error; - if (format_len) {++format; --format_len;} - if (!format_startswith('z', format_len, *format, exact)) goto parse_error; - if (format_len) {++format; --format_len;} + if (format_len < 2 && exact) goto parse_error; + if (*format++ != '%') goto parse_error; + if (*format++ != 'z') goto parse_error; + if (format_len) format_len -= 2; /* Time zone offset */ int offset_neg = 0, offset_hour = 0, offset_minute = 0; @@ -629,8 +625,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, while (sublen > 0 && isspace(*substr)) { ++substr; --sublen; - if (!format_startswith(' ', format_len, *format, exact)) goto parse_error; - if (format_len) {++format; --format_len;} + if (format_len < 1 && exact) goto parse_error; + if (*format++ != ' ') goto parse_error; + if (format_len) --format_len; } if ((sublen != 0) || (format_len != 0)) { From 3acfdf6190a0b30183846b15986c61a4f8eafd26 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Mon, 7 Nov 2022 15:47:06 +0000 Subject: [PATCH 12/18] only compare *format++ if format_len --- .../tslibs/src/datetime/np_datetime_strings.c | 60 +++++++++---------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c index e5a14b64da266..f0264db102045 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c @@ -107,7 +107,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, ++substr; --sublen; if (format_len < 1 && exact) goto parse_error; - if (*format++ != ' ') goto parse_error; + if (format_len && *format++ != ' ') goto parse_error; if (format_len) --format_len; } @@ -116,7 +116,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, ++substr; --sublen; if (format_len < 1 && exact) goto parse_error; - if (*format++ != '-') goto parse_error; + if (format_len && *format++ != '-') goto parse_error; if (format_len) --format_len; } @@ -126,8 +126,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, /* PARSE THE YEAR (4 digits) */ if (format_len < 2 && exact) goto parse_error; - if (*format++ != '%') goto parse_error; - if (*format++ != 'Y') goto parse_error; + if (format_len && *format++ != '%') goto parse_error; + if (format_len && *format++ != 'Y') goto parse_error; if (format_len) format_len -= 2; out->year = 0; @@ -173,7 +173,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, ++substr; --sublen; if (format_len < 1 && exact) goto parse_error; - if (*format++ != ymd_sep) goto parse_error; + if (format_len && *format++ != ymd_sep) goto parse_error; if (format_len) --format_len; /* Cannot have trailing separator */ if (sublen == 0 || !isdigit(*substr)) { @@ -183,8 +183,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, /* PARSE THE MONTH */ if (format_len < 2 && exact) goto parse_error; - if (*format++ != '%') goto parse_error; - if (*format++ != 'm') goto parse_error; + if (format_len && *format++ != '%') goto parse_error; + if (format_len && *format++ != 'm') goto parse_error; if (format_len) format_len -= 2; /* First digit required */ out->month = (*substr - '0'); @@ -230,14 +230,14 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, ++substr; --sublen; if (format_len < 1 && exact) goto parse_error; - if (*format++ != ymd_sep) goto parse_error; + if (format_len && *format++ != ymd_sep) goto parse_error; if (format_len) --format_len; } /* PARSE THE DAY */ if (format_len < 2 && exact) goto parse_error; - if (*format++ != '%') goto parse_error; - if (*format++ != 'd') goto parse_error; + if (format_len && *format++ != '%') goto parse_error; + if (format_len && *format++ != 'd') goto parse_error; if (format_len) format_len -= 2; /* First digit required */ if (!isdigit(*substr)) { @@ -275,19 +275,19 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, goto finish; } - if (format_len < 1 && exact) goto parse_error; - if (*format++ != *substr) goto parse_error; - if (format_len) --format_len; if ((*substr != 'T' && *substr != ' ') || sublen == 1) { goto parse_error; } + if (format_len < 1 && exact) goto parse_error; + if (format_len && *format++ != *substr) goto parse_error; + if (format_len) --format_len; ++substr; --sublen; /* PARSE THE HOURS */ if (format_len < 2 && exact) goto parse_error; - if (*format++ != '%') goto parse_error; - if (*format++ != 'H') goto parse_error; + if (format_len && *format++ != '%') goto parse_error; + if (format_len && *format++ != 'H') goto parse_error; if (format_len) format_len -= 2; /* First digit required */ if (!isdigit(*substr)) { @@ -333,7 +333,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, goto parse_error; } if (format_len < 1 && exact) goto parse_error; - if (*format++ != ':') goto parse_error; + if (format_len && *format++ != ':') goto parse_error; if (format_len) --format_len; } else if (!isdigit(*substr)) { if (!hour_was_2_digits) { @@ -344,8 +344,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, /* PARSE THE MINUTES */ if (format_len < 2 && exact) goto parse_error; - if (*format++ != '%') goto parse_error; - if (*format++ != 'M') goto parse_error; + if (format_len && *format++ != '%') goto parse_error; + if (format_len && *format++ != 'M') goto parse_error; if (format_len) format_len -= 2; /* First digit required */ out->min = (*substr - '0'); @@ -380,7 +380,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, * character is a digit. */ if (has_hms_sep && *substr == ':') { if (format_len < 1 && exact) goto parse_error; - if (*format++ != ':') goto parse_error; + if (format_len && *format++ != ':') goto parse_error; if (format_len) --format_len; ++substr; --sublen; @@ -395,8 +395,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, /* PARSE THE SECONDS */ if (format_len < 2 && exact) goto parse_error; - if (*format++ != '%') goto parse_error; - if (*format++ != 'S') goto parse_error; + if (format_len && *format++ != '%') goto parse_error; + if (format_len && *format++ != 'S') goto parse_error; if (format_len) format_len -= 2; /* First digit required */ out->sec = (*substr - '0'); @@ -424,7 +424,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, ++substr; --sublen; if (format_len < 1 && exact) goto parse_error; - if (*format++ != '.') goto parse_error; + if (format_len && *format++ != '.') goto parse_error; if (format_len) --format_len; } else { bestunit = NPY_FR_s; @@ -433,8 +433,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, /* PARSE THE MICROSECONDS (0 to 6 digits) */ if (format_len < 2 && exact) goto parse_error; - if (*format++ != '%') goto parse_error; - if (*format++ != 'S') goto parse_error; + if (format_len && *format++ != '%') goto parse_error; + if (format_len && *format++ != 'f') goto parse_error; if (format_len) format_len -= 2; numdigits = 0; for (i = 0; i < 6; ++i) { @@ -501,7 +501,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, ++substr; --sublen; if (format_len < 1 && exact) goto parse_error; - if (*format++ != ' ') goto parse_error; + if (format_len && *format++ != ' ') goto parse_error; if (format_len) --format_len; } @@ -516,8 +516,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, /* UTC specifier */ if (*substr == 'Z') { if (format_len < 2 && exact) goto parse_error; - if (*format++ != '%') goto parse_error; - if (*format++ != 'Z') goto parse_error; + if (format_len && *format++ != '%') goto parse_error; + if (format_len && *format++ != 'Z') goto parse_error; if (format_len) format_len -= 2; /* "Z" should be equivalent to tz offset "+00:00" */ if (out_local != NULL) { @@ -539,8 +539,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } } else if (*substr == '-' || *substr == '+') { if (format_len < 2 && exact) goto parse_error; - if (*format++ != '%') goto parse_error; - if (*format++ != 'z') goto parse_error; + if (format_len && *format++ != '%') goto parse_error; + if (format_len && *format++ != 'z') goto parse_error; if (format_len) format_len -= 2; /* Time zone offset */ int offset_neg = 0, offset_hour = 0, offset_minute = 0; @@ -626,7 +626,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, ++substr; --sublen; if (format_len < 1 && exact) goto parse_error; - if (*format++ != ' ') goto parse_error; + if (format_len && *format++ != ' ') goto parse_error; if (format_len) --format_len; } From f3060c9afab7f8cd8e4362324cc3978d71314a73 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Mon, 7 Nov 2022 15:55:41 +0000 Subject: [PATCH 13/18] clean up --- .pre-commit-config.yaml | 11 ++--------- pandas/_libs/tslibs/np_datetime.pyx | 4 ++-- .../_libs/tslibs/src/datetime/np_datetime_strings.c | 1 - 3 files changed, 4 insertions(+), 12 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6ccaaa3c8eb25..6aa1f5659365f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -27,7 +27,7 @@ repos: - id: codespell types_or: [python, rst, markdown] - repo: https://github.com/MarcoGorelli/cython-lint - rev: v0.4.0 + rev: v0.2.1 hooks: - id: cython-lint - repo: https://github.com/pre-commit/pre-commit-hooks @@ -48,14 +48,7 @@ repos: # this particular codebase (e.g. src/headers, src/klib). However, # we can lint all header files since they aren't "generated" like C files are. exclude: ^pandas/_libs/src/(klib|headers)/ - args: [ - --quiet, - '--extensions=c,h', - '--headers=h', - --recursive, - '--filter=-readability/casting,-runtime/int,-build/include_subdir', - '--linelength=88' - ] + args: [--quiet, '--extensions=c,h', '--headers=h', --recursive, '--filter=-readability/casting,-runtime/int,-build/include_subdir'] - repo: https://github.com/PyCQA/flake8 rev: 5.0.4 hooks: diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index 08d782ec0de06..d49c41e54764f 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -278,8 +278,8 @@ cdef inline int string_to_dts( int* out_local, int* out_tzoffset, bint want_exc, - format: str | None = None, - bint exact = True, + format: str | None=None, + bint exact=True, ) except? -1: cdef: Py_ssize_t length diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c index f0264db102045..e1893d1819cb2 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c @@ -66,7 +66,6 @@ This file implements string parsing and creation for NumPy datetime. * * Returns 0 on success, -1 on failure. */ - int parse_iso_8601_datetime(const char *str, int len, int want_exc, npy_datetimestruct *out, NPY_DATETIMEUNIT *out_bestunit, From 7310e139e0df5207c958e5f213614a3a5962a78e Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Mon, 7 Nov 2022 17:17:42 +0000 Subject: [PATCH 14/18] typing --- pandas/_libs/tslib.pyi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/tslib.pyi b/pandas/_libs/tslib.pyi index b8dc696b4d66c..bb70b07dd4908 100644 --- a/pandas/_libs/tslib.pyi +++ b/pandas/_libs/tslib.pyi @@ -25,7 +25,7 @@ def array_to_datetime( utc: bool = ..., require_iso8601: bool = ..., allow_mixed: bool = ..., - format: str = ..., + format: str | None = ..., exact: bool = ..., ) -> tuple[np.ndarray, tzinfo | None]: ... From 3ceb1ee1c5291536c205853a996d725ffab107d0 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Wed, 9 Nov 2022 10:38:02 +0000 Subject: [PATCH 15/18] split out branches --- .pre-commit-config.yaml | 8 +- .../tslibs/src/datetime/np_datetime_strings.c | 220 ++++++++++++------ 2 files changed, 161 insertions(+), 67 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6aa1f5659365f..d7f1549bd3928 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -48,7 +48,13 @@ repos: # this particular codebase (e.g. src/headers, src/klib). However, # we can lint all header files since they aren't "generated" like C files are. exclude: ^pandas/_libs/src/(klib|headers)/ - args: [--quiet, '--extensions=c,h', '--headers=h', --recursive, '--filter=-readability/casting,-runtime/int,-build/include_subdir'] + args: [ + --quiet, + '--extensions=c,h', + '--headers=h', + --recursive, + '--filter=-readability/casting,-runtime/int,-build/include_subdir,-readability/fn_size' + ] - repo: https://github.com/PyCQA/flake8 rev: 5.0.4 hooks: diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c index e1893d1819cb2..220baf499e1c7 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c @@ -71,6 +71,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, NPY_DATETIMEUNIT *out_bestunit, int *out_local, int *out_tzoffset, const char* format, int format_len, int exact) { + if (len < 0 || format_len < 0) + goto parse_error; int year_leap = 0; int i, numdigits; const char *substr; @@ -105,18 +107,28 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, while (sublen > 0 && isspace(*substr)) { ++substr; --sublen; - if (format_len < 1 && exact) goto parse_error; - if (format_len && *format++ != ' ') goto parse_error; - if (format_len) --format_len; + if (format_len < 1) { + if (exact) + goto parse_error; + } else { + if (*format++ != ' ') + goto parse_error; + format_len -= 1; + } } /* Leading '-' sign for negative year */ if (*substr == '-') { ++substr; --sublen; - if (format_len < 1 && exact) goto parse_error; - if (format_len && *format++ != '-') goto parse_error; - if (format_len) --format_len; + if (format_len < 1) { + if (exact) + goto parse_error; + } else { + if (*format++ != '-') + goto parse_error; + format_len -= 1; + } } if (sublen == 0) { @@ -124,10 +136,14 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } /* PARSE THE YEAR (4 digits) */ - if (format_len < 2 && exact) goto parse_error; - if (format_len && *format++ != '%') goto parse_error; - if (format_len && *format++ != 'Y') goto parse_error; - if (format_len) format_len -= 2; + if (format_len < 2) { + if (exact || format_len) + goto parse_error; + } else { + if (*format++ != '%' || *format++ != 'Y') + goto parse_error; + format_len -= 2; + } out->year = 0; if (sublen >= 4 && isdigit(substr[0]) && isdigit(substr[1]) && @@ -171,9 +187,14 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, ymd_sep = valid_ymd_sep[i]; ++substr; --sublen; - if (format_len < 1 && exact) goto parse_error; - if (format_len && *format++ != ymd_sep) goto parse_error; - if (format_len) --format_len; + if (format_len < 1) { + if (exact) + goto parse_error; + } else { + if (*format++ != ymd_sep) + goto parse_error; + format_len -= 1; + } /* Cannot have trailing separator */ if (sublen == 0 || !isdigit(*substr)) { goto parse_error; @@ -181,10 +202,14 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } /* PARSE THE MONTH */ - if (format_len < 2 && exact) goto parse_error; - if (format_len && *format++ != '%') goto parse_error; - if (format_len && *format++ != 'm') goto parse_error; - if (format_len) format_len -= 2; + if (format_len < 2) { + if (exact || format_len) + goto parse_error; + } else { + if (*format++ != '%' || *format++ != 'm') + goto parse_error; + format_len -= 2; + } /* First digit required */ out->month = (*substr - '0'); ++substr; @@ -228,16 +253,25 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } ++substr; --sublen; - if (format_len < 1 && exact) goto parse_error; - if (format_len && *format++ != ymd_sep) goto parse_error; - if (format_len) --format_len; + if (format_len < 1) { + if (exact) + goto parse_error; + } else { + if (*format++ != ymd_sep) + goto parse_error; + format_len -= 1; + } } /* PARSE THE DAY */ - if (format_len < 2 && exact) goto parse_error; - if (format_len && *format++ != '%') goto parse_error; - if (format_len && *format++ != 'd') goto parse_error; - if (format_len) format_len -= 2; + if (format_len < 2) { + if (exact || format_len) + goto parse_error; + } else { + if (*format++ != '%' || *format++ != 'd') + goto parse_error; + format_len -= 2; + } /* First digit required */ if (!isdigit(*substr)) { goto parse_error; @@ -277,17 +311,26 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if ((*substr != 'T' && *substr != ' ') || sublen == 1) { goto parse_error; } - if (format_len < 1 && exact) goto parse_error; - if (format_len && *format++ != *substr) goto parse_error; - if (format_len) --format_len; + if (format_len < 1) { + if (exact) + goto parse_error; + } else { + if (*format++ != *substr) + goto parse_error; + format_len -= 1; + } ++substr; --sublen; /* PARSE THE HOURS */ - if (format_len < 2 && exact) goto parse_error; - if (format_len && *format++ != '%') goto parse_error; - if (format_len && *format++ != 'H') goto parse_error; - if (format_len) format_len -= 2; + if (format_len < 2) { + if (exact || format_len) + goto parse_error; + } else { + if (*format++ != '%' || *format++ != 'H') + goto parse_error; + format_len -= 2; + } /* First digit required */ if (!isdigit(*substr)) { goto parse_error; @@ -331,9 +374,14 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if (sublen == 0 || !isdigit(*substr)) { goto parse_error; } - if (format_len < 1 && exact) goto parse_error; - if (format_len && *format++ != ':') goto parse_error; - if (format_len) --format_len; + if (format_len < 1) { + if (exact) + goto parse_error; + } else { + if (*format++ != ':') + goto parse_error; + format_len -= 1; + } } else if (!isdigit(*substr)) { if (!hour_was_2_digits) { goto parse_error; @@ -342,10 +390,14 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } /* PARSE THE MINUTES */ - if (format_len < 2 && exact) goto parse_error; - if (format_len && *format++ != '%') goto parse_error; - if (format_len && *format++ != 'M') goto parse_error; - if (format_len) format_len -= 2; + if (format_len < 2) { + if (exact || format_len) + goto parse_error; + } else { + if (*format++ != '%' || *format++ != 'M') + goto parse_error; + format_len -= 2; + } /* First digit required */ out->min = (*substr - '0'); ++substr; @@ -378,9 +430,14 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, /* If we make it through this condition block, then the next * character is a digit. */ if (has_hms_sep && *substr == ':') { - if (format_len < 1 && exact) goto parse_error; - if (format_len && *format++ != ':') goto parse_error; - if (format_len) --format_len; + if (format_len < 1) { + if (exact) + goto parse_error; + } else { + if (*format++ != ':') + goto parse_error; + format_len -= 1; + } ++substr; --sublen; /* Cannot have a trailing ':' */ @@ -393,10 +450,14 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } /* PARSE THE SECONDS */ - if (format_len < 2 && exact) goto parse_error; - if (format_len && *format++ != '%') goto parse_error; - if (format_len && *format++ != 'S') goto parse_error; - if (format_len) format_len -= 2; + if (format_len < 2) { + if (exact || format_len) + goto parse_error; + } else { + if (*format++ != '%' || *format++ != 'S') + goto parse_error; + format_len -= 2; + } /* First digit required */ out->sec = (*substr - '0'); ++substr; @@ -422,19 +483,28 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if (sublen > 0 && *substr == '.') { ++substr; --sublen; - if (format_len < 1 && exact) goto parse_error; - if (format_len && *format++ != '.') goto parse_error; - if (format_len) --format_len; + if (format_len < 1) { + if (exact) + goto parse_error; + } else { + if (*format++ != '.') + goto parse_error; + format_len -= 1; + } } else { bestunit = NPY_FR_s; goto parse_timezone; } /* PARSE THE MICROSECONDS (0 to 6 digits) */ - if (format_len < 2 && exact) goto parse_error; - if (format_len && *format++ != '%') goto parse_error; - if (format_len && *format++ != 'f') goto parse_error; - if (format_len) format_len -= 2; + if (format_len < 2) { + if (exact || format_len) + goto parse_error; + } else { + if (*format++ != '%' || *format++ != 'f') + goto parse_error; + format_len -= 2; + } numdigits = 0; for (i = 0; i < 6; ++i) { out->us *= 10; @@ -499,9 +569,14 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, while (sublen > 0 && isspace(*substr)) { ++substr; --sublen; - if (format_len < 1 && exact) goto parse_error; - if (format_len && *format++ != ' ') goto parse_error; - if (format_len) --format_len; + if (format_len < 1) { + if (exact) + goto parse_error; + } else { + if (*format++ != ' ') + goto parse_error; + format_len -= 1; + } } if (sublen == 0) { @@ -514,10 +589,14 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, /* UTC specifier */ if (*substr == 'Z') { - if (format_len < 2 && exact) goto parse_error; - if (format_len && *format++ != '%') goto parse_error; - if (format_len && *format++ != 'Z') goto parse_error; - if (format_len) format_len -= 2; + if (format_len < 2) { + if (exact || format_len) + goto parse_error; + } else { + if (*format++ != '%' || *format++ != 'Z') + goto parse_error; + format_len -= 2; + } /* "Z" should be equivalent to tz offset "+00:00" */ if (out_local != NULL) { *out_local = 1; @@ -537,10 +616,14 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, --sublen; } } else if (*substr == '-' || *substr == '+') { - if (format_len < 2 && exact) goto parse_error; - if (format_len && *format++ != '%') goto parse_error; - if (format_len && *format++ != 'z') goto parse_error; - if (format_len) format_len -= 2; + if (format_len < 2) { + if (exact || format_len) + goto parse_error; + } else { + if (*format++ != '%' || *format++ != 'z') + goto parse_error; + format_len -= 2; + } /* Time zone offset */ int offset_neg = 0, offset_hour = 0, offset_minute = 0; @@ -624,9 +707,14 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, while (sublen > 0 && isspace(*substr)) { ++substr; --sublen; - if (format_len < 1 && exact) goto parse_error; - if (format_len && *format++ != ' ') goto parse_error; - if (format_len) --format_len; + if (format_len < 1) { + if (exact) + goto parse_error; + } else { + if (*format++ != ' ') + goto parse_error; + format_len -= 1; + } } if ((sublen != 0) || (format_len != 0)) { From 080f0183b0f96879dab0540a9661adf7b672f7c1 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Sun, 13 Nov 2022 12:35:20 +0000 Subject: [PATCH 16/18] use compare_format function --- .pre-commit-config.yaml | 8 +- .../tslibs/src/datetime/np_datetime_strings.c | 206 ++++++------------ 2 files changed, 73 insertions(+), 141 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d7f1549bd3928..6aa1f5659365f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -48,13 +48,7 @@ repos: # this particular codebase (e.g. src/headers, src/klib). However, # we can lint all header files since they aren't "generated" like C files are. exclude: ^pandas/_libs/src/(klib|headers)/ - args: [ - --quiet, - '--extensions=c,h', - '--headers=h', - --recursive, - '--filter=-readability/casting,-runtime/int,-build/include_subdir,-readability/fn_size' - ] + args: [--quiet, '--extensions=c,h', '--headers=h', --recursive, '--filter=-readability/casting,-runtime/int,-build/include_subdir'] - repo: https://github.com/PyCQA/flake8 rev: 5.0.4 hooks: diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c index 220baf499e1c7..76e3e1147df00 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c @@ -66,6 +66,38 @@ This file implements string parsing and creation for NumPy datetime. * * Returns 0 on success, -1 on failure. */ + +// This function will advance the pointer on format +// and decrement characters_remaining by n on success +// On failure will return -1 without incrementing +int compare_format(const char **format, int *characters_remaining, + const char *compare_to, int n, const int exact) { + if (*characters_remaining < n) { + if (exact) { + // TODO(pandas-dev): in the future we should set a PyErr here + // to be very clear about what went wrong + return -1; + } else if (*characters_remaining) { + // TODO(pandas-dev): same return value in this function as + // above branch, but stub out a future where + // we have a better error message + return -1; + } else { + return 0; + } + } else { + if (strncmp(*format, compare_to, n)) { + // TODO(pandas-dev): PyErr to differentiate what went wrong + return -1; + } else { + *format += n; + *characters_remaining -= n; + return 0; + } + } + return 0; +} + int parse_iso_8601_datetime(const char *str, int len, int want_exc, npy_datetimestruct *out, NPY_DATETIMEUNIT *out_bestunit, @@ -107,13 +139,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, while (sublen > 0 && isspace(*substr)) { ++substr; --sublen; - if (format_len < 1) { - if (exact) - goto parse_error; - } else { - if (*format++ != ' ') - goto parse_error; - format_len -= 1; + if (compare_format(&format, &format_len, " ", 1, exact)) { + goto parse_error; } } @@ -121,14 +148,6 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if (*substr == '-') { ++substr; --sublen; - if (format_len < 1) { - if (exact) - goto parse_error; - } else { - if (*format++ != '-') - goto parse_error; - format_len -= 1; - } } if (sublen == 0) { @@ -136,13 +155,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } /* PARSE THE YEAR (4 digits) */ - if (format_len < 2) { - if (exact || format_len) - goto parse_error; - } else { - if (*format++ != '%' || *format++ != 'Y') - goto parse_error; - format_len -= 2; + if (compare_format(&format, &format_len, "%Y", 2, exact)) { + goto parse_error; } out->year = 0; @@ -187,13 +201,10 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, ymd_sep = valid_ymd_sep[i]; ++substr; --sublen; - if (format_len < 1) { - if (exact) - goto parse_error; - } else { - if (*format++ != ymd_sep) - goto parse_error; - format_len -= 1; + + const char tmp[1] = {ymd_sep}; + if (compare_format(&format, &format_len, tmp, 1, exact)) { + goto parse_error; } /* Cannot have trailing separator */ if (sublen == 0 || !isdigit(*substr)) { @@ -202,13 +213,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } /* PARSE THE MONTH */ - if (format_len < 2) { - if (exact || format_len) - goto parse_error; - } else { - if (*format++ != '%' || *format++ != 'm') - goto parse_error; - format_len -= 2; + if (compare_format(&format, &format_len, "%m", 2, exact)) { + goto parse_error; } /* First digit required */ out->month = (*substr - '0'); @@ -253,24 +259,15 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } ++substr; --sublen; - if (format_len < 1) { - if (exact) - goto parse_error; - } else { - if (*format++ != ymd_sep) - goto parse_error; - format_len -= 1; + const char tmp[1] = {ymd_sep}; + if (compare_format(&format, &format_len, tmp, 1, exact)) { + goto parse_error; } } /* PARSE THE DAY */ - if (format_len < 2) { - if (exact || format_len) - goto parse_error; - } else { - if (*format++ != '%' || *format++ != 'd') - goto parse_error; - format_len -= 2; + if (compare_format(&format, &format_len, "%d", 2, exact)) { + goto parse_error; } /* First digit required */ if (!isdigit(*substr)) { @@ -311,25 +308,16 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if ((*substr != 'T' && *substr != ' ') || sublen == 1) { goto parse_error; } - if (format_len < 1) { - if (exact) - goto parse_error; - } else { - if (*format++ != *substr) - goto parse_error; - format_len -= 1; + const char tmp[1] = {*substr}; + if (compare_format(&format, &format_len, tmp, 1, exact)) { + goto parse_error; } ++substr; --sublen; /* PARSE THE HOURS */ - if (format_len < 2) { - if (exact || format_len) - goto parse_error; - } else { - if (*format++ != '%' || *format++ != 'H') - goto parse_error; - format_len -= 2; + if (compare_format(&format, &format_len, "%H", 2, exact)) { + goto parse_error; } /* First digit required */ if (!isdigit(*substr)) { @@ -374,13 +362,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if (sublen == 0 || !isdigit(*substr)) { goto parse_error; } - if (format_len < 1) { - if (exact) - goto parse_error; - } else { - if (*format++ != ':') - goto parse_error; - format_len -= 1; + if (compare_format(&format, &format_len, ":", 1, exact)) { + goto parse_error; } } else if (!isdigit(*substr)) { if (!hour_was_2_digits) { @@ -390,13 +373,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } /* PARSE THE MINUTES */ - if (format_len < 2) { - if (exact || format_len) - goto parse_error; - } else { - if (*format++ != '%' || *format++ != 'M') - goto parse_error; - format_len -= 2; + if (compare_format(&format, &format_len, "%M", 2, exact)) { + goto parse_error; } /* First digit required */ out->min = (*substr - '0'); @@ -430,13 +408,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, /* If we make it through this condition block, then the next * character is a digit. */ if (has_hms_sep && *substr == ':') { - if (format_len < 1) { - if (exact) - goto parse_error; - } else { - if (*format++ != ':') - goto parse_error; - format_len -= 1; + if (compare_format(&format, &format_len, ":", 1, exact)) { + goto parse_error; } ++substr; --sublen; @@ -450,13 +423,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } /* PARSE THE SECONDS */ - if (format_len < 2) { - if (exact || format_len) - goto parse_error; - } else { - if (*format++ != '%' || *format++ != 'S') - goto parse_error; - format_len -= 2; + if (compare_format(&format, &format_len, "%S", 2, exact)) { + goto parse_error; } /* First digit required */ out->sec = (*substr - '0'); @@ -483,13 +451,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if (sublen > 0 && *substr == '.') { ++substr; --sublen; - if (format_len < 1) { - if (exact) - goto parse_error; - } else { - if (*format++ != '.') - goto parse_error; - format_len -= 1; + if (compare_format(&format, &format_len, ".", 1, exact)) { + goto parse_error; } } else { bestunit = NPY_FR_s; @@ -497,13 +460,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } /* PARSE THE MICROSECONDS (0 to 6 digits) */ - if (format_len < 2) { - if (exact || format_len) - goto parse_error; - } else { - if (*format++ != '%' || *format++ != 'f') - goto parse_error; - format_len -= 2; + if (compare_format(&format, &format_len, "%f", 2, exact)) { + goto parse_error; } numdigits = 0; for (i = 0; i < 6; ++i) { @@ -569,13 +527,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, while (sublen > 0 && isspace(*substr)) { ++substr; --sublen; - if (format_len < 1) { - if (exact) - goto parse_error; - } else { - if (*format++ != ' ') - goto parse_error; - format_len -= 1; + if (compare_format(&format, &format_len, " ", 1, exact)) { + goto parse_error; } } @@ -589,13 +542,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, /* UTC specifier */ if (*substr == 'Z') { - if (format_len < 2) { - if (exact || format_len) - goto parse_error; - } else { - if (*format++ != '%' || *format++ != 'Z') - goto parse_error; - format_len -= 2; + if (compare_format(&format, &format_len, "%Z", 2, exact)) { + goto parse_error; } /* "Z" should be equivalent to tz offset "+00:00" */ if (out_local != NULL) { @@ -616,13 +564,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, --sublen; } } else if (*substr == '-' || *substr == '+') { - if (format_len < 2) { - if (exact || format_len) - goto parse_error; - } else { - if (*format++ != '%' || *format++ != 'z') - goto parse_error; - format_len -= 2; + if (compare_format(&format, &format_len, "%z", 2, exact)) { + goto parse_error; } /* Time zone offset */ int offset_neg = 0, offset_hour = 0, offset_minute = 0; @@ -707,13 +650,8 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, while (sublen > 0 && isspace(*substr)) { ++substr; --sublen; - if (format_len < 1) { - if (exact) - goto parse_error; - } else { - if (*format++ != ' ') - goto parse_error; - format_len -= 1; + if (compare_format(&format, &format_len, " ", 1, exact)) { + goto parse_error; } } From 031e0e3660b825cdcde2999f90b8429f640afff0 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Sun, 13 Nov 2022 14:39:44 +0000 Subject: [PATCH 17/18] remove tmp variable --- .../_libs/tslibs/src/datetime/np_datetime_strings.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c index 76e3e1147df00..597a2aae7a2a3 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c @@ -70,7 +70,7 @@ This file implements string parsing and creation for NumPy datetime. // This function will advance the pointer on format // and decrement characters_remaining by n on success // On failure will return -1 without incrementing -int compare_format(const char **format, int *characters_remaining, +static int compare_format(const char **format, int *characters_remaining, const char *compare_to, int n, const int exact) { if (*characters_remaining < n) { if (exact) { @@ -202,8 +202,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, ++substr; --sublen; - const char tmp[1] = {ymd_sep}; - if (compare_format(&format, &format_len, tmp, 1, exact)) { + if (compare_format(&format, &format_len, &ymd_sep, 1, exact)) { goto parse_error; } /* Cannot have trailing separator */ @@ -259,8 +258,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } ++substr; --sublen; - const char tmp[1] = {ymd_sep}; - if (compare_format(&format, &format_len, tmp, 1, exact)) { + if (compare_format(&format, &format_len, &ymd_sep, 1, exact)) { goto parse_error; } } @@ -308,8 +306,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if ((*substr != 'T' && *substr != ' ') || sublen == 1) { goto parse_error; } - const char tmp[1] = {*substr}; - if (compare_format(&format, &format_len, tmp, 1, exact)) { + if (compare_format(&format, &format_len, substr, 1, exact)) { goto parse_error; } ++substr; From c1e6bc2573ebcd6ec9833cdf597d3c890d0f8503 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Thu, 17 Nov 2022 10:08:21 +0000 Subject: [PATCH 18/18] Add co-authors > > Co-authored-by: fdrocha <> Co-authored-by: nikitaved <>