From 1104a9276b40233710d5ba8a9d790e57d4cdcd4f Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Tue, 18 Oct 2022 08:41:08 +0100 Subject: [PATCH 01/12] :wastebasket: deprecate infer_datetime_format, make strict --- pandas/core/tools/datetimes.py | 92 ++++++++++++-------------------- pandas/io/parsers/base_parser.py | 5 -- pandas/io/parsers/readers.py | 39 +++++++------- 3 files changed, 53 insertions(+), 83 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 7791ea804a52a..5760952ba7324 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -19,7 +19,10 @@ import numpy as np -from pandas._libs import tslib +from pandas._libs import ( + lib, + tslib, +) from pandas._libs.tslibs import ( OutOfBoundsDatetime, Timedelta, @@ -331,7 +334,6 @@ def _convert_listlike_datetimes( tz: Timezone | None = None, unit: str | None = None, errors: DateTimeErrorChoices = "raise", - infer_datetime_format: bool = False, dayfirst: bool | None = None, yearfirst: bool | None = None, exact: bool = True, @@ -415,27 +417,19 @@ def _convert_listlike_datetimes( arg = ensure_object(arg) require_iso8601 = False - if infer_datetime_format and format is None: + if format is None: format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst) + # There is a special fast-path for iso8601 formatted + # datetime strings, so in those cases don't use the inferred + # format because this path makes process slower in this + # special case + if format is not None and format_is_iso(format): + require_iso8601 = True + format = None if format is not None: - # There is a special fast-path for iso8601 formatted - # datetime strings, so in those cases don't use the inferred - # format because this path makes process slower in this - # special case - format_is_iso8601 = format_is_iso(format) - if format_is_iso8601: - require_iso8601 = not infer_datetime_format - format = None - - if format is not None: - res = _to_datetime_with_format( - arg, orig_arg, name, tz, format, exact, errors, infer_datetime_format - ) - if res is not None: - return res + return _to_datetime_with_format(arg, orig_arg, name, tz, format, exact, errors) - assert format is None or infer_datetime_format utc = tz == "utc" result, tz_parsed = objects_to_datetime64ns( arg, @@ -464,8 +458,7 @@ def _array_strptime_with_fallback( fmt: str, exact: bool, errors: str, - infer_datetime_format: bool, -) -> Index | None: +) -> Index: """ Call array_strptime, with fallback behavior depending on 'errors'. """ @@ -486,18 +479,14 @@ def _array_strptime_with_fallback( # if fmt was inferred, try falling back # to array_to_datetime - terminate here # for specified formats - if not infer_datetime_format: - if errors == "raise": - raise - elif errors == "coerce": - result = np.empty(arg.shape, dtype="M8[ns]") - iresult = result.view("i8") - iresult.fill(iNaT) - else: - result = arg + if errors == "raise": + raise + elif errors == "coerce": + result = np.empty(arg.shape, dtype="M8[ns]") + iresult = result.view("i8") + iresult.fill(iNaT) else: - # Indicates to the caller to fallback to objects_to_datetime64ns - return None + result = arg else: if "%Z" in fmt or "%z" in fmt: return _return_parsed_timezone_results(result, timezones, tz, name) @@ -513,10 +502,9 @@ def _to_datetime_with_format( fmt: str, exact: bool, errors: str, - infer_datetime_format: bool, -) -> Index | None: +) -> Index: """ - Try parsing with the given format, returning None on failure. + Try parsing with the given format. """ result = None @@ -537,9 +525,7 @@ def _to_datetime_with_format( return _box_as_indexlike(result, utc=utc, name=name) # fallback - res = _array_strptime_with_fallback( - arg, name, tz, fmt, exact, errors, infer_datetime_format - ) + res = _array_strptime_with_fallback(arg, name, tz, fmt, exact, errors) return res @@ -713,7 +699,7 @@ def to_datetime( format: str | None = None, exact: bool = True, unit: str | None = None, - infer_datetime_format: bool = False, + infer_datetime_format: lib.NoDefault | bool = lib.no_default, origin: str = "unix", cache: bool = True, ) -> DatetimeIndex | Series | DatetimeScalar | NaTType | None: @@ -926,24 +912,6 @@ def to_datetime( 1 2016-03-05 dtype: datetime64[ns] - Passing ``infer_datetime_format=True`` can often-times speedup a parsing - if its not an ISO8601 format exactly, but in a regular format. - - >>> s = pd.Series(['3/11/2000', '3/12/2000', '3/13/2000'] * 1000) - >>> s.head() - 0 3/11/2000 - 1 3/12/2000 - 2 3/13/2000 - 3 3/11/2000 - 4 3/12/2000 - dtype: object - - >>> %timeit pd.to_datetime(s, infer_datetime_format=True) # doctest: +SKIP - 100 loops, best of 3: 10.4 ms per loop - - >>> %timeit pd.to_datetime(s, infer_datetime_format=False) # doctest: +SKIP - 1 loop, best of 3: 471 ms per loop - Using a unix epoch time >>> pd.to_datetime(1490195805, unit='s') @@ -1060,6 +1028,15 @@ def to_datetime( '2020-01-01 18:00:00+00:00', '2020-01-01 19:00:00+00:00'], dtype='datetime64[ns, UTC]', freq=None) """ + if infer_datetime_format is not lib.no_default: + warnings.warn( + "The argument 'infer_datetime_format' is deprecated and will " + "be removed in a future version. " + "A strict version of it is now the default, see " + "https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. " + "You can safely remove this argument.", + stacklevel=find_stack_level(), + ) if arg is None: return None @@ -1075,7 +1052,6 @@ def to_datetime( yearfirst=yearfirst, errors=errors, exact=exact, - infer_datetime_format=infer_datetime_format, ) result: Timestamp | NaTType | Series | Index diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 45f6469a31f4f..5080c15153ced 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -122,13 +122,11 @@ def __init__(self, kwds) -> None: self.true_values = kwds.get("true_values") self.false_values = kwds.get("false_values") self.mangle_dupe_cols = kwds.get("mangle_dupe_cols", True) - self.infer_datetime_format = kwds.pop("infer_datetime_format", False) self.cache_dates = kwds.pop("cache_dates", True) self._date_conv = _make_date_converter( date_parser=self.date_parser, dayfirst=self.dayfirst, - infer_datetime_format=self.infer_datetime_format, cache_dates=self.cache_dates, ) @@ -1105,7 +1103,6 @@ def _get_empty_meta( def _make_date_converter( date_parser=None, dayfirst: bool = False, - infer_datetime_format: bool = False, cache_dates: bool = True, ): def converter(*date_cols): @@ -1118,7 +1115,6 @@ def converter(*date_cols): utc=None, dayfirst=dayfirst, errors="ignore", - infer_datetime_format=infer_datetime_format, cache=cache_dates, ).to_numpy() @@ -1188,7 +1184,6 @@ def converter(*date_cols): "squeeze": None, "compression": None, "mangle_dupe_cols": True, - "infer_datetime_format": False, "skip_blank_lines": True, "encoding_errors": "strict", "on_bad_lines": ParserBase.BadLineHandleMethod.ERROR, diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index c1698c68ce465..6ed73bd1de1e8 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -262,11 +262,6 @@ :ref:`io.csv.mixed_timezones` for more. Note: A fast-path exists for iso8601-formatted dates. -infer_datetime_format : bool, default False - If True and `parse_dates` is enabled, pandas will attempt to infer the - format of the datetime strings in the columns, and if it can be inferred, - switch to a faster method of parsing them. In some cases this can increase - the parsing speed by 5-10x. keep_date_col : bool, default False If True and `parse_dates` specifies combining multiple columns then keep the original columns. @@ -483,7 +478,6 @@ "decimal", "iterator", "dayfirst", - "infer_datetime_format", "verbose", "skipinitialspace", "low_memory", @@ -648,7 +642,7 @@ def read_csv( verbose: bool = ..., skip_blank_lines: bool = ..., parse_dates: bool | Sequence[Hashable] | None = ..., - infer_datetime_format: bool = ..., + infer_datetime_format: bool | lib.NoDefault = ..., keep_date_col: bool = ..., date_parser=..., dayfirst: bool = ..., @@ -709,7 +703,7 @@ def read_csv( verbose: bool = ..., skip_blank_lines: bool = ..., parse_dates: bool | Sequence[Hashable] | None = ..., - infer_datetime_format: bool = ..., + infer_datetime_format: bool | lib.NoDefault = ..., keep_date_col: bool = ..., date_parser=..., dayfirst: bool = ..., @@ -770,7 +764,7 @@ def read_csv( verbose: bool = ..., skip_blank_lines: bool = ..., parse_dates: bool | Sequence[Hashable] | None = ..., - infer_datetime_format: bool = ..., + infer_datetime_format: bool | lib.NoDefault = ..., keep_date_col: bool = ..., date_parser=..., dayfirst: bool = ..., @@ -831,7 +825,7 @@ def read_csv( verbose: bool = ..., skip_blank_lines: bool = ..., parse_dates: bool | Sequence[Hashable] | None = ..., - infer_datetime_format: bool = ..., + infer_datetime_format: bool | lib.NoDefault = ..., keep_date_col: bool = ..., date_parser=..., dayfirst: bool = ..., @@ -905,7 +899,7 @@ def read_csv( skip_blank_lines: bool = True, # Datetime Handling parse_dates: bool | Sequence[Hashable] | None = None, - infer_datetime_format: bool = False, + infer_datetime_format: bool | lib.NoDefault = lib.no_default, keep_date_col: bool = False, date_parser=None, dayfirst: bool = False, @@ -940,6 +934,15 @@ def read_csv( storage_options: StorageOptions = None, use_nullable_dtypes: bool = False, ) -> DataFrame | TextFileReader: + if infer_datetime_format is not lib.no_default: + warnings.warn( + "The argument 'infer_datetime_format' is deprecated and will " + "be removed in a future version. " + "A strict version of it is now the default, see " + "https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. " + "You can safely remove this argument.", + stacklevel=find_stack_level(), + ) # locals() should never be modified kwds = locals().copy() del kwds["filepath_or_buffer"] @@ -992,7 +995,7 @@ def read_table( verbose: bool = ..., skip_blank_lines: bool = ..., parse_dates: bool | Sequence[Hashable] = ..., - infer_datetime_format: bool = ..., + infer_datetime_format: bool | lib.NoDefault = ..., keep_date_col: bool = ..., date_parser=..., dayfirst: bool = ..., @@ -1053,7 +1056,7 @@ def read_table( verbose: bool = ..., skip_blank_lines: bool = ..., parse_dates: bool | Sequence[Hashable] = ..., - infer_datetime_format: bool = ..., + infer_datetime_format: bool | lib.NoDefault = ..., keep_date_col: bool = ..., date_parser=..., dayfirst: bool = ..., @@ -1114,7 +1117,7 @@ def read_table( verbose: bool = ..., skip_blank_lines: bool = ..., parse_dates: bool | Sequence[Hashable] = ..., - infer_datetime_format: bool = ..., + infer_datetime_format: bool | lib.NoDefault = ..., keep_date_col: bool = ..., date_parser=..., dayfirst: bool = ..., @@ -1175,7 +1178,7 @@ def read_table( verbose: bool = ..., skip_blank_lines: bool = ..., parse_dates: bool | Sequence[Hashable] = ..., - infer_datetime_format: bool = ..., + infer_datetime_format: bool | lib.NoDefault = ..., keep_date_col: bool = ..., date_parser=..., dayfirst: bool = ..., @@ -1249,7 +1252,7 @@ def read_table( skip_blank_lines: bool = True, # Datetime Handling parse_dates: bool | Sequence[Hashable] = False, - infer_datetime_format: bool = False, + infer_datetime_format: bool | lib.NoDefault = lib.no_default, keep_date_col: bool = False, date_parser=None, dayfirst: bool = False, @@ -1883,10 +1886,6 @@ def TextParser(*args, **kwds) -> TextFileReader: Encoding to use for UTF when reading/writing (ex. 'utf-8') squeeze : bool, default False returns Series if only one column. - infer_datetime_format: bool, default False - If True and `parse_dates` is True for a column, try to infer the - datetime format based on the first datetime string. If the format - can be inferred, there often will be a large parsing speed-up. float_precision : str, optional Specifies which converter the C engine should use for floating-point values. The options are `None` or `high` for the ordinary converter, From ab78002b251961f393a76f542e23582d72a6d309 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Tue, 18 Oct 2022 08:43:02 +0100 Subject: [PATCH 02/12] :rotating_light: add warning about dayfirst --- pandas/_libs/tslibs/parsing.pyx | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 5c93edfee79f2..74de3502b73de 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -1088,6 +1088,7 @@ def guess_datetime_format(dt_str: str, bint dayfirst=False) -> str | None: # rebuild string, capturing any inferred padding dt_str = ''.join(tokens) if parsed_datetime.strftime(guessed_format) == dt_str: + _maybe_warn_about_dayfirst(guessed_format, dayfirst) return guessed_format else: return None @@ -1106,6 +1107,26 @@ cdef str _fill_token(token: str, padding: int): token_filled = f'{seconds}.{nanoseconds}' return token_filled +cdef void _maybe_warn_about_dayfirst(format: str, bint dayfirst): + """Warn if guessed datetime format doesn't respect dayfirst argument.""" + cdef: + int day_index = format.find('%d') + int month_index = format.find('%m') + + if (day_index != -1) and (month_index != -1): + if (day_index > month_index) and dayfirst: + warnings.warn( + f"Parsing dates in {format} format when dayfirst=True was specified. " + f"Pass `dayfirst=False` or specify a format to silence this warning.", + stacklevel=find_stack_level(), + ) + if (day_index < month_index) and not dayfirst: + warnings.warn( + f"Parsing dates in {format} format when dayfirst=False was specified. " + f"Pass `dayfirst=True` or specify a format to silence this warning.", + stacklevel=find_stack_level(), + ) + @cython.wraparound(False) @cython.boundscheck(False) cdef inline object convert_to_unicode(object item, bint keep_trivial_numbers): From d1cdfd29076044beee8d2dac22dd0a5af7678129 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Tue, 18 Oct 2022 08:48:37 +0100 Subject: [PATCH 03/12] :white_check_mark: add/update tests --- pandas/tests/apply/test_frame_apply.py | 3 +- pandas/tests/frame/methods/test_drop.py | 10 +- pandas/tests/frame/methods/test_to_csv.py | 10 +- .../indexes/datetimes/test_constructors.py | 12 +- pandas/tests/indexes/test_base.py | 12 +- .../io/parser/common/test_common_basic.py | 4 +- pandas/tests/io/parser/test_parse_dates.py | 101 ++++-------- .../io/parser/usecols/test_parse_dates.py | 8 +- pandas/tests/io/test_sql.py | 4 +- pandas/tests/io/xml/test_xml_dtypes.py | 2 +- pandas/tests/plotting/test_converter.py | 4 +- pandas/tests/series/methods/test_to_csv.py | 6 +- pandas/tests/tools/test_to_datetime.py | 148 +++++++----------- pandas/tests/tslibs/test_parsing.py | 18 +-- 14 files changed, 138 insertions(+), 204 deletions(-) diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 3bcb7d964fad1..28a9871b76985 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -836,7 +836,8 @@ def test_with_dictlike_columns_with_datetime(): df["author"] = ["X", "Y", "Z"] df["publisher"] = ["BBC", "NBC", "N24"] df["date"] = pd.to_datetime( - ["17-10-2010 07:15:30", "13-05-2011 08:20:35", "15-01-2013 09:09:09"] + ["17-10-2010 07:15:30", "13-05-2011 08:20:35", "15-01-2013 09:09:09"], + dayfirst=True, ) result = df.apply(lambda x: {}, axis=1) expected = Series([{}, {}, {}]) diff --git a/pandas/tests/frame/methods/test_drop.py b/pandas/tests/frame/methods/test_drop.py index 6e5b97af7c297..1b295fd10c9d5 100644 --- a/pandas/tests/frame/methods/test_drop.py +++ b/pandas/tests/frame/methods/test_drop.py @@ -405,11 +405,11 @@ def test_drop_level_nonunique_datetime(self): idx = Index([2, 3, 4, 4, 5], name="id") idxdt = pd.to_datetime( [ - "201603231400", - "201603231500", - "201603231600", - "201603231600", - "201603231700", + "2016-03-23 14:00", + "2016-03-23 15:00", + "2016-03-23 16:00", + "2016-03-23 16:00", + "2016-03-23 17:00", ] ) df = DataFrame(np.arange(10).reshape(5, 2), columns=list("ab"), index=idx) diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 1933278efb443..3b4dec8bff7f1 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -27,7 +27,7 @@ class TestDataFrameToCSV: def read_csv(self, path, **kwargs): - params = {"index_col": 0, "parse_dates": True} + params = {"index_col": 0} params.update(**kwargs) return read_csv(path, **params) @@ -46,17 +46,17 @@ def test_to_csv_from_csv1(self, float_frame, datetime_frame): # freq does not roundtrip datetime_frame.index = datetime_frame.index._with_freq(None) datetime_frame.to_csv(path) - recons = self.read_csv(path) + recons = self.read_csv(path, parse_dates=True) tm.assert_frame_equal(datetime_frame, recons) datetime_frame.to_csv(path, index_label="index") - recons = self.read_csv(path, index_col=None) + recons = self.read_csv(path, index_col=None, parse_dates=True) assert len(recons.columns) == len(datetime_frame.columns) + 1 # no index datetime_frame.to_csv(path, index=False) - recons = self.read_csv(path, index_col=None) + recons = self.read_csv(path, index_col=None, parse_dates=True) tm.assert_almost_equal(datetime_frame.values, recons.values) # corner case @@ -1056,7 +1056,7 @@ def test_to_csv_date_format(self, datetime_frame): # test NaTs nat_index = to_datetime( - ["NaT"] * 10 + ["2000-01-01", "1/1/2000", "1-1-2000"] + ["NaT"] * 10 + ["2000-01-01", "2000-01-01", "2000-01-01"] ) nat_frame = DataFrame({"A": nat_index}, index=nat_index) nat_frame.to_csv(path, date_format="%Y-%m-%d") diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py index 9914f4357cee4..c1039728f5b5e 100644 --- a/pandas/tests/indexes/datetimes/test_constructors.py +++ b/pandas/tests/indexes/datetimes/test_constructors.py @@ -1042,10 +1042,18 @@ def test_datetimeindex_constructor_misc(self): arr = np.array(["1/1/2005", "1/2/2005", "1/3/2005", "2005-01-04"], dtype="O") idx4 = DatetimeIndex(arr) - arr = to_datetime(["1/1/2005", "1/2/2005", "1/3/2005", "2005-01-04"]) + # Can't be parsed consistently, need to parse each element individually + arr = [ + to_datetime(date_string) + for date_string in ["1/1/2005", "1/2/2005", "1/3/2005", "2005-01-04"] + ] idx5 = DatetimeIndex(arr) - arr = to_datetime(["1/1/2005", "1/2/2005", "Jan 3, 2005", "2005-01-04"]) + # Can't be parsed consistently, need to parse each element individually + arr = [ + to_datetime(date_string) + for date_string in ["1/1/2005", "1/2/2005", "Jan 3, 2005", "2005-01-04"] + ] idx6 = DatetimeIndex(arr) idx7 = DatetimeIndex(["12/05/2007", "25/01/2008"], dayfirst=True) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index ac76953c66a24..512ce164f40f4 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1185,10 +1185,16 @@ def test_equals_op_index_vs_mi_same_length(self): expected = np.array([False, False, False]) tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize("dt_conv", [pd.to_datetime, pd.to_timedelta]) - def test_dt_conversion_preserves_name(self, dt_conv): + @pytest.mark.parametrize( + "dt_conv, arg", + [ + (pd.to_datetime, ["2000-01-01", "2000-01-02"]), + (pd.to_timedelta, ["01:02:03", "01:02:04"]), + ], + ) + def test_dt_conversion_preserves_name(self, dt_conv, arg): # GH 10875 - index = Index(["01:02:03", "01:02:04"], name="label") + index = Index(arg, name="label") assert index.name == dt_conv(index).name def test_cached_properties_not_settable(self): diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 359b059252556..de45b8e9564d0 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -58,8 +58,8 @@ def _set_noconvert_columns(self): return CParserWrapper._set_noconvert_columns(self) data = """a,b,c,d,e -0,1,20140101,0900,4 -0,1,20140102,1000,4""" +0,1,2014-01-01,09:00,4 +0,1,2014-01-02,10:00,4""" parse_dates = [[1, 2]] cols = { diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 9c8809b6099f9..b8d515a67b7fe 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -1666,9 +1666,9 @@ def test_parse_delimited_date_swap_no_warning( @pytest.mark.parametrize( "date_string,dayfirst,expected", [ - # %d/%m/%Y; month > 12 thus replacement + # %d/%m/%Y; month > 12 ("13/02/2019", False, datetime(2019, 2, 13)), - # %m/%d/%Y; day > 12 thus there will be no replacement + # %m/%d/%Y; day > 12 ("02/13/2019", True, datetime(2019, 2, 13)), ], ) @@ -1677,7 +1677,10 @@ def test_parse_delimited_date_swap_with_warning( ): parser = all_parsers expected = DataFrame({0: [expected]}, dtype="datetime64[ns]") - warning_msg = "Specify a format to ensure consistent parsing" + warning_msg = ( + "Parsing dates in .* format when dayfirst=.* was specified. " + "Pass `dayfirst=.*` or specify a format to silence this warning." + ) result = parser.read_csv_check_warnings( UserWarning, warning_msg, @@ -1691,13 +1694,11 @@ def test_parse_delimited_date_swap_with_warning( def test_parse_multiple_delimited_dates_with_swap_warnings(): # GH46210 - warning_msg = "Specify a format to ensure consistent parsing" - with tm.assert_produces_warning(UserWarning, match=warning_msg) as record: + with pytest.raises( + ValueError, + match=r"^time data '31/05/2000' does not match format '%m/%d/%Y' \(match\)$", + ): pd.to_datetime(["01/01/2000", "31/05/2000", "31/05/2001", "01/02/2000"]) - assert len({str(warning.message) for warning in record}) == 1 - # Using set(record) as repetitions of the same warning are suppressed - # https://docs.python.org/3/library/warnings.html - # and here we care to check that the warning is only shows once to users. def _helper_hypothesis_delimited_date(call, date_string, **kwargs): @@ -1860,97 +1861,51 @@ def test_parse_dates_and_keep_orgin_column(all_parsers): def test_dayfirst_warnings(): # GH 12585 - warning_msg_day_first = ( - r"Parsing dates in DD/MM/YYYY format when dayfirst=False \(the default\) was " - r"specified. This may lead to inconsistently parsed dates! Specify a format " - r"to ensure consistent parsing." - ) - warning_msg_month_first = ( - "Parsing dates in MM/DD/YYYY format when dayfirst=True was " - "specified. This may lead to inconsistently parsed dates! Specify a format " - "to ensure consistent parsing." - ) # CASE 1: valid input input = "date\n31/12/2014\n10/03/2011" - expected_consistent = DatetimeIndex( + expected = DatetimeIndex( ["2014-12-31", "2011-03-10"], dtype="datetime64[ns]", freq=None, name="date" ) - expected_inconsistent = DatetimeIndex( - ["2014-12-31", "2011-10-03"], dtype="datetime64[ns]", freq=None, name="date" + warning_msg = ( + "Parsing dates in .* format when dayfirst=.* was specified. " + "Pass `dayfirst=.*` or specify a format to silence this warning." ) # A. dayfirst arg correct, no warning res1 = read_csv( StringIO(input), parse_dates=["date"], dayfirst=True, index_col="date" ).index - tm.assert_index_equal(expected_consistent, res1) + tm.assert_index_equal(expected, res1) - # B. dayfirst arg incorrect, warning + incorrect output - with tm.assert_produces_warning(UserWarning, match=warning_msg_day_first): + # B. dayfirst arg incorrect, warning + with tm.assert_produces_warning(UserWarning, match=warning_msg): res2 = read_csv( StringIO(input), parse_dates=["date"], dayfirst=False, index_col="date" ).index - tm.assert_index_equal(expected_inconsistent, res2) - - # C. dayfirst default arg, same as B - with tm.assert_produces_warning(UserWarning, match=warning_msg_day_first): - res3 = read_csv( - StringIO(input), parse_dates=["date"], dayfirst=False, index_col="date" - ).index - tm.assert_index_equal(expected_inconsistent, res3) - - # D. infer_datetime_format=True overrides dayfirst default - # no warning + correct result - res4 = read_csv( - StringIO(input), - parse_dates=["date"], - infer_datetime_format=True, - index_col="date", - ).index - tm.assert_index_equal(expected_consistent, res4) + tm.assert_index_equal(expected, res2) # CASE 2: invalid input # cannot consistently process with single format - # warnings *always* raised + # return to user unaltered # first in DD/MM/YYYY, second in MM/DD/YYYY input = "date\n31/12/2014\n03/30/2011" - expected = DatetimeIndex( - ["2014-12-31", "2011-03-30"], dtype="datetime64[ns]", freq=None, name="date" - ) + expected = Index(["31/12/2014", "03/30/2011"], dtype="object", name="date") # A. use dayfirst=True - with tm.assert_produces_warning(UserWarning, match=warning_msg_month_first): - res5 = read_csv( - StringIO(input), parse_dates=["date"], dayfirst=True, index_col="date" - ).index + res5 = read_csv( + StringIO(input), parse_dates=["date"], dayfirst=True, index_col="date" + ).index tm.assert_index_equal(expected, res5) # B. use dayfirst=False - with tm.assert_produces_warning(UserWarning, match=warning_msg_day_first): + with tm.assert_produces_warning(UserWarning, match=warning_msg): res6 = read_csv( StringIO(input), parse_dates=["date"], dayfirst=False, index_col="date" ).index tm.assert_index_equal(expected, res6) - # C. use dayfirst default arg, same as B - with tm.assert_produces_warning(UserWarning, match=warning_msg_day_first): - res7 = read_csv( - StringIO(input), parse_dates=["date"], dayfirst=False, index_col="date" - ).index - tm.assert_index_equal(expected, res7) - - # D. use infer_datetime_format=True - with tm.assert_produces_warning(UserWarning, match=warning_msg_day_first): - res8 = read_csv( - StringIO(input), - parse_dates=["date"], - infer_datetime_format=True, - index_col="date", - ).index - tm.assert_index_equal(expected, res8) - @pytest.mark.parametrize( "date_string, dayfirst", @@ -1973,9 +1928,11 @@ def test_dayfirst_warnings_no_leading_zero(date_string, dayfirst): expected = DatetimeIndex( ["2014-01-31"], dtype="datetime64[ns]", freq=None, name="date" ) - with tm.assert_produces_warning( - UserWarning, match=r"may lead to inconsistently parsed dates" - ): + warning_msg = ( + "Parsing dates in .* format when dayfirst=.* was specified. " + "Pass `dayfirst=.*` or specify a format to silence this warning." + ) + with tm.assert_produces_warning(UserWarning, match=warning_msg): res = read_csv( StringIO(initial_value), parse_dates=["date"], diff --git a/pandas/tests/io/parser/usecols/test_parse_dates.py b/pandas/tests/io/parser/usecols/test_parse_dates.py index 50000dab8a7aa..6d40435a4107e 100644 --- a/pandas/tests/io/parser/usecols/test_parse_dates.py +++ b/pandas/tests/io/parser/usecols/test_parse_dates.py @@ -31,8 +31,8 @@ def test_usecols_with_parse_dates(all_parsers, usecols): # see gh-9755 data = """a,b,c,d,e -0,1,20140101,0900,4 -0,1,20140102,1000,4""" +0,1,2014-01-01,09:00,4 +0,1,2014-01-02,10:00,4""" parser = all_parsers parse_dates = [[1, 2]] @@ -138,8 +138,8 @@ def test_usecols_with_parse_dates4(all_parsers): ) def test_usecols_with_parse_dates_and_names(all_parsers, usecols, names): # see gh-9755 - s = """0,1,20140101,0900,4 -0,1,20140102,1000,4""" + s = """0,1,2014-01-01,09:00,4 +0,1,2014-01-02,10:00,4""" parse_dates = [[1, 2]] parser = all_parsers diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 9adada8afb2c2..129d6f89fd019 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -1386,7 +1386,7 @@ def test_sqlalchemy_type_mapping(self): # Test Timestamp objects (no datetime64 because of timezone) (GH9085) df = DataFrame( - {"time": to_datetime(["201412120154", "201412110254"], utc=True)} + {"time": to_datetime(["2014-12-12 01:54", "2014-12-11 02:54"], utc=True)} ) db = sql.SQLDatabase(self.conn) table = sql.SQLTable("test_type", db, frame=df) @@ -1595,7 +1595,7 @@ def test_sqlite_type_mapping(self): # Test Timestamp objects (no datetime64 because of timezone) (GH9085) df = DataFrame( - {"time": to_datetime(["201412120154", "201412110254"], utc=True)} + {"time": to_datetime(["2014-12-12 01:54", "2014-12-11 02:54"], utc=True)} ) db = sql.SQLiteDatabase(self.conn) table = sql.SQLiteTable("test_type", db, frame=df) diff --git a/pandas/tests/io/xml/test_xml_dtypes.py b/pandas/tests/io/xml/test_xml_dtypes.py index 5629830767c3c..7b2ffbc7cda5e 100644 --- a/pandas/tests/io/xml/test_xml_dtypes.py +++ b/pandas/tests/io/xml/test_xml_dtypes.py @@ -457,7 +457,7 @@ def test_day_first_parse_dates(parser): ) with tm.assert_produces_warning( - UserWarning, match="Parsing dates in DD/MM/YYYY format" + UserWarning, match="Parsing dates in %d/%m/%Y format" ): df_result = read_xml(xml, parse_dates=["date"], parser=parser) df_iter = read_xml_iterparse( diff --git a/pandas/tests/plotting/test_converter.py b/pandas/tests/plotting/test_converter.py index 9a6fed1afad1f..87d5aaf0c3205 100644 --- a/pandas/tests/plotting/test_converter.py +++ b/pandas/tests/plotting/test_converter.py @@ -161,8 +161,8 @@ def dtc(self): return converter.DatetimeConverter() def test_convert_accepts_unicode(self, dtc): - r1 = dtc.convert("12:22", None, None) - r2 = dtc.convert("12:22", None, None) + r1 = dtc.convert("2000-01-01 12:22", None, None) + r2 = dtc.convert("2000-01-01 12:22", None, None) assert r1 == r2, "DatetimeConverter.convert should accept unicode" def test_conversion(self, dtc): diff --git a/pandas/tests/series/methods/test_to_csv.py b/pandas/tests/series/methods/test_to_csv.py index 28519fc9b529f..7827483644634 100644 --- a/pandas/tests/series/methods/test_to_csv.py +++ b/pandas/tests/series/methods/test_to_csv.py @@ -13,7 +13,7 @@ class TestSeriesToCSV: def read_csv(self, path, **kwargs): - params = {"index_col": 0, "header": None, "parse_dates": True} + params = {"index_col": 0, "header": None} params.update(**kwargs) header = params.get("header") @@ -30,7 +30,7 @@ def test_from_csv(self, datetime_series, string_series): with tm.ensure_clean() as path: datetime_series.to_csv(path, header=False) - ts = self.read_csv(path) + ts = self.read_csv(path, parse_dates=True) tm.assert_series_equal(datetime_series, ts, check_names=False) assert ts.name is None @@ -55,7 +55,7 @@ def test_from_csv(self, datetime_series, string_series): with open(path, "w") as outfile: outfile.write("1998-01-01|1.0\n1999-01-01|2.0") - series = self.read_csv(path, sep="|") + series = self.read_csv(path, sep="|", parse_dates=True) check_series = Series( {datetime(1998, 1, 1): 1.0, datetime(1999, 1, 1): 2.0} ) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index f524bc18793d8..286036440073f 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -219,7 +219,6 @@ def test_to_datetime_format_YYYYMMDD_overflow(self, input_s, expected): ), (["201010", pd.NA], "%y%m%d", DatetimeIndex(["2020-10-10", "NaT"])), (["201010", pd.NA], "%d%m%y", DatetimeIndex(["2010-10-20", "NaT"])), - (["201010", pd.NA], None, DatetimeIndex(["2010-10-20", "NaT"])), ([None, np.nan, pd.NA], None, DatetimeIndex(["NaT", "NaT", "NaT"])), ([None, np.nan, pd.NA], "%Y%m%d", DatetimeIndex(["NaT", "NaT", "NaT"])), ], @@ -463,14 +462,14 @@ def test_to_datetime_parse_timezone_keeps_name(self): class TestToDatetime: def test_to_datetime_mixed_datetime_and_string(self): # GH#47018 adapted old doctest with new behavior - d1 = datetime(2020, 1, 1, 17, tzinfo=timezone(-timedelta(hours=1))) d2 = datetime(2020, 1, 1, 18, tzinfo=timezone(-timedelta(hours=1))) - res = to_datetime(["2020-01-01 17:00 -0100", d2]) - expected = to_datetime([d1, d2]).tz_convert(pytz.FixedOffset(-60)) - tm.assert_index_equal(res, expected) + with pytest.raises( + ValueError, + match=r"time data '.*' does not match format '%Y-%m-%d %H:%M %z' \(match\)", + ): + to_datetime(["2020-01-01 17:00 -0100", d2]) - @pytest.mark.parametrize("infer_datetime_format", [True, False]) - def test_to_datetime_np_str(self, infer_datetime_format): + def test_to_datetime_np_str(self): # GH#32264 # GH#48969 value = np.str_("2019-02-04 10:18:46.297000+0000") @@ -482,11 +481,11 @@ def test_to_datetime_np_str(self, infer_datetime_format): assert to_datetime(value) == exp assert to_datetime(ser.iloc[0]) == exp - res = to_datetime([value], infer_datetime_format=infer_datetime_format) + res = to_datetime([value]) expected = Index([exp]) tm.assert_index_equal(res, expected) - res = to_datetime(ser, infer_datetime_format=infer_datetime_format) + res = to_datetime(ser) expected = Series(expected) tm.assert_series_equal(res, expected) @@ -927,7 +926,10 @@ def test_datetime_bool_arrays_mixed(self, cache): msg = f"{type(cache)} is not convertible to datetime" with pytest.raises(TypeError, match=msg): to_datetime([False, datetime.today()], cache=cache) - with pytest.raises(TypeError, match=msg): + with pytest.raises( + ValueError, + match=r"^time data 'True' does not match format '%Y%m%d' \(match\)$", + ): to_datetime(["20130101", True], cache=cache) tm.assert_index_equal( to_datetime([0, False, NaT, 0.0], errors="coerce", cache=cache), @@ -1071,8 +1073,7 @@ def test_to_datetime_cache_scalar(self): (None,) + (NaT,) * start_caching_at + ("2012 July 26", Timestamp("2012-07-26")), - (NaT,) * (start_caching_at + 1) - + (Timestamp("2012-07-26"), Timestamp("2012-07-26")), + (NaT,) * (start_caching_at + 1) + (Timestamp("2012-07-26"), NaT), ), ), ) @@ -1153,7 +1154,6 @@ def test_to_datetime_coerce(self): ) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("infer_datetime_format", [True, False]) @pytest.mark.parametrize( "errors, expected", [ @@ -1224,15 +1224,18 @@ def test_iso_8601_strings_with_different_offsets_utc(self): def test_iso8601_strings_mixed_offsets_with_naive(self): # GH 24992 - result = to_datetime( + # Can't parse consistently, need to parse each element in loop. + result = DatetimeIndex( [ - "2018-11-28T00:00:00", - "2018-11-28T00:00:00+12:00", - "2018-11-28T00:00:00", - "2018-11-28T00:00:00+06:00", - "2018-11-28T00:00:00", - ], - utc=True, + to_datetime(string, utc=True) + for string in [ + "2018-11-28T00:00:00", + "2018-11-28T00:00:00+12:00", + "2018-11-28T00:00:00", + "2018-11-28T00:00:00+06:00", + "2018-11-28T00:00:00", + ] + ] ) expected = to_datetime( [ @@ -1248,9 +1251,10 @@ def test_iso8601_strings_mixed_offsets_with_naive(self): def test_iso8601_strings_mixed_offsets_with_naive_reversed(self): items = ["2018-11-28T00:00:00+12:00", "2018-11-28T00:00:00"] - result = to_datetime(items, utc=True) - expected = to_datetime(list(reversed(items)), utc=True)[::-1] - tm.assert_index_equal(result, expected) + # Can't parse consistently, need to parse each element in loop. + result = [to_datetime(item, utc=True) for item in items] + expected = [to_datetime(item, utc=True) for item in list(reversed(items))][::-1] + assert result == expected def test_mixed_offsets_with_native_datetime_raises(self): # GH 25978 @@ -1778,7 +1782,7 @@ def test_to_datetime_on_datetime64_series(self, cache): def test_to_datetime_with_space_in_series(self, cache): # GH 6428 ser = Series(["10/18/2006", "10/18/2008", " "]) - msg = r"(\(')?String does not contain a date(:', ' '\))?" + msg = r"^time data ' ' does not match format '%m/%d/%Y' \(match\)$" with pytest.raises(ValueError, match=msg): to_datetime(ser, errors="raise", cache=cache) result_coerce = to_datetime(ser, errors="coerce", cache=cache) @@ -1838,7 +1842,7 @@ def test_to_datetime_strings(self, cache): def test_to_datetime_strings_variation(self, cache): array = ["2012", "20120101", "20120101 12:01:01"] - expected = list(to_datetime(array, cache=cache)) + expected = [to_datetime(dt_str, cache=cache) for dt_str in array] result = [Timestamp(date_str) for date_str in array] tm.assert_almost_equal(result, expected) @@ -1908,7 +1912,10 @@ def test_string_na_nat_conversion(self, cache): result = tslib.array_to_datetime(strings)[0] tm.assert_almost_equal(result, expected) - result2 = to_datetime(strings, cache=cache) + # Can't parse in consistent format, so need to convert each individually. + result2 = DatetimeIndex( + [to_datetime(string, cache=cache) for string in strings] + ) assert isinstance(result2, DatetimeIndex) tm.assert_numpy_array_equal(result, result2.values) @@ -2011,80 +2018,39 @@ def test_dayfirst(self, cache): def test_dayfirst_warnings_valid_input(self): # GH 12585 - warning_msg_day_first = ( - r"Parsing dates in DD/MM/YYYY format when dayfirst=False \(the default\) " - "was specified. This may lead to inconsistently parsed dates! Specify a " - "format to ensure consistent parsing." + warning_msg = ( + "Parsing dates in .* format when dayfirst=.* was specified. " + "Pass `dayfirst=.*` or specify a format to silence this warning." ) # CASE 1: valid input arr = ["31/12/2014", "10/03/2011"] - expected_consistent = DatetimeIndex( + expected = DatetimeIndex( ["2014-12-31", "2011-03-10"], dtype="datetime64[ns]", freq=None ) - expected_inconsistent = DatetimeIndex( - ["2014-12-31", "2011-10-03"], dtype="datetime64[ns]", freq=None - ) # A. dayfirst arg correct, no warning res1 = to_datetime(arr, dayfirst=True) - tm.assert_index_equal(expected_consistent, res1) + tm.assert_index_equal(expected, res1) - # B. dayfirst arg incorrect, warning + incorrect output - with tm.assert_produces_warning(UserWarning, match=warning_msg_day_first): + # B. dayfirst arg incorrect, warning + with tm.assert_produces_warning(UserWarning, match=warning_msg): res2 = to_datetime(arr, dayfirst=False) - tm.assert_index_equal(expected_inconsistent, res2) - - # C. dayfirst default arg, same as B - with tm.assert_produces_warning(UserWarning, match=warning_msg_day_first): - res3 = to_datetime(arr, dayfirst=False) - tm.assert_index_equal(expected_inconsistent, res3) - - # D. infer_datetime_format=True overrides dayfirst default - # no warning + correct result - res4 = to_datetime(arr, infer_datetime_format=True) - tm.assert_index_equal(expected_consistent, res4) + tm.assert_index_equal(expected, res2) def test_dayfirst_warnings_invalid_input(self): # CASE 2: invalid input # cannot consistently process with single format - # warnings *always* raised - warning_msg_day_first = ( - r"Parsing dates in DD/MM/YYYY format when dayfirst=False \(the default\) " - "was specified. This may lead to inconsistently parsed dates! Specify a " - "format to ensure consistent parsing." - ) - warning_msg_month_first = ( - r"Parsing dates in MM/DD/YYYY format when dayfirst=True " - "was specified. This may lead to inconsistently parsed dates! Specify a " - "format to ensure consistent parsing." - ) + # ValueError *always* raised - arr = ["31/12/2014", "03/30/2011"] # first in DD/MM/YYYY, second in MM/DD/YYYY - expected = DatetimeIndex( - ["2014-12-31", "2011-03-30"], dtype="datetime64[ns]", freq=None - ) - - # A. use dayfirst=True - with tm.assert_produces_warning(UserWarning, match=warning_msg_month_first): - res5 = to_datetime(arr, dayfirst=True) - tm.assert_index_equal(expected, res5) - - # B. use dayfirst=False - with tm.assert_produces_warning(UserWarning, match=warning_msg_day_first): - res6 = to_datetime(arr, dayfirst=False) - tm.assert_index_equal(expected, res6) - - # C. use dayfirst default arg, same as B - with tm.assert_produces_warning(UserWarning, match=warning_msg_day_first): - res7 = to_datetime(arr, dayfirst=False) - tm.assert_index_equal(expected, res7) + arr = ["31/12/2014", "03/30/2011"] - # D. use infer_datetime_format=True - with tm.assert_produces_warning(UserWarning, match=warning_msg_day_first): - res8 = to_datetime(arr, infer_datetime_format=True) - tm.assert_index_equal(expected, res8) + with pytest.raises( + ValueError, + match=r"time data '03/30/2011' does not match format '%d/%m/%Y' \(match\)$", + ): + to_datetime(arr, dayfirst=True) @pytest.mark.parametrize("klass", [DatetimeIndex, DatetimeArray]) def test_to_datetime_dta_tz(self, klass): @@ -2139,12 +2105,8 @@ def test_to_datetime_infer_datetime_format_consistent_format( s_as_dt_strings = ser.apply(lambda x: x.strftime(test_format)) with_format = to_datetime(s_as_dt_strings, format=test_format, cache=cache) - no_infer = to_datetime( - s_as_dt_strings, infer_datetime_format=False, cache=cache - ) - yes_infer = to_datetime( - s_as_dt_strings, infer_datetime_format=True, cache=cache - ) + no_infer = to_datetime(s_as_dt_strings, cache=cache) + yes_infer = to_datetime(s_as_dt_strings, cache=cache) # Whether the format is explicitly passed, it is inferred, or # it is not inferred, the results should all be the same @@ -2223,7 +2185,7 @@ def test_infer_datetime_format_tz_name(self, tz_name, offset): def test_infer_datetime_format_zero_tz(self, ts, zero_tz): # GH 41047 ser = Series([ts + zero_tz]) - result = to_datetime(ser, infer_datetime_format=True) + result = to_datetime(ser) tz = pytz.utc if zero_tz == "Z" else None expected = Series([Timestamp(ts, tz=tz)]) tm.assert_series_equal(result, expected) @@ -2782,9 +2744,9 @@ def test_empty_string_datetime_coerce_format(): with pytest.raises(ValueError, match="does not match format"): to_datetime(td, format=format, errors="raise") - # don't raise an exception in case no format is given - result = to_datetime(td, errors="raise") - tm.assert_series_equal(result, expected) + # still raise an exception in case no format is given + with pytest.raises(ValueError, match="does not match format"): + to_datetime(td, errors="raise") def test_empty_string_datetime_coerce__unit(): diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py index a4e12315d34e0..49d83a8fa5c56 100644 --- a/pandas/tests/tslibs/test_parsing.py +++ b/pandas/tests/tslibs/test_parsing.py @@ -235,19 +235,19 @@ def test_guess_datetime_format_wrong_type_inputs(invalid_type_dt): @pytest.mark.parametrize( - "string,fmt", + "string,fmt,dayfirst", [ - ("2011-1-1", "%Y-%m-%d"), - ("1/1/2011", "%m/%d/%Y"), - ("30-1-2011", "%d-%m-%Y"), - ("2011-1-1 0:0:0", "%Y-%m-%d %H:%M:%S"), - ("2011-1-3T00:00:0", "%Y-%m-%dT%H:%M:%S"), - ("2011-1-1 00:00:00", "%Y-%m-%d %H:%M:%S"), + ("2011-1-1", "%Y-%m-%d", False), + ("1/1/2011", "%m/%d/%Y", False), + ("30-1-2011", "%d-%m-%Y", True), + ("2011-1-1 0:0:0", "%Y-%m-%d %H:%M:%S", False), + ("2011-1-3T00:00:0", "%Y-%m-%dT%H:%M:%S", False), + ("2011-1-1 00:00:00", "%Y-%m-%d %H:%M:%S", False), ], ) -def test_guess_datetime_format_no_padding(string, fmt): +def test_guess_datetime_format_no_padding(string, fmt, dayfirst): # see gh-11142 - result = parsing.guess_datetime_format(string) + result = parsing.guess_datetime_format(string, dayfirst=dayfirst) assert result == fmt From 632ea9d73fb67c27d98d97c080f7c5e6c35c9919 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Tue, 18 Oct 2022 09:16:00 +0100 Subject: [PATCH 04/12] :rotating_light: add warning if format cant be guessed --- pandas/core/tools/datetimes.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 5760952ba7324..09729c2aab22c 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -132,7 +132,16 @@ def _guess_datetime_format_for_array(arr, dayfirst: bool | None = False) -> str if (first_non_null := tslib.first_non_null(arr)) != -1: if type(first_non_nan_element := arr[first_non_null]) is str: # GH#32264 np.str_ object - return guess_datetime_format(first_non_nan_element, dayfirst=dayfirst) + guessed_format = guess_datetime_format( + first_non_nan_element, dayfirst=dayfirst + ) + if guessed_format is not None: + return guessed_format + warnings.warn( + "Could not infer format - " + "to ensure consistent parsing, specify a format.", + stacklevel=find_stack_level(), + ) return None From dadb44b9e47e64db09b443d5388550cfddfc981a Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Tue, 18 Oct 2022 09:22:00 +0100 Subject: [PATCH 05/12] :goal_net: catch warnings --- pandas/core/tools/datetimes.py | 2 +- pandas/tests/frame/methods/test_to_csv.py | 5 +- pandas/tests/groupby/test_function.py | 3 +- .../tests/groupby/transform/test_transform.py | 3 +- pandas/tests/io/excel/test_readers.py | 13 +- pandas/tests/io/parser/test_parse_dates.py | 98 ++++- .../io/parser/usecols/test_parse_dates.py | 8 +- pandas/tests/test_algos.py | 3 +- pandas/tests/tools/test_to_datetime.py | 391 +++++++++--------- 9 files changed, 309 insertions(+), 217 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 09729c2aab22c..41feb153978d4 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -1002,7 +1002,7 @@ def to_datetime( are constant: >>> from datetime import datetime - >>> pd.to_datetime(["2020-01-01 01:00 -01:00", datetime(2020, 1, 1, 3, 0)]) + >>> pd.to_datetime(["2020-01-01 01:00:00-01:00", datetime(2020, 1, 1, 3, 0)]) DatetimeIndex(['2020-01-01 01:00:00-01:00', '2020-01-01 02:00:00-01:00'], dtype='datetime64[ns, pytz.FixedOffset(-60)]', freq=None) diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 3b4dec8bff7f1..3985bd40daac5 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -514,7 +514,10 @@ def test_to_csv_multiindex(self, float_frame, datetime_frame): tsframe.index = MultiIndex.from_arrays(new_index) tsframe.to_csv(path, index_label=["time", "foo"]) - recons = self.read_csv(path, index_col=[0, 1]) + with tm.assert_produces_warning( + UserWarning, match="Could not infer format" + ): + recons = self.read_csv(path, index_col=[0, 1], parse_dates=True) # TODO to_csv drops column name tm.assert_frame_equal(tsframe, recons, check_names=False) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index cdbb121819c5e..ed63d41a74ae6 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -717,7 +717,8 @@ def test_max_nan_bug(): -05-06,2013-05-06 00:00:00,,log.log -05-07,2013-05-07 00:00:00,OE,xlsx""" - df = pd.read_csv(StringIO(raw), parse_dates=[0]) + with tm.assert_produces_warning(UserWarning, match="Could not infer format"): + df = pd.read_csv(StringIO(raw), parse_dates=[0]) gb = df.groupby("Date") r = gb[["File"]].max() e = gb["File"].max().to_frame() diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 8a2bd64a3deb0..d52de4d0658ef 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -1070,7 +1070,8 @@ def demean_rename(x): @pytest.mark.parametrize("func", [min, max, np.min, np.max, "first", "last"]) def test_groupby_transform_timezone_column(func): # GH 24198 - ts = pd.to_datetime("now", utc=True).tz_convert("Asia/Singapore") + with tm.assert_produces_warning(UserWarning, match="Could not infer format"): + ts = pd.to_datetime("now", utc=True).tz_convert("Asia/Singapore") result = DataFrame({"end_time": [ts], "id": [1]}) result["max_end_time"] = result.groupby("id").end_time.transform(func) expected = DataFrame([[ts, 1, ts]], columns=["end_time", "id", "max_end_time"]) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index fa1d6bbfd5a7e..8f937ad6b401a 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -888,11 +888,18 @@ def test_reader_seconds(self, request, engine, read_ext): ] } ) - - actual = pd.read_excel("times_1900" + read_ext, sheet_name="Sheet1") + if engine == "odf": + # odf recognises cell type as time (from its attribute) + # so tries to parse it. + warning = UserWarning + else: + warning = None + with tm.assert_produces_warning(warning, match="Could not infer format"): + actual = pd.read_excel("times_1900" + read_ext, sheet_name="Sheet1") tm.assert_frame_equal(actual, expected) - actual = pd.read_excel("times_1904" + read_ext, sheet_name="Sheet1") + with tm.assert_produces_warning(warning, match="Could not infer format"): + actual = pd.read_excel("times_1904" + read_ext, sheet_name="Sheet1") tm.assert_frame_equal(actual, expected) def test_read_excel_multiindex(self, request, read_ext): diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index b8d515a67b7fe..c3feb03936686 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -826,7 +826,13 @@ def test_yy_format_with_year_first(all_parsers, parse_dates): 090331,0830,5,6 """ parser = all_parsers - result = parser.read_csv(StringIO(data), index_col=0, parse_dates=parse_dates) + result = parser.read_csv_check_warnings( + UserWarning, + "Could not infer format", + StringIO(data), + index_col=0, + parse_dates=parse_dates, + ) index = DatetimeIndex( [ datetime(2009, 1, 31, 0, 10, 0), @@ -899,7 +905,13 @@ def test_multi_index_parse_dates(all_parsers, index_col): columns=["A", "B", "C"], index=index, ) - result = parser.read_csv(StringIO(data), index_col=index_col, parse_dates=True) + result = parser.read_csv_check_warnings( + UserWarning, + "Could not infer format", + StringIO(data), + index_col=index_col, + parse_dates=True, + ) tm.assert_frame_equal(result, expected) @@ -1232,19 +1244,55 @@ def test_read_with_parse_dates_invalid_type(all_parsers, parse_dates): @pytest.mark.parametrize("cache_dates", [True, False]) -@pytest.mark.parametrize("value", ["nan", "0", ""]) +@pytest.mark.parametrize("value", ["nan", ""]) def test_bad_date_parse(all_parsers, cache_dates, value): # if we have an invalid date make sure that we handle this with # and w/o the cache properly parser = all_parsers s = StringIO((f"{value},\n") * 50000) - parser.read_csv( + if parser.engine == "pyarrow": + # None in input gets converted to 'None', for which + # pandas tries to guess the datetime format, triggering + # the warning. TODO: parse dates directly in pyarrow, see + # https://github.com/pandas-dev/pandas/issues/48017 + warn = UserWarning + else: + warn = None + parser.read_csv_check_warnings( + warn, + "Could not infer format", + s, + header=None, + names=["foo", "bar"], + parse_dates=["foo"], + cache_dates=cache_dates, + ) + + +@pytest.mark.parametrize("cache_dates", [True, False]) +@pytest.mark.parametrize("value", ["0"]) +def test_bad_date_parse_with_warning(all_parsers, cache_dates, value): + # if we have an invalid date make sure that we handle this with + # and w/o the cache properly. + parser = all_parsers + s = StringIO((f"{value},\n") * 50000) + + if parser.engine == "pyarrow": + # pyarrow reads "0" as 0 (of type int64), and so + # pandas doesn't try to guess the datetime format + # TODO: parse dates directly in pyarrow, see + # https://github.com/pandas-dev/pandas/issues/48017 + warn = None + else: + warn = UserWarning + parser.read_csv_check_warnings( + warn, + "Could not infer format", s, header=None, names=["foo", "bar"], parse_dates=["foo"], - infer_datetime_format=False, cache_dates=cache_dates, ) @@ -1262,6 +1310,19 @@ def test_parse_dates_empty_string(all_parsers): tm.assert_frame_equal(result, expected) +def test_parse_dates_infer_datetime_format_warning(all_parsers): + # GH 49024 + parser = all_parsers + data = "Date,test\n2012-01-01,1\n,2" + parser.read_csv_check_warnings( + UserWarning, + "The argument 'infer_datetime_format' is deprecated", + StringIO(data), + parse_dates=["Date"], + infer_datetime_format=True, + ) + + @xfail_pyarrow @pytest.mark.parametrize( "data,kwargs,expected", @@ -1635,7 +1696,13 @@ def test_parse_timezone(all_parsers): def test_invalid_parse_delimited_date(all_parsers, date_string): parser = all_parsers expected = DataFrame({0: [date_string]}, dtype="object") - result = parser.read_csv(StringIO(date_string), header=None, parse_dates=[0]) + result = parser.read_csv_check_warnings( + UserWarning, + "Could not infer format", + StringIO(date_string), + header=None, + parse_dates=[0], + ) tm.assert_frame_equal(result, expected) @@ -1786,7 +1853,13 @@ def test_date_parser_and_names(all_parsers): # GH#33699 parser = all_parsers data = StringIO("""x,y\n1,2""") - result = parser.read_csv(data, parse_dates=["B"], names=["B"]) + result = parser.read_csv_check_warnings( + UserWarning, + "Could not infer format", + data, + parse_dates=["B"], + names=["B"], + ) expected = DataFrame({"B": ["y", "2"]}, index=["x", "1"]) tm.assert_frame_equal(result, expected) @@ -1833,7 +1906,9 @@ def test_date_parser_usecols_thousands(all_parsers): """ parser = all_parsers - result = parser.read_csv( + result = parser.read_csv_check_warnings( + UserWarning, + "Could not infer format", StringIO(data), parse_dates=[1], usecols=[1, 2], @@ -1947,7 +2022,12 @@ def test_infer_first_column_as_index(all_parsers): # GH#11019 parser = all_parsers data = "a,b,c\n1970-01-01,2,3,4" - result = parser.read_csv(StringIO(data), parse_dates=["a"]) + result = parser.read_csv_check_warnings( + UserWarning, + "Could not infer format", + StringIO(data), + parse_dates=["a"], + ) expected = DataFrame({"a": "2", "b": 3, "c": 4}, index=["1970-01-01"]) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/usecols/test_parse_dates.py b/pandas/tests/io/parser/usecols/test_parse_dates.py index 6d40435a4107e..4823df1da9959 100644 --- a/pandas/tests/io/parser/usecols/test_parse_dates.py +++ b/pandas/tests/io/parser/usecols/test_parse_dates.py @@ -124,7 +124,13 @@ def test_usecols_with_parse_dates4(all_parsers): } expected = DataFrame(cols, columns=["a_b"] + list("cdefghij")) - result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates) + result = parser.read_csv_check_warnings( + UserWarning, + "Could not infer format", + StringIO(data), + usecols=usecols, + parse_dates=parse_dates, + ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 80271c13cd35d..b3f0f40be2d78 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1212,7 +1212,8 @@ def test_value_counts_datetime_outofbounds(self): tm.assert_series_equal(res, exp) # GH 12424 - res = to_datetime(Series(["2362-01-01", np.nan]), errors="ignore") + with tm.assert_produces_warning(UserWarning, match="Could not infer format"): + res = to_datetime(Series(["2362-01-01", np.nan]), errors="ignore") exp = Series(["2362-01-01", np.nan], dtype=object) tm.assert_series_equal(res, exp) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 286036440073f..a2871e79dc7d9 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -228,6 +228,13 @@ def test_to_datetime_with_NA(self, data, format, expected): result = to_datetime(data, format=format) tm.assert_index_equal(result, expected) + def test_to_datetime_with_NA_with_warning(self): + # GH#42957 + with tm.assert_produces_warning(UserWarning, match="Could not infer format"): + result = to_datetime(["201010", pd.NA]) + expected = DatetimeIndex(["2010-10-20", "NaT"]) + tm.assert_index_equal(result, expected) + def test_to_datetime_format_integer(self, cache): # GH 10178 ser = Series([2000, 2001, 2002]) @@ -345,7 +352,6 @@ def test_to_datetime_with_non_exact(self, cache): ], ) def test_parse_nanoseconds_with_formula(self, cache, arg): - # GH8989 # truncating the nanoseconds when a format was provided expected = to_datetime(arg, cache=cache) @@ -619,15 +625,16 @@ def test_to_datetime_YYYYMMDD(self): def test_to_datetime_unparsable_ignore(self): # unparsable ser = "Month 1, 1999" - assert to_datetime(ser, errors="ignore") == ser + with tm.assert_produces_warning(UserWarning, match="Could not infer format"): + assert to_datetime(ser, errors="ignore") == ser @td.skip_if_windows # `tm.set_timezone` does not work in windows def test_to_datetime_now(self): # See GH#18666 with tm.set_timezone("US/Eastern"): - msg = "The parsing of 'now' in pd.to_datetime" + msg = "The parsing of 'now' in pd.to_datetime|Could not infer format" with tm.assert_produces_warning( - FutureWarning, match=msg, check_stacklevel=False + (FutureWarning, UserWarning), match=msg, check_stacklevel=False ): # checking stacklevel is tricky because we go through cython code # GH#18705 @@ -654,8 +661,11 @@ def test_to_datetime_today(self, tz): # so this test will not detect the regression introduced in #18666. with tm.set_timezone(tz): nptoday = np.datetime64("today").astype("datetime64[ns]").astype(np.int64) - pdtoday = to_datetime("today") - pdtoday2 = to_datetime(["today"])[0] + with tm.assert_produces_warning( + UserWarning, match="Could not infer format" + ): + pdtoday = to_datetime("today") + pdtoday2 = to_datetime(["today"])[0] tstoday = Timestamp("today") tstoday2 = Timestamp.today() @@ -672,8 +682,8 @@ def test_to_datetime_today(self, tz): @pytest.mark.parametrize("arg", ["now", "today"]) def test_to_datetime_today_now_unicode_bytes(self, arg): - warn = FutureWarning if arg == "now" else None - msg = "The parsing of 'now' in pd.to_datetime" + warn = (FutureWarning, UserWarning) if arg == "now" else UserWarning + msg = "The parsing of 'now' in pd.to_datetime|Could not infer format" with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False): # checking stacklevel is tricky because we go through cython code # GH#18705 @@ -946,18 +956,17 @@ def test_datetime_invalid_datatype(self, arg): to_datetime(arg) @pytest.mark.parametrize("value", ["a", "00:01:99"]) - @pytest.mark.parametrize("infer", [True, False]) - @pytest.mark.parametrize("format", [None, "H%:M%:S%"]) - def test_datetime_invalid_scalar(self, value, format, infer): + @pytest.mark.parametrize( + "format,warning", [(None, UserWarning), ("H%:M%:S%", None)] + ) + def test_datetime_invalid_scalar(self, value, format, warning): # GH24763 - res = to_datetime( - value, errors="ignore", format=format, infer_datetime_format=infer - ) + with tm.assert_produces_warning(warning, match="Could not infer format"): + res = to_datetime(value, errors="ignore", format=format) assert res == value - res = to_datetime( - value, errors="coerce", format=format, infer_datetime_format=infer - ) + with tm.assert_produces_warning(warning, match="Could not infer format"): + res = to_datetime(value, errors="coerce", format=format) assert res is NaT msg = ( @@ -966,51 +975,46 @@ def test_datetime_invalid_scalar(self, value, format, infer): f"Given date string {value} not likely a datetime" ) with pytest.raises(ValueError, match=msg): - to_datetime( - value, errors="raise", format=format, infer_datetime_format=infer - ) + with tm.assert_produces_warning(warning, match="Could not infer format"): + to_datetime(value, errors="raise", format=format) @pytest.mark.parametrize("value", ["3000/12/11 00:00:00"]) - @pytest.mark.parametrize("infer", [True, False]) - @pytest.mark.parametrize("format", [None, "H%:M%:S%"]) - def test_datetime_outofbounds_scalar(self, value, format, infer): + @pytest.mark.parametrize( + "format,warning", [(None, UserWarning), ("H%:M%:S%", None)] + ) + def test_datetime_outofbounds_scalar(self, value, format, warning): # GH24763 - res = to_datetime( - value, errors="ignore", format=format, infer_datetime_format=infer - ) + with tm.assert_produces_warning(warning, match="Could not infer format"): + res = to_datetime(value, errors="ignore", format=format) assert res == value - res = to_datetime( - value, errors="coerce", format=format, infer_datetime_format=infer - ) + with tm.assert_produces_warning(warning, match="Could not infer format"): + res = to_datetime(value, errors="coerce", format=format) assert res is NaT if format is not None: msg = "is a bad directive in format|Out of bounds .* present at position 0" with pytest.raises(ValueError, match=msg): - to_datetime( - value, errors="raise", format=format, infer_datetime_format=infer - ) + to_datetime(value, errors="raise", format=format) else: msg = "Out of bounds .* present at position 0" - with pytest.raises(OutOfBoundsDatetime, match=msg): - to_datetime( - value, errors="raise", format=format, infer_datetime_format=infer - ) + with pytest.raises( + OutOfBoundsDatetime, match=msg + ), tm.assert_produces_warning(warning, match="Could not infer format"): + to_datetime(value, errors="raise", format=format) @pytest.mark.parametrize("values", [["a"], ["00:01:99"], ["a", "b", "99:00:00"]]) - @pytest.mark.parametrize("infer", [True, False]) - @pytest.mark.parametrize("format", [None, "H%:M%:S%"]) - def test_datetime_invalid_index(self, values, format, infer): + @pytest.mark.parametrize( + "format,warning", [(None, UserWarning), ("H%:M%:S%", None)] + ) + def test_datetime_invalid_index(self, values, format, warning): # GH24763 - res = to_datetime( - values, errors="ignore", format=format, infer_datetime_format=infer - ) + with tm.assert_produces_warning(warning, match="Could not infer format"): + res = to_datetime(values, errors="ignore", format=format) tm.assert_index_equal(res, Index(values)) - res = to_datetime( - values, errors="coerce", format=format, infer_datetime_format=infer - ) + with tm.assert_produces_warning(warning, match="Could not infer format"): + res = to_datetime(values, errors="coerce", format=format) tm.assert_index_equal(res, DatetimeIndex([NaT] * len(values))) msg = ( @@ -1019,9 +1023,8 @@ def test_datetime_invalid_index(self, values, format, infer): "second must be in 0..59" ) with pytest.raises(ValueError, match=msg): - to_datetime( - values, errors="raise", format=format, infer_datetime_format=infer - ) + with tm.assert_produces_warning(warning, match="Could not infer format"): + to_datetime(values, errors="raise", format=format) @pytest.mark.parametrize("utc", [True, None]) @pytest.mark.parametrize("format", ["%Y%m%d %H:%M:%S", None]) @@ -1161,28 +1164,28 @@ def test_to_datetime_coerce(self): ("ignore", Index(["200622-12-31", "111111-24-11"])), ], ) - def test_to_datetime_malformed_no_raise( - self, errors, expected, infer_datetime_format - ): + def test_to_datetime_malformed_no_raise(self, errors, expected): # GH 28299 # GH 48633 ts_strings = ["200622-12-31", "111111-24-11"] - result = to_datetime( - ts_strings, errors=errors, infer_datetime_format=infer_datetime_format - ) + with tm.assert_produces_warning(UserWarning, match="Could not infer format"): + result = to_datetime(ts_strings, errors=errors) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("infer_datetime_format", [True, False]) - def test_to_datetime_malformed_raise(self, infer_datetime_format): + def test_to_datetime_malformed_raise(self): # GH 48633 ts_strings = ["200622-12-31", "111111-24-11"] with pytest.raises( ValueError, match=r"^hour must be in 0\.\.23: 111111-24-11 present at position 1$", ): - to_datetime( - ts_strings, errors="raise", infer_datetime_format=infer_datetime_format - ) + with tm.assert_produces_warning( + UserWarning, match="Could not infer format" + ): + to_datetime( + ts_strings, + errors="raise", + ) def test_iso_8601_strings_with_same_offset(self): # GH 17697, 11736 @@ -1283,7 +1286,10 @@ def test_mixed_offsets_with_native_datetime_raises(self): tm.assert_series_equal(mixed, expected) with pytest.raises(ValueError, match="Tz-aware datetime.datetime"): - to_datetime(mixed) + with tm.assert_produces_warning( + UserWarning, match="Could not infer format" + ): + to_datetime(mixed) def test_non_iso_strings_with_tz_offset(self): result = to_datetime(["March 1, 2018 12:00:00+0400"] * 2) @@ -1409,23 +1415,26 @@ def test_unit_with_numeric(self, cache, errors, dtype): tm.assert_index_equal(result, expected) @pytest.mark.parametrize( - "exp, arr", + "exp, arr, warning", [ [ ["NaT", "2015-06-19 05:33:20", "2015-05-27 22:33:20"], ["foo", 1.434692e18, 1.432766e18], + UserWarning, ], [ ["2015-06-19 05:33:20", "2015-05-27 22:33:20", "NaT", "NaT"], [1.434692e18, 1.432766e18, "foo", "NaT"], + None, ], ], ) - def test_unit_with_numeric_coerce(self, cache, exp, arr): + def test_unit_with_numeric_coerce(self, cache, exp, arr, warning): # but we want to make sure that we are coercing # if we have ints/strings expected = DatetimeIndex(exp) - result = to_datetime(arr, errors="coerce", cache=cache) + with tm.assert_produces_warning(warning, match="Could not infer format"): + result = to_datetime(arr, errors="coerce", cache=cache) tm.assert_index_equal(result, expected) @pytest.mark.parametrize( @@ -1741,7 +1750,10 @@ def test_to_datetime_barely_out_of_bounds(self): msg = "Out of bounds .* present at position 0" with pytest.raises(OutOfBoundsDatetime, match=msg): - to_datetime(arr) + with tm.assert_produces_warning( + UserWarning, match="Could not infer format" + ): + to_datetime(arr) @pytest.mark.parametrize( "arg, exp_str", @@ -1925,15 +1937,22 @@ def test_string_na_nat_conversion_malformed(self, cache): # GH 10636, default is now 'raise' msg = r"Unknown string format:|day is out of range for month" with pytest.raises(ValueError, match=msg): - to_datetime(malformed, errors="raise", cache=cache) + with tm.assert_produces_warning( + UserWarning, match="Could not infer format" + ): + to_datetime(malformed, errors="raise", cache=cache) - result = to_datetime(malformed, errors="ignore", cache=cache) + with tm.assert_produces_warning(UserWarning, match="Could not infer format"): + result = to_datetime(malformed, errors="ignore", cache=cache) # GH 21864 expected = Index(malformed) tm.assert_index_equal(result, expected) with pytest.raises(ValueError, match=msg): - to_datetime(malformed, errors="raise", cache=cache) + with tm.assert_produces_warning( + UserWarning, match="Could not infer format" + ): + to_datetime(malformed, errors="raise", cache=cache) def test_string_na_nat_conversion_with_name(self, cache): idx = ["a", "b", "c", "d", "e"] @@ -2114,60 +2133,14 @@ def test_to_datetime_infer_datetime_format_consistent_format( tm.assert_series_equal(no_infer, yes_infer) @pytest.mark.parametrize( - "data", - [ - ["01/01/2011 00:00:00", "01-02-2011 00:00:00", "2011-01-03T00:00:00"], - ["Jan/01/2011", "Feb/01/2011", "Mar/01/2011"], - ], + "tz_name, offset, warning", + [("UTC", 0, None), ("UTC-3", 180, UserWarning), ("UTC+3", -180, UserWarning)], ) - def test_to_datetime_infer_datetime_format_inconsistent_format(self, cache, data): - ser = Series(np.array(data)) - - # When the format is inconsistent, infer_datetime_format should just - # fallback to the default parsing - tm.assert_series_equal( - to_datetime(ser, infer_datetime_format=False, cache=cache), - to_datetime(ser, infer_datetime_format=True, cache=cache), - ) - - def test_to_datetime_infer_datetime_format_series_with_nans(self, cache): - ser = Series( - np.array( - ["01/01/2011 00:00:00", np.nan, "01/03/2011 00:00:00", np.nan], - dtype=object, - ) - ) - tm.assert_series_equal( - to_datetime(ser, infer_datetime_format=False, cache=cache), - to_datetime(ser, infer_datetime_format=True, cache=cache), - ) - - def test_to_datetime_infer_datetime_format_series_start_with_nans(self, cache): - ser = Series( - np.array( - [ - np.nan, - np.nan, - "01/01/2011 00:00:00", - "01/02/2011 00:00:00", - "01/03/2011 00:00:00", - ], - dtype=object, - ) - ) - - tm.assert_series_equal( - to_datetime(ser, infer_datetime_format=False, cache=cache), - to_datetime(ser, infer_datetime_format=True, cache=cache), - ) - - @pytest.mark.parametrize( - "tz_name, offset", [("UTC", 0), ("UTC-3", 180), ("UTC+3", -180)] - ) - def test_infer_datetime_format_tz_name(self, tz_name, offset): + def test_infer_datetime_format_tz_name(self, tz_name, offset, warning): # GH 33133 ser = Series([f"2019-02-02 08:07:13 {tz_name}"]) - result = to_datetime(ser, infer_datetime_format=True) + with tm.assert_produces_warning(warning, match="Could not infer format"): + result = to_datetime(ser) expected = Series( [Timestamp("2019-02-02 08:07:13").tz_localize(pytz.FixedOffset(offset))] ) @@ -2203,26 +2176,38 @@ def test_to_datetime_iso8601_noleading_0s(self, cache, format): ) tm.assert_series_equal(to_datetime(ser, format=format, cache=cache), expected) + def test_parse_dates_infer_datetime_format_warning(self): + # GH 49024 + with tm.assert_produces_warning( + UserWarning, + match="The argument 'infer_datetime_format' is deprecated", + ): + to_datetime(["10-10-2000"], infer_datetime_format=True) + class TestDaysInMonth: # tests for issue #10154 @pytest.mark.parametrize( - "arg, format", + "arg, format, warning", [ - ["2015-02-29", None], - ["2015-02-29", "%Y-%m-%d"], - ["2015-02-32", "%Y-%m-%d"], - ["2015-04-31", "%Y-%m-%d"], + ["2015-02-29", None, UserWarning], + ["2015-02-29", "%Y-%m-%d", None], + ["2015-02-32", "%Y-%m-%d", None], + ["2015-04-31", "%Y-%m-%d", None], ], ) - def test_day_not_in_month_coerce(self, cache, arg, format): - assert isna(to_datetime(arg, errors="coerce", format=format, cache=cache)) + def test_day_not_in_month_coerce(self, cache, arg, format, warning): + with tm.assert_produces_warning(warning, match="Could not infer format"): + assert isna(to_datetime(arg, errors="coerce", format=format, cache=cache)) def test_day_not_in_month_raise(self, cache): msg = "day is out of range for month" with pytest.raises(ValueError, match=msg): - to_datetime("2015-02-29", errors="raise", cache=cache) + with tm.assert_produces_warning( + UserWarning, match="Could not infer format" + ): + to_datetime("2015-02-29", errors="raise", cache=cache) @pytest.mark.parametrize("arg", ["2015-02-29", "2015-02-32", "2015-04-31"]) def test_day_not_in_month_raise_value(self, cache, arg): @@ -2231,85 +2216,85 @@ def test_day_not_in_month_raise_value(self, cache, arg): to_datetime(arg, errors="raise", format="%Y-%m-%d", cache=cache) @pytest.mark.parametrize( - "expected, format", + "expected, format, warning", [ - ["2015-02-29", None], - ["2015-02-29", "%Y-%m-%d"], - ["2015-02-29", "%Y-%m-%d"], - ["2015-04-31", "%Y-%m-%d"], + ["2015-02-29", None, UserWarning], + ["2015-02-29", "%Y-%m-%d", None], + ["2015-02-29", "%Y-%m-%d", None], + ["2015-04-31", "%Y-%m-%d", None], ], ) - def test_day_not_in_month_ignore(self, cache, expected, format): - result = to_datetime(expected, errors="ignore", format=format, cache=cache) + def test_day_not_in_month_ignore(self, cache, expected, format, warning): + with tm.assert_produces_warning(warning, match="Could not infer format"): + result = to_datetime(expected, errors="ignore", format=format, cache=cache) assert result == expected class TestDatetimeParsingWrappers: @pytest.mark.parametrize( - "date_str,expected", - list( - { - "2011-01-01": datetime(2011, 1, 1), - "2Q2005": datetime(2005, 4, 1), - "2Q05": datetime(2005, 4, 1), - "2005Q1": datetime(2005, 1, 1), - "05Q1": datetime(2005, 1, 1), - "2011Q3": datetime(2011, 7, 1), - "11Q3": datetime(2011, 7, 1), - "3Q2011": datetime(2011, 7, 1), - "3Q11": datetime(2011, 7, 1), - # quarterly without space - "2000Q4": datetime(2000, 10, 1), - "00Q4": datetime(2000, 10, 1), - "4Q2000": datetime(2000, 10, 1), - "4Q00": datetime(2000, 10, 1), - "2000q4": datetime(2000, 10, 1), - "2000-Q4": datetime(2000, 10, 1), - "00-Q4": datetime(2000, 10, 1), - "4Q-2000": datetime(2000, 10, 1), - "4Q-00": datetime(2000, 10, 1), - "00q4": datetime(2000, 10, 1), - "2005": datetime(2005, 1, 1), - "2005-11": datetime(2005, 11, 1), - "2005 11": datetime(2005, 11, 1), - "11-2005": datetime(2005, 11, 1), - "11 2005": datetime(2005, 11, 1), - "200511": datetime(2020, 5, 11), - "20051109": datetime(2005, 11, 9), - "20051109 10:15": datetime(2005, 11, 9, 10, 15), - "20051109 08H": datetime(2005, 11, 9, 8, 0), - "2005-11-09 10:15": datetime(2005, 11, 9, 10, 15), - "2005-11-09 08H": datetime(2005, 11, 9, 8, 0), - "2005/11/09 10:15": datetime(2005, 11, 9, 10, 15), - "2005/11/09 08H": datetime(2005, 11, 9, 8, 0), - "Thu Sep 25 10:36:28 2003": datetime(2003, 9, 25, 10, 36, 28), - "Thu Sep 25 2003": datetime(2003, 9, 25), - "Sep 25 2003": datetime(2003, 9, 25), - "January 1 2014": datetime(2014, 1, 1), - # GHE10537 - "2014-06": datetime(2014, 6, 1), - "06-2014": datetime(2014, 6, 1), - "2014-6": datetime(2014, 6, 1), - "6-2014": datetime(2014, 6, 1), - "20010101 12": datetime(2001, 1, 1, 12), - "20010101 1234": datetime(2001, 1, 1, 12, 34), - "20010101 123456": datetime(2001, 1, 1, 12, 34, 56), - }.items() - ), + "date_str, expected, warning", + [ + ("2011-01-01", datetime(2011, 1, 1), None), + ("2Q2005", datetime(2005, 4, 1), UserWarning), + ("2Q05", datetime(2005, 4, 1), UserWarning), + ("2005Q1", datetime(2005, 1, 1), UserWarning), + ("05Q1", datetime(2005, 1, 1), UserWarning), + ("2011Q3", datetime(2011, 7, 1), UserWarning), + ("11Q3", datetime(2011, 7, 1), UserWarning), + ("3Q2011", datetime(2011, 7, 1), UserWarning), + ("3Q11", datetime(2011, 7, 1), UserWarning), + # quarterly without space + ("2000Q4", datetime(2000, 10, 1), UserWarning), + ("00Q4", datetime(2000, 10, 1), UserWarning), + ("4Q2000", datetime(2000, 10, 1), UserWarning), + ("4Q00", datetime(2000, 10, 1), UserWarning), + ("2000q4", datetime(2000, 10, 1), UserWarning), + ("2000-Q4", datetime(2000, 10, 1), UserWarning), + ("00-Q4", datetime(2000, 10, 1), UserWarning), + ("4Q-2000", datetime(2000, 10, 1), UserWarning), + ("4Q-00", datetime(2000, 10, 1), UserWarning), + ("00q4", datetime(2000, 10, 1), UserWarning), + ("2005", datetime(2005, 1, 1), None), + ("2005-11", datetime(2005, 11, 1), UserWarning), + ("2005 11", datetime(2005, 11, 1), UserWarning), + ("11-2005", datetime(2005, 11, 1), UserWarning), + ("11 2005", datetime(2005, 11, 1), UserWarning), + ("200511", datetime(2020, 5, 11), UserWarning), + ("20051109", datetime(2005, 11, 9), None), + ("20051109 10:15", datetime(2005, 11, 9, 10, 15), None), + ("20051109 08H", datetime(2005, 11, 9, 8, 0), None), + ("2005-11-09 10:15", datetime(2005, 11, 9, 10, 15), None), + ("2005-11-09 08H", datetime(2005, 11, 9, 8, 0), None), + ("2005/11/09 10:15", datetime(2005, 11, 9, 10, 15), None), + ("2005/11/09 08H", datetime(2005, 11, 9, 8, 0), None), + ("Thu Sep 25 10:36:28 2003", datetime(2003, 9, 25, 10, 36, 28), None), + ("Thu Sep 25 2003", datetime(2003, 9, 25), None), + ("Sep 25 2003", datetime(2003, 9, 25), None), + ("January 1 2014", datetime(2014, 1, 1), None), + # GHE10537 + ("2014-06", datetime(2014, 6, 1), UserWarning), + ("06-2014", datetime(2014, 6, 1), UserWarning), + ("2014-6", datetime(2014, 6, 1), UserWarning), + ("6-2014", datetime(2014, 6, 1), UserWarning), + ("20010101 12", datetime(2001, 1, 1, 12), None), + ("20010101 1234", datetime(2001, 1, 1, 12, 34), UserWarning), + ("20010101 123456", datetime(2001, 1, 1, 12, 34, 56), UserWarning), + ], ) - def test_parsers(self, date_str, expected, cache): + def test_parsers(self, date_str, expected, warning, cache): # dateutil >= 2.5.0 defaults to yearfirst=True # https://github.com/dateutil/dateutil/issues/217 yearfirst = True result1, _ = parsing.parse_time_string(date_str, yearfirst=yearfirst) - result2 = to_datetime(date_str, yearfirst=yearfirst) - result3 = to_datetime([date_str], yearfirst=yearfirst) - # result5 is used below - result4 = to_datetime( - np.array([date_str], dtype=object), yearfirst=yearfirst, cache=cache - ) + with tm.assert_produces_warning(warning, match="Could not infer format"): + result2 = to_datetime(date_str, yearfirst=yearfirst) + result3 = to_datetime([date_str], yearfirst=yearfirst) + # result5 is used below + result4 = to_datetime( + np.array([date_str], dtype=object), yearfirst=yearfirst, cache=cache + ) result6 = DatetimeIndex([date_str], yearfirst=yearfirst) # result7 is used below result8 = DatetimeIndex(Index([date_str]), yearfirst=yearfirst) @@ -2418,9 +2403,10 @@ def test_parsers_dayfirst_yearfirst( result2 = Timestamp(date_str) assert result2 == expected - result3 = to_datetime( - date_str, dayfirst=dayfirst, yearfirst=yearfirst, cache=cache - ) + with tm.assert_produces_warning(UserWarning, match="Could not infer format"): + result3 = to_datetime( + date_str, dayfirst=dayfirst, yearfirst=yearfirst, cache=cache + ) result4 = DatetimeIndex([date_str], dayfirst=dayfirst, yearfirst=yearfirst)[0] @@ -2437,8 +2423,9 @@ def test_parsers_timestring(self, date_str, exp_def): exp_now = parse(date_str) result1, _ = parsing.parse_time_string(date_str) - result2 = to_datetime(date_str) - result3 = to_datetime([date_str]) + with tm.assert_produces_warning(UserWarning, match="Could not infer format"): + result2 = to_datetime(date_str) + result3 = to_datetime([date_str]) result4 = Timestamp(date_str) result5 = DatetimeIndex([date_str])[0] # parse time string return time string based on default date @@ -2602,17 +2589,23 @@ def test_incorrect_value_exception(self): with pytest.raises( ValueError, match="Unknown string format: yesterday present at position 1" ): - to_datetime(["today", "yesterday"]) + with tm.assert_produces_warning( + UserWarning, match="Could not infer format" + ): + to_datetime(["today", "yesterday"]) - @pytest.mark.parametrize("format", [None, "%Y-%m-%d %H:%M:%S"]) - def test_to_datetime_out_of_bounds_with_format_arg(self, format): + @pytest.mark.parametrize( + "format, warning", [(None, UserWarning), ("%Y-%m-%d %H:%M:%S", None)] + ) + def test_to_datetime_out_of_bounds_with_format_arg(self, format, warning): # see gh-23830 msg = ( "Out of bounds nanosecond timestamp: 2417-10-27 00:00:00 " "present at position 0" ) with pytest.raises(OutOfBoundsDatetime, match=msg): - to_datetime("2417-10-27 00:00:00", format=format) + with tm.assert_produces_warning(warning, match="Could not infer format"): + to_datetime("2417-10-27 00:00:00", format=format) @pytest.mark.parametrize( "arg, origin, expected_str", From d67bd3576215ee65108f913375b3e353adc1a720 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Tue, 18 Oct 2022 09:35:49 +0100 Subject: [PATCH 06/12] :memo: update docs --- doc/source/user_guide/basics.rst | 2 ++ doc/source/user_guide/io.rst | 31 +++++--------------------- doc/source/user_guide/timeseries.rst | 27 +++++++---------------- doc/source/whatsnew/v2.0.0.rst | 33 ++++++++++++++++++++++++++++ 4 files changed, 49 insertions(+), 44 deletions(-) diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index a34d4891b9d77..92fae28d3bdb3 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -2313,6 +2313,7 @@ useful if you are reading in data which is mostly of the desired dtype (e.g. num non-conforming elements intermixed that you want to represent as missing: .. ipython:: python + :okwarning: import datetime @@ -2329,6 +2330,7 @@ The ``errors`` parameter has a third option of ``errors='ignore'``, which will s encounters any errors with the conversion to a desired data type: .. ipython:: python + :okwarning: import datetime diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 63e6b007f77a8..3f7abac3e5582 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1009,41 +1009,22 @@ To parse the mixed-timezone values as a datetime column, pass a partially-applie Inferring datetime format +++++++++++++++++++++++++ -If you have ``parse_dates`` enabled for some or all of your columns, and your -datetime strings are all formatted the same way, you may get a large speed -up by setting ``infer_datetime_format=True``. If set, pandas will attempt -to guess the format of your datetime strings, and then use a faster means -of parsing the strings. 5-10x parsing speeds have been observed. pandas -will fallback to the usual parsing if either the format cannot be guessed -or the format that was guessed cannot properly parse the entire column -of strings. So in general, ``infer_datetime_format`` should not have any -negative consequences if enabled. - -Here are some examples of datetime strings that can be guessed (All -representing December 30th, 2011 at 00:00:00): - -* "20111230" -* "2011/12/30" -* "20111230 00:00:00" -* "12/30/2011 00:00:00" -* "30/Dec/2011 00:00:00" -* "30/December/2011 00:00:00" - -Note that ``infer_datetime_format`` is sensitive to ``dayfirst``. With -``dayfirst=True``, it will guess "01/12/2011" to be December 1st. With -``dayfirst=False`` (default) it will guess "01/12/2011" to be January 12th. +If you try to parse a column of date strings, pandas will attempt to guess the format +from the first non-NaN element, and will then parse the rest of the column with that +format. .. ipython:: python - # Try to infer the format for the index column df = pd.read_csv( "foo.csv", index_col=0, parse_dates=True, - infer_datetime_format=True, ) df +In the case that you have mixed datetime formats within the same column, you'll need to +first read it in the file, and then apply :func:`to_datetime` to each element. + .. ipython:: python :suppress: diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 474068e43a4d4..2710a22ec6161 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -13,17 +13,6 @@ a tremendous amount of new functionality for manipulating time series data. For example, pandas supports: -Parsing time series information from various sources and formats - -.. ipython:: python - - import datetime - - dti = pd.to_datetime( - ["1/1/2018", np.datetime64("2018-01-01"), datetime.datetime(2018, 1, 1)] - ) - dti - Generate sequences of fixed-frequency dates and time spans .. ipython:: python @@ -132,6 +121,8 @@ time. .. ipython:: python + import datetime + pd.Timestamp(datetime.datetime(2012, 5, 1)) pd.Timestamp("2012-05-01") pd.Timestamp(2012, 5, 1) @@ -196,26 +187,24 @@ is converted to a ``DatetimeIndex``: .. ipython:: python - pd.to_datetime(pd.Series(["Jul 31, 2009", "2010-01-10", None])) + pd.to_datetime(pd.Series(["Jul 31, 2009", "Jan 10, 2010", None])) - pd.to_datetime(["2005/11/23", "2010.12.31"]) + pd.to_datetime(["2005/11/23", "2010/12/31"]) If you use dates which start with the day first (i.e. European style), you can pass the ``dayfirst`` flag: .. ipython:: python - :okwarning: + :okwarning: pd.to_datetime(["04-01-2012 10:00"], dayfirst=True) - - pd.to_datetime(["14-01-2012", "01-14-2012"], dayfirst=True) + pd.to_datetime(["04-14-2012 10:00"], dayfirst=True) .. warning:: You see in the above example that ``dayfirst`` isn't strict. If a date can't be parsed with the day being first it will be parsed as if - ``dayfirst`` were False, and in the case of parsing delimited date strings - (e.g. ``31-12-2012``) then a warning will also be raised. + ``dayfirst`` were False and a warning will also be raised. If you pass a single string to ``to_datetime``, it returns a single ``Timestamp``. ``Timestamp`` can also accept string input, but it doesn't accept string parsing @@ -768,7 +757,7 @@ partially matching dates: rng2 = pd.date_range("2011-01-01", "2012-01-01", freq="W") ts2 = pd.Series(np.random.randn(len(rng2)), index=rng2) - ts2.truncate(before="2011-11", after="2011-12") + ts2.truncate(before="2011-11-01", after="2011-12-01") ts2["2011-11":"2011-12"] Even complicated fancy indexing that breaks the ``DatetimeIndex`` frequency diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 80c8ad9a8b2eb..840c29ec8b09e 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -114,6 +114,39 @@ Optional libraries below the lowest tested version may still work, but are not c See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more. +Datetimes are now parsed with a consistent format +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:func:`to_datetime` now parses dates with a consistent format, which is guessed from the first non-NA value +(unless ``format`` is specified). Previously, it would've guessed the format for each element individually. + +*Old behavior*: + + .. code-block:: ipython + + In [1]: ser = pd.Series(['13-01-2000', '12-01-2000']) + In [2]: pd.to_datetime(ser) + Out[2]: + 0 2000-01-13 + 1 2000-12-01 + dtype: datetime64[ns] + +*New behavior*: + + .. ipython:: python + :okwarning: + + ser = pd.Series(['13-01-2000', '12-01-2000']) + pd.to_datetime(ser) + +Note that this affects :func:`read_csv` as well. + +If you still need to parse dates with inconsistent formats, you'll need to apply :func:`to_datetime` +to each element individually, e.g. :: + + ser = pd.Series(['13-01-2000', '12 January 2000']) + ser.apply(pd.to_datetime) + .. _whatsnew_200.api_breaking.other: Other API changes From 98db2b5c7e5e4206d15b0348f4f37d429e0b1c78 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Wed, 19 Oct 2022 10:15:01 +0100 Subject: [PATCH 07/12] :memo: add example of reading csv file with mixed formats --- doc/source/user_guide/io.rst | 9 ++++++++- doc/source/whatsnew/v2.0.0.rst | 2 +- pandas/_libs/tslibs/parsing.pyx | 4 ++-- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 3f7abac3e5582..c2992236381c7 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1023,7 +1023,14 @@ format. df In the case that you have mixed datetime formats within the same column, you'll need to -first read it in the file, and then apply :func:`to_datetime` to each element. +first read it in as an object dtype and then apply :func:`to_datetime` to each element. + +.. ipython:: python + + data = io.StringIO("date\n12 Jan 2000\n2000-01-13\n") + df = pd.read_csv(data) + df['date'] = df['date'].apply(pd.to_datetime) + df .. ipython:: python :suppress: diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 840c29ec8b09e..49f99592e69a8 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -118,7 +118,7 @@ Datetimes are now parsed with a consistent format ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ :func:`to_datetime` now parses dates with a consistent format, which is guessed from the first non-NA value -(unless ``format`` is specified). Previously, it would've guessed the format for each element individually. +(unless ``format`` is specified). Previously, it would have guessed the format for each element individually. *Old behavior*: diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 74de3502b73de..c9df9146240da 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -1117,13 +1117,13 @@ cdef void _maybe_warn_about_dayfirst(format: str, bint dayfirst): if (day_index > month_index) and dayfirst: warnings.warn( f"Parsing dates in {format} format when dayfirst=True was specified. " - f"Pass `dayfirst=False` or specify a format to silence this warning.", + "Pass `dayfirst=False` or specify a format to silence this warning.", stacklevel=find_stack_level(), ) if (day_index < month_index) and not dayfirst: warnings.warn( f"Parsing dates in {format} format when dayfirst=False was specified. " - f"Pass `dayfirst=True` or specify a format to silence this warning.", + "Pass `dayfirst=True` or specify a format to silence this warning.", stacklevel=find_stack_level(), ) From cc307ab27b36c323e8ef11b78988b3df059d0f46 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Wed, 19 Oct 2022 10:24:42 +0100 Subject: [PATCH 08/12] :wastebasket: removed now outdated tests / clean inputs --- .../indexes/datetimes/test_constructors.py | 22 ++-------- pandas/tests/tools/test_to_datetime.py | 43 +------------------ 2 files changed, 6 insertions(+), 59 deletions(-) diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py index c1039728f5b5e..a9491f90e80f0 100644 --- a/pandas/tests/indexes/datetimes/test_constructors.py +++ b/pandas/tests/indexes/datetimes/test_constructors.py @@ -1042,27 +1042,13 @@ def test_datetimeindex_constructor_misc(self): arr = np.array(["1/1/2005", "1/2/2005", "1/3/2005", "2005-01-04"], dtype="O") idx4 = DatetimeIndex(arr) - # Can't be parsed consistently, need to parse each element individually - arr = [ - to_datetime(date_string) - for date_string in ["1/1/2005", "1/2/2005", "1/3/2005", "2005-01-04"] - ] - idx5 = DatetimeIndex(arr) - - # Can't be parsed consistently, need to parse each element individually - arr = [ - to_datetime(date_string) - for date_string in ["1/1/2005", "1/2/2005", "Jan 3, 2005", "2005-01-04"] - ] - idx6 = DatetimeIndex(arr) - - idx7 = DatetimeIndex(["12/05/2007", "25/01/2008"], dayfirst=True) - idx8 = DatetimeIndex( + idx5 = DatetimeIndex(["12/05/2007", "25/01/2008"], dayfirst=True) + idx6 = DatetimeIndex( ["2007/05/12", "2008/01/25"], dayfirst=False, yearfirst=True ) - tm.assert_index_equal(idx7, idx8) + tm.assert_index_equal(idx5, idx6) - for other in [idx2, idx3, idx4, idx5, idx6]: + for other in [idx2, idx3, idx4]: assert (idx1.values == other.values).all() sdate = datetime(1999, 12, 25) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index a2871e79dc7d9..e3b9e30e1923c 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1225,40 +1225,6 @@ def test_iso_8601_strings_with_different_offsets_utc(self): ) tm.assert_index_equal(result, expected) - def test_iso8601_strings_mixed_offsets_with_naive(self): - # GH 24992 - # Can't parse consistently, need to parse each element in loop. - result = DatetimeIndex( - [ - to_datetime(string, utc=True) - for string in [ - "2018-11-28T00:00:00", - "2018-11-28T00:00:00+12:00", - "2018-11-28T00:00:00", - "2018-11-28T00:00:00+06:00", - "2018-11-28T00:00:00", - ] - ] - ) - expected = to_datetime( - [ - "2018-11-28T00:00:00", - "2018-11-27T12:00:00", - "2018-11-28T00:00:00", - "2018-11-27T18:00:00", - "2018-11-28T00:00:00", - ], - utc=True, - ) - tm.assert_index_equal(result, expected) - - def test_iso8601_strings_mixed_offsets_with_naive_reversed(self): - items = ["2018-11-28T00:00:00+12:00", "2018-11-28T00:00:00"] - # Can't parse consistently, need to parse each element in loop. - result = [to_datetime(item, utc=True) for item in items] - expected = [to_datetime(item, utc=True) for item in list(reversed(items))][::-1] - assert result == expected - def test_mixed_offsets_with_native_datetime_raises(self): # GH 25978 @@ -1910,9 +1876,7 @@ def test_to_datetime_overflow(self): def test_string_na_nat_conversion(self, cache): # GH #999, #858 - strings = np.array( - ["1/1/2000", "1/2/2000", np.nan, "1/4/2000, 12:34:56"], dtype=object - ) + strings = np.array(["1/1/2000", "1/2/2000", np.nan, "1/4/2000"], dtype=object) expected = np.empty(4, dtype="M8[ns]") for i, val in enumerate(strings): @@ -1924,10 +1888,7 @@ def test_string_na_nat_conversion(self, cache): result = tslib.array_to_datetime(strings)[0] tm.assert_almost_equal(result, expected) - # Can't parse in consistent format, so need to convert each individually. - result2 = DatetimeIndex( - [to_datetime(string, cache=cache) for string in strings] - ) + result2 = to_datetime(strings, cache=cache) assert isinstance(result2, DatetimeIndex) tm.assert_numpy_array_equal(result, result2.values) From fc419d526ad1f7cf2a1174dae6f9385734e7283e Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Wed, 19 Oct 2022 15:21:11 +0100 Subject: [PATCH 09/12] make iso8601 fastpath respect exact --- f.py | 92 +++++++++++++++++++ pandas/_libs/tslib.pyx | 50 +++++++++- pandas/_libs/tslibs/conversion.pyx | 13 ++- pandas/_libs/tslibs/np_datetime.pxd | 11 +++ pandas/_libs/tslibs/np_datetime.pyx | 44 ++++++++- pandas/_libs/tslibs/parsing.pyx | 13 ++- .../tslibs/src/datetime/np_datetime_strings.c | 55 ++++++++++- .../tslibs/src/datetime/np_datetime_strings.h | 13 ++- pandas/core/arrays/datetimes.py | 8 ++ pandas/core/tools/datetimes.py | 27 ++++-- pandas/tests/tools/test_to_datetime.py | 92 ++++++++++++++++++- 11 files changed, 396 insertions(+), 22 deletions(-) create mode 100644 f.py diff --git a/f.py b/f.py new file mode 100644 index 0000000000000..e2e151a6271d9 --- /dev/null +++ b/f.py @@ -0,0 +1,92 @@ +from typing import NamedTuple + + +class ISO8601Info(NamedTuple): + format: str = b"" + date_sep: str = b"" + time_sep: str = b"" + micro_or_tz: str = b"" + year: bool = False + month: bool = False + day: bool = False + hour: bool = False + minute: bool = False + second: bool = False + + +def format_is_iso(f: str): + """ + Does format match the iso8601 set that can be handled by the C parser? + Generally of form YYYY-MM-DDTHH:MM:SS - date separator can be different + but must be consistent. Leading 0s in dates and times are optional. + + no, needs doing in c. eff... + """ + excluded_formats = ["%Y%m%d", "%Y%m", "%Y"] + + if f in excluded_formats: + return ISO8601Info() + for date_sep in [" ", "/", "\\", "-", ".", ""]: + for time_sep in [" ", "T"]: + for micro_or_tz in ["", "%z", "%Z", ".%f", ".%f%z", ".%f%Z"]: + if f"%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M:%S{micro_or_tz}" == f: + return ISO8601Info( + format=f.encode("utf-8"), + date_sep=date_sep.encode("utf-8"), + time_sep=time_sep.encode("utf-8"), + micro_or_tz=micro_or_tz.encode("utf-8"), + year=True, + month=True, + day=True, + hour=True, + minute=True, + second=True, + ) + elif f"%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M" == f: + return ISO8601Info( + format=f.encode("utf-8"), + date_sep=date_sep.encode("utf-8"), + time_sep=time_sep.encode("utf-8"), + micro_or_tz=micro_or_tz.encode("utf-8"), + year=True, + month=True, + day=True, + hour=True, + minute=True, + ) + elif f"%Y{date_sep}%m{date_sep}%d{time_sep}%H" == f: + return ISO8601Info( + format=f.encode("utf-8"), + date_sep=date_sep.encode("utf-8"), + time_sep=time_sep.encode("utf-8"), + micro_or_tz=micro_or_tz.encode("utf-8"), + year=True, + month=True, + day=True, + hour=True, + ) + elif f"%Y{date_sep}%m{date_sep}%d" == f: + return ISO8601Info( + format=f.encode("utf-8"), + date_sep=date_sep.encode("utf-8"), + time_sep=time_sep.encode("utf-8"), + micro_or_tz=micro_or_tz.encode("utf-8"), + year=True, + month=True, + day=True, + ) + elif f"%Y{date_sep}%m" == f: + return ISO8601Info( + format=f.encode("utf-8"), + date_sep=date_sep.encode("utf-8"), + time_sep=time_sep.encode("utf-8"), + micro_or_tz=micro_or_tz.encode("utf-8"), + year=True, + month=True, + ) + return ISO8601Info() + + +if __name__ == "__main__": + print(format_is_iso("%Y-%m-%d %H:%M:%S%z")) +# print(format_is_iso('%Y%m%d %H')) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 03331f54db892..a4036023843ef 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -93,7 +93,19 @@ def _test_parse_iso8601(ts: str): elif ts == 'today': return Timestamp.now().normalize() - string_to_dts(ts, &obj.dts, &out_bestunit, &out_local, &out_tzoffset, True) + string_to_dts(ts, &obj.dts, &out_bestunit, &out_local, &out_tzoffset, True, + format='', + date_sep='', + time_sep='', + micro_or_tz='', + year=False, + month=False, + day=False, + hour=False, + minute=False, + second=False, + exact=False, + ) obj.value = npy_datetimestruct_to_datetime(NPY_FR_ns, &obj.dts) check_dts_bounds(&obj.dts) if out_local == 1: @@ -449,6 +461,17 @@ cpdef array_to_datetime( bint utc=False, bint require_iso8601=False, bint allow_mixed=False, + const char *format='', + const char *date_sep='', + const char *time_sep='', + const char *micro_or_tz='', + bint year=False, + bint month=False, + bint day=False, + bint hour=False, + bint minute=False, + bint second=False, + bint exact=False, ): """ Converts a 1D array of date-like values to a numpy array of either: @@ -568,6 +591,16 @@ cpdef array_to_datetime( iresult[i] = get_datetime64_nanos(val, NPY_FR_ns) elif is_integer_object(val) or is_float_object(val): + if require_iso8601: + if is_coerce: + iresult[i] = NPY_NAT + continue + elif is_raise: + raise ValueError( + f"time data \"{val}\" at position {i} doesn't match format {format.decode('utf-8')}" + ) + return values, tz_out + # these must be ns unit by-definition seen_integer = True @@ -598,7 +631,18 @@ cpdef array_to_datetime( string_to_dts_failed = string_to_dts( val, &dts, &out_bestunit, &out_local, - &out_tzoffset, False + &out_tzoffset, False, + format, + date_sep=date_sep, + time_sep=time_sep, + micro_or_tz=micro_or_tz, + year=year, + month=month, + day=day, + hour=hour, + minute=minute, + second=second, + exact=exact, ) if string_to_dts_failed: # An error at this point is a _parsing_ error @@ -613,7 +657,7 @@ cpdef array_to_datetime( continue elif is_raise: raise ValueError( - f"time data \"{val}\" at position {i} doesn't match format specified" + f"time data \"{val}\" at position {i} doesn't match format {format.decode('utf-8')}" ) return values, tz_out diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 923dfa3c54d26..a4d10703f4865 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -488,7 +488,18 @@ cdef _TSObject _convert_str_to_tsobject(object ts, tzinfo tz, str unit, else: string_to_dts_failed = string_to_dts( ts, &dts, &out_bestunit, &out_local, - &out_tzoffset, False + &out_tzoffset, False, + '', + date_sep='', + time_sep='', + micro_or_tz='', + year=False, + month=False, + day=False, + hour=False, + minute=False, + second=False, + exact=False, ) if not string_to_dts_failed: try: diff --git a/pandas/_libs/tslibs/np_datetime.pxd b/pandas/_libs/tslibs/np_datetime.pxd index e51bbd4e074e1..82363ef79e29b 100644 --- a/pandas/_libs/tslibs/np_datetime.pxd +++ b/pandas/_libs/tslibs/np_datetime.pxd @@ -95,6 +95,17 @@ cdef int string_to_dts( int* out_local, int* out_tzoffset, bint want_exc, + const char *format, + const char *date_sep, + const char *time_sep, + const char *micro_or_tz, + bint year, + bint month, + bint day, + bint hour, + bint minute, + bint second, + bint exact, ) except? -1 cdef NPY_DATETIMEUNIT get_unit_from_dtype(cnp.dtype dtype) diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index 07872050dc822..7749c77d13f3b 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -52,7 +52,19 @@ cdef extern from "src/datetime/np_datetime_strings.h": int parse_iso_8601_datetime(const char *str, int len, int want_exc, npy_datetimestruct *out, NPY_DATETIMEUNIT *out_bestunit, - int *out_local, int *out_tzoffset) + int *out_local, int *out_tzoffset, + int format, + const char *date_sep, + const char *time_sep, + const char *micro_or_tz, + int year, + int month, + int day, + int hour, + int minute, + int second, + int exact + ) # ---------------------------------------------------------------------- @@ -273,14 +285,40 @@ cdef inline int string_to_dts( int* out_local, int* out_tzoffset, bint want_exc, + const char *format, + const char *date_sep, + const char *time_sep, + const char *micro_or_tz, + bint year, + bint month, + bint day, + bint hour, + bint minute, + bint second, + bint exact, ) except? -1: cdef: Py_ssize_t length + Py_ssize_t format_length const char* buf buf = get_c_string_buf_and_size(val, &length) - return parse_iso_8601_datetime(buf, length, want_exc, - dts, out_bestunit, out_local, out_tzoffset) + format_length = len(format) + result = parse_iso_8601_datetime(buf, length, want_exc, + dts, out_bestunit, out_local, out_tzoffset, + format_length, + date_sep, + time_sep, + micro_or_tz, + year, + month, + day, + hour, + minute, + second, + exact, + ) + return result cpdef ndarray astype_overflowsafe( diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index c9df9146240da..30f934ef9129c 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -409,7 +409,18 @@ cdef parse_datetime_string_with_reso( # TODO: does this render some/all of parse_delimited_date redundant? string_to_dts_failed = string_to_dts( date_string, &dts, &out_bestunit, &out_local, - &out_tzoffset, False + &out_tzoffset, False, + '', + '', + '', + '', + False, + False, + False, + False, + False, + False, + False, ) if not string_to_dts_failed: if dts.ps != 0 or out_local: diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c index cfbaed01b57c9..f5977b8066ef0 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c @@ -69,7 +69,18 @@ This file implements string parsing and creation for NumPy datetime. int parse_iso_8601_datetime(const char *str, int len, int want_exc, npy_datetimestruct *out, NPY_DATETIMEUNIT *out_bestunit, - int *out_local, int *out_tzoffset) { + int *out_local, int *out_tzoffset, + int format_length, + const char *date_sep, + const char *time_sep, + const char *micro_or_tz, + int year, + int month, + int day, + int hour, + int minute, + int second, + int exact) { int year_leap = 0; int i, numdigits; const char *substr; @@ -134,6 +145,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, /* Check whether it's a leap-year */ year_leap = is_leapyear(out->year); + /* If the format contains month but we're + already at the end of the string, error */ + if ((format_length > 0) && month && (sublen == 0)) { + goto parse_error; + } /* Next character must be a separator, start of month, or end of string */ if (sublen == 0) { if (out_local != NULL) { @@ -154,6 +170,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } has_ymd_sep = 1; ymd_sep = valid_ymd_sep[i]; + if ((format_length > 0) && (ymd_sep != *date_sep)) { + goto parse_error; + } ++substr; --sublen; /* Cannot have trailing separator */ @@ -163,6 +182,12 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } /* PARSE THE MONTH */ + + /* If the format doesn't contain month, and there's still some + string to be parsed, and we're not checking for an exact match, error*/ + if ((format_length > 0) && !month && exact) { + goto parse_error; + } /* First digit required */ out->month = (*substr - '0'); ++substr; @@ -183,6 +208,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, goto error; } + if ((format_length > 0) && day && (sublen == 0)) { + goto parse_error; + } /* Next character must be the separator, start of day, or end of string */ if (sublen == 0) { bestunit = NPY_FR_M; @@ -206,6 +234,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } /* PARSE THE DAY */ + if ((format_length > 0) && !day && exact) { + goto parse_error; + } /* First digit required */ if (!isdigit(*substr)) { goto parse_error; @@ -230,6 +261,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, goto error; } + if ((format_length > 0) && hour && (sublen == 0)) { + goto parse_error; + } /* Next character must be a 'T', ' ', or end of string */ if (sublen == 0) { if (out_local != NULL) { @@ -239,13 +273,18 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, goto finish; } - if ((*substr != 'T' && *substr != ' ') || sublen == 1) { + if ((format_length > 0) && (*substr != *time_sep)) { + goto parse_error; + } else if ((*substr != 'T' && *substr != ' ') || sublen == 1) { goto parse_error; } ++substr; --sublen; /* PARSE THE HOURS */ + if ((format_length > 0) && !hour && exact) { + goto parse_error; + } /* First digit required */ if (!isdigit(*substr)) { goto parse_error; @@ -269,6 +308,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } } + if ((format_length > 0) && minute && (sublen == 0)) { + goto parse_error; + } /* Next character must be a ':' or the end of the string */ if (sublen == 0) { if (!hour_was_2_digits) { @@ -294,6 +336,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } /* PARSE THE MINUTES */ + if ((format_length > 0) && !minute && exact) { + goto parse_error; + } /* First digit required */ out->min = (*substr - '0'); ++substr; @@ -315,6 +360,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, goto parse_error; } + if ((format_length > 0) && second && (sublen == 0)) { + goto parse_error; + } if (sublen == 0) { bestunit = NPY_FR_m; goto finish; @@ -335,6 +383,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } /* PARSE THE SECONDS */ + if ((format_length > 0) && !second && exact) { + goto parse_error; + } /* First digit required */ out->sec = (*substr - '0'); ++substr; diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h index 511d9a401fed2..7ebf3e981a787 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h @@ -58,7 +58,18 @@ parse_iso_8601_datetime(const char *str, int len, int want_exc, npy_datetimestruct *out, NPY_DATETIMEUNIT *out_bestunit, int *out_local, - int *out_tzoffset); + int *out_tzoffset, + int format_length, + const char *date_sep, + const char *time_sep, + const char *micro_or_tz, + int year, + int month, + int day, + int hour, + int minute, + int second, + int exact); /* * Provides a string length to use for converting datetime diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index ca0a745c180e9..61e8dcadf7dc9 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -78,6 +78,7 @@ from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.missing import isna +from f import ISO8601Info from pandas.core.arrays import datetimelike as dtl from pandas.core.arrays._ranges import generate_regular_range import pandas.core.common as com @@ -2180,6 +2181,8 @@ def objects_to_datetime64ns( require_iso8601: bool = False, allow_object: bool = False, allow_mixed: bool = False, + iso_info=ISO8601Info(), + exact: bool = False, ): """ Convert data to array of timestamps. @@ -2193,11 +2196,14 @@ def objects_to_datetime64ns( Whether to convert timezone-aware timestamps to UTC. errors : {'raise', 'ignore', 'coerce'} require_iso8601 : bool, default False + If True, then only try parsing in ISO8601 format, and skip other formats. allow_object : bool Whether to return an object-dtype ndarray instead of raising if the data contains more than one timezone. allow_mixed : bool, default False Interpret integers as timestamps when datetime objects are also present. + iso_info : ISO860Info + Info about how to parse the ISO8601-formatted string. Returns ------- @@ -2227,6 +2233,8 @@ def objects_to_datetime64ns( yearfirst=yearfirst, require_iso8601=require_iso8601, allow_mixed=allow_mixed, + **iso_info._asdict(), + exact=exact, ) result = result.reshape(data.shape, order=order) except OverflowError as err: diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 41feb153978d4..7607f637cabc6 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -32,9 +32,8 @@ parsing, timezones, ) -from pandas._libs.tslibs.parsing import ( +from pandas._libs.tslibs.parsing import ( # format_is_iso, DateParseError, - format_is_iso, guess_datetime_format, ) from pandas._libs.tslibs.strptime import array_strptime @@ -65,6 +64,10 @@ ) from pandas.core.dtypes.missing import notna +from f import ( + ISO8601Info, + format_is_iso, +) from pandas.arrays import ( DatetimeArray, IntegerArray, @@ -424,19 +427,21 @@ def _convert_listlike_datetimes( raise arg = ensure_object(arg) - require_iso8601 = False if format is None: format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst) - # There is a special fast-path for iso8601 formatted - # datetime strings, so in those cases don't use the inferred - # format because this path makes process slower in this - # special case - if format is not None and format_is_iso(format): - require_iso8601 = True - format = None if format is not None: + iso_info = format_is_iso(format) + require_iso8601 = True + else: + iso_info = ISO8601Info() + require_iso8601 = False + if format is not None and not iso_info.format: + # There is a special fast-path for iso8601 formatted + # datetime strings, so in those cases don't use the inferred + # format because this path makes process slower in this + # special case return _to_datetime_with_format(arg, orig_arg, name, tz, format, exact, errors) utc = tz == "utc" @@ -448,6 +453,8 @@ def _convert_listlike_datetimes( errors=errors, require_iso8601=require_iso8601, allow_object=True, + iso_info=iso_info, + exact=exact, ) if tz_parsed is not None: diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index e3b9e30e1923c..77946f0570256 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1733,6 +1733,96 @@ def test_to_datetime_iso8601(self, cache, arg, exp_str): exp = Timestamp(exp_str) assert result[0] == exp + @pytest.mark.parametrize( + "input, format", + [ + ("2012", "%Y-%m"), + ("2012-01", "%Y-%m-%d"), + ("2012-01-01", "%Y-%m-%d %H"), + ("2012-01-01 10", "%Y-%m-%d %H:%M"), + ("2012-01-01 10:00", "%Y-%m-%d %H:%M:%S"), + (0, "%Y-%m-%d"), + ], + ) + @pytest.mark.parametrize("exact", [True, False]) + def test_to_datetime_iso8601_fails(self, input, format, exact): + with pytest.raises( + ValueError, + match=rf"time data \"{input}\" at position 0 doesn't match format {format}", + ): + to_datetime(input, format=format, exact=exact) + + @pytest.mark.parametrize( + "input, format", + [ + ("2012-01-01", "%Y-%m"), + ("2012-01-01 10", "%Y-%m-%d"), + ("2012-01-01 10:00", "%Y-%m-%d %H"), + ("2012-01-01 10:00:00", "%Y-%m-%d %H:%M"), + (0, "%Y-%m-%d"), + ], + ) + def test_to_datetime_iso8601_exact_fails(self, input, format): + with pytest.raises( + ValueError, + match=rf"time data \"{input}\" at position 0 doesn't match format {format}", + ): + to_datetime(input, format=format) + + @pytest.mark.parametrize( + "input, format", + [ + ("2012-01-01", "%Y-%m"), + ("2012-01-01 10", "%Y-%m-%d"), + ("2012-01-01 10:00", "%Y-%m-%d %H"), + ("2012-01-01 10:00:00", "%Y-%m-%d %H:%M"), + ], + ) + def test_to_datetime_iso8601_non_exact(self, input, format): + to_datetime(input, format=format, exact=False) + + @pytest.mark.parametrize( + "input, format", + [ + ("2020-01", "%Y/%m"), + ("2020-01-01", "%Y/%m/%d"), + ("2020-01-01 00", "%Y/%m/%dT%H"), + ("2020-01-01T00", "%Y/%m/%d %H"), + ("2020-01-01 00:00", "%Y/%m/%dT%H:%M"), + ("2020-01-01T00:00", "%Y/%m/%d %H:%M"), + ("2020-01-01 00:00:00", "%Y/%m/%dT%H:%M:%S"), + ("2020-01-01T00:00:00", "%Y/%m/%d %H:%M:%S"), + ], + ) + def test_to_datetime_iso8601_separator(self, input, format): + with pytest.raises( + ValueError, + match=( + rf"time data \"{input}\" at position 0 doesn\'t match format {format}" + ), + ): + to_datetime(input, format=format) + + @pytest.mark.parametrize( + "input, format", + [ + ("2020-01", "%Y-%m"), + ("2020-01-01", "%Y-%m-%d"), + ("2020-01-01 00", "%Y-%m-%d %H"), + ("2020-01-01T00", "%Y-%m-%dT%H"), + ("2020-01-01 00:00", "%Y-%m-%d %H:%M"), + ("2020-01-01T00:00", "%Y-%m-%dT%H:%M"), + ("2020-01-01 00:00:00", "%Y-%m-%d %H:%M:%S"), + ("2020-01-01T00:00:00", "%Y-%m-%dT%H:%M:%S"), + ("2020-01-01T00:00:00.000", "%Y-%m-%dT%H:%M:%S"), + ("2020-01-01T00:00:00.000", "%Y-%m-%dT%H:%M:%S.%f"), + ("2020-01-01T00:00:00.000000", "%Y-%m-%dT%H:%M:%S.%f"), + ("2020-01-01T00:00:00.000000000", "%Y-%m-%dT%H:%M:%S.%f"), + ], + ) + def test_to_datetime_iso8601_valid(self, input, format): + to_datetime(input, format=format) + def test_to_datetime_default(self, cache): rs = to_datetime("2001", cache=cache) xp = datetime(2001, 1, 1) @@ -2172,7 +2262,7 @@ def test_day_not_in_month_raise(self, cache): @pytest.mark.parametrize("arg", ["2015-02-29", "2015-02-32", "2015-04-31"]) def test_day_not_in_month_raise_value(self, cache, arg): - msg = f'time data "{arg}" at position 0 doesn\'t match format specified' + msg = f'time data "{arg}" at position 0 doesn\'t match format %Y-%m-%d' with pytest.raises(ValueError, match=msg): to_datetime(arg, errors="raise", format="%Y-%m-%d", cache=cache) From de10e598feff0f21d7d6d86bb71efaa0cf520bf7 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Wed, 19 Oct 2022 19:57:48 +0100 Subject: [PATCH 10/12] fixup test --- pandas/tests/io/parser/dtypes/test_categorical.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/dtypes/test_categorical.py b/pandas/tests/io/parser/dtypes/test_categorical.py index 3b8c520004f12..cc23c95433148 100644 --- a/pandas/tests/io/parser/dtypes/test_categorical.py +++ b/pandas/tests/io/parser/dtypes/test_categorical.py @@ -263,7 +263,15 @@ def test_categorical_coerces_timestamp(all_parsers): dtype = {"b": CategoricalDtype([Timestamp("2014")])} data = "b\n2014-01-01\n2014-01-01T00:00:00" - expected = DataFrame({"b": Categorical([Timestamp("2014")] * 2)}) + if parser.engine == "pyarrow": + # pyarrow parses the data, and then + # converts to the dtypes + expected = DataFrame({"b": Categorical([Timestamp("2014")] * 2)}) + else: + # pandas parses the data as the dtype as it reads it, + # but the second row doesn't respect the format inferred + # from the first row (%Y-%m-%d) + expected = DataFrame({"b": Categorical([Timestamp("2014"), pd.NaT])}) result = parser.read_csv(StringIO(data), dtype=dtype) tm.assert_frame_equal(result, expected) From 594f3d49bbb1762e589a5d55219d979174e03850 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Thu, 20 Oct 2022 15:14:28 +0100 Subject: [PATCH 11/12] use struct --- pandas/_libs/tslib.pxd | 20 +++ pandas/_libs/tslib.pyx | 52 +++---- pandas/_libs/tslibs/conversion.pxd | 15 ++ pandas/_libs/tslibs/conversion.pyx | 29 ++-- pandas/_libs/tslibs/np_datetime.pxd | 28 ++-- pandas/_libs/tslibs/np_datetime.pyx | 53 ++----- pandas/_libs/tslibs/parsing.pxd | 21 +++ pandas/_libs/tslibs/parsing.pyx | 146 ++++++++++++++---- .../tslibs/src/datetime/np_datetime_strings.c | 36 ++--- .../tslibs/src/datetime/np_datetime_strings.h | 28 ++-- pandas/core/arrays/datetimes.py | 10 +- pandas/core/tools/datetimes.py | 12 +- 12 files changed, 280 insertions(+), 170 deletions(-) create mode 100644 pandas/_libs/tslib.pxd diff --git a/pandas/_libs/tslib.pxd b/pandas/_libs/tslib.pxd new file mode 100644 index 0000000000000..69ad8a0e10f4c --- /dev/null +++ b/pandas/_libs/tslib.pxd @@ -0,0 +1,20 @@ +from pandas._libs.tslibs.np_datetime cimport ( + NPY_DATETIMEUNIT, + npy_datetimestruct, +) + + +cdef extern from "src/datetime/np_datetime_strings.h": + ctypedef struct ISOInfo: + const char *format + int format_len + const char *date_sep + const char *time_sep + const char *micro_or_tz + int year + int month + int day + int hour + int minute + int second + int exact diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index a4036023843ef..5c150c5c7a958 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -85,16 +85,11 @@ def _test_parse_iso8601(ts: str): _TSObject obj int out_local = 0, out_tzoffset = 0 NPY_DATETIMEUNIT out_bestunit + ISOInfo iso_info - obj = _TSObject() - - if ts == 'now': - return Timestamp.utcnow() - elif ts == 'today': - return Timestamp.now().normalize() - - string_to_dts(ts, &obj.dts, &out_bestunit, &out_local, &out_tzoffset, True, + iso_info = ISOInfo( format='', + format_len=0, date_sep='', time_sep='', micro_or_tz='', @@ -105,7 +100,16 @@ def _test_parse_iso8601(ts: str): minute=False, second=False, exact=False, - ) + ) + + obj = _TSObject() + + if ts == 'now': + return Timestamp.utcnow() + elif ts == 'today': + return Timestamp.now().normalize() + + string_to_dts(ts, &obj.dts, &out_bestunit, &out_local, &out_tzoffset, True, &iso_info) obj.value = npy_datetimestruct_to_datetime(NPY_FR_ns, &obj.dts) check_dts_bounds(&obj.dts) if out_local == 1: @@ -455,23 +459,13 @@ def first_non_null(values: ndarray) -> int: @cython.boundscheck(False) cpdef array_to_datetime( ndarray[object] values, + ISOInfo iso_info, str errors='raise', bint dayfirst=False, bint yearfirst=False, bint utc=False, bint require_iso8601=False, bint allow_mixed=False, - const char *format='', - const char *date_sep='', - const char *time_sep='', - const char *micro_or_tz='', - bint year=False, - bint month=False, - bint day=False, - bint hour=False, - bint minute=False, - bint second=False, - bint exact=False, ): """ Converts a 1D array of date-like values to a numpy array of either: @@ -533,6 +527,7 @@ cpdef array_to_datetime( tzinfo tz_out = None bint found_tz = False, found_naive = False + # specify error conditions assert is_raise or is_ignore or is_coerce @@ -597,7 +592,7 @@ cpdef array_to_datetime( continue elif is_raise: raise ValueError( - f"time data \"{val}\" at position {i} doesn't match format {format.decode('utf-8')}" + f"time data \"{val}\" at position {i} doesn't match format {iso_info.format.decode('utf-8')}" ) return values, tz_out @@ -631,18 +626,7 @@ cpdef array_to_datetime( string_to_dts_failed = string_to_dts( val, &dts, &out_bestunit, &out_local, - &out_tzoffset, False, - format, - date_sep=date_sep, - time_sep=time_sep, - micro_or_tz=micro_or_tz, - year=year, - month=month, - day=day, - hour=hour, - minute=minute, - second=second, - exact=exact, + &out_tzoffset, False, &iso_info, ) if string_to_dts_failed: # An error at this point is a _parsing_ error @@ -657,7 +641,7 @@ cpdef array_to_datetime( continue elif is_raise: raise ValueError( - f"time data \"{val}\" at position {i} doesn't match format {format.decode('utf-8')}" + f"time data \"{val}\" at position {i} doesn't match format {iso_info.format.decode('utf-8')}" ) return values, tz_out diff --git a/pandas/_libs/tslibs/conversion.pxd b/pandas/_libs/tslibs/conversion.pxd index c285b248f7a5b..e4db12a412523 100644 --- a/pandas/_libs/tslibs/conversion.pxd +++ b/pandas/_libs/tslibs/conversion.pxd @@ -40,3 +40,18 @@ cdef int64_t cast_from_unit(object ts, str unit) except? -1 cpdef (int64_t, int) precision_from_unit(str unit) cdef maybe_localize_tso(_TSObject obj, tzinfo tz, NPY_DATETIMEUNIT reso) + +cdef extern from "src/datetime/np_datetime_strings.h": + ctypedef struct ISOInfo: + const char *format + int format_len + const char *date_sep + const char *time_sep + const char *micro_or_tz + int year + int month + int day + int hour + int minute + int second + int exact diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index a4d10703f4865..acf1dc005cb9b 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -469,6 +469,22 @@ cdef _TSObject _convert_str_to_tsobject(object ts, tzinfo tz, str unit, datetime dt int64_t ival NPY_DATETIMEUNIT out_bestunit + ISOInfo iso_info + + iso_info = ISOInfo( + format='', + format_len=0, + date_sep='', + time_sep='', + micro_or_tz='', + year=False, + month=False, + day=False, + hour=False, + minute=False, + second=False, + exact=False, + ) if len(ts) == 0 or ts in nat_strings: ts = NaT @@ -488,18 +504,7 @@ cdef _TSObject _convert_str_to_tsobject(object ts, tzinfo tz, str unit, else: string_to_dts_failed = string_to_dts( ts, &dts, &out_bestunit, &out_local, - &out_tzoffset, False, - '', - date_sep='', - time_sep='', - micro_or_tz='', - year=False, - month=False, - day=False, - hour=False, - minute=False, - second=False, - exact=False, + &out_tzoffset, False, &iso_info, ) if not string_to_dts_failed: try: diff --git a/pandas/_libs/tslibs/np_datetime.pxd b/pandas/_libs/tslibs/np_datetime.pxd index 82363ef79e29b..bc67720897d54 100644 --- a/pandas/_libs/tslibs/np_datetime.pxd +++ b/pandas/_libs/tslibs/np_datetime.pxd @@ -95,17 +95,7 @@ cdef int string_to_dts( int* out_local, int* out_tzoffset, bint want_exc, - const char *format, - const char *date_sep, - const char *time_sep, - const char *micro_or_tz, - bint year, - bint month, - bint day, - bint hour, - bint minute, - bint second, - bint exact, + ISOInfo* iso_info, ) except? -1 cdef NPY_DATETIMEUNIT get_unit_from_dtype(cnp.dtype dtype) @@ -129,3 +119,19 @@ cdef int64_t convert_reso( NPY_DATETIMEUNIT to_reso, bint round_ok, ) except? -1 + +cdef extern from "src/datetime/np_datetime_strings.h": + + ctypedef struct ISOInfo: + const char *format + int format_len + const char *date_sep + const char *time_sep + const char *micro_or_tz + int year + int month + int day + int hour + int minute + int second + int exact diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index 7749c77d13f3b..f68be361e5d35 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -48,23 +48,6 @@ cdef extern from "src/datetime/np_datetime.h": PyArray_DatetimeMetaData get_datetime_metadata_from_dtype(cnp.PyArray_Descr *dtype); -cdef extern from "src/datetime/np_datetime_strings.h": - int parse_iso_8601_datetime(const char *str, int len, int want_exc, - npy_datetimestruct *out, - NPY_DATETIMEUNIT *out_bestunit, - int *out_local, int *out_tzoffset, - int format, - const char *date_sep, - const char *time_sep, - const char *micro_or_tz, - int year, - int month, - int day, - int hour, - int minute, - int second, - int exact - ) # ---------------------------------------------------------------------- @@ -285,38 +268,16 @@ cdef inline int string_to_dts( int* out_local, int* out_tzoffset, bint want_exc, - const char *format, - const char *date_sep, - const char *time_sep, - const char *micro_or_tz, - bint year, - bint month, - bint day, - bint hour, - bint minute, - bint second, - bint exact, + ISOInfo* iso_info, ) except? -1: cdef: Py_ssize_t length - Py_ssize_t format_length const char* buf + buf = get_c_string_buf_and_size(val, &length) - format_length = len(format) result = parse_iso_8601_datetime(buf, length, want_exc, - dts, out_bestunit, out_local, out_tzoffset, - format_length, - date_sep, - time_sep, - micro_or_tz, - year, - month, - day, - hour, - minute, - second, - exact, + dts, out_bestunit, out_local, out_tzoffset, iso_info ) return result @@ -640,3 +601,11 @@ cdef int64_t _convert_reso_with_dtstruct( pandas_datetime_to_datetimestruct(value, from_unit, &dts) check_dts_bounds(&dts, to_unit) return npy_datetimestruct_to_datetime(to_unit, &dts) + +cdef extern from "src/datetime/np_datetime_strings.h": + int parse_iso_8601_datetime(const char *str, int len, int want_exc, + npy_datetimestruct *out, + NPY_DATETIMEUNIT *out_bestunit, + int *out_local, int *out_tzoffset, + ISOInfo *iso_info + ) diff --git a/pandas/_libs/tslibs/parsing.pxd b/pandas/_libs/tslibs/parsing.pxd index 25667f00e42b5..a84cfa2523379 100644 --- a/pandas/_libs/tslibs/parsing.pxd +++ b/pandas/_libs/tslibs/parsing.pxd @@ -1,3 +1,24 @@ cpdef str get_rule_month(str source) cpdef quarter_to_myear(int year, int quarter, str freq) +cpdef ISOInfo null_iso_info() +from pandas._libs.tslibs.np_datetime cimport ( + NPY_DATETIMEUNIT, + npy_datetimestruct, +) + + +cdef extern from "src/datetime/np_datetime_strings.h": + ctypedef struct ISOInfo: + const char *format + int format_len + const char *date_sep + const char *time_sep + const char *micro_or_tz + int year + int month + int day + int hour + int minute + int second + int exact diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 30f934ef9129c..c76646b8b1f46 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -397,6 +397,22 @@ cdef parse_datetime_string_with_reso( NPY_DATETIMEUNIT out_bestunit int out_local int out_tzoffset + ISOInfo iso_info + + iso_info = ISOInfo( + format='', + format_len=0, + date_sep='', + time_sep='', + micro_or_tz='', + year=False, + month=False, + day=False, + hour=False, + minute=False, + second=False, + exact=False, + ) if not _does_string_look_like_datetime(date_string): raise ValueError(f'Given date string {date_string} not likely a datetime') @@ -409,18 +425,7 @@ cdef parse_datetime_string_with_reso( # TODO: does this render some/all of parse_delimited_date redundant? string_to_dts_failed = string_to_dts( date_string, &dts, &out_bestunit, &out_local, - &out_tzoffset, False, - '', - '', - '', - '', - False, - False, - False, - False, - False, - False, - False, + &out_tzoffset, False, &iso_info, ) if not string_to_dts_failed: if dts.ps != 0 or out_local: @@ -933,26 +938,115 @@ class _timelex: _DATEUTIL_LEXER_SPLIT = _timelex.split +cpdef ISOInfo null_iso_info(): + return ISOInfo( + format=''.encode('utf-8'), + format_len=0, + date_sep=''.encode('utf-8'), + time_sep=''.encode('utf-8'), + micro_or_tz=''.encode('utf-8'), + year=False, + month=False, + day=False, + hour=False, + minute=False, + second=False, + exact=False, + ) -def format_is_iso(f: str) -> bint: +def format_is_iso(f: str, bint exact) -> ISOInfo: """ Does format match the iso8601 set that can be handled by the C parser? Generally of form YYYY-MM-DDTHH:MM:SS - date separator can be different but must be consistent. Leading 0s in dates and times are optional. """ - iso_template = '%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M:%S{micro_or_tz}'.format - excluded_formats = ['%Y%m%d', '%Y%m', '%Y'] - - for date_sep in [' ', '/', '\\', '-', '.', '']: - for time_sep in [' ', 'T']: - for micro_or_tz in ['', '%z', '%Z', '.%f', '.%f%z', '.%f%Z']: - if (iso_template(date_sep=date_sep, - time_sep=time_sep, - micro_or_tz=micro_or_tz, - ).startswith(f) and f not in excluded_formats): - return True - return False - + excluded_formats = ["%Y%m%d", "%Y%m", "%Y"] + + cdef ISOInfo null_info + + if f in excluded_formats: + return null_iso_info() + + for date_sep in [" ", "/", "\\", "-", ".", ""]: + for time_sep in [" ", "T"]: + for micro_or_tz in ["", "%z", "%Z", ".%f", ".%f%z", ".%f%Z"]: + if f"%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M:%S{micro_or_tz}" == f: + return ISOInfo( + format=f.encode("utf-8"), + format_len=len(f), + date_sep=date_sep.encode("utf-8"), + time_sep=time_sep.encode("utf-8"), + micro_or_tz=micro_or_tz.encode("utf-8"), + year=True, + month=True, + day=True, + hour=True, + minute=True, + second=True, + exact=exact, + ) + elif f"%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M" == f: + return ISOInfo( + format=f.encode("utf-8"), + format_len=len(f), + date_sep=date_sep.encode("utf-8"), + time_sep=time_sep.encode("utf-8"), + micro_or_tz=micro_or_tz.encode("utf-8"), + year=True, + month=True, + day=True, + hour=True, + minute=True, + second=False, + exact=exact, + ) + elif f"%Y{date_sep}%m{date_sep}%d{time_sep}%H" == f: + return ISOInfo( + format=f.encode("utf-8"), + format_len=len(f), + date_sep=date_sep.encode("utf-8"), + time_sep=time_sep.encode("utf-8"), + micro_or_tz=micro_or_tz.encode("utf-8"), + year=True, + month=True, + day=True, + hour=True, + minute=False, + second=False, + exact=exact, + ) + elif f"%Y{date_sep}%m{date_sep}%d" == f: + return ISOInfo( + format=f.encode("utf-8"), + format_len=len(f), + date_sep=date_sep.encode("utf-8"), + time_sep=time_sep.encode("utf-8"), + micro_or_tz=micro_or_tz.encode("utf-8"), + year=True, + month=True, + day=True, + hour=False, + minute=False, + second=False, + exact=exact, + ) + elif f"%Y{date_sep}%m" == f: + return ISOInfo( + format=f.encode("utf-8"), + format_len=len(f), + date_sep=date_sep.encode("utf-8"), + time_sep=time_sep.encode("utf-8"), + micro_or_tz=micro_or_tz.encode("utf-8"), + year=True, + month=True, + day=False, + hour=False, + minute=False, + second=False, + exact=exact, + ) + + return null_iso_info() def guess_datetime_format(dt_str: str, bint dayfirst=False) -> str | None: """ diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c index f5977b8066ef0..60063e3432d4d 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c @@ -70,17 +70,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, npy_datetimestruct *out, NPY_DATETIMEUNIT *out_bestunit, int *out_local, int *out_tzoffset, - int format_length, - const char *date_sep, - const char *time_sep, - const char *micro_or_tz, - int year, - int month, - int day, - int hour, - int minute, - int second, - int exact) { + ISOInfo *iso_info) { int year_leap = 0; int i, numdigits; const char *substr; @@ -147,7 +137,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, /* If the format contains month but we're already at the end of the string, error */ - if ((format_length > 0) && month && (sublen == 0)) { + if ((iso_info->format_len > 0) && iso_info->month && (sublen == 0)) { goto parse_error; } /* Next character must be a separator, start of month, or end of string */ @@ -170,7 +160,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } has_ymd_sep = 1; ymd_sep = valid_ymd_sep[i]; - if ((format_length > 0) && (ymd_sep != *date_sep)) { + if ((iso_info->format_len > 0) && (ymd_sep != *iso_info->date_sep)) { goto parse_error; } ++substr; @@ -185,7 +175,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, /* If the format doesn't contain month, and there's still some string to be parsed, and we're not checking for an exact match, error*/ - if ((format_length > 0) && !month && exact) { + if ((iso_info->format_len > 0) && !iso_info->month && iso_info->exact) { goto parse_error; } /* First digit required */ @@ -208,7 +198,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, goto error; } - if ((format_length > 0) && day && (sublen == 0)) { + if ((iso_info->format_len > 0) && iso_info->day && (sublen == 0)) { goto parse_error; } /* Next character must be the separator, start of day, or end of string */ @@ -234,7 +224,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } /* PARSE THE DAY */ - if ((format_length > 0) && !day && exact) { + if ((iso_info->format_len > 0) && !iso_info->day && iso_info->exact) { goto parse_error; } /* First digit required */ @@ -261,7 +251,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, goto error; } - if ((format_length > 0) && hour && (sublen == 0)) { + if ((iso_info->format_len > 0) && iso_info->hour && (sublen == 0)) { goto parse_error; } /* Next character must be a 'T', ' ', or end of string */ @@ -273,7 +263,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, goto finish; } - if ((format_length > 0) && (*substr != *time_sep)) { + if ((iso_info->format_len > 0) && (*substr != *iso_info->time_sep)) { goto parse_error; } else if ((*substr != 'T' && *substr != ' ') || sublen == 1) { goto parse_error; @@ -282,7 +272,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, --sublen; /* PARSE THE HOURS */ - if ((format_length > 0) && !hour && exact) { + if ((iso_info->format_len > 0) && !iso_info->hour && iso_info->exact) { goto parse_error; } /* First digit required */ @@ -308,7 +298,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } } - if ((format_length > 0) && minute && (sublen == 0)) { + if ((iso_info->format_len > 0) && iso_info->minute && (sublen == 0)) { goto parse_error; } /* Next character must be a ':' or the end of the string */ @@ -336,7 +326,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } /* PARSE THE MINUTES */ - if ((format_length > 0) && !minute && exact) { + if ((iso_info->format_len > 0) && !iso_info->minute && iso_info->exact) { goto parse_error; } /* First digit required */ @@ -360,7 +350,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, goto parse_error; } - if ((format_length > 0) && second && (sublen == 0)) { + if ((iso_info->format_len > 0) && iso_info->second && (sublen == 0)) { goto parse_error; } if (sublen == 0) { @@ -383,7 +373,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } /* PARSE THE SECONDS */ - if ((format_length > 0) && !second && exact) { + if ((iso_info->format_len > 0) && !iso_info->second && iso_info->exact) { goto parse_error; } /* First digit required */ diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h index 7ebf3e981a787..0e9ad256e0707 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h @@ -53,23 +53,29 @@ This file implements string parsing and creation for NumPy datetime. * * Returns 0 on success, -1 on failure. */ + +typedef struct { + const char *format; + int format_len; + const char *date_sep; + const char *time_sep; + const char *micro_or_tz; + int year; + int month; + int day; + int hour; + int minute; + int second; + int exact; +} ISOInfo; + int parse_iso_8601_datetime(const char *str, int len, int want_exc, npy_datetimestruct *out, NPY_DATETIMEUNIT *out_bestunit, int *out_local, int *out_tzoffset, - int format_length, - const char *date_sep, - const char *time_sep, - const char *micro_or_tz, - int year, - int month, - int day, - int hour, - int minute, - int second, - int exact); + ISOInfo *iso_info); /* * Provides a string length to use for converting datetime diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 61e8dcadf7dc9..ac76c80a43da1 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -42,6 +42,7 @@ tz_convert_from_utc, tzconversion, ) +from pandas._libs.tslibs.parsing import null_iso_info from pandas._typing import ( DateTimeErrorChoices, IntervalClosedType, @@ -78,7 +79,6 @@ from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.missing import isna -from f import ISO8601Info from pandas.core.arrays import datetimelike as dtl from pandas.core.arrays._ranges import generate_regular_range import pandas.core.common as com @@ -2181,7 +2181,7 @@ def objects_to_datetime64ns( require_iso8601: bool = False, allow_object: bool = False, allow_mixed: bool = False, - iso_info=ISO8601Info(), + iso_info=None, exact: bool = False, ): """ @@ -2219,6 +2219,9 @@ def objects_to_datetime64ns( """ assert errors in ["raise", "ignore", "coerce"] + if iso_info is None: + iso_info = null_iso_info() + # if str-dtype, convert data = np.array(data, copy=False, dtype=np.object_) @@ -2227,14 +2230,13 @@ def objects_to_datetime64ns( try: result, tz_parsed = tslib.array_to_datetime( data.ravel("K"), + iso_info=iso_info, errors=errors, utc=utc, dayfirst=dayfirst, yearfirst=yearfirst, require_iso8601=require_iso8601, allow_mixed=allow_mixed, - **iso_info._asdict(), - exact=exact, ) result = result.reshape(data.shape, order=order) except OverflowError as err: diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 7607f637cabc6..3d177797eab33 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -34,7 +34,9 @@ ) from pandas._libs.tslibs.parsing import ( # format_is_iso, DateParseError, + format_is_iso, guess_datetime_format, + null_iso_info, ) from pandas._libs.tslibs.strptime import array_strptime from pandas._typing import ( @@ -64,10 +66,6 @@ ) from pandas.core.dtypes.missing import notna -from f import ( - ISO8601Info, - format_is_iso, -) from pandas.arrays import ( DatetimeArray, IntegerArray, @@ -432,12 +430,12 @@ def _convert_listlike_datetimes( format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst) if format is not None: - iso_info = format_is_iso(format) + iso_info = format_is_iso(format, exact=exact) require_iso8601 = True else: - iso_info = ISO8601Info() + iso_info = null_iso_info() require_iso8601 = False - if format is not None and not iso_info.format: + if format is not None and not iso_info["format"]: # There is a special fast-path for iso8601 formatted # datetime strings, so in those cases don't use the inferred # format because this path makes process slower in this From 7ea1acbb9abe8c29830bf6ea6f063fdbc9579b8e Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Thu, 20 Oct 2022 15:16:05 +0100 Subject: [PATCH 12/12] remove f.py --- f.py | 92 ------------------------------------------------------------ 1 file changed, 92 deletions(-) delete mode 100644 f.py diff --git a/f.py b/f.py deleted file mode 100644 index e2e151a6271d9..0000000000000 --- a/f.py +++ /dev/null @@ -1,92 +0,0 @@ -from typing import NamedTuple - - -class ISO8601Info(NamedTuple): - format: str = b"" - date_sep: str = b"" - time_sep: str = b"" - micro_or_tz: str = b"" - year: bool = False - month: bool = False - day: bool = False - hour: bool = False - minute: bool = False - second: bool = False - - -def format_is_iso(f: str): - """ - Does format match the iso8601 set that can be handled by the C parser? - Generally of form YYYY-MM-DDTHH:MM:SS - date separator can be different - but must be consistent. Leading 0s in dates and times are optional. - - no, needs doing in c. eff... - """ - excluded_formats = ["%Y%m%d", "%Y%m", "%Y"] - - if f in excluded_formats: - return ISO8601Info() - for date_sep in [" ", "/", "\\", "-", ".", ""]: - for time_sep in [" ", "T"]: - for micro_or_tz in ["", "%z", "%Z", ".%f", ".%f%z", ".%f%Z"]: - if f"%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M:%S{micro_or_tz}" == f: - return ISO8601Info( - format=f.encode("utf-8"), - date_sep=date_sep.encode("utf-8"), - time_sep=time_sep.encode("utf-8"), - micro_or_tz=micro_or_tz.encode("utf-8"), - year=True, - month=True, - day=True, - hour=True, - minute=True, - second=True, - ) - elif f"%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M" == f: - return ISO8601Info( - format=f.encode("utf-8"), - date_sep=date_sep.encode("utf-8"), - time_sep=time_sep.encode("utf-8"), - micro_or_tz=micro_or_tz.encode("utf-8"), - year=True, - month=True, - day=True, - hour=True, - minute=True, - ) - elif f"%Y{date_sep}%m{date_sep}%d{time_sep}%H" == f: - return ISO8601Info( - format=f.encode("utf-8"), - date_sep=date_sep.encode("utf-8"), - time_sep=time_sep.encode("utf-8"), - micro_or_tz=micro_or_tz.encode("utf-8"), - year=True, - month=True, - day=True, - hour=True, - ) - elif f"%Y{date_sep}%m{date_sep}%d" == f: - return ISO8601Info( - format=f.encode("utf-8"), - date_sep=date_sep.encode("utf-8"), - time_sep=time_sep.encode("utf-8"), - micro_or_tz=micro_or_tz.encode("utf-8"), - year=True, - month=True, - day=True, - ) - elif f"%Y{date_sep}%m" == f: - return ISO8601Info( - format=f.encode("utf-8"), - date_sep=date_sep.encode("utf-8"), - time_sep=time_sep.encode("utf-8"), - micro_or_tz=micro_or_tz.encode("utf-8"), - year=True, - month=True, - ) - return ISO8601Info() - - -if __name__ == "__main__": - print(format_is_iso("%Y-%m-%d %H:%M:%S%z")) -# print(format_is_iso('%Y%m%d %H'))