From ea796690dc3bce718cd3dafb78e7bf6bb2612149 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Tue, 18 Oct 2022 08:41:08 +0100 Subject: [PATCH 01/34] :wastebasket: deprecate infer_datetime_format, make strict --- pandas/core/tools/datetimes.py | 92 ++++++++++++-------------------- pandas/io/parsers/base_parser.py | 5 -- pandas/io/parsers/readers.py | 39 +++++++------- 3 files changed, 53 insertions(+), 83 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 7791ea804a52a..5760952ba7324 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -19,7 +19,10 @@ import numpy as np -from pandas._libs import tslib +from pandas._libs import ( + lib, + tslib, +) from pandas._libs.tslibs import ( OutOfBoundsDatetime, Timedelta, @@ -331,7 +334,6 @@ def _convert_listlike_datetimes( tz: Timezone | None = None, unit: str | None = None, errors: DateTimeErrorChoices = "raise", - infer_datetime_format: bool = False, dayfirst: bool | None = None, yearfirst: bool | None = None, exact: bool = True, @@ -415,27 +417,19 @@ def _convert_listlike_datetimes( arg = ensure_object(arg) require_iso8601 = False - if infer_datetime_format and format is None: + if format is None: format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst) + # There is a special fast-path for iso8601 formatted + # datetime strings, so in those cases don't use the inferred + # format because this path makes process slower in this + # special case + if format is not None and format_is_iso(format): + require_iso8601 = True + format = None if format is not None: - # There is a special fast-path for iso8601 formatted - # datetime strings, so in those cases don't use the inferred - # format because this path makes process slower in this - # special case - format_is_iso8601 = format_is_iso(format) - if format_is_iso8601: - require_iso8601 = not infer_datetime_format - format = None - - if format is not None: - res = _to_datetime_with_format( - arg, orig_arg, name, tz, format, exact, errors, infer_datetime_format - ) - if res is not None: - return res + return _to_datetime_with_format(arg, orig_arg, name, tz, format, exact, errors) - assert format is None or infer_datetime_format utc = tz == "utc" result, tz_parsed = objects_to_datetime64ns( arg, @@ -464,8 +458,7 @@ def _array_strptime_with_fallback( fmt: str, exact: bool, errors: str, - infer_datetime_format: bool, -) -> Index | None: +) -> Index: """ Call array_strptime, with fallback behavior depending on 'errors'. """ @@ -486,18 +479,14 @@ def _array_strptime_with_fallback( # if fmt was inferred, try falling back # to array_to_datetime - terminate here # for specified formats - if not infer_datetime_format: - if errors == "raise": - raise - elif errors == "coerce": - result = np.empty(arg.shape, dtype="M8[ns]") - iresult = result.view("i8") - iresult.fill(iNaT) - else: - result = arg + if errors == "raise": + raise + elif errors == "coerce": + result = np.empty(arg.shape, dtype="M8[ns]") + iresult = result.view("i8") + iresult.fill(iNaT) else: - # Indicates to the caller to fallback to objects_to_datetime64ns - return None + result = arg else: if "%Z" in fmt or "%z" in fmt: return _return_parsed_timezone_results(result, timezones, tz, name) @@ -513,10 +502,9 @@ def _to_datetime_with_format( fmt: str, exact: bool, errors: str, - infer_datetime_format: bool, -) -> Index | None: +) -> Index: """ - Try parsing with the given format, returning None on failure. + Try parsing with the given format. """ result = None @@ -537,9 +525,7 @@ def _to_datetime_with_format( return _box_as_indexlike(result, utc=utc, name=name) # fallback - res = _array_strptime_with_fallback( - arg, name, tz, fmt, exact, errors, infer_datetime_format - ) + res = _array_strptime_with_fallback(arg, name, tz, fmt, exact, errors) return res @@ -713,7 +699,7 @@ def to_datetime( format: str | None = None, exact: bool = True, unit: str | None = None, - infer_datetime_format: bool = False, + infer_datetime_format: lib.NoDefault | bool = lib.no_default, origin: str = "unix", cache: bool = True, ) -> DatetimeIndex | Series | DatetimeScalar | NaTType | None: @@ -926,24 +912,6 @@ def to_datetime( 1 2016-03-05 dtype: datetime64[ns] - Passing ``infer_datetime_format=True`` can often-times speedup a parsing - if its not an ISO8601 format exactly, but in a regular format. - - >>> s = pd.Series(['3/11/2000', '3/12/2000', '3/13/2000'] * 1000) - >>> s.head() - 0 3/11/2000 - 1 3/12/2000 - 2 3/13/2000 - 3 3/11/2000 - 4 3/12/2000 - dtype: object - - >>> %timeit pd.to_datetime(s, infer_datetime_format=True) # doctest: +SKIP - 100 loops, best of 3: 10.4 ms per loop - - >>> %timeit pd.to_datetime(s, infer_datetime_format=False) # doctest: +SKIP - 1 loop, best of 3: 471 ms per loop - Using a unix epoch time >>> pd.to_datetime(1490195805, unit='s') @@ -1060,6 +1028,15 @@ def to_datetime( '2020-01-01 18:00:00+00:00', '2020-01-01 19:00:00+00:00'], dtype='datetime64[ns, UTC]', freq=None) """ + if infer_datetime_format is not lib.no_default: + warnings.warn( + "The argument 'infer_datetime_format' is deprecated and will " + "be removed in a future version. " + "A strict version of it is now the default, see " + "https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. " + "You can safely remove this argument.", + stacklevel=find_stack_level(), + ) if arg is None: return None @@ -1075,7 +1052,6 @@ def to_datetime( yearfirst=yearfirst, errors=errors, exact=exact, - infer_datetime_format=infer_datetime_format, ) result: Timestamp | NaTType | Series | Index diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 45f6469a31f4f..5080c15153ced 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -122,13 +122,11 @@ def __init__(self, kwds) -> None: self.true_values = kwds.get("true_values") self.false_values = kwds.get("false_values") self.mangle_dupe_cols = kwds.get("mangle_dupe_cols", True) - self.infer_datetime_format = kwds.pop("infer_datetime_format", False) self.cache_dates = kwds.pop("cache_dates", True) self._date_conv = _make_date_converter( date_parser=self.date_parser, dayfirst=self.dayfirst, - infer_datetime_format=self.infer_datetime_format, cache_dates=self.cache_dates, ) @@ -1105,7 +1103,6 @@ def _get_empty_meta( def _make_date_converter( date_parser=None, dayfirst: bool = False, - infer_datetime_format: bool = False, cache_dates: bool = True, ): def converter(*date_cols): @@ -1118,7 +1115,6 @@ def converter(*date_cols): utc=None, dayfirst=dayfirst, errors="ignore", - infer_datetime_format=infer_datetime_format, cache=cache_dates, ).to_numpy() @@ -1188,7 +1184,6 @@ def converter(*date_cols): "squeeze": None, "compression": None, "mangle_dupe_cols": True, - "infer_datetime_format": False, "skip_blank_lines": True, "encoding_errors": "strict", "on_bad_lines": ParserBase.BadLineHandleMethod.ERROR, diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index c1698c68ce465..6ed73bd1de1e8 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -262,11 +262,6 @@ :ref:`io.csv.mixed_timezones` for more. Note: A fast-path exists for iso8601-formatted dates. -infer_datetime_format : bool, default False - If True and `parse_dates` is enabled, pandas will attempt to infer the - format of the datetime strings in the columns, and if it can be inferred, - switch to a faster method of parsing them. In some cases this can increase - the parsing speed by 5-10x. keep_date_col : bool, default False If True and `parse_dates` specifies combining multiple columns then keep the original columns. @@ -483,7 +478,6 @@ "decimal", "iterator", "dayfirst", - "infer_datetime_format", "verbose", "skipinitialspace", "low_memory", @@ -648,7 +642,7 @@ def read_csv( verbose: bool = ..., skip_blank_lines: bool = ..., parse_dates: bool | Sequence[Hashable] | None = ..., - infer_datetime_format: bool = ..., + infer_datetime_format: bool | lib.NoDefault = ..., keep_date_col: bool = ..., date_parser=..., dayfirst: bool = ..., @@ -709,7 +703,7 @@ def read_csv( verbose: bool = ..., skip_blank_lines: bool = ..., parse_dates: bool | Sequence[Hashable] | None = ..., - infer_datetime_format: bool = ..., + infer_datetime_format: bool | lib.NoDefault = ..., keep_date_col: bool = ..., date_parser=..., dayfirst: bool = ..., @@ -770,7 +764,7 @@ def read_csv( verbose: bool = ..., skip_blank_lines: bool = ..., parse_dates: bool | Sequence[Hashable] | None = ..., - infer_datetime_format: bool = ..., + infer_datetime_format: bool | lib.NoDefault = ..., keep_date_col: bool = ..., date_parser=..., dayfirst: bool = ..., @@ -831,7 +825,7 @@ def read_csv( verbose: bool = ..., skip_blank_lines: bool = ..., parse_dates: bool | Sequence[Hashable] | None = ..., - infer_datetime_format: bool = ..., + infer_datetime_format: bool | lib.NoDefault = ..., keep_date_col: bool = ..., date_parser=..., dayfirst: bool = ..., @@ -905,7 +899,7 @@ def read_csv( skip_blank_lines: bool = True, # Datetime Handling parse_dates: bool | Sequence[Hashable] | None = None, - infer_datetime_format: bool = False, + infer_datetime_format: bool | lib.NoDefault = lib.no_default, keep_date_col: bool = False, date_parser=None, dayfirst: bool = False, @@ -940,6 +934,15 @@ def read_csv( storage_options: StorageOptions = None, use_nullable_dtypes: bool = False, ) -> DataFrame | TextFileReader: + if infer_datetime_format is not lib.no_default: + warnings.warn( + "The argument 'infer_datetime_format' is deprecated and will " + "be removed in a future version. " + "A strict version of it is now the default, see " + "https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. " + "You can safely remove this argument.", + stacklevel=find_stack_level(), + ) # locals() should never be modified kwds = locals().copy() del kwds["filepath_or_buffer"] @@ -992,7 +995,7 @@ def read_table( verbose: bool = ..., skip_blank_lines: bool = ..., parse_dates: bool | Sequence[Hashable] = ..., - infer_datetime_format: bool = ..., + infer_datetime_format: bool | lib.NoDefault = ..., keep_date_col: bool = ..., date_parser=..., dayfirst: bool = ..., @@ -1053,7 +1056,7 @@ def read_table( verbose: bool = ..., skip_blank_lines: bool = ..., parse_dates: bool | Sequence[Hashable] = ..., - infer_datetime_format: bool = ..., + infer_datetime_format: bool | lib.NoDefault = ..., keep_date_col: bool = ..., date_parser=..., dayfirst: bool = ..., @@ -1114,7 +1117,7 @@ def read_table( verbose: bool = ..., skip_blank_lines: bool = ..., parse_dates: bool | Sequence[Hashable] = ..., - infer_datetime_format: bool = ..., + infer_datetime_format: bool | lib.NoDefault = ..., keep_date_col: bool = ..., date_parser=..., dayfirst: bool = ..., @@ -1175,7 +1178,7 @@ def read_table( verbose: bool = ..., skip_blank_lines: bool = ..., parse_dates: bool | Sequence[Hashable] = ..., - infer_datetime_format: bool = ..., + infer_datetime_format: bool | lib.NoDefault = ..., keep_date_col: bool = ..., date_parser=..., dayfirst: bool = ..., @@ -1249,7 +1252,7 @@ def read_table( skip_blank_lines: bool = True, # Datetime Handling parse_dates: bool | Sequence[Hashable] = False, - infer_datetime_format: bool = False, + infer_datetime_format: bool | lib.NoDefault = lib.no_default, keep_date_col: bool = False, date_parser=None, dayfirst: bool = False, @@ -1883,10 +1886,6 @@ def TextParser(*args, **kwds) -> TextFileReader: Encoding to use for UTF when reading/writing (ex. 'utf-8') squeeze : bool, default False returns Series if only one column. - infer_datetime_format: bool, default False - If True and `parse_dates` is True for a column, try to infer the - datetime format based on the first datetime string. If the format - can be inferred, there often will be a large parsing speed-up. float_precision : str, optional Specifies which converter the C engine should use for floating-point values. The options are `None` or `high` for the ordinary converter, From bb68cc3526abbb60009e5fcaab23897e28769376 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Tue, 18 Oct 2022 08:43:02 +0100 Subject: [PATCH 02/34] :rotating_light: add warning about dayfirst --- pandas/_libs/tslibs/parsing.pyx | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 5c93edfee79f2..74de3502b73de 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -1088,6 +1088,7 @@ def guess_datetime_format(dt_str: str, bint dayfirst=False) -> str | None: # rebuild string, capturing any inferred padding dt_str = ''.join(tokens) if parsed_datetime.strftime(guessed_format) == dt_str: + _maybe_warn_about_dayfirst(guessed_format, dayfirst) return guessed_format else: return None @@ -1106,6 +1107,26 @@ cdef str _fill_token(token: str, padding: int): token_filled = f'{seconds}.{nanoseconds}' return token_filled +cdef void _maybe_warn_about_dayfirst(format: str, bint dayfirst): + """Warn if guessed datetime format doesn't respect dayfirst argument.""" + cdef: + int day_index = format.find('%d') + int month_index = format.find('%m') + + if (day_index != -1) and (month_index != -1): + if (day_index > month_index) and dayfirst: + warnings.warn( + f"Parsing dates in {format} format when dayfirst=True was specified. " + f"Pass `dayfirst=False` or specify a format to silence this warning.", + stacklevel=find_stack_level(), + ) + if (day_index < month_index) and not dayfirst: + warnings.warn( + f"Parsing dates in {format} format when dayfirst=False was specified. " + f"Pass `dayfirst=True` or specify a format to silence this warning.", + stacklevel=find_stack_level(), + ) + @cython.wraparound(False) @cython.boundscheck(False) cdef inline object convert_to_unicode(object item, bint keep_trivial_numbers): From 82266f45d2014cb85d1ff08e53e1de4dcdac32a7 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Tue, 18 Oct 2022 08:48:37 +0100 Subject: [PATCH 03/34] :white_check_mark: add/update tests --- pandas/tests/apply/test_frame_apply.py | 3 +- pandas/tests/frame/methods/test_drop.py | 10 +- pandas/tests/frame/methods/test_to_csv.py | 10 +- .../indexes/datetimes/test_constructors.py | 12 +- pandas/tests/indexes/test_base.py | 12 +- .../io/parser/common/test_common_basic.py | 4 +- pandas/tests/io/parser/test_parse_dates.py | 101 ++++-------- .../io/parser/usecols/test_parse_dates.py | 8 +- pandas/tests/io/test_sql.py | 4 +- pandas/tests/io/xml/test_xml_dtypes.py | 2 +- pandas/tests/plotting/test_converter.py | 4 +- pandas/tests/series/methods/test_to_csv.py | 6 +- pandas/tests/tools/test_to_datetime.py | 148 +++++++----------- pandas/tests/tslibs/test_parsing.py | 18 +-- 14 files changed, 138 insertions(+), 204 deletions(-) diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 3bcb7d964fad1..28a9871b76985 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -836,7 +836,8 @@ def test_with_dictlike_columns_with_datetime(): df["author"] = ["X", "Y", "Z"] df["publisher"] = ["BBC", "NBC", "N24"] df["date"] = pd.to_datetime( - ["17-10-2010 07:15:30", "13-05-2011 08:20:35", "15-01-2013 09:09:09"] + ["17-10-2010 07:15:30", "13-05-2011 08:20:35", "15-01-2013 09:09:09"], + dayfirst=True, ) result = df.apply(lambda x: {}, axis=1) expected = Series([{}, {}, {}]) diff --git a/pandas/tests/frame/methods/test_drop.py b/pandas/tests/frame/methods/test_drop.py index 6e5b97af7c297..1b295fd10c9d5 100644 --- a/pandas/tests/frame/methods/test_drop.py +++ b/pandas/tests/frame/methods/test_drop.py @@ -405,11 +405,11 @@ def test_drop_level_nonunique_datetime(self): idx = Index([2, 3, 4, 4, 5], name="id") idxdt = pd.to_datetime( [ - "201603231400", - "201603231500", - "201603231600", - "201603231600", - "201603231700", + "2016-03-23 14:00", + "2016-03-23 15:00", + "2016-03-23 16:00", + "2016-03-23 16:00", + "2016-03-23 17:00", ] ) df = DataFrame(np.arange(10).reshape(5, 2), columns=list("ab"), index=idx) diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 1933278efb443..3b4dec8bff7f1 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -27,7 +27,7 @@ class TestDataFrameToCSV: def read_csv(self, path, **kwargs): - params = {"index_col": 0, "parse_dates": True} + params = {"index_col": 0} params.update(**kwargs) return read_csv(path, **params) @@ -46,17 +46,17 @@ def test_to_csv_from_csv1(self, float_frame, datetime_frame): # freq does not roundtrip datetime_frame.index = datetime_frame.index._with_freq(None) datetime_frame.to_csv(path) - recons = self.read_csv(path) + recons = self.read_csv(path, parse_dates=True) tm.assert_frame_equal(datetime_frame, recons) datetime_frame.to_csv(path, index_label="index") - recons = self.read_csv(path, index_col=None) + recons = self.read_csv(path, index_col=None, parse_dates=True) assert len(recons.columns) == len(datetime_frame.columns) + 1 # no index datetime_frame.to_csv(path, index=False) - recons = self.read_csv(path, index_col=None) + recons = self.read_csv(path, index_col=None, parse_dates=True) tm.assert_almost_equal(datetime_frame.values, recons.values) # corner case @@ -1056,7 +1056,7 @@ def test_to_csv_date_format(self, datetime_frame): # test NaTs nat_index = to_datetime( - ["NaT"] * 10 + ["2000-01-01", "1/1/2000", "1-1-2000"] + ["NaT"] * 10 + ["2000-01-01", "2000-01-01", "2000-01-01"] ) nat_frame = DataFrame({"A": nat_index}, index=nat_index) nat_frame.to_csv(path, date_format="%Y-%m-%d") diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py index 9914f4357cee4..c1039728f5b5e 100644 --- a/pandas/tests/indexes/datetimes/test_constructors.py +++ b/pandas/tests/indexes/datetimes/test_constructors.py @@ -1042,10 +1042,18 @@ def test_datetimeindex_constructor_misc(self): arr = np.array(["1/1/2005", "1/2/2005", "1/3/2005", "2005-01-04"], dtype="O") idx4 = DatetimeIndex(arr) - arr = to_datetime(["1/1/2005", "1/2/2005", "1/3/2005", "2005-01-04"]) + # Can't be parsed consistently, need to parse each element individually + arr = [ + to_datetime(date_string) + for date_string in ["1/1/2005", "1/2/2005", "1/3/2005", "2005-01-04"] + ] idx5 = DatetimeIndex(arr) - arr = to_datetime(["1/1/2005", "1/2/2005", "Jan 3, 2005", "2005-01-04"]) + # Can't be parsed consistently, need to parse each element individually + arr = [ + to_datetime(date_string) + for date_string in ["1/1/2005", "1/2/2005", "Jan 3, 2005", "2005-01-04"] + ] idx6 = DatetimeIndex(arr) idx7 = DatetimeIndex(["12/05/2007", "25/01/2008"], dayfirst=True) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 4b0821a50e09b..56ef410b4d94e 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1185,10 +1185,16 @@ def test_equals_op_index_vs_mi_same_length(self): expected = np.array([False, False, False]) tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize("dt_conv", [pd.to_datetime, pd.to_timedelta]) - def test_dt_conversion_preserves_name(self, dt_conv): + @pytest.mark.parametrize( + "dt_conv, arg", + [ + (pd.to_datetime, ["2000-01-01", "2000-01-02"]), + (pd.to_timedelta, ["01:02:03", "01:02:04"]), + ], + ) + def test_dt_conversion_preserves_name(self, dt_conv, arg): # GH 10875 - index = Index(["01:02:03", "01:02:04"], name="label") + index = Index(arg, name="label") assert index.name == dt_conv(index).name def test_cached_properties_not_settable(self): diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 52d8abe76ecbc..e7c4066b13640 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -58,8 +58,8 @@ def _set_noconvert_columns(self): return CParserWrapper._set_noconvert_columns(self) data = """a,b,c,d,e -0,1,20140101,0900,4 -0,1,20140102,1000,4""" +0,1,2014-01-01,09:00,4 +0,1,2014-01-02,10:00,4""" parse_dates = [[1, 2]] cols = { diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 9c8809b6099f9..b8d515a67b7fe 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -1666,9 +1666,9 @@ def test_parse_delimited_date_swap_no_warning( @pytest.mark.parametrize( "date_string,dayfirst,expected", [ - # %d/%m/%Y; month > 12 thus replacement + # %d/%m/%Y; month > 12 ("13/02/2019", False, datetime(2019, 2, 13)), - # %m/%d/%Y; day > 12 thus there will be no replacement + # %m/%d/%Y; day > 12 ("02/13/2019", True, datetime(2019, 2, 13)), ], ) @@ -1677,7 +1677,10 @@ def test_parse_delimited_date_swap_with_warning( ): parser = all_parsers expected = DataFrame({0: [expected]}, dtype="datetime64[ns]") - warning_msg = "Specify a format to ensure consistent parsing" + warning_msg = ( + "Parsing dates in .* format when dayfirst=.* was specified. " + "Pass `dayfirst=.*` or specify a format to silence this warning." + ) result = parser.read_csv_check_warnings( UserWarning, warning_msg, @@ -1691,13 +1694,11 @@ def test_parse_delimited_date_swap_with_warning( def test_parse_multiple_delimited_dates_with_swap_warnings(): # GH46210 - warning_msg = "Specify a format to ensure consistent parsing" - with tm.assert_produces_warning(UserWarning, match=warning_msg) as record: + with pytest.raises( + ValueError, + match=r"^time data '31/05/2000' does not match format '%m/%d/%Y' \(match\)$", + ): pd.to_datetime(["01/01/2000", "31/05/2000", "31/05/2001", "01/02/2000"]) - assert len({str(warning.message) for warning in record}) == 1 - # Using set(record) as repetitions of the same warning are suppressed - # https://docs.python.org/3/library/warnings.html - # and here we care to check that the warning is only shows once to users. def _helper_hypothesis_delimited_date(call, date_string, **kwargs): @@ -1860,97 +1861,51 @@ def test_parse_dates_and_keep_orgin_column(all_parsers): def test_dayfirst_warnings(): # GH 12585 - warning_msg_day_first = ( - r"Parsing dates in DD/MM/YYYY format when dayfirst=False \(the default\) was " - r"specified. This may lead to inconsistently parsed dates! Specify a format " - r"to ensure consistent parsing." - ) - warning_msg_month_first = ( - "Parsing dates in MM/DD/YYYY format when dayfirst=True was " - "specified. This may lead to inconsistently parsed dates! Specify a format " - "to ensure consistent parsing." - ) # CASE 1: valid input input = "date\n31/12/2014\n10/03/2011" - expected_consistent = DatetimeIndex( + expected = DatetimeIndex( ["2014-12-31", "2011-03-10"], dtype="datetime64[ns]", freq=None, name="date" ) - expected_inconsistent = DatetimeIndex( - ["2014-12-31", "2011-10-03"], dtype="datetime64[ns]", freq=None, name="date" + warning_msg = ( + "Parsing dates in .* format when dayfirst=.* was specified. " + "Pass `dayfirst=.*` or specify a format to silence this warning." ) # A. dayfirst arg correct, no warning res1 = read_csv( StringIO(input), parse_dates=["date"], dayfirst=True, index_col="date" ).index - tm.assert_index_equal(expected_consistent, res1) + tm.assert_index_equal(expected, res1) - # B. dayfirst arg incorrect, warning + incorrect output - with tm.assert_produces_warning(UserWarning, match=warning_msg_day_first): + # B. dayfirst arg incorrect, warning + with tm.assert_produces_warning(UserWarning, match=warning_msg): res2 = read_csv( StringIO(input), parse_dates=["date"], dayfirst=False, index_col="date" ).index - tm.assert_index_equal(expected_inconsistent, res2) - - # C. dayfirst default arg, same as B - with tm.assert_produces_warning(UserWarning, match=warning_msg_day_first): - res3 = read_csv( - StringIO(input), parse_dates=["date"], dayfirst=False, index_col="date" - ).index - tm.assert_index_equal(expected_inconsistent, res3) - - # D. infer_datetime_format=True overrides dayfirst default - # no warning + correct result - res4 = read_csv( - StringIO(input), - parse_dates=["date"], - infer_datetime_format=True, - index_col="date", - ).index - tm.assert_index_equal(expected_consistent, res4) + tm.assert_index_equal(expected, res2) # CASE 2: invalid input # cannot consistently process with single format - # warnings *always* raised + # return to user unaltered # first in DD/MM/YYYY, second in MM/DD/YYYY input = "date\n31/12/2014\n03/30/2011" - expected = DatetimeIndex( - ["2014-12-31", "2011-03-30"], dtype="datetime64[ns]", freq=None, name="date" - ) + expected = Index(["31/12/2014", "03/30/2011"], dtype="object", name="date") # A. use dayfirst=True - with tm.assert_produces_warning(UserWarning, match=warning_msg_month_first): - res5 = read_csv( - StringIO(input), parse_dates=["date"], dayfirst=True, index_col="date" - ).index + res5 = read_csv( + StringIO(input), parse_dates=["date"], dayfirst=True, index_col="date" + ).index tm.assert_index_equal(expected, res5) # B. use dayfirst=False - with tm.assert_produces_warning(UserWarning, match=warning_msg_day_first): + with tm.assert_produces_warning(UserWarning, match=warning_msg): res6 = read_csv( StringIO(input), parse_dates=["date"], dayfirst=False, index_col="date" ).index tm.assert_index_equal(expected, res6) - # C. use dayfirst default arg, same as B - with tm.assert_produces_warning(UserWarning, match=warning_msg_day_first): - res7 = read_csv( - StringIO(input), parse_dates=["date"], dayfirst=False, index_col="date" - ).index - tm.assert_index_equal(expected, res7) - - # D. use infer_datetime_format=True - with tm.assert_produces_warning(UserWarning, match=warning_msg_day_first): - res8 = read_csv( - StringIO(input), - parse_dates=["date"], - infer_datetime_format=True, - index_col="date", - ).index - tm.assert_index_equal(expected, res8) - @pytest.mark.parametrize( "date_string, dayfirst", @@ -1973,9 +1928,11 @@ def test_dayfirst_warnings_no_leading_zero(date_string, dayfirst): expected = DatetimeIndex( ["2014-01-31"], dtype="datetime64[ns]", freq=None, name="date" ) - with tm.assert_produces_warning( - UserWarning, match=r"may lead to inconsistently parsed dates" - ): + warning_msg = ( + "Parsing dates in .* format when dayfirst=.* was specified. " + "Pass `dayfirst=.*` or specify a format to silence this warning." + ) + with tm.assert_produces_warning(UserWarning, match=warning_msg): res = read_csv( StringIO(initial_value), parse_dates=["date"], diff --git a/pandas/tests/io/parser/usecols/test_parse_dates.py b/pandas/tests/io/parser/usecols/test_parse_dates.py index 50000dab8a7aa..6d40435a4107e 100644 --- a/pandas/tests/io/parser/usecols/test_parse_dates.py +++ b/pandas/tests/io/parser/usecols/test_parse_dates.py @@ -31,8 +31,8 @@ def test_usecols_with_parse_dates(all_parsers, usecols): # see gh-9755 data = """a,b,c,d,e -0,1,20140101,0900,4 -0,1,20140102,1000,4""" +0,1,2014-01-01,09:00,4 +0,1,2014-01-02,10:00,4""" parser = all_parsers parse_dates = [[1, 2]] @@ -138,8 +138,8 @@ def test_usecols_with_parse_dates4(all_parsers): ) def test_usecols_with_parse_dates_and_names(all_parsers, usecols, names): # see gh-9755 - s = """0,1,20140101,0900,4 -0,1,20140102,1000,4""" + s = """0,1,2014-01-01,09:00,4 +0,1,2014-01-02,10:00,4""" parse_dates = [[1, 2]] parser = all_parsers diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 9adada8afb2c2..129d6f89fd019 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -1386,7 +1386,7 @@ def test_sqlalchemy_type_mapping(self): # Test Timestamp objects (no datetime64 because of timezone) (GH9085) df = DataFrame( - {"time": to_datetime(["201412120154", "201412110254"], utc=True)} + {"time": to_datetime(["2014-12-12 01:54", "2014-12-11 02:54"], utc=True)} ) db = sql.SQLDatabase(self.conn) table = sql.SQLTable("test_type", db, frame=df) @@ -1595,7 +1595,7 @@ def test_sqlite_type_mapping(self): # Test Timestamp objects (no datetime64 because of timezone) (GH9085) df = DataFrame( - {"time": to_datetime(["201412120154", "201412110254"], utc=True)} + {"time": to_datetime(["2014-12-12 01:54", "2014-12-11 02:54"], utc=True)} ) db = sql.SQLiteDatabase(self.conn) table = sql.SQLiteTable("test_type", db, frame=df) diff --git a/pandas/tests/io/xml/test_xml_dtypes.py b/pandas/tests/io/xml/test_xml_dtypes.py index 5629830767c3c..7b2ffbc7cda5e 100644 --- a/pandas/tests/io/xml/test_xml_dtypes.py +++ b/pandas/tests/io/xml/test_xml_dtypes.py @@ -457,7 +457,7 @@ def test_day_first_parse_dates(parser): ) with tm.assert_produces_warning( - UserWarning, match="Parsing dates in DD/MM/YYYY format" + UserWarning, match="Parsing dates in %d/%m/%Y format" ): df_result = read_xml(xml, parse_dates=["date"], parser=parser) df_iter = read_xml_iterparse( diff --git a/pandas/tests/plotting/test_converter.py b/pandas/tests/plotting/test_converter.py index 9a6fed1afad1f..87d5aaf0c3205 100644 --- a/pandas/tests/plotting/test_converter.py +++ b/pandas/tests/plotting/test_converter.py @@ -161,8 +161,8 @@ def dtc(self): return converter.DatetimeConverter() def test_convert_accepts_unicode(self, dtc): - r1 = dtc.convert("12:22", None, None) - r2 = dtc.convert("12:22", None, None) + r1 = dtc.convert("2000-01-01 12:22", None, None) + r2 = dtc.convert("2000-01-01 12:22", None, None) assert r1 == r2, "DatetimeConverter.convert should accept unicode" def test_conversion(self, dtc): diff --git a/pandas/tests/series/methods/test_to_csv.py b/pandas/tests/series/methods/test_to_csv.py index 28519fc9b529f..7827483644634 100644 --- a/pandas/tests/series/methods/test_to_csv.py +++ b/pandas/tests/series/methods/test_to_csv.py @@ -13,7 +13,7 @@ class TestSeriesToCSV: def read_csv(self, path, **kwargs): - params = {"index_col": 0, "header": None, "parse_dates": True} + params = {"index_col": 0, "header": None} params.update(**kwargs) header = params.get("header") @@ -30,7 +30,7 @@ def test_from_csv(self, datetime_series, string_series): with tm.ensure_clean() as path: datetime_series.to_csv(path, header=False) - ts = self.read_csv(path) + ts = self.read_csv(path, parse_dates=True) tm.assert_series_equal(datetime_series, ts, check_names=False) assert ts.name is None @@ -55,7 +55,7 @@ def test_from_csv(self, datetime_series, string_series): with open(path, "w") as outfile: outfile.write("1998-01-01|1.0\n1999-01-01|2.0") - series = self.read_csv(path, sep="|") + series = self.read_csv(path, sep="|", parse_dates=True) check_series = Series( {datetime(1998, 1, 1): 1.0, datetime(1999, 1, 1): 2.0} ) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index f524bc18793d8..286036440073f 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -219,7 +219,6 @@ def test_to_datetime_format_YYYYMMDD_overflow(self, input_s, expected): ), (["201010", pd.NA], "%y%m%d", DatetimeIndex(["2020-10-10", "NaT"])), (["201010", pd.NA], "%d%m%y", DatetimeIndex(["2010-10-20", "NaT"])), - (["201010", pd.NA], None, DatetimeIndex(["2010-10-20", "NaT"])), ([None, np.nan, pd.NA], None, DatetimeIndex(["NaT", "NaT", "NaT"])), ([None, np.nan, pd.NA], "%Y%m%d", DatetimeIndex(["NaT", "NaT", "NaT"])), ], @@ -463,14 +462,14 @@ def test_to_datetime_parse_timezone_keeps_name(self): class TestToDatetime: def test_to_datetime_mixed_datetime_and_string(self): # GH#47018 adapted old doctest with new behavior - d1 = datetime(2020, 1, 1, 17, tzinfo=timezone(-timedelta(hours=1))) d2 = datetime(2020, 1, 1, 18, tzinfo=timezone(-timedelta(hours=1))) - res = to_datetime(["2020-01-01 17:00 -0100", d2]) - expected = to_datetime([d1, d2]).tz_convert(pytz.FixedOffset(-60)) - tm.assert_index_equal(res, expected) + with pytest.raises( + ValueError, + match=r"time data '.*' does not match format '%Y-%m-%d %H:%M %z' \(match\)", + ): + to_datetime(["2020-01-01 17:00 -0100", d2]) - @pytest.mark.parametrize("infer_datetime_format", [True, False]) - def test_to_datetime_np_str(self, infer_datetime_format): + def test_to_datetime_np_str(self): # GH#32264 # GH#48969 value = np.str_("2019-02-04 10:18:46.297000+0000") @@ -482,11 +481,11 @@ def test_to_datetime_np_str(self, infer_datetime_format): assert to_datetime(value) == exp assert to_datetime(ser.iloc[0]) == exp - res = to_datetime([value], infer_datetime_format=infer_datetime_format) + res = to_datetime([value]) expected = Index([exp]) tm.assert_index_equal(res, expected) - res = to_datetime(ser, infer_datetime_format=infer_datetime_format) + res = to_datetime(ser) expected = Series(expected) tm.assert_series_equal(res, expected) @@ -927,7 +926,10 @@ def test_datetime_bool_arrays_mixed(self, cache): msg = f"{type(cache)} is not convertible to datetime" with pytest.raises(TypeError, match=msg): to_datetime([False, datetime.today()], cache=cache) - with pytest.raises(TypeError, match=msg): + with pytest.raises( + ValueError, + match=r"^time data 'True' does not match format '%Y%m%d' \(match\)$", + ): to_datetime(["20130101", True], cache=cache) tm.assert_index_equal( to_datetime([0, False, NaT, 0.0], errors="coerce", cache=cache), @@ -1071,8 +1073,7 @@ def test_to_datetime_cache_scalar(self): (None,) + (NaT,) * start_caching_at + ("2012 July 26", Timestamp("2012-07-26")), - (NaT,) * (start_caching_at + 1) - + (Timestamp("2012-07-26"), Timestamp("2012-07-26")), + (NaT,) * (start_caching_at + 1) + (Timestamp("2012-07-26"), NaT), ), ), ) @@ -1153,7 +1154,6 @@ def test_to_datetime_coerce(self): ) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("infer_datetime_format", [True, False]) @pytest.mark.parametrize( "errors, expected", [ @@ -1224,15 +1224,18 @@ def test_iso_8601_strings_with_different_offsets_utc(self): def test_iso8601_strings_mixed_offsets_with_naive(self): # GH 24992 - result = to_datetime( + # Can't parse consistently, need to parse each element in loop. + result = DatetimeIndex( [ - "2018-11-28T00:00:00", - "2018-11-28T00:00:00+12:00", - "2018-11-28T00:00:00", - "2018-11-28T00:00:00+06:00", - "2018-11-28T00:00:00", - ], - utc=True, + to_datetime(string, utc=True) + for string in [ + "2018-11-28T00:00:00", + "2018-11-28T00:00:00+12:00", + "2018-11-28T00:00:00", + "2018-11-28T00:00:00+06:00", + "2018-11-28T00:00:00", + ] + ] ) expected = to_datetime( [ @@ -1248,9 +1251,10 @@ def test_iso8601_strings_mixed_offsets_with_naive(self): def test_iso8601_strings_mixed_offsets_with_naive_reversed(self): items = ["2018-11-28T00:00:00+12:00", "2018-11-28T00:00:00"] - result = to_datetime(items, utc=True) - expected = to_datetime(list(reversed(items)), utc=True)[::-1] - tm.assert_index_equal(result, expected) + # Can't parse consistently, need to parse each element in loop. + result = [to_datetime(item, utc=True) for item in items] + expected = [to_datetime(item, utc=True) for item in list(reversed(items))][::-1] + assert result == expected def test_mixed_offsets_with_native_datetime_raises(self): # GH 25978 @@ -1778,7 +1782,7 @@ def test_to_datetime_on_datetime64_series(self, cache): def test_to_datetime_with_space_in_series(self, cache): # GH 6428 ser = Series(["10/18/2006", "10/18/2008", " "]) - msg = r"(\(')?String does not contain a date(:', ' '\))?" + msg = r"^time data ' ' does not match format '%m/%d/%Y' \(match\)$" with pytest.raises(ValueError, match=msg): to_datetime(ser, errors="raise", cache=cache) result_coerce = to_datetime(ser, errors="coerce", cache=cache) @@ -1838,7 +1842,7 @@ def test_to_datetime_strings(self, cache): def test_to_datetime_strings_variation(self, cache): array = ["2012", "20120101", "20120101 12:01:01"] - expected = list(to_datetime(array, cache=cache)) + expected = [to_datetime(dt_str, cache=cache) for dt_str in array] result = [Timestamp(date_str) for date_str in array] tm.assert_almost_equal(result, expected) @@ -1908,7 +1912,10 @@ def test_string_na_nat_conversion(self, cache): result = tslib.array_to_datetime(strings)[0] tm.assert_almost_equal(result, expected) - result2 = to_datetime(strings, cache=cache) + # Can't parse in consistent format, so need to convert each individually. + result2 = DatetimeIndex( + [to_datetime(string, cache=cache) for string in strings] + ) assert isinstance(result2, DatetimeIndex) tm.assert_numpy_array_equal(result, result2.values) @@ -2011,80 +2018,39 @@ def test_dayfirst(self, cache): def test_dayfirst_warnings_valid_input(self): # GH 12585 - warning_msg_day_first = ( - r"Parsing dates in DD/MM/YYYY format when dayfirst=False \(the default\) " - "was specified. This may lead to inconsistently parsed dates! Specify a " - "format to ensure consistent parsing." + warning_msg = ( + "Parsing dates in .* format when dayfirst=.* was specified. " + "Pass `dayfirst=.*` or specify a format to silence this warning." ) # CASE 1: valid input arr = ["31/12/2014", "10/03/2011"] - expected_consistent = DatetimeIndex( + expected = DatetimeIndex( ["2014-12-31", "2011-03-10"], dtype="datetime64[ns]", freq=None ) - expected_inconsistent = DatetimeIndex( - ["2014-12-31", "2011-10-03"], dtype="datetime64[ns]", freq=None - ) # A. dayfirst arg correct, no warning res1 = to_datetime(arr, dayfirst=True) - tm.assert_index_equal(expected_consistent, res1) + tm.assert_index_equal(expected, res1) - # B. dayfirst arg incorrect, warning + incorrect output - with tm.assert_produces_warning(UserWarning, match=warning_msg_day_first): + # B. dayfirst arg incorrect, warning + with tm.assert_produces_warning(UserWarning, match=warning_msg): res2 = to_datetime(arr, dayfirst=False) - tm.assert_index_equal(expected_inconsistent, res2) - - # C. dayfirst default arg, same as B - with tm.assert_produces_warning(UserWarning, match=warning_msg_day_first): - res3 = to_datetime(arr, dayfirst=False) - tm.assert_index_equal(expected_inconsistent, res3) - - # D. infer_datetime_format=True overrides dayfirst default - # no warning + correct result - res4 = to_datetime(arr, infer_datetime_format=True) - tm.assert_index_equal(expected_consistent, res4) + tm.assert_index_equal(expected, res2) def test_dayfirst_warnings_invalid_input(self): # CASE 2: invalid input # cannot consistently process with single format - # warnings *always* raised - warning_msg_day_first = ( - r"Parsing dates in DD/MM/YYYY format when dayfirst=False \(the default\) " - "was specified. This may lead to inconsistently parsed dates! Specify a " - "format to ensure consistent parsing." - ) - warning_msg_month_first = ( - r"Parsing dates in MM/DD/YYYY format when dayfirst=True " - "was specified. This may lead to inconsistently parsed dates! Specify a " - "format to ensure consistent parsing." - ) + # ValueError *always* raised - arr = ["31/12/2014", "03/30/2011"] # first in DD/MM/YYYY, second in MM/DD/YYYY - expected = DatetimeIndex( - ["2014-12-31", "2011-03-30"], dtype="datetime64[ns]", freq=None - ) - - # A. use dayfirst=True - with tm.assert_produces_warning(UserWarning, match=warning_msg_month_first): - res5 = to_datetime(arr, dayfirst=True) - tm.assert_index_equal(expected, res5) - - # B. use dayfirst=False - with tm.assert_produces_warning(UserWarning, match=warning_msg_day_first): - res6 = to_datetime(arr, dayfirst=False) - tm.assert_index_equal(expected, res6) - - # C. use dayfirst default arg, same as B - with tm.assert_produces_warning(UserWarning, match=warning_msg_day_first): - res7 = to_datetime(arr, dayfirst=False) - tm.assert_index_equal(expected, res7) + arr = ["31/12/2014", "03/30/2011"] - # D. use infer_datetime_format=True - with tm.assert_produces_warning(UserWarning, match=warning_msg_day_first): - res8 = to_datetime(arr, infer_datetime_format=True) - tm.assert_index_equal(expected, res8) + with pytest.raises( + ValueError, + match=r"time data '03/30/2011' does not match format '%d/%m/%Y' \(match\)$", + ): + to_datetime(arr, dayfirst=True) @pytest.mark.parametrize("klass", [DatetimeIndex, DatetimeArray]) def test_to_datetime_dta_tz(self, klass): @@ -2139,12 +2105,8 @@ def test_to_datetime_infer_datetime_format_consistent_format( s_as_dt_strings = ser.apply(lambda x: x.strftime(test_format)) with_format = to_datetime(s_as_dt_strings, format=test_format, cache=cache) - no_infer = to_datetime( - s_as_dt_strings, infer_datetime_format=False, cache=cache - ) - yes_infer = to_datetime( - s_as_dt_strings, infer_datetime_format=True, cache=cache - ) + no_infer = to_datetime(s_as_dt_strings, cache=cache) + yes_infer = to_datetime(s_as_dt_strings, cache=cache) # Whether the format is explicitly passed, it is inferred, or # it is not inferred, the results should all be the same @@ -2223,7 +2185,7 @@ def test_infer_datetime_format_tz_name(self, tz_name, offset): def test_infer_datetime_format_zero_tz(self, ts, zero_tz): # GH 41047 ser = Series([ts + zero_tz]) - result = to_datetime(ser, infer_datetime_format=True) + result = to_datetime(ser) tz = pytz.utc if zero_tz == "Z" else None expected = Series([Timestamp(ts, tz=tz)]) tm.assert_series_equal(result, expected) @@ -2782,9 +2744,9 @@ def test_empty_string_datetime_coerce_format(): with pytest.raises(ValueError, match="does not match format"): to_datetime(td, format=format, errors="raise") - # don't raise an exception in case no format is given - result = to_datetime(td, errors="raise") - tm.assert_series_equal(result, expected) + # still raise an exception in case no format is given + with pytest.raises(ValueError, match="does not match format"): + to_datetime(td, errors="raise") def test_empty_string_datetime_coerce__unit(): diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py index a4e12315d34e0..49d83a8fa5c56 100644 --- a/pandas/tests/tslibs/test_parsing.py +++ b/pandas/tests/tslibs/test_parsing.py @@ -235,19 +235,19 @@ def test_guess_datetime_format_wrong_type_inputs(invalid_type_dt): @pytest.mark.parametrize( - "string,fmt", + "string,fmt,dayfirst", [ - ("2011-1-1", "%Y-%m-%d"), - ("1/1/2011", "%m/%d/%Y"), - ("30-1-2011", "%d-%m-%Y"), - ("2011-1-1 0:0:0", "%Y-%m-%d %H:%M:%S"), - ("2011-1-3T00:00:0", "%Y-%m-%dT%H:%M:%S"), - ("2011-1-1 00:00:00", "%Y-%m-%d %H:%M:%S"), + ("2011-1-1", "%Y-%m-%d", False), + ("1/1/2011", "%m/%d/%Y", False), + ("30-1-2011", "%d-%m-%Y", True), + ("2011-1-1 0:0:0", "%Y-%m-%d %H:%M:%S", False), + ("2011-1-3T00:00:0", "%Y-%m-%dT%H:%M:%S", False), + ("2011-1-1 00:00:00", "%Y-%m-%d %H:%M:%S", False), ], ) -def test_guess_datetime_format_no_padding(string, fmt): +def test_guess_datetime_format_no_padding(string, fmt, dayfirst): # see gh-11142 - result = parsing.guess_datetime_format(string) + result = parsing.guess_datetime_format(string, dayfirst=dayfirst) assert result == fmt From 4a6f19856f674dfdd3b5cc8248548ab121786801 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Tue, 18 Oct 2022 09:16:00 +0100 Subject: [PATCH 04/34] :rotating_light: add warning if format cant be guessed --- pandas/core/tools/datetimes.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 5760952ba7324..09729c2aab22c 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -132,7 +132,16 @@ def _guess_datetime_format_for_array(arr, dayfirst: bool | None = False) -> str if (first_non_null := tslib.first_non_null(arr)) != -1: if type(first_non_nan_element := arr[first_non_null]) is str: # GH#32264 np.str_ object - return guess_datetime_format(first_non_nan_element, dayfirst=dayfirst) + guessed_format = guess_datetime_format( + first_non_nan_element, dayfirst=dayfirst + ) + if guessed_format is not None: + return guessed_format + warnings.warn( + "Could not infer format - " + "to ensure consistent parsing, specify a format.", + stacklevel=find_stack_level(), + ) return None From 5568dca44d1f7a5267dcbfdcf843a5222f088258 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Tue, 18 Oct 2022 09:22:00 +0100 Subject: [PATCH 05/34] :goal_net: catch warnings --- pandas/core/tools/datetimes.py | 2 +- pandas/tests/frame/methods/test_to_csv.py | 5 +- pandas/tests/groupby/test_function.py | 3 +- .../tests/groupby/transform/test_transform.py | 3 +- pandas/tests/io/excel/test_readers.py | 13 +- pandas/tests/io/parser/test_parse_dates.py | 98 ++++- .../io/parser/usecols/test_parse_dates.py | 8 +- pandas/tests/test_algos.py | 3 +- pandas/tests/tools/test_to_datetime.py | 391 +++++++++--------- 9 files changed, 309 insertions(+), 217 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 09729c2aab22c..41feb153978d4 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -1002,7 +1002,7 @@ def to_datetime( are constant: >>> from datetime import datetime - >>> pd.to_datetime(["2020-01-01 01:00 -01:00", datetime(2020, 1, 1, 3, 0)]) + >>> pd.to_datetime(["2020-01-01 01:00:00-01:00", datetime(2020, 1, 1, 3, 0)]) DatetimeIndex(['2020-01-01 01:00:00-01:00', '2020-01-01 02:00:00-01:00'], dtype='datetime64[ns, pytz.FixedOffset(-60)]', freq=None) diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 3b4dec8bff7f1..3985bd40daac5 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -514,7 +514,10 @@ def test_to_csv_multiindex(self, float_frame, datetime_frame): tsframe.index = MultiIndex.from_arrays(new_index) tsframe.to_csv(path, index_label=["time", "foo"]) - recons = self.read_csv(path, index_col=[0, 1]) + with tm.assert_produces_warning( + UserWarning, match="Could not infer format" + ): + recons = self.read_csv(path, index_col=[0, 1], parse_dates=True) # TODO to_csv drops column name tm.assert_frame_equal(tsframe, recons, check_names=False) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index cdbb121819c5e..ed63d41a74ae6 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -717,7 +717,8 @@ def test_max_nan_bug(): -05-06,2013-05-06 00:00:00,,log.log -05-07,2013-05-07 00:00:00,OE,xlsx""" - df = pd.read_csv(StringIO(raw), parse_dates=[0]) + with tm.assert_produces_warning(UserWarning, match="Could not infer format"): + df = pd.read_csv(StringIO(raw), parse_dates=[0]) gb = df.groupby("Date") r = gb[["File"]].max() e = gb["File"].max().to_frame() diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 8a2bd64a3deb0..d52de4d0658ef 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -1070,7 +1070,8 @@ def demean_rename(x): @pytest.mark.parametrize("func", [min, max, np.min, np.max, "first", "last"]) def test_groupby_transform_timezone_column(func): # GH 24198 - ts = pd.to_datetime("now", utc=True).tz_convert("Asia/Singapore") + with tm.assert_produces_warning(UserWarning, match="Could not infer format"): + ts = pd.to_datetime("now", utc=True).tz_convert("Asia/Singapore") result = DataFrame({"end_time": [ts], "id": [1]}) result["max_end_time"] = result.groupby("id").end_time.transform(func) expected = DataFrame([[ts, 1, ts]], columns=["end_time", "id", "max_end_time"]) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index fa1d6bbfd5a7e..8f937ad6b401a 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -888,11 +888,18 @@ def test_reader_seconds(self, request, engine, read_ext): ] } ) - - actual = pd.read_excel("times_1900" + read_ext, sheet_name="Sheet1") + if engine == "odf": + # odf recognises cell type as time (from its attribute) + # so tries to parse it. + warning = UserWarning + else: + warning = None + with tm.assert_produces_warning(warning, match="Could not infer format"): + actual = pd.read_excel("times_1900" + read_ext, sheet_name="Sheet1") tm.assert_frame_equal(actual, expected) - actual = pd.read_excel("times_1904" + read_ext, sheet_name="Sheet1") + with tm.assert_produces_warning(warning, match="Could not infer format"): + actual = pd.read_excel("times_1904" + read_ext, sheet_name="Sheet1") tm.assert_frame_equal(actual, expected) def test_read_excel_multiindex(self, request, read_ext): diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index b8d515a67b7fe..c3feb03936686 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -826,7 +826,13 @@ def test_yy_format_with_year_first(all_parsers, parse_dates): 090331,0830,5,6 """ parser = all_parsers - result = parser.read_csv(StringIO(data), index_col=0, parse_dates=parse_dates) + result = parser.read_csv_check_warnings( + UserWarning, + "Could not infer format", + StringIO(data), + index_col=0, + parse_dates=parse_dates, + ) index = DatetimeIndex( [ datetime(2009, 1, 31, 0, 10, 0), @@ -899,7 +905,13 @@ def test_multi_index_parse_dates(all_parsers, index_col): columns=["A", "B", "C"], index=index, ) - result = parser.read_csv(StringIO(data), index_col=index_col, parse_dates=True) + result = parser.read_csv_check_warnings( + UserWarning, + "Could not infer format", + StringIO(data), + index_col=index_col, + parse_dates=True, + ) tm.assert_frame_equal(result, expected) @@ -1232,19 +1244,55 @@ def test_read_with_parse_dates_invalid_type(all_parsers, parse_dates): @pytest.mark.parametrize("cache_dates", [True, False]) -@pytest.mark.parametrize("value", ["nan", "0", ""]) +@pytest.mark.parametrize("value", ["nan", ""]) def test_bad_date_parse(all_parsers, cache_dates, value): # if we have an invalid date make sure that we handle this with # and w/o the cache properly parser = all_parsers s = StringIO((f"{value},\n") * 50000) - parser.read_csv( + if parser.engine == "pyarrow": + # None in input gets converted to 'None', for which + # pandas tries to guess the datetime format, triggering + # the warning. TODO: parse dates directly in pyarrow, see + # https://github.com/pandas-dev/pandas/issues/48017 + warn = UserWarning + else: + warn = None + parser.read_csv_check_warnings( + warn, + "Could not infer format", + s, + header=None, + names=["foo", "bar"], + parse_dates=["foo"], + cache_dates=cache_dates, + ) + + +@pytest.mark.parametrize("cache_dates", [True, False]) +@pytest.mark.parametrize("value", ["0"]) +def test_bad_date_parse_with_warning(all_parsers, cache_dates, value): + # if we have an invalid date make sure that we handle this with + # and w/o the cache properly. + parser = all_parsers + s = StringIO((f"{value},\n") * 50000) + + if parser.engine == "pyarrow": + # pyarrow reads "0" as 0 (of type int64), and so + # pandas doesn't try to guess the datetime format + # TODO: parse dates directly in pyarrow, see + # https://github.com/pandas-dev/pandas/issues/48017 + warn = None + else: + warn = UserWarning + parser.read_csv_check_warnings( + warn, + "Could not infer format", s, header=None, names=["foo", "bar"], parse_dates=["foo"], - infer_datetime_format=False, cache_dates=cache_dates, ) @@ -1262,6 +1310,19 @@ def test_parse_dates_empty_string(all_parsers): tm.assert_frame_equal(result, expected) +def test_parse_dates_infer_datetime_format_warning(all_parsers): + # GH 49024 + parser = all_parsers + data = "Date,test\n2012-01-01,1\n,2" + parser.read_csv_check_warnings( + UserWarning, + "The argument 'infer_datetime_format' is deprecated", + StringIO(data), + parse_dates=["Date"], + infer_datetime_format=True, + ) + + @xfail_pyarrow @pytest.mark.parametrize( "data,kwargs,expected", @@ -1635,7 +1696,13 @@ def test_parse_timezone(all_parsers): def test_invalid_parse_delimited_date(all_parsers, date_string): parser = all_parsers expected = DataFrame({0: [date_string]}, dtype="object") - result = parser.read_csv(StringIO(date_string), header=None, parse_dates=[0]) + result = parser.read_csv_check_warnings( + UserWarning, + "Could not infer format", + StringIO(date_string), + header=None, + parse_dates=[0], + ) tm.assert_frame_equal(result, expected) @@ -1786,7 +1853,13 @@ def test_date_parser_and_names(all_parsers): # GH#33699 parser = all_parsers data = StringIO("""x,y\n1,2""") - result = parser.read_csv(data, parse_dates=["B"], names=["B"]) + result = parser.read_csv_check_warnings( + UserWarning, + "Could not infer format", + data, + parse_dates=["B"], + names=["B"], + ) expected = DataFrame({"B": ["y", "2"]}, index=["x", "1"]) tm.assert_frame_equal(result, expected) @@ -1833,7 +1906,9 @@ def test_date_parser_usecols_thousands(all_parsers): """ parser = all_parsers - result = parser.read_csv( + result = parser.read_csv_check_warnings( + UserWarning, + "Could not infer format", StringIO(data), parse_dates=[1], usecols=[1, 2], @@ -1947,7 +2022,12 @@ def test_infer_first_column_as_index(all_parsers): # GH#11019 parser = all_parsers data = "a,b,c\n1970-01-01,2,3,4" - result = parser.read_csv(StringIO(data), parse_dates=["a"]) + result = parser.read_csv_check_warnings( + UserWarning, + "Could not infer format", + StringIO(data), + parse_dates=["a"], + ) expected = DataFrame({"a": "2", "b": 3, "c": 4}, index=["1970-01-01"]) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/usecols/test_parse_dates.py b/pandas/tests/io/parser/usecols/test_parse_dates.py index 6d40435a4107e..4823df1da9959 100644 --- a/pandas/tests/io/parser/usecols/test_parse_dates.py +++ b/pandas/tests/io/parser/usecols/test_parse_dates.py @@ -124,7 +124,13 @@ def test_usecols_with_parse_dates4(all_parsers): } expected = DataFrame(cols, columns=["a_b"] + list("cdefghij")) - result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates) + result = parser.read_csv_check_warnings( + UserWarning, + "Could not infer format", + StringIO(data), + usecols=usecols, + parse_dates=parse_dates, + ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 80271c13cd35d..b3f0f40be2d78 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1212,7 +1212,8 @@ def test_value_counts_datetime_outofbounds(self): tm.assert_series_equal(res, exp) # GH 12424 - res = to_datetime(Series(["2362-01-01", np.nan]), errors="ignore") + with tm.assert_produces_warning(UserWarning, match="Could not infer format"): + res = to_datetime(Series(["2362-01-01", np.nan]), errors="ignore") exp = Series(["2362-01-01", np.nan], dtype=object) tm.assert_series_equal(res, exp) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 286036440073f..a2871e79dc7d9 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -228,6 +228,13 @@ def test_to_datetime_with_NA(self, data, format, expected): result = to_datetime(data, format=format) tm.assert_index_equal(result, expected) + def test_to_datetime_with_NA_with_warning(self): + # GH#42957 + with tm.assert_produces_warning(UserWarning, match="Could not infer format"): + result = to_datetime(["201010", pd.NA]) + expected = DatetimeIndex(["2010-10-20", "NaT"]) + tm.assert_index_equal(result, expected) + def test_to_datetime_format_integer(self, cache): # GH 10178 ser = Series([2000, 2001, 2002]) @@ -345,7 +352,6 @@ def test_to_datetime_with_non_exact(self, cache): ], ) def test_parse_nanoseconds_with_formula(self, cache, arg): - # GH8989 # truncating the nanoseconds when a format was provided expected = to_datetime(arg, cache=cache) @@ -619,15 +625,16 @@ def test_to_datetime_YYYYMMDD(self): def test_to_datetime_unparsable_ignore(self): # unparsable ser = "Month 1, 1999" - assert to_datetime(ser, errors="ignore") == ser + with tm.assert_produces_warning(UserWarning, match="Could not infer format"): + assert to_datetime(ser, errors="ignore") == ser @td.skip_if_windows # `tm.set_timezone` does not work in windows def test_to_datetime_now(self): # See GH#18666 with tm.set_timezone("US/Eastern"): - msg = "The parsing of 'now' in pd.to_datetime" + msg = "The parsing of 'now' in pd.to_datetime|Could not infer format" with tm.assert_produces_warning( - FutureWarning, match=msg, check_stacklevel=False + (FutureWarning, UserWarning), match=msg, check_stacklevel=False ): # checking stacklevel is tricky because we go through cython code # GH#18705 @@ -654,8 +661,11 @@ def test_to_datetime_today(self, tz): # so this test will not detect the regression introduced in #18666. with tm.set_timezone(tz): nptoday = np.datetime64("today").astype("datetime64[ns]").astype(np.int64) - pdtoday = to_datetime("today") - pdtoday2 = to_datetime(["today"])[0] + with tm.assert_produces_warning( + UserWarning, match="Could not infer format" + ): + pdtoday = to_datetime("today") + pdtoday2 = to_datetime(["today"])[0] tstoday = Timestamp("today") tstoday2 = Timestamp.today() @@ -672,8 +682,8 @@ def test_to_datetime_today(self, tz): @pytest.mark.parametrize("arg", ["now", "today"]) def test_to_datetime_today_now_unicode_bytes(self, arg): - warn = FutureWarning if arg == "now" else None - msg = "The parsing of 'now' in pd.to_datetime" + warn = (FutureWarning, UserWarning) if arg == "now" else UserWarning + msg = "The parsing of 'now' in pd.to_datetime|Could not infer format" with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False): # checking stacklevel is tricky because we go through cython code # GH#18705 @@ -946,18 +956,17 @@ def test_datetime_invalid_datatype(self, arg): to_datetime(arg) @pytest.mark.parametrize("value", ["a", "00:01:99"]) - @pytest.mark.parametrize("infer", [True, False]) - @pytest.mark.parametrize("format", [None, "H%:M%:S%"]) - def test_datetime_invalid_scalar(self, value, format, infer): + @pytest.mark.parametrize( + "format,warning", [(None, UserWarning), ("H%:M%:S%", None)] + ) + def test_datetime_invalid_scalar(self, value, format, warning): # GH24763 - res = to_datetime( - value, errors="ignore", format=format, infer_datetime_format=infer - ) + with tm.assert_produces_warning(warning, match="Could not infer format"): + res = to_datetime(value, errors="ignore", format=format) assert res == value - res = to_datetime( - value, errors="coerce", format=format, infer_datetime_format=infer - ) + with tm.assert_produces_warning(warning, match="Could not infer format"): + res = to_datetime(value, errors="coerce", format=format) assert res is NaT msg = ( @@ -966,51 +975,46 @@ def test_datetime_invalid_scalar(self, value, format, infer): f"Given date string {value} not likely a datetime" ) with pytest.raises(ValueError, match=msg): - to_datetime( - value, errors="raise", format=format, infer_datetime_format=infer - ) + with tm.assert_produces_warning(warning, match="Could not infer format"): + to_datetime(value, errors="raise", format=format) @pytest.mark.parametrize("value", ["3000/12/11 00:00:00"]) - @pytest.mark.parametrize("infer", [True, False]) - @pytest.mark.parametrize("format", [None, "H%:M%:S%"]) - def test_datetime_outofbounds_scalar(self, value, format, infer): + @pytest.mark.parametrize( + "format,warning", [(None, UserWarning), ("H%:M%:S%", None)] + ) + def test_datetime_outofbounds_scalar(self, value, format, warning): # GH24763 - res = to_datetime( - value, errors="ignore", format=format, infer_datetime_format=infer - ) + with tm.assert_produces_warning(warning, match="Could not infer format"): + res = to_datetime(value, errors="ignore", format=format) assert res == value - res = to_datetime( - value, errors="coerce", format=format, infer_datetime_format=infer - ) + with tm.assert_produces_warning(warning, match="Could not infer format"): + res = to_datetime(value, errors="coerce", format=format) assert res is NaT if format is not None: msg = "is a bad directive in format|Out of bounds .* present at position 0" with pytest.raises(ValueError, match=msg): - to_datetime( - value, errors="raise", format=format, infer_datetime_format=infer - ) + to_datetime(value, errors="raise", format=format) else: msg = "Out of bounds .* present at position 0" - with pytest.raises(OutOfBoundsDatetime, match=msg): - to_datetime( - value, errors="raise", format=format, infer_datetime_format=infer - ) + with pytest.raises( + OutOfBoundsDatetime, match=msg + ), tm.assert_produces_warning(warning, match="Could not infer format"): + to_datetime(value, errors="raise", format=format) @pytest.mark.parametrize("values", [["a"], ["00:01:99"], ["a", "b", "99:00:00"]]) - @pytest.mark.parametrize("infer", [True, False]) - @pytest.mark.parametrize("format", [None, "H%:M%:S%"]) - def test_datetime_invalid_index(self, values, format, infer): + @pytest.mark.parametrize( + "format,warning", [(None, UserWarning), ("H%:M%:S%", None)] + ) + def test_datetime_invalid_index(self, values, format, warning): # GH24763 - res = to_datetime( - values, errors="ignore", format=format, infer_datetime_format=infer - ) + with tm.assert_produces_warning(warning, match="Could not infer format"): + res = to_datetime(values, errors="ignore", format=format) tm.assert_index_equal(res, Index(values)) - res = to_datetime( - values, errors="coerce", format=format, infer_datetime_format=infer - ) + with tm.assert_produces_warning(warning, match="Could not infer format"): + res = to_datetime(values, errors="coerce", format=format) tm.assert_index_equal(res, DatetimeIndex([NaT] * len(values))) msg = ( @@ -1019,9 +1023,8 @@ def test_datetime_invalid_index(self, values, format, infer): "second must be in 0..59" ) with pytest.raises(ValueError, match=msg): - to_datetime( - values, errors="raise", format=format, infer_datetime_format=infer - ) + with tm.assert_produces_warning(warning, match="Could not infer format"): + to_datetime(values, errors="raise", format=format) @pytest.mark.parametrize("utc", [True, None]) @pytest.mark.parametrize("format", ["%Y%m%d %H:%M:%S", None]) @@ -1161,28 +1164,28 @@ def test_to_datetime_coerce(self): ("ignore", Index(["200622-12-31", "111111-24-11"])), ], ) - def test_to_datetime_malformed_no_raise( - self, errors, expected, infer_datetime_format - ): + def test_to_datetime_malformed_no_raise(self, errors, expected): # GH 28299 # GH 48633 ts_strings = ["200622-12-31", "111111-24-11"] - result = to_datetime( - ts_strings, errors=errors, infer_datetime_format=infer_datetime_format - ) + with tm.assert_produces_warning(UserWarning, match="Could not infer format"): + result = to_datetime(ts_strings, errors=errors) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("infer_datetime_format", [True, False]) - def test_to_datetime_malformed_raise(self, infer_datetime_format): + def test_to_datetime_malformed_raise(self): # GH 48633 ts_strings = ["200622-12-31", "111111-24-11"] with pytest.raises( ValueError, match=r"^hour must be in 0\.\.23: 111111-24-11 present at position 1$", ): - to_datetime( - ts_strings, errors="raise", infer_datetime_format=infer_datetime_format - ) + with tm.assert_produces_warning( + UserWarning, match="Could not infer format" + ): + to_datetime( + ts_strings, + errors="raise", + ) def test_iso_8601_strings_with_same_offset(self): # GH 17697, 11736 @@ -1283,7 +1286,10 @@ def test_mixed_offsets_with_native_datetime_raises(self): tm.assert_series_equal(mixed, expected) with pytest.raises(ValueError, match="Tz-aware datetime.datetime"): - to_datetime(mixed) + with tm.assert_produces_warning( + UserWarning, match="Could not infer format" + ): + to_datetime(mixed) def test_non_iso_strings_with_tz_offset(self): result = to_datetime(["March 1, 2018 12:00:00+0400"] * 2) @@ -1409,23 +1415,26 @@ def test_unit_with_numeric(self, cache, errors, dtype): tm.assert_index_equal(result, expected) @pytest.mark.parametrize( - "exp, arr", + "exp, arr, warning", [ [ ["NaT", "2015-06-19 05:33:20", "2015-05-27 22:33:20"], ["foo", 1.434692e18, 1.432766e18], + UserWarning, ], [ ["2015-06-19 05:33:20", "2015-05-27 22:33:20", "NaT", "NaT"], [1.434692e18, 1.432766e18, "foo", "NaT"], + None, ], ], ) - def test_unit_with_numeric_coerce(self, cache, exp, arr): + def test_unit_with_numeric_coerce(self, cache, exp, arr, warning): # but we want to make sure that we are coercing # if we have ints/strings expected = DatetimeIndex(exp) - result = to_datetime(arr, errors="coerce", cache=cache) + with tm.assert_produces_warning(warning, match="Could not infer format"): + result = to_datetime(arr, errors="coerce", cache=cache) tm.assert_index_equal(result, expected) @pytest.mark.parametrize( @@ -1741,7 +1750,10 @@ def test_to_datetime_barely_out_of_bounds(self): msg = "Out of bounds .* present at position 0" with pytest.raises(OutOfBoundsDatetime, match=msg): - to_datetime(arr) + with tm.assert_produces_warning( + UserWarning, match="Could not infer format" + ): + to_datetime(arr) @pytest.mark.parametrize( "arg, exp_str", @@ -1925,15 +1937,22 @@ def test_string_na_nat_conversion_malformed(self, cache): # GH 10636, default is now 'raise' msg = r"Unknown string format:|day is out of range for month" with pytest.raises(ValueError, match=msg): - to_datetime(malformed, errors="raise", cache=cache) + with tm.assert_produces_warning( + UserWarning, match="Could not infer format" + ): + to_datetime(malformed, errors="raise", cache=cache) - result = to_datetime(malformed, errors="ignore", cache=cache) + with tm.assert_produces_warning(UserWarning, match="Could not infer format"): + result = to_datetime(malformed, errors="ignore", cache=cache) # GH 21864 expected = Index(malformed) tm.assert_index_equal(result, expected) with pytest.raises(ValueError, match=msg): - to_datetime(malformed, errors="raise", cache=cache) + with tm.assert_produces_warning( + UserWarning, match="Could not infer format" + ): + to_datetime(malformed, errors="raise", cache=cache) def test_string_na_nat_conversion_with_name(self, cache): idx = ["a", "b", "c", "d", "e"] @@ -2114,60 +2133,14 @@ def test_to_datetime_infer_datetime_format_consistent_format( tm.assert_series_equal(no_infer, yes_infer) @pytest.mark.parametrize( - "data", - [ - ["01/01/2011 00:00:00", "01-02-2011 00:00:00", "2011-01-03T00:00:00"], - ["Jan/01/2011", "Feb/01/2011", "Mar/01/2011"], - ], + "tz_name, offset, warning", + [("UTC", 0, None), ("UTC-3", 180, UserWarning), ("UTC+3", -180, UserWarning)], ) - def test_to_datetime_infer_datetime_format_inconsistent_format(self, cache, data): - ser = Series(np.array(data)) - - # When the format is inconsistent, infer_datetime_format should just - # fallback to the default parsing - tm.assert_series_equal( - to_datetime(ser, infer_datetime_format=False, cache=cache), - to_datetime(ser, infer_datetime_format=True, cache=cache), - ) - - def test_to_datetime_infer_datetime_format_series_with_nans(self, cache): - ser = Series( - np.array( - ["01/01/2011 00:00:00", np.nan, "01/03/2011 00:00:00", np.nan], - dtype=object, - ) - ) - tm.assert_series_equal( - to_datetime(ser, infer_datetime_format=False, cache=cache), - to_datetime(ser, infer_datetime_format=True, cache=cache), - ) - - def test_to_datetime_infer_datetime_format_series_start_with_nans(self, cache): - ser = Series( - np.array( - [ - np.nan, - np.nan, - "01/01/2011 00:00:00", - "01/02/2011 00:00:00", - "01/03/2011 00:00:00", - ], - dtype=object, - ) - ) - - tm.assert_series_equal( - to_datetime(ser, infer_datetime_format=False, cache=cache), - to_datetime(ser, infer_datetime_format=True, cache=cache), - ) - - @pytest.mark.parametrize( - "tz_name, offset", [("UTC", 0), ("UTC-3", 180), ("UTC+3", -180)] - ) - def test_infer_datetime_format_tz_name(self, tz_name, offset): + def test_infer_datetime_format_tz_name(self, tz_name, offset, warning): # GH 33133 ser = Series([f"2019-02-02 08:07:13 {tz_name}"]) - result = to_datetime(ser, infer_datetime_format=True) + with tm.assert_produces_warning(warning, match="Could not infer format"): + result = to_datetime(ser) expected = Series( [Timestamp("2019-02-02 08:07:13").tz_localize(pytz.FixedOffset(offset))] ) @@ -2203,26 +2176,38 @@ def test_to_datetime_iso8601_noleading_0s(self, cache, format): ) tm.assert_series_equal(to_datetime(ser, format=format, cache=cache), expected) + def test_parse_dates_infer_datetime_format_warning(self): + # GH 49024 + with tm.assert_produces_warning( + UserWarning, + match="The argument 'infer_datetime_format' is deprecated", + ): + to_datetime(["10-10-2000"], infer_datetime_format=True) + class TestDaysInMonth: # tests for issue #10154 @pytest.mark.parametrize( - "arg, format", + "arg, format, warning", [ - ["2015-02-29", None], - ["2015-02-29", "%Y-%m-%d"], - ["2015-02-32", "%Y-%m-%d"], - ["2015-04-31", "%Y-%m-%d"], + ["2015-02-29", None, UserWarning], + ["2015-02-29", "%Y-%m-%d", None], + ["2015-02-32", "%Y-%m-%d", None], + ["2015-04-31", "%Y-%m-%d", None], ], ) - def test_day_not_in_month_coerce(self, cache, arg, format): - assert isna(to_datetime(arg, errors="coerce", format=format, cache=cache)) + def test_day_not_in_month_coerce(self, cache, arg, format, warning): + with tm.assert_produces_warning(warning, match="Could not infer format"): + assert isna(to_datetime(arg, errors="coerce", format=format, cache=cache)) def test_day_not_in_month_raise(self, cache): msg = "day is out of range for month" with pytest.raises(ValueError, match=msg): - to_datetime("2015-02-29", errors="raise", cache=cache) + with tm.assert_produces_warning( + UserWarning, match="Could not infer format" + ): + to_datetime("2015-02-29", errors="raise", cache=cache) @pytest.mark.parametrize("arg", ["2015-02-29", "2015-02-32", "2015-04-31"]) def test_day_not_in_month_raise_value(self, cache, arg): @@ -2231,85 +2216,85 @@ def test_day_not_in_month_raise_value(self, cache, arg): to_datetime(arg, errors="raise", format="%Y-%m-%d", cache=cache) @pytest.mark.parametrize( - "expected, format", + "expected, format, warning", [ - ["2015-02-29", None], - ["2015-02-29", "%Y-%m-%d"], - ["2015-02-29", "%Y-%m-%d"], - ["2015-04-31", "%Y-%m-%d"], + ["2015-02-29", None, UserWarning], + ["2015-02-29", "%Y-%m-%d", None], + ["2015-02-29", "%Y-%m-%d", None], + ["2015-04-31", "%Y-%m-%d", None], ], ) - def test_day_not_in_month_ignore(self, cache, expected, format): - result = to_datetime(expected, errors="ignore", format=format, cache=cache) + def test_day_not_in_month_ignore(self, cache, expected, format, warning): + with tm.assert_produces_warning(warning, match="Could not infer format"): + result = to_datetime(expected, errors="ignore", format=format, cache=cache) assert result == expected class TestDatetimeParsingWrappers: @pytest.mark.parametrize( - "date_str,expected", - list( - { - "2011-01-01": datetime(2011, 1, 1), - "2Q2005": datetime(2005, 4, 1), - "2Q05": datetime(2005, 4, 1), - "2005Q1": datetime(2005, 1, 1), - "05Q1": datetime(2005, 1, 1), - "2011Q3": datetime(2011, 7, 1), - "11Q3": datetime(2011, 7, 1), - "3Q2011": datetime(2011, 7, 1), - "3Q11": datetime(2011, 7, 1), - # quarterly without space - "2000Q4": datetime(2000, 10, 1), - "00Q4": datetime(2000, 10, 1), - "4Q2000": datetime(2000, 10, 1), - "4Q00": datetime(2000, 10, 1), - "2000q4": datetime(2000, 10, 1), - "2000-Q4": datetime(2000, 10, 1), - "00-Q4": datetime(2000, 10, 1), - "4Q-2000": datetime(2000, 10, 1), - "4Q-00": datetime(2000, 10, 1), - "00q4": datetime(2000, 10, 1), - "2005": datetime(2005, 1, 1), - "2005-11": datetime(2005, 11, 1), - "2005 11": datetime(2005, 11, 1), - "11-2005": datetime(2005, 11, 1), - "11 2005": datetime(2005, 11, 1), - "200511": datetime(2020, 5, 11), - "20051109": datetime(2005, 11, 9), - "20051109 10:15": datetime(2005, 11, 9, 10, 15), - "20051109 08H": datetime(2005, 11, 9, 8, 0), - "2005-11-09 10:15": datetime(2005, 11, 9, 10, 15), - "2005-11-09 08H": datetime(2005, 11, 9, 8, 0), - "2005/11/09 10:15": datetime(2005, 11, 9, 10, 15), - "2005/11/09 08H": datetime(2005, 11, 9, 8, 0), - "Thu Sep 25 10:36:28 2003": datetime(2003, 9, 25, 10, 36, 28), - "Thu Sep 25 2003": datetime(2003, 9, 25), - "Sep 25 2003": datetime(2003, 9, 25), - "January 1 2014": datetime(2014, 1, 1), - # GHE10537 - "2014-06": datetime(2014, 6, 1), - "06-2014": datetime(2014, 6, 1), - "2014-6": datetime(2014, 6, 1), - "6-2014": datetime(2014, 6, 1), - "20010101 12": datetime(2001, 1, 1, 12), - "20010101 1234": datetime(2001, 1, 1, 12, 34), - "20010101 123456": datetime(2001, 1, 1, 12, 34, 56), - }.items() - ), + "date_str, expected, warning", + [ + ("2011-01-01", datetime(2011, 1, 1), None), + ("2Q2005", datetime(2005, 4, 1), UserWarning), + ("2Q05", datetime(2005, 4, 1), UserWarning), + ("2005Q1", datetime(2005, 1, 1), UserWarning), + ("05Q1", datetime(2005, 1, 1), UserWarning), + ("2011Q3", datetime(2011, 7, 1), UserWarning), + ("11Q3", datetime(2011, 7, 1), UserWarning), + ("3Q2011", datetime(2011, 7, 1), UserWarning), + ("3Q11", datetime(2011, 7, 1), UserWarning), + # quarterly without space + ("2000Q4", datetime(2000, 10, 1), UserWarning), + ("00Q4", datetime(2000, 10, 1), UserWarning), + ("4Q2000", datetime(2000, 10, 1), UserWarning), + ("4Q00", datetime(2000, 10, 1), UserWarning), + ("2000q4", datetime(2000, 10, 1), UserWarning), + ("2000-Q4", datetime(2000, 10, 1), UserWarning), + ("00-Q4", datetime(2000, 10, 1), UserWarning), + ("4Q-2000", datetime(2000, 10, 1), UserWarning), + ("4Q-00", datetime(2000, 10, 1), UserWarning), + ("00q4", datetime(2000, 10, 1), UserWarning), + ("2005", datetime(2005, 1, 1), None), + ("2005-11", datetime(2005, 11, 1), UserWarning), + ("2005 11", datetime(2005, 11, 1), UserWarning), + ("11-2005", datetime(2005, 11, 1), UserWarning), + ("11 2005", datetime(2005, 11, 1), UserWarning), + ("200511", datetime(2020, 5, 11), UserWarning), + ("20051109", datetime(2005, 11, 9), None), + ("20051109 10:15", datetime(2005, 11, 9, 10, 15), None), + ("20051109 08H", datetime(2005, 11, 9, 8, 0), None), + ("2005-11-09 10:15", datetime(2005, 11, 9, 10, 15), None), + ("2005-11-09 08H", datetime(2005, 11, 9, 8, 0), None), + ("2005/11/09 10:15", datetime(2005, 11, 9, 10, 15), None), + ("2005/11/09 08H", datetime(2005, 11, 9, 8, 0), None), + ("Thu Sep 25 10:36:28 2003", datetime(2003, 9, 25, 10, 36, 28), None), + ("Thu Sep 25 2003", datetime(2003, 9, 25), None), + ("Sep 25 2003", datetime(2003, 9, 25), None), + ("January 1 2014", datetime(2014, 1, 1), None), + # GHE10537 + ("2014-06", datetime(2014, 6, 1), UserWarning), + ("06-2014", datetime(2014, 6, 1), UserWarning), + ("2014-6", datetime(2014, 6, 1), UserWarning), + ("6-2014", datetime(2014, 6, 1), UserWarning), + ("20010101 12", datetime(2001, 1, 1, 12), None), + ("20010101 1234", datetime(2001, 1, 1, 12, 34), UserWarning), + ("20010101 123456", datetime(2001, 1, 1, 12, 34, 56), UserWarning), + ], ) - def test_parsers(self, date_str, expected, cache): + def test_parsers(self, date_str, expected, warning, cache): # dateutil >= 2.5.0 defaults to yearfirst=True # https://github.com/dateutil/dateutil/issues/217 yearfirst = True result1, _ = parsing.parse_time_string(date_str, yearfirst=yearfirst) - result2 = to_datetime(date_str, yearfirst=yearfirst) - result3 = to_datetime([date_str], yearfirst=yearfirst) - # result5 is used below - result4 = to_datetime( - np.array([date_str], dtype=object), yearfirst=yearfirst, cache=cache - ) + with tm.assert_produces_warning(warning, match="Could not infer format"): + result2 = to_datetime(date_str, yearfirst=yearfirst) + result3 = to_datetime([date_str], yearfirst=yearfirst) + # result5 is used below + result4 = to_datetime( + np.array([date_str], dtype=object), yearfirst=yearfirst, cache=cache + ) result6 = DatetimeIndex([date_str], yearfirst=yearfirst) # result7 is used below result8 = DatetimeIndex(Index([date_str]), yearfirst=yearfirst) @@ -2418,9 +2403,10 @@ def test_parsers_dayfirst_yearfirst( result2 = Timestamp(date_str) assert result2 == expected - result3 = to_datetime( - date_str, dayfirst=dayfirst, yearfirst=yearfirst, cache=cache - ) + with tm.assert_produces_warning(UserWarning, match="Could not infer format"): + result3 = to_datetime( + date_str, dayfirst=dayfirst, yearfirst=yearfirst, cache=cache + ) result4 = DatetimeIndex([date_str], dayfirst=dayfirst, yearfirst=yearfirst)[0] @@ -2437,8 +2423,9 @@ def test_parsers_timestring(self, date_str, exp_def): exp_now = parse(date_str) result1, _ = parsing.parse_time_string(date_str) - result2 = to_datetime(date_str) - result3 = to_datetime([date_str]) + with tm.assert_produces_warning(UserWarning, match="Could not infer format"): + result2 = to_datetime(date_str) + result3 = to_datetime([date_str]) result4 = Timestamp(date_str) result5 = DatetimeIndex([date_str])[0] # parse time string return time string based on default date @@ -2602,17 +2589,23 @@ def test_incorrect_value_exception(self): with pytest.raises( ValueError, match="Unknown string format: yesterday present at position 1" ): - to_datetime(["today", "yesterday"]) + with tm.assert_produces_warning( + UserWarning, match="Could not infer format" + ): + to_datetime(["today", "yesterday"]) - @pytest.mark.parametrize("format", [None, "%Y-%m-%d %H:%M:%S"]) - def test_to_datetime_out_of_bounds_with_format_arg(self, format): + @pytest.mark.parametrize( + "format, warning", [(None, UserWarning), ("%Y-%m-%d %H:%M:%S", None)] + ) + def test_to_datetime_out_of_bounds_with_format_arg(self, format, warning): # see gh-23830 msg = ( "Out of bounds nanosecond timestamp: 2417-10-27 00:00:00 " "present at position 0" ) with pytest.raises(OutOfBoundsDatetime, match=msg): - to_datetime("2417-10-27 00:00:00", format=format) + with tm.assert_produces_warning(warning, match="Could not infer format"): + to_datetime("2417-10-27 00:00:00", format=format) @pytest.mark.parametrize( "arg, origin, expected_str", From bc910b06eb71e4adc103ca7ee2fb952cd68175d0 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Tue, 18 Oct 2022 09:35:49 +0100 Subject: [PATCH 06/34] :memo: update docs --- doc/source/user_guide/basics.rst | 2 ++ doc/source/user_guide/io.rst | 31 +++++--------------------- doc/source/user_guide/timeseries.rst | 27 +++++++---------------- doc/source/whatsnew/v2.0.0.rst | 33 ++++++++++++++++++++++++++++ 4 files changed, 49 insertions(+), 44 deletions(-) diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index a34d4891b9d77..92fae28d3bdb3 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -2313,6 +2313,7 @@ useful if you are reading in data which is mostly of the desired dtype (e.g. num non-conforming elements intermixed that you want to represent as missing: .. ipython:: python + :okwarning: import datetime @@ -2329,6 +2330,7 @@ The ``errors`` parameter has a third option of ``errors='ignore'``, which will s encounters any errors with the conversion to a desired data type: .. ipython:: python + :okwarning: import datetime diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index cc01270181202..d6e67cd638a7b 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1009,41 +1009,22 @@ To parse the mixed-timezone values as a datetime column, pass a partially-applie Inferring datetime format +++++++++++++++++++++++++ -If you have ``parse_dates`` enabled for some or all of your columns, and your -datetime strings are all formatted the same way, you may get a large speed -up by setting ``infer_datetime_format=True``. If set, pandas will attempt -to guess the format of your datetime strings, and then use a faster means -of parsing the strings. 5-10x parsing speeds have been observed. pandas -will fallback to the usual parsing if either the format cannot be guessed -or the format that was guessed cannot properly parse the entire column -of strings. So in general, ``infer_datetime_format`` should not have any -negative consequences if enabled. - -Here are some examples of datetime strings that can be guessed (All -representing December 30th, 2011 at 00:00:00): - -* "20111230" -* "2011/12/30" -* "20111230 00:00:00" -* "12/30/2011 00:00:00" -* "30/Dec/2011 00:00:00" -* "30/December/2011 00:00:00" - -Note that ``infer_datetime_format`` is sensitive to ``dayfirst``. With -``dayfirst=True``, it will guess "01/12/2011" to be December 1st. With -``dayfirst=False`` (default) it will guess "01/12/2011" to be January 12th. +If you try to parse a column of date strings, pandas will attempt to guess the format +from the first non-NaN element, and will then parse the rest of the column with that +format. .. ipython:: python - # Try to infer the format for the index column df = pd.read_csv( "foo.csv", index_col=0, parse_dates=True, - infer_datetime_format=True, ) df +In the case that you have mixed datetime formats within the same column, you'll need to +first read it in the file, and then apply :func:`to_datetime` to each element. + .. ipython:: python :suppress: diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 474068e43a4d4..2710a22ec6161 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -13,17 +13,6 @@ a tremendous amount of new functionality for manipulating time series data. For example, pandas supports: -Parsing time series information from various sources and formats - -.. ipython:: python - - import datetime - - dti = pd.to_datetime( - ["1/1/2018", np.datetime64("2018-01-01"), datetime.datetime(2018, 1, 1)] - ) - dti - Generate sequences of fixed-frequency dates and time spans .. ipython:: python @@ -132,6 +121,8 @@ time. .. ipython:: python + import datetime + pd.Timestamp(datetime.datetime(2012, 5, 1)) pd.Timestamp("2012-05-01") pd.Timestamp(2012, 5, 1) @@ -196,26 +187,24 @@ is converted to a ``DatetimeIndex``: .. ipython:: python - pd.to_datetime(pd.Series(["Jul 31, 2009", "2010-01-10", None])) + pd.to_datetime(pd.Series(["Jul 31, 2009", "Jan 10, 2010", None])) - pd.to_datetime(["2005/11/23", "2010.12.31"]) + pd.to_datetime(["2005/11/23", "2010/12/31"]) If you use dates which start with the day first (i.e. European style), you can pass the ``dayfirst`` flag: .. ipython:: python - :okwarning: + :okwarning: pd.to_datetime(["04-01-2012 10:00"], dayfirst=True) - - pd.to_datetime(["14-01-2012", "01-14-2012"], dayfirst=True) + pd.to_datetime(["04-14-2012 10:00"], dayfirst=True) .. warning:: You see in the above example that ``dayfirst`` isn't strict. If a date can't be parsed with the day being first it will be parsed as if - ``dayfirst`` were False, and in the case of parsing delimited date strings - (e.g. ``31-12-2012``) then a warning will also be raised. + ``dayfirst`` were False and a warning will also be raised. If you pass a single string to ``to_datetime``, it returns a single ``Timestamp``. ``Timestamp`` can also accept string input, but it doesn't accept string parsing @@ -768,7 +757,7 @@ partially matching dates: rng2 = pd.date_range("2011-01-01", "2012-01-01", freq="W") ts2 = pd.Series(np.random.randn(len(rng2)), index=rng2) - ts2.truncate(before="2011-11", after="2011-12") + ts2.truncate(before="2011-11-01", after="2011-12-01") ts2["2011-11":"2011-12"] Even complicated fancy indexing that breaks the ``DatetimeIndex`` frequency diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index a0a7aa94bd287..915a91d71e9eb 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -114,6 +114,39 @@ Optional libraries below the lowest tested version may still work, but are not c See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more. +Datetimes are now parsed with a consistent format +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:func:`to_datetime` now parses dates with a consistent format, which is guessed from the first non-NA value +(unless ``format`` is specified). Previously, it would've guessed the format for each element individually. + +*Old behavior*: + + .. code-block:: ipython + + In [1]: ser = pd.Series(['13-01-2000', '12-01-2000']) + In [2]: pd.to_datetime(ser) + Out[2]: + 0 2000-01-13 + 1 2000-12-01 + dtype: datetime64[ns] + +*New behavior*: + + .. ipython:: python + :okwarning: + + ser = pd.Series(['13-01-2000', '12-01-2000']) + pd.to_datetime(ser) + +Note that this affects :func:`read_csv` as well. + +If you still need to parse dates with inconsistent formats, you'll need to apply :func:`to_datetime` +to each element individually, e.g. :: + + ser = pd.Series(['13-01-2000', '12 January 2000']) + ser.apply(pd.to_datetime) + .. _whatsnew_200.api_breaking.other: Other API changes From 7d03503198bd45acd63a236b4175603055362cf6 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Wed, 19 Oct 2022 10:15:01 +0100 Subject: [PATCH 07/34] :memo: add example of reading csv file with mixed formats --- doc/source/user_guide/io.rst | 9 ++++++++- doc/source/whatsnew/v2.0.0.rst | 2 +- pandas/_libs/tslibs/parsing.pyx | 4 ++-- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index d6e67cd638a7b..844cd70f4866c 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1023,7 +1023,14 @@ format. df In the case that you have mixed datetime formats within the same column, you'll need to -first read it in the file, and then apply :func:`to_datetime` to each element. +first read it in as an object dtype and then apply :func:`to_datetime` to each element. + +.. ipython:: python + + data = io.StringIO("date\n12 Jan 2000\n2000-01-13\n") + df = pd.read_csv(data) + df['date'] = df['date'].apply(pd.to_datetime) + df .. ipython:: python :suppress: diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 915a91d71e9eb..95a0a93838216 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -118,7 +118,7 @@ Datetimes are now parsed with a consistent format ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ :func:`to_datetime` now parses dates with a consistent format, which is guessed from the first non-NA value -(unless ``format`` is specified). Previously, it would've guessed the format for each element individually. +(unless ``format`` is specified). Previously, it would have guessed the format for each element individually. *Old behavior*: diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 74de3502b73de..c9df9146240da 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -1117,13 +1117,13 @@ cdef void _maybe_warn_about_dayfirst(format: str, bint dayfirst): if (day_index > month_index) and dayfirst: warnings.warn( f"Parsing dates in {format} format when dayfirst=True was specified. " - f"Pass `dayfirst=False` or specify a format to silence this warning.", + "Pass `dayfirst=False` or specify a format to silence this warning.", stacklevel=find_stack_level(), ) if (day_index < month_index) and not dayfirst: warnings.warn( f"Parsing dates in {format} format when dayfirst=False was specified. " - f"Pass `dayfirst=True` or specify a format to silence this warning.", + "Pass `dayfirst=True` or specify a format to silence this warning.", stacklevel=find_stack_level(), ) From ac825f5dc33e16873b01110591f89b26d1e8ed8a Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Wed, 19 Oct 2022 10:24:42 +0100 Subject: [PATCH 08/34] :wastebasket: removed now outdated tests / clean inputs --- .../indexes/datetimes/test_constructors.py | 22 ++-------- pandas/tests/tools/test_to_datetime.py | 43 +------------------ 2 files changed, 6 insertions(+), 59 deletions(-) diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py index c1039728f5b5e..a9491f90e80f0 100644 --- a/pandas/tests/indexes/datetimes/test_constructors.py +++ b/pandas/tests/indexes/datetimes/test_constructors.py @@ -1042,27 +1042,13 @@ def test_datetimeindex_constructor_misc(self): arr = np.array(["1/1/2005", "1/2/2005", "1/3/2005", "2005-01-04"], dtype="O") idx4 = DatetimeIndex(arr) - # Can't be parsed consistently, need to parse each element individually - arr = [ - to_datetime(date_string) - for date_string in ["1/1/2005", "1/2/2005", "1/3/2005", "2005-01-04"] - ] - idx5 = DatetimeIndex(arr) - - # Can't be parsed consistently, need to parse each element individually - arr = [ - to_datetime(date_string) - for date_string in ["1/1/2005", "1/2/2005", "Jan 3, 2005", "2005-01-04"] - ] - idx6 = DatetimeIndex(arr) - - idx7 = DatetimeIndex(["12/05/2007", "25/01/2008"], dayfirst=True) - idx8 = DatetimeIndex( + idx5 = DatetimeIndex(["12/05/2007", "25/01/2008"], dayfirst=True) + idx6 = DatetimeIndex( ["2007/05/12", "2008/01/25"], dayfirst=False, yearfirst=True ) - tm.assert_index_equal(idx7, idx8) + tm.assert_index_equal(idx5, idx6) - for other in [idx2, idx3, idx4, idx5, idx6]: + for other in [idx2, idx3, idx4]: assert (idx1.values == other.values).all() sdate = datetime(1999, 12, 25) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index a2871e79dc7d9..e3b9e30e1923c 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1225,40 +1225,6 @@ def test_iso_8601_strings_with_different_offsets_utc(self): ) tm.assert_index_equal(result, expected) - def test_iso8601_strings_mixed_offsets_with_naive(self): - # GH 24992 - # Can't parse consistently, need to parse each element in loop. - result = DatetimeIndex( - [ - to_datetime(string, utc=True) - for string in [ - "2018-11-28T00:00:00", - "2018-11-28T00:00:00+12:00", - "2018-11-28T00:00:00", - "2018-11-28T00:00:00+06:00", - "2018-11-28T00:00:00", - ] - ] - ) - expected = to_datetime( - [ - "2018-11-28T00:00:00", - "2018-11-27T12:00:00", - "2018-11-28T00:00:00", - "2018-11-27T18:00:00", - "2018-11-28T00:00:00", - ], - utc=True, - ) - tm.assert_index_equal(result, expected) - - def test_iso8601_strings_mixed_offsets_with_naive_reversed(self): - items = ["2018-11-28T00:00:00+12:00", "2018-11-28T00:00:00"] - # Can't parse consistently, need to parse each element in loop. - result = [to_datetime(item, utc=True) for item in items] - expected = [to_datetime(item, utc=True) for item in list(reversed(items))][::-1] - assert result == expected - def test_mixed_offsets_with_native_datetime_raises(self): # GH 25978 @@ -1910,9 +1876,7 @@ def test_to_datetime_overflow(self): def test_string_na_nat_conversion(self, cache): # GH #999, #858 - strings = np.array( - ["1/1/2000", "1/2/2000", np.nan, "1/4/2000, 12:34:56"], dtype=object - ) + strings = np.array(["1/1/2000", "1/2/2000", np.nan, "1/4/2000"], dtype=object) expected = np.empty(4, dtype="M8[ns]") for i, val in enumerate(strings): @@ -1924,10 +1888,7 @@ def test_string_na_nat_conversion(self, cache): result = tslib.array_to_datetime(strings)[0] tm.assert_almost_equal(result, expected) - # Can't parse in consistent format, so need to convert each individually. - result2 = DatetimeIndex( - [to_datetime(string, cache=cache) for string in strings] - ) + result2 = to_datetime(strings, cache=cache) assert isinstance(result2, DatetimeIndex) tm.assert_numpy_array_equal(result, result2.values) From 2ffcef67c67e1d53d9d36334eefb730b61416f84 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <> Date: Fri, 21 Oct 2022 19:34:15 +0200 Subject: [PATCH 09/34] :memo: clarify whatsnew and user-guide --- doc/source/user_guide/io.rst | 4 +++- doc/source/whatsnew/v2.0.0.rst | 3 +-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 844cd70f4866c..2f35feaeffec7 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1011,7 +1011,9 @@ Inferring datetime format If you try to parse a column of date strings, pandas will attempt to guess the format from the first non-NaN element, and will then parse the rest of the column with that -format. +format. If pandas fails to guess the format, then a warning will be raised, and each +row will have its format guessed individually by ``dateutil.parser.parse``. The safest +way to parse dates is to explicitly set ``format=``. .. ipython:: python diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 95a0a93838216..71afc73598988 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -117,8 +117,7 @@ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for mor Datetimes are now parsed with a consistent format ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:func:`to_datetime` now parses dates with a consistent format, which is guessed from the first non-NA value -(unless ``format`` is specified). Previously, it would have guessed the format for each element individually. +In the past, :func:`to_datetime` guessed the format for each element independently. This was appropriate for some cases where a column had a mixed date format - however, it would regularly cause problems for columns where users expected a consistent format but the function would switch formats row-wise. As of version 2.0.0, this behavior is consistent column-wise, and the format is determined by the first non-NA value in the column (unless the user specifies a format, in which case that is used). *Old behavior*: From b3e32ac646b117f15a9f32c4d95f271926ec5f0e Mon Sep 17 00:00:00 2001 From: Marco Gorelli <> Date: Fri, 28 Oct 2022 10:54:43 +0200 Subject: [PATCH 10/34] :art: --- pandas/tests/tools/test_to_datetime.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index c1c6811b82317..1b052f60e0dc5 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -635,7 +635,7 @@ def test_to_datetime_now(self): # GH#18705 now = Timestamp("now") with tm.assert_produces_warning( - UserWarning, match="Could not infer format", + UserWarning, match="Could not infer format" ): pdnow = to_datetime("now") pdnow2 = to_datetime(["now"])[0] From d3adfe5a3aea0cad36b578f65d03c4f559909403 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Sat, 29 Oct 2022 13:48:05 +0100 Subject: [PATCH 11/34] guess %Y-%m format --- pandas/_libs/tslibs/parsing.pyx | 5 +++-- pandas/tests/tslibs/test_parsing.py | 2 ++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index a0d0fd7bc67b0..a335b0bdefdcc 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -1011,10 +1011,11 @@ def guess_datetime_format(dt_str: str, bint dayfirst=False) -> str | None: break # Only consider it a valid guess if we have a year, month and day, - # unless it's %Y which is both common and unambiguous. + # unless it's %Y or %Y-%m which conform with ISO8601. Note that we don't + # make an exception for %Y%m because it's explicitly not considered ISO8601. if ( len({'year', 'month', 'day'} & found_attrs) != 3 - and format_guess != ['%Y'] + and format_guess not in (['%Y'], ['%Y', None, '%m']) ): return None diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py index 49d83a8fa5c56..972bb2d8126e5 100644 --- a/pandas/tests/tslibs/test_parsing.py +++ b/pandas/tests/tslibs/test_parsing.py @@ -148,6 +148,7 @@ def test_parsers_month_freq(date_str, expected): ("20111230", "%Y%m%d"), ("2011-12-30", "%Y-%m-%d"), ("2011", "%Y"), + ("2011-01", "%Y-%m"), ("30-12-2011", "%d-%m-%Y"), ("2011-12-30 00:00:00", "%Y-%m-%d %H:%M:%S"), ("2011-12-30T00:00:00", "%Y-%m-%dT%H:%M:%S"), @@ -215,6 +216,7 @@ def test_guess_datetime_format_with_locale_specific_formats(string, fmt): "this_is_not_a_datetime", "51a", "13/2019", + "202001", # YYYYMM isn't ISO8601 ], ) def test_guess_datetime_format_invalid_inputs(invalid_dt): From affa7f32aa6cf26b05df1cee7a54591015f708aa Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Sat, 29 Oct 2022 13:37:16 +0100 Subject: [PATCH 12/34] Detect format from first non-na, but also exclude now and today --- pandas/_libs/tslib.pyx | 2 +- pandas/core/tools/datetimes.py | 3 --- pandas/tests/tools/test_to_datetime.py | 25 +++++++++---------------- 3 files changed, 10 insertions(+), 20 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index d7c0c91332e02..705b5440b74a0 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -429,7 +429,7 @@ def first_non_null(values: ndarray) -> int: val = values[i] if checknull_with_nat_and_na(val): continue - if isinstance(val, str) and (len(val) == 0 or val in nat_strings): + if isinstance(val, str) and (len(val) == 0 or val in ("now", "today", *nat_strings)): continue return i else: diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 27ca210fb0ece..1b7f2c2236a06 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -485,9 +485,6 @@ def _array_strptime_with_fallback( else: result = arg except ValueError: - # if fmt was inferred, try falling back - # to array_to_datetime - terminate here - # for specified formats if errors == "raise": raise elif errors == "coerce": diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 1b052f60e0dc5..90a13f6bc46c3 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -634,11 +634,8 @@ def test_to_datetime_now(self): with tm.set_timezone("US/Eastern"): # GH#18705 now = Timestamp("now") - with tm.assert_produces_warning( - UserWarning, match="Could not infer format" - ): - pdnow = to_datetime("now") - pdnow2 = to_datetime(["now"])[0] + pdnow = to_datetime("now") + pdnow2 = to_datetime(["now"])[0] # These should all be equal with infinite perf; this gives # a generous margin of 10 seconds @@ -659,11 +656,8 @@ def test_to_datetime_today(self, tz): # so this test will not detect the regression introduced in #18666. with tm.set_timezone(tz): nptoday = np.datetime64("today").astype("datetime64[ns]").astype(np.int64) - with tm.assert_produces_warning( - UserWarning, match="Could not infer format" - ): - pdtoday = to_datetime("today") - pdtoday2 = to_datetime(["today"])[0] + pdtoday = to_datetime("today") + pdtoday2 = to_datetime(["today"])[0] tstoday = Timestamp("today") tstoday2 = Timestamp.today() @@ -680,8 +674,7 @@ def test_to_datetime_today(self, tz): @pytest.mark.parametrize("arg", ["now", "today"]) def test_to_datetime_today_now_unicode_bytes(self, arg): - with tm.assert_produces_warning(UserWarning, match="Could not infer format"): - to_datetime([arg]) + to_datetime([arg]) @pytest.mark.parametrize( "dt", [np.datetime64("2000-01-01"), np.datetime64("2000-01-02")] @@ -2210,8 +2203,8 @@ class TestDatetimeParsingWrappers: ("4Q-00", datetime(2000, 10, 1), UserWarning), ("00q4", datetime(2000, 10, 1), UserWarning), ("2005", datetime(2005, 1, 1), None), - ("2005-11", datetime(2005, 11, 1), UserWarning), - ("2005 11", datetime(2005, 11, 1), UserWarning), + ("2005-11", datetime(2005, 11, 1), None), + ("2005 11", datetime(2005, 11, 1), None), ("11-2005", datetime(2005, 11, 1), UserWarning), ("11 2005", datetime(2005, 11, 1), UserWarning), ("200511", datetime(2020, 5, 11), UserWarning), @@ -2227,9 +2220,9 @@ class TestDatetimeParsingWrappers: ("Sep 25 2003", datetime(2003, 9, 25), None), ("January 1 2014", datetime(2014, 1, 1), None), # GHE10537 - ("2014-06", datetime(2014, 6, 1), UserWarning), + ("2014-06", datetime(2014, 6, 1), None), ("06-2014", datetime(2014, 6, 1), UserWarning), - ("2014-6", datetime(2014, 6, 1), UserWarning), + ("2014-6", datetime(2014, 6, 1), None), ("6-2014", datetime(2014, 6, 1), UserWarning), ("20010101 12", datetime(2001, 1, 1, 12), None), ("20010101 1234", datetime(2001, 1, 1, 12, 34), UserWarning), From 575b215ca339536fdda63bc6d41cafadf6bf926b Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Sat, 29 Oct 2022 14:41:27 +0100 Subject: [PATCH 13/34] :white_check_mark: fixup tests based on now and today parsing --- pandas/tests/groupby/transform/test_transform.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index b0858cbcf67d5..2b4eba539ec82 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -1052,8 +1052,7 @@ def demean_rename(x): @pytest.mark.parametrize("func", [min, max, np.min, np.max, "first", "last"]) def test_groupby_transform_timezone_column(func): # GH 24198 - with tm.assert_produces_warning(UserWarning, match="Could not infer format"): - ts = pd.to_datetime("now", utc=True).tz_convert("Asia/Singapore") + ts = pd.to_datetime("now", utc=True).tz_convert("Asia/Singapore") result = DataFrame({"end_time": [ts], "id": [1]}) result["max_end_time"] = result.groupby("id").end_time.transform(func) expected = DataFrame([[ts, 1, ts]], columns=["end_time", "id", "max_end_time"]) From 1d255e07a3278576c3d35eec961a9d8db97d626f Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Thu, 17 Nov 2022 13:30:18 +0000 Subject: [PATCH 14/34] fixup after merge --- pandas/core/tools/datetimes.py | 9 ++------- pandas/tests/tools/test_to_datetime.py | 2 +- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 070ddbb87a553..02988754450a6 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -429,13 +429,8 @@ def _convert_listlike_datetimes( if format is None: format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst) - # There is a special fast-path for iso8601 formatted - # datetime strings, so in those cases don't use the inferred - # format because this path makes process slower in this - # special case - if format is not None and format_is_iso(format): - require_iso8601 = True - format = None + # There is a special fast-path for iso8601 formatted datetime strings + require_iso8601 = format is not None and format_is_iso(format) if format is not None and not require_iso8601: return _to_datetime_with_format( diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 5b21ee316e5da..99899dacc68df 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -2333,7 +2333,7 @@ class TestDatetimeParsingWrappers: ("00q4", datetime(2000, 10, 1), UserWarning), ("2005", datetime(2005, 1, 1), None), ("2005-11", datetime(2005, 11, 1), None), - ("2005 11", datetime(2005, 11, 1), None), + ("2005 11", datetime(2005, 11, 1), UserWarning), ("11-2005", datetime(2005, 11, 1), UserWarning), ("11 2005", datetime(2005, 11, 1), UserWarning), ("200511", datetime(2020, 5, 11), UserWarning), From 285b1ff8dfa93b60be32280183f71811797729de Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Thu, 17 Nov 2022 17:58:12 +0000 Subject: [PATCH 15/34] fixup after merge --- pandas/tests/io/excel/test_readers.py | 13 +++---------- pandas/tests/tools/test_to_datetime.py | 24 ------------------------ 2 files changed, 3 insertions(+), 34 deletions(-) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 8e92fa10049e0..bff4c98fe2842 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -943,18 +943,11 @@ def test_reader_seconds(self, request, engine, read_ext): ] } ) - if engine == "odf": - # odf recognises cell type as time (from its attribute) - # so tries to parse it. - warning = UserWarning - else: - warning = None - with tm.assert_produces_warning(warning, match="Could not infer format"): - actual = pd.read_excel("times_1900" + read_ext, sheet_name="Sheet1") + + actual = pd.read_excel("times_1900" + read_ext, sheet_name="Sheet1") tm.assert_frame_equal(actual, expected) - with tm.assert_produces_warning(warning, match="Could not infer format"): - actual = pd.read_excel("times_1904" + read_ext, sheet_name="Sheet1") + actual = pd.read_excel("times_1904" + read_ext, sheet_name="Sheet1") tm.assert_frame_equal(actual, expected) def test_read_excel_multiindex(self, request, read_ext): diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 0c713d3f4267b..a6737b3b8fb3e 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -2898,30 +2898,6 @@ def test_to_datetime_cache_coerce_50_lines_outofbounds(series_length): to_datetime(s, errors="raise", utc=True) -@pytest.mark.parametrize( - "arg", - [ - ["1724-12-20 20:20:20+00:00", "2022-01-01 00:00:00"], - [ - Timestamp("1724-12-20 20:20:20+00:00"), - Timestamp("2022-01-01 00:00:00"), - ], - [datetime(1724, 12, 20, 20, 20, 20, tzinfo=timezone.utc), datetime(2022, 1, 1)], - ], - ids=["string", "pd.Timestamp", "datetime.datetime"], -) -@pytest.mark.parametrize("tz_aware_first", [True, False]) -def test_to_datetime_mixed_tzaware_timestamp_utc_true(arg, tz_aware_first): - # GH 48678 - exp_arg = ["1724-12-20 20:20:20", "2022-01-01 00:00:00"] - if not tz_aware_first: - arg.reverse() - exp_arg.reverse() - result = to_datetime(arg, utc=True) - expected = DatetimeIndex(exp_arg).tz_localize("UTC") - tm.assert_index_equal(result, expected) - - def test_to_datetime_format_f_parse_nanos(): # GH 48767 timestamp = "15/02/2020 02:03:04.123456789" From 963b62bf510b29f4a9f0aff2e011cf3a6b15d943 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Thu, 17 Nov 2022 19:30:51 +0000 Subject: [PATCH 16/34] fixup test --- pandas/tests/io/parser/dtypes/test_categorical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/dtypes/test_categorical.py b/pandas/tests/io/parser/dtypes/test_categorical.py index 3b8c520004f12..a0deebecdfff8 100644 --- a/pandas/tests/io/parser/dtypes/test_categorical.py +++ b/pandas/tests/io/parser/dtypes/test_categorical.py @@ -262,7 +262,7 @@ def test_categorical_coerces_timestamp(all_parsers): parser = all_parsers dtype = {"b": CategoricalDtype([Timestamp("2014")])} - data = "b\n2014-01-01\n2014-01-01T00:00:00" + data = "b\n2014-01-01\n2014-01-01" expected = DataFrame({"b": Categorical([Timestamp("2014")] * 2)}) result = parser.read_csv(StringIO(data), dtype=dtype) From c90a8a525f35fd6a4ebee81d319c2fc662a01a68 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Thu, 17 Nov 2022 19:34:03 +0000 Subject: [PATCH 17/34] remove outdated doctest --- pandas/core/tools/datetimes.py | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 02988754450a6..0f4fd77e87e1e 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -964,7 +964,7 @@ def to_datetime( - Timezone-naive inputs are converted to timezone-naive :class:`DatetimeIndex`: - >>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 13:00:15']) + >>> pd.to_datetime(['2018-10-26 12:00:00', '2018-10-26 13:00:15']) DatetimeIndex(['2018-10-26 12:00:00', '2018-10-26 13:00:15'], dtype='datetime64[ns]', freq=None) @@ -1011,19 +1011,6 @@ def to_datetime( ... utc=True) DatetimeIndex(['2018-10-26 17:30:00+00:00', '2018-10-26 17:00:00+00:00'], dtype='datetime64[ns, UTC]', freq=None) - - - Inputs can contain both naive and aware, string or datetime, the above - rules still apply - - >>> from datetime import timezone, timedelta - >>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 12:00 -0530', - ... datetime(2020, 1, 1, 18), - ... datetime(2020, 1, 1, 18, - ... tzinfo=timezone(-timedelta(hours=1)))], - ... utc=True) - DatetimeIndex(['2018-10-26 12:00:00+00:00', '2018-10-26 17:30:00+00:00', - '2020-01-01 18:00:00+00:00', '2020-01-01 19:00:00+00:00'], - dtype='datetime64[ns, UTC]', freq=None) """ if infer_datetime_format is not lib.no_default: warnings.warn( From cdfa355b1f02adf6c51f78723bb2ce4c3ecbaa6c Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Sat, 19 Nov 2022 11:25:15 +0000 Subject: [PATCH 18/34] xfail test based on issue 49767 --- pandas/tests/extension/test_arrow.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index d094a7731c417..0dfc9cd14562a 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -664,6 +664,14 @@ def test_EA_types(self, engine, data, request): reason=f"Parameterized types with tz={pa_dtype.tz} not supported.", ) ) + elif pa.types.is_timestamp(pa_dtype): + request.node.add_marker( + pytest.mark.xfail( + raises=ValueError, + strict=False, + reason="https://github.com/pandas-dev/pandas/issues/49767", + ) + ) elif pa.types.is_binary(pa_dtype): request.node.add_marker( pytest.mark.xfail(reason="CSV parsers don't correctly handle binary") From 5755032ea606db1d492cccb3dd25844d3e3ee4df Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Fri, 2 Dec 2022 14:47:18 +0000 Subject: [PATCH 19/34] wip --- pandas/tests/tools/test_to_datetime.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index f3f487e488848..bd3506a5fece5 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -352,6 +352,7 @@ def test_to_datetime_with_non_exact(self, cache): ], ) def test_parse_nanoseconds_with_formula(self, cache, arg): + # GH8989 # truncating the nanoseconds when a format was provided expected = to_datetime(arg, cache=cache) @@ -468,12 +469,11 @@ def test_to_datetime_parse_timezone_keeps_name(self): class TestToDatetime: def test_to_datetime_mixed_datetime_and_string(self): # GH#47018 adapted old doctest with new behavior - d2 = datetime(2020, 1, 1, 18, tzinfo=timezone(-timedelta(hours=1))) - with pytest.raises( - ValueError, - match=r"time data '.*' does not match format '%Y-%m-%d %H:%M %z' \(match\)", - ): - to_datetime(["2020-01-01 17:00 -0100", d2]) + d1 = datetime(2020, 1, 1, 17, tzinfo=pytz.FixedOffset(-60)) + d2 = datetime(2020, 1, 1, 18, tzinfo=pytz.FixedOffset(-60)) + res = to_datetime(["2020-01-01 17:00 -0100", d2]) + expected = to_datetime([d1, d2]) + tm.assert_index_equal(res, expected) @pytest.mark.parametrize( "fmt", @@ -1145,7 +1145,8 @@ def test_to_datetime_cache_scalar(self): (None,) + (NaT,) * start_caching_at + ("2012 July 26", Timestamp("2012-07-26")), - (NaT,) * (start_caching_at + 1) + (Timestamp("2012-07-26"), NaT), + (NaT,) * (start_caching_at + 1) + + (Timestamp("2012-07-26"), Timestamp("2012-07-26")), ), ), ) From 0a86705c3572d6f72a4532b27b37a7f038c41e8f Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Sat, 3 Dec 2022 12:50:08 +0000 Subject: [PATCH 20/34] add back examples of formats which can be guessed --- doc/source/user_guide/io.rst | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index c38b7d8418d60..330e42e27ea7d 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -970,6 +970,19 @@ To parse the mixed-timezone values as a datetime column, pass a partially-applie Inferring datetime format +++++++++++++++++++++++++ +Here are some examples of datetime strings that can be guessed (all +representing December 30th, 2011 at 00:00:00): +* "20111230" +* "2011/12/30" +* "20111230 00:00:00" +* "12/30/2011 00:00:00" +* "30/Dec/2011 00:00:00" +* "30/December/2011 00:00:00" + +Note that format inference is sensitive to ``dayfirst``. With +``dayfirst=True``, it will guess "01/12/2011" to be December 1st. With +``dayfirst=False`` (default) it will guess "01/12/2011" to be January 12th. + If you try to parse a column of date strings, pandas will attempt to guess the format from the first non-NaN element, and will then parse the rest of the column with that format. If pandas fails to guess the format, then a warning will be raised, and each From 86e9bcfe27df1686a4106385da73daf3f6536689 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Tue, 6 Dec 2022 09:39:19 +0000 Subject: [PATCH 21/34] start fixing up --- doc/source/user_guide/io.rst | 1 + doc/source/user_guide/timeseries.rst | 11 +++++++++++ pandas/core/tools/datetimes.py | 9 ++++++++- 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 327fd8cd35956..81e7a7e3e9abf 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -970,6 +970,7 @@ Inferring datetime format Here are some examples of datetime strings that can be guessed (all representing December 30th, 2011 at 00:00:00): + * "20111230" * "2011/12/30" * "20111230 00:00:00" diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 100ed1a889131..197a3e19b94f5 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -13,6 +13,17 @@ a tremendous amount of new functionality for manipulating time series data. For example, pandas supports: +Parsing time series information from various sources and formats + +.. ipython:: python + + import datetime + + dti = pd.to_datetime( + ["1/1/2018", np.datetime64("2018-01-01"), datetime.datetime(2018, 1, 1)] + ) + dti + Generate sequences of fixed-frequency dates and time spans .. ipython:: python diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 44118725db84b..d82cb1ff571fd 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -993,7 +993,7 @@ def to_datetime( are constant: >>> from datetime import datetime - >>> pd.to_datetime(["2020-01-01 01:00:00-01:00", datetime(2020, 1, 1, 3, 0)]) + >>> pd.to_datetime(["2020-01-01 01:00 -01:00", datetime(2020, 1, 1, 3, 0)]) DatetimeIndex(['2020-01-01 01:00:00-01:00', '2020-01-01 02:00:00-01:00'], dtype='datetime64[ns, pytz.FixedOffset(-60)]', freq=None) @@ -1014,6 +1014,13 @@ def to_datetime( ... utc=True) DatetimeIndex(['2018-10-26 17:30:00+00:00', '2018-10-26 17:00:00+00:00'], dtype='datetime64[ns, UTC]', freq=None) + + - Inputs can contain both string or datetime, the above + rules still apply + >>> from datetime import timezone, timedelta + >>> pd.to_datetime(['2018-10-26 12:00', datetime(2020, 1, 1, 18)], utc=True) + DatetimeIndex(['2018-10-26 12:00:00+00:00', '2020-01-01 18:00:00+00:00'], + dtype='datetime64[ns, UTC]', freq=None) """ if infer_datetime_format is not lib.no_default: warnings.warn( From f92a8cb8259f04a57ae178f71d74130e2b8b3b7c Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Tue, 6 Dec 2022 10:32:52 +0000 Subject: [PATCH 22/34] fixups from reviews --- doc/source/whatsnew/v2.0.0.rst | 2 +- pandas/core/tools/datetimes.py | 2 +- pandas/tests/apply/test_frame_apply.py | 3 +-- pandas/tests/extension/test_arrow.py | 3 +-- pandas/tests/tools/test_to_datetime.py | 21 ++++++++------------- 5 files changed, 12 insertions(+), 19 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index e90d7b1e61d7b..79078aebceeb4 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -409,7 +409,7 @@ Other API changes Deprecations ~~~~~~~~~~~~ -- +- Deprecated argument ``infer_datetime_format`` in :func:`to_datetime` and :func:`read_csv`, as a strict version of it is now the default (:issue:`48621`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index d82cb1ff571fd..3064dcd7ac7ad 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -993,7 +993,7 @@ def to_datetime( are constant: >>> from datetime import datetime - >>> pd.to_datetime(["2020-01-01 01:00 -01:00", datetime(2020, 1, 1, 3, 0)]) + >>> pd.to_datetime(["2020-01-01 01:00:00-01:00", datetime(2020, 1, 1, 3, 0)]) DatetimeIndex(['2020-01-01 01:00:00-01:00', '2020-01-01 02:00:00-01:00'], dtype='datetime64[ns, pytz.FixedOffset(-60)]', freq=None) diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index aa65fa5b29034..e7c2618d388c2 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -836,8 +836,7 @@ def test_with_dictlike_columns_with_datetime(): df["author"] = ["X", "Y", "Z"] df["publisher"] = ["BBC", "NBC", "N24"] df["date"] = pd.to_datetime( - ["17-10-2010 07:15:30", "13-05-2011 08:20:35", "15-01-2013 09:09:09"], - dayfirst=True, + ["17-10-2010 07:15:30", "13-05-2011 08:20:35", "15-01-2013 09:09:09"] ) result = df.apply(lambda x: {}, axis=1) expected = Series([{}, {}, {}]) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 3b2ef95a6c320..3d34b304a2588 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -692,11 +692,10 @@ def test_EA_types(self, engine, data, request): reason=f"Parameterized types with tz={pa_dtype.tz} not supported.", ) ) - elif pa.types.is_timestamp(pa_dtype): + elif pa.types.is_timestamp(pa_dtype) and pa_dtype.unit in ("us", "ns"): request.node.add_marker( pytest.mark.xfail( raises=ValueError, - strict=False, reason="https://github.com/pandas-dev/pandas/issues/49767", ) ) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 18eb20de03f03..2ed415b4613ad 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -469,10 +469,10 @@ def test_to_datetime_parse_timezone_keeps_name(self): class TestToDatetime: def test_to_datetime_mixed_datetime_and_string(self): # GH#47018 adapted old doctest with new behavior - d1 = datetime(2020, 1, 1, 17, tzinfo=pytz.FixedOffset(-60)) - d2 = datetime(2020, 1, 1, 18, tzinfo=pytz.FixedOffset(-60)) - res = to_datetime(["2020-01-01 17:00 -0100", d2]) - expected = to_datetime([d1, d2]) + d1 = datetime(2020, 1, 1, 17, tzinfo=timezone(-timedelta(hours=1))) + d2 = datetime(2020, 1, 1, 18, tzinfo=timezone(-timedelta(hours=1))) + res = to_datetime(["2020-01-01 17:00:00-01:00", d2]) + expected = to_datetime([d1, d2]).tz_convert(pytz.FixedOffset(-60)) tm.assert_index_equal(res, expected) @pytest.mark.parametrize( @@ -1335,10 +1335,7 @@ def test_mixed_offsets_with_native_datetime_raises(self): tm.assert_series_equal(mixed, expected) with pytest.raises(ValueError, match="Tz-aware datetime.datetime"): - with tm.assert_produces_warning( - UserWarning, match="Could not infer format" - ): - to_datetime(mixed) + to_datetime(mixed) def test_non_iso_strings_with_tz_offset(self): result = to_datetime(["March 1, 2018 12:00:00+0400"] * 2) @@ -2304,13 +2301,11 @@ def test_to_datetime_infer_datetime_format_consistent_format( s_as_dt_strings = ser.apply(lambda x: x.strftime(test_format)) with_format = to_datetime(s_as_dt_strings, format=test_format, cache=cache) - no_infer = to_datetime(s_as_dt_strings, cache=cache) - yes_infer = to_datetime(s_as_dt_strings, cache=cache) + without_format = to_datetime(s_as_dt_strings, cache=cache) - # Whether the format is explicitly passed, it is inferred, or + # Whether the format is explicitly passed, or # it is not inferred, the results should all be the same - tm.assert_series_equal(with_format, no_infer) - tm.assert_series_equal(no_infer, yes_infer) + tm.assert_series_equal(with_format, without_format) @pytest.mark.parametrize( "tz_name, offset, warning", From fd215df755004b3251c8d200b1a85aee526d11fa Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Tue, 6 Dec 2022 10:40:21 +0000 Subject: [PATCH 23/34] lint --- pandas/_libs/tslibs/parsing.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index c0efb48562780..c525ed6ba496e 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -1075,8 +1075,8 @@ cdef str _fill_token(token: str, padding: int): cdef void _maybe_warn_about_dayfirst(format: str, bint dayfirst): """Warn if guessed datetime format doesn't respect dayfirst argument.""" cdef: - int day_index = format.find('%d') - int month_index = format.find('%m') + int day_index = format.find("%d") + int month_index = format.find("%m") if (day_index != -1) and (month_index != -1): if (day_index > month_index) and dayfirst: From 0a5c466381ce5f9748a8ed1401f097c49c260d7f Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Tue, 6 Dec 2022 10:49:12 +0000 Subject: [PATCH 24/34] put tests back --- pandas/tests/tools/test_to_datetime.py | 48 ++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 2ed415b4613ad..33e45932544d6 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -2307,6 +2307,54 @@ def test_to_datetime_infer_datetime_format_consistent_format( # it is not inferred, the results should all be the same tm.assert_series_equal(with_format, without_format) + def test_to_datetime_inconsistent_format(self, cache): + data = ["01/01/2011 00:00:00", "01-02-2011 00:00:00", "2011-01-03T00:00:00"] + ser = Series(np.array(data)) + with pytest.raises(ValueError, match="does not match format"): + to_datetime(ser, cache=cache) + + def test_to_datetime_consistent_format(self, cache): + data = ["Jan/01/2011", "Feb/01/2011", "Mar/01/2011"] + ser = Series(np.array(data)) + result = to_datetime(ser, cache=cache) + expected = Series( + ["2011-01-01", "2011-02-01", "2011-03-01"], dtype="datetime64[ns]" + ) + tm.assert_series_equal(result, expected) + + def test_to_datetime_series_with_nans(self, cache): + ser = Series( + np.array( + ["01/01/2011 00:00:00", np.nan, "01/03/2011 00:00:00", np.nan], + dtype=object, + ) + ) + result = to_datetime(ser, cache=cache) + expected = Series( + ["2011-01-01", NaT, "2011-01-03", NaT], dtype="datetime64[ns]" + ) + tm.assert_series_equal(result, expected) + + def test_to_datetime_series_start_with_nans(self, cache): + ser = Series( + np.array( + [ + np.nan, + np.nan, + "01/01/2011 00:00:00", + "01/02/2011 00:00:00", + "01/03/2011 00:00:00", + ], + dtype=object, + ) + ) + + result = to_datetime(ser, cache=cache) + expected = Series( + [NaT, NaT, "2011-01-01", "2011-01-02", "2011-01-03"], dtype="datetime64[ns]" + ) + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( "tz_name, offset, warning", [("UTC", 0, None), ("UTC-3", 180, UserWarning), ("UTC+3", -180, UserWarning)], From 772dd6c457e8a656abc928dac447c39cb3b15abd Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Tue, 6 Dec 2022 11:28:34 +0000 Subject: [PATCH 25/34] shorten diff --- doc/source/user_guide/timeseries.rst | 2 +- pandas/core/tools/datetimes.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 197a3e19b94f5..74536eb975e70 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -766,7 +766,7 @@ partially matching dates: rng2 = pd.date_range("2011-01-01", "2012-01-01", freq="W") ts2 = pd.Series(np.random.randn(len(rng2)), index=rng2) - ts2.truncate(before="2011-11-01", after="2011-12-01") + ts2.truncate(before="2011-11", after="2011-12") ts2["2011-11":"2011-12"] Even complicated fancy indexing that breaks the ``DatetimeIndex`` frequency diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 3064dcd7ac7ad..c85b2f4987834 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -1017,7 +1017,7 @@ def to_datetime( - Inputs can contain both string or datetime, the above rules still apply - >>> from datetime import timezone, timedelta + >>> pd.to_datetime(['2018-10-26 12:00', datetime(2020, 1, 1, 18)], utc=True) DatetimeIndex(['2018-10-26 12:00:00+00:00', '2020-01-01 18:00:00+00:00'], dtype='datetime64[ns, UTC]', freq=None) From b49b7cf186b889030fce5772c9d9bb06a66060ef Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Tue, 6 Dec 2022 13:39:39 +0000 Subject: [PATCH 26/34] add example of string which cannot be guessed --- doc/source/user_guide/io.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 81e7a7e3e9abf..0d27dda3bb8ff 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -984,8 +984,9 @@ Note that format inference is sensitive to ``dayfirst``. With If you try to parse a column of date strings, pandas will attempt to guess the format from the first non-NaN element, and will then parse the rest of the column with that -format. If pandas fails to guess the format, then a warning will be raised, and each -row will have its format guessed individually by ``dateutil.parser.parse``. The safest +format. If pandas fails to guess the format (for example if your first string is +``'01 December US/Pacific 2000'``), then a warning will be raised and each +row will be parsed individually by ``dateutil.parser.parse``. The safest way to parse dates is to explicitly set ``format=``. .. ipython:: python From d17d8195fa6a3a5edbe546ed0445b9d3a9089c60 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Tue, 6 Dec 2022 16:30:48 +0000 Subject: [PATCH 27/34] add deprecated directive, construct expected explicitly, explicit UserWarning, reword row-wise and column-wise --- doc/source/whatsnew/v2.0.0.rst | 2 +- pandas/core/tools/datetimes.py | 8 ++++++-- pandas/io/parsers/readers.py | 9 +++++++++ pandas/tests/tools/test_to_datetime.py | 12 ++++++++---- 4 files changed, 24 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index a09828fe24275..30efed2ecdb64 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -345,7 +345,7 @@ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for mor Datetimes are now parsed with a consistent format ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -In the past, :func:`to_datetime` guessed the format for each element independently. This was appropriate for some cases where a column had a mixed date format - however, it would regularly cause problems for columns where users expected a consistent format but the function would switch formats row-wise. As of version 2.0.0, this behavior is consistent column-wise, and the format is determined by the first non-NA value in the column (unless the user specifies a format, in which case that is used). +In the past, :func:`to_datetime` guessed the format for each element independently. This was appropriate for some cases where elements had mixed date formats - however, it would regularly cause problems when users expected a consistent format but the function would switch formats between elements. As of version 2.0.0, parsing will use a consistent format, determined by the first non-NA value (unless the user specifies a format, in which case that is used). *Old behavior*: diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index c85b2f4987834..27f58aab93f87 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -139,6 +139,7 @@ def _guess_datetime_format_for_array(arr, dayfirst: bool | None = False) -> str warnings.warn( "Could not infer format - " "to ensure consistent parsing, specify a format.", + UserWarning, stacklevel=find_stack_level(), ) return None @@ -371,8 +372,6 @@ def _convert_listlike_datetimes( None or string of the frequency of the passed data errors : str error handing behaviors from to_datetime, 'raise', 'coerce', 'ignore' - infer_datetime_format : bool, default False - inferring format behavior from to_datetime dayfirst : bool dayfirst parsing behavior from to_datetime yearfirst : bool @@ -804,6 +803,11 @@ def to_datetime( of the datetime strings based on the first non-NaN element, and if it can be inferred, switch to a faster method of parsing them. In some cases this can increase the parsing speed by ~5-10x. + + .. deprecated:: 2.0.0 + A strict version of this argument is now the default, passing it has + no effect. + origin : scalar, default 'unix' Define the reference date. The numeric values would be parsed as number of units (defined by `unit`) since this reference date. diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 67c9391704c0d..c73cdcf1f847c 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -242,6 +242,15 @@ :ref:`io.csv.mixed_timezones` for more. Note: A fast-path exists for iso8601-formatted dates. +infer_datetime_format : bool, default False + If True and `parse_dates` is enabled, pandas will attempt to infer the + format of the datetime strings in the columns, and if it can be inferred, + switch to a faster method of parsing them. In some cases this can increase + the parsing speed by 5-10x. + + .. deprecated:: 2.0.0 + A strict version of this argument is now the default, passing it has no effect. + keep_date_col : bool, default False If True and `parse_dates` specifies combining multiple columns then keep the original columns. diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 33e45932544d6..a0d66942de533 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -469,10 +469,14 @@ def test_to_datetime_parse_timezone_keeps_name(self): class TestToDatetime: def test_to_datetime_mixed_datetime_and_string(self): # GH#47018 adapted old doctest with new behavior - d1 = datetime(2020, 1, 1, 17, tzinfo=timezone(-timedelta(hours=1))) - d2 = datetime(2020, 1, 1, 18, tzinfo=timezone(-timedelta(hours=1))) - res = to_datetime(["2020-01-01 17:00:00-01:00", d2]) - expected = to_datetime([d1, d2]).tz_convert(pytz.FixedOffset(-60)) + py_dt = datetime(2020, 1, 1, 18, tzinfo=timezone(-timedelta(hours=1))) + res = to_datetime(["2020-01-01 17:00 -0100", py_dt]) + expected = Index( + [ + Timestamp("2020-01-01 17:00:00-0100", tz=pytz.FixedOffset(-60)), + Timestamp("2020-01-01 18:00:00-0100", tz="UTC-01:00"), + ], + ) tm.assert_index_equal(res, expected) @pytest.mark.parametrize( From f4520e9816ebda411a8ec813f450356745fefc4c Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Tue, 6 Dec 2022 17:26:36 +0000 Subject: [PATCH 28/34] remove redundant example --- doc/source/user_guide/timeseries.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 74536eb975e70..1b7acc12f0dcb 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -208,7 +208,6 @@ you can pass the ``dayfirst`` flag: .. ipython:: python :okwarning: - pd.to_datetime(["04-01-2012 10:00"], dayfirst=True) pd.to_datetime(["04-14-2012 10:00"], dayfirst=True) .. warning:: From fcb515f8daa61c2c501a3de6cd475dd82b5e6d22 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Tue, 6 Dec 2022 17:34:33 +0000 Subject: [PATCH 29/34] restore newline --- doc/source/user_guide/timeseries.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 1b7acc12f0dcb..6f9fec0ff81b9 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -208,6 +208,8 @@ you can pass the ``dayfirst`` flag: .. ipython:: python :okwarning: + pd.to_datetime(["04-01-2012 10:00"], dayfirst=True) + pd.to_datetime(["04-14-2012 10:00"], dayfirst=True) .. warning:: From 22156529156f542d8bc8c6fc7545976d3ff24c0a Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Fri, 9 Dec 2022 08:45:05 +0000 Subject: [PATCH 30/34] double backticks around False, explicitly raise UserWarning --- doc/source/user_guide/timeseries.rst | 2 +- pandas/_libs/tslibs/parsing.pyx | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 6f9fec0ff81b9..7e1368061322b 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -216,7 +216,7 @@ you can pass the ``dayfirst`` flag: You see in the above example that ``dayfirst`` isn't strict. If a date can't be parsed with the day being first it will be parsed as if - ``dayfirst`` were False and a warning will also be raised. + ``dayfirst`` were ``False`` and a warning will also be raised. If you pass a single string to ``to_datetime``, it returns a single ``Timestamp``. ``Timestamp`` can also accept string input, but it doesn't accept string parsing diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index cefab6256ce5d..1d5916a98922a 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -1083,12 +1083,14 @@ cdef void _maybe_warn_about_dayfirst(format: str, bint dayfirst): warnings.warn( f"Parsing dates in {format} format when dayfirst=True was specified. " "Pass `dayfirst=False` or specify a format to silence this warning.", + UserWarning, stacklevel=find_stack_level(), ) if (day_index < month_index) and not dayfirst: warnings.warn( f"Parsing dates in {format} format when dayfirst=False was specified. " "Pass `dayfirst=True` or specify a format to silence this warning.", + UserWarning, stacklevel=find_stack_level(), ) From 7d11f593b9ea863b5ba4750fb677299ab2aea8ca Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Mon, 12 Dec 2022 17:47:18 +0000 Subject: [PATCH 31/34] reword warning --- pandas/core/tools/datetimes.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 9138bfed5679a..2b63836119895 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -137,8 +137,9 @@ def _guess_datetime_format_for_array(arr, dayfirst: bool | None = False) -> str if guessed_format is not None: return guessed_format warnings.warn( - "Could not infer format - " - "to ensure consistent parsing, specify a format.", + "Could not infer format, so each element will be parsed " + "individually by `dateutil`. To ensure parsing is " + "consistent and as-expected, please specify a format.", UserWarning, stacklevel=find_stack_level(), ) From f0ac4585f623a41299a15ca74c7930563ee08286 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Mon, 12 Dec 2022 18:34:49 +0000 Subject: [PATCH 32/34] test both dayfirst True and False --- pandas/tests/tslibs/test_parsing.py | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py index 4b6f899f1fbae..a4c79e77d2eed 100644 --- a/pandas/tests/tslibs/test_parsing.py +++ b/pandas/tests/tslibs/test_parsing.py @@ -238,19 +238,30 @@ def test_guess_datetime_format_wrong_type_inputs(invalid_type_dt): @pytest.mark.parametrize( - "string,fmt,dayfirst", + "string,fmt,dayfirst,warning", [ - ("2011-1-1", "%Y-%m-%d", False), - ("1/1/2011", "%m/%d/%Y", False), - ("30-1-2011", "%d-%m-%Y", True), - ("2011-1-1 0:0:0", "%Y-%m-%d %H:%M:%S", False), - ("2011-1-3T00:00:0", "%Y-%m-%dT%H:%M:%S", False), - ("2011-1-1 00:00:00", "%Y-%m-%d %H:%M:%S", False), + ("2011-1-1", "%Y-%m-%d", False, None), + ("2011-1-1", "%Y-%d-%m", True, None), + ("1/1/2011", "%m/%d/%Y", False, None), + ("1/1/2011", "%d/%m/%Y", True, None), + ("30-1-2011", "%d-%m-%Y", False, UserWarning), + ("30-1-2011", "%d-%m-%Y", True, None), + ("2011-1-1 0:0:0", "%Y-%m-%d %H:%M:%S", False, None), + ("2011-1-1 0:0:0", "%Y-%d-%m %H:%M:%S", True, None), + ("2011-1-3T00:00:0", "%Y-%m-%dT%H:%M:%S", False, None), + ("2011-1-3T00:00:0", "%Y-%d-%mT%H:%M:%S", True, None), + ("2011-1-1 00:00:00", "%Y-%m-%d %H:%M:%S", False, None), + ("2011-1-1 00:00:00", "%Y-%d-%m %H:%M:%S", True, None), ], ) -def test_guess_datetime_format_no_padding(string, fmt, dayfirst): +def test_guess_datetime_format_no_padding(string, fmt, dayfirst, warning): # see gh-11142 - result = parsing.guess_datetime_format(string, dayfirst=dayfirst) + msg = ( + f"Parsing dates in {fmt} format when dayfirst=False was specified. " + "Pass `dayfirst=True` or specify a format to silence this warning." + ) + with tm.assert_produces_warning(warning, match=msg): + result = parsing.guess_datetime_format(string, dayfirst=dayfirst) assert result == fmt From 4a5dd1cdb927fb2279e7e5033a3bb33a6d8fa7ad Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Tue, 13 Dec 2022 08:24:08 +0000 Subject: [PATCH 33/34] postmerge fixup --- pandas/io/parsers/readers.py | 2 -- pandas/tests/tools/test_to_datetime.py | 3 ++- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index c73cdcf1f847c..96c2fd08bbc59 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -1749,8 +1749,6 @@ def TextParser(*args, **kwds) -> TextFileReader: transformed content. encoding : str, optional Encoding to use for UTF when reading/writing (ex. 'utf-8') - squeeze : bool, default False - returns Series if only one column. float_precision : str, optional Specifies which converter the C engine should use for floating-point values. The options are `None` or `high` for the ordinary converter, diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index f4cf069046b1d..1ad9324e93406 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -2387,7 +2387,8 @@ def test_to_datetime_series_start_with_nans(self, cache): def test_infer_datetime_format_tz_name(self, tz_name, offset, warning): # GH 33133 ser = Series([f"2019-02-02 08:07:13 {tz_name}"]) - result = to_datetime(ser) + with tm.assert_produces_warning(warning, match="Could not infer format"): + result = to_datetime(ser) tz = timezone(timedelta(minutes=offset)) expected = Series([Timestamp("2019-02-02 08:07:13").tz_localize(tz)]) tm.assert_series_equal(result, expected) From 917b31b85c2be25b16269a53aa2815ee949ab04c Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Tue, 13 Dec 2022 09:05:14 +0000 Subject: [PATCH 34/34] unimportant typo to restart CI --- pandas/tests/tools/test_to_datetime.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 1ad9324e93406..48844beed30f4 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -2329,7 +2329,7 @@ def test_to_datetime_infer_datetime_format_consistent_format( without_format = to_datetime(s_as_dt_strings, cache=cache) # Whether the format is explicitly passed, or - # it is not inferred, the results should all be the same + # it is inferred, the results should all be the same tm.assert_series_equal(with_format, without_format) def test_to_datetime_inconsistent_format(self, cache):