diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index 3c7d84bb866f1..f30c66d75b525 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -2312,6 +2312,7 @@ useful if you are reading in data which is mostly of the desired dtype (e.g. num non-conforming elements intermixed that you want to represent as missing: .. ipython:: python + :okwarning: import datetime @@ -2328,6 +2329,7 @@ The ``errors`` parameter has a third option of ``errors='ignore'``, which will s encounters any errors with the conversion to a desired data type: .. ipython:: python + :okwarning: import datetime diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index d6a00022efb79..0d27dda3bb8ff 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -968,17 +968,7 @@ To parse the mixed-timezone values as a datetime column, pass a partially-applie Inferring datetime format +++++++++++++++++++++++++ -If you have ``parse_dates`` enabled for some or all of your columns, and your -datetime strings are all formatted the same way, you may get a large speed -up by setting ``infer_datetime_format=True``. If set, pandas will attempt -to guess the format of your datetime strings, and then use a faster means -of parsing the strings. 5-10x parsing speeds have been observed. pandas -will fallback to the usual parsing if either the format cannot be guessed -or the format that was guessed cannot properly parse the entire column -of strings. So in general, ``infer_datetime_format`` should not have any -negative consequences if enabled. - -Here are some examples of datetime strings that can be guessed (All +Here are some examples of datetime strings that can be guessed (all representing December 30th, 2011 at 00:00:00): * "20111230" @@ -988,21 +978,36 @@ representing December 30th, 2011 at 00:00:00): * "30/Dec/2011 00:00:00" * "30/December/2011 00:00:00" -Note that ``infer_datetime_format`` is sensitive to ``dayfirst``. With +Note that format inference is sensitive to ``dayfirst``. With ``dayfirst=True``, it will guess "01/12/2011" to be December 1st. With ``dayfirst=False`` (default) it will guess "01/12/2011" to be January 12th. +If you try to parse a column of date strings, pandas will attempt to guess the format +from the first non-NaN element, and will then parse the rest of the column with that +format. If pandas fails to guess the format (for example if your first string is +``'01 December US/Pacific 2000'``), then a warning will be raised and each +row will be parsed individually by ``dateutil.parser.parse``. The safest +way to parse dates is to explicitly set ``format=``. + .. ipython:: python - # Try to infer the format for the index column df = pd.read_csv( "foo.csv", index_col=0, parse_dates=True, - infer_datetime_format=True, ) df +In the case that you have mixed datetime formats within the same column, you'll need to +first read it in as an object dtype and then apply :func:`to_datetime` to each element. + +.. ipython:: python + + data = io.StringIO("date\n12 Jan 2000\n2000-01-13\n") + df = pd.read_csv(data) + df['date'] = df['date'].apply(pd.to_datetime) + df + .. ipython:: python :suppress: diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 78134872fa4b8..7e1368061322b 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -132,6 +132,8 @@ time. .. ipython:: python + import datetime + pd.Timestamp(datetime.datetime(2012, 5, 1)) pd.Timestamp("2012-05-01") pd.Timestamp(2012, 5, 1) @@ -196,26 +198,25 @@ is converted to a ``DatetimeIndex``: .. ipython:: python - pd.to_datetime(pd.Series(["Jul 31, 2009", "2010-01-10", None])) + pd.to_datetime(pd.Series(["Jul 31, 2009", "Jan 10, 2010", None])) - pd.to_datetime(["2005/11/23", "2010.12.31"]) + pd.to_datetime(["2005/11/23", "2010/12/31"]) If you use dates which start with the day first (i.e. European style), you can pass the ``dayfirst`` flag: .. ipython:: python - :okwarning: + :okwarning: pd.to_datetime(["04-01-2012 10:00"], dayfirst=True) - pd.to_datetime(["14-01-2012", "01-14-2012"], dayfirst=True) + pd.to_datetime(["04-14-2012 10:00"], dayfirst=True) .. warning:: You see in the above example that ``dayfirst`` isn't strict. If a date can't be parsed with the day being first it will be parsed as if - ``dayfirst`` were False, and in the case of parsing delimited date strings - (e.g. ``31-12-2012``) then a warning will also be raised. + ``dayfirst`` were ``False`` and a warning will also be raised. If you pass a single string to ``to_datetime``, it returns a single ``Timestamp``. ``Timestamp`` can also accept string input, but it doesn't accept string parsing diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 3d0d6bc5d27f2..4d3b2548f5fc5 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -411,6 +411,38 @@ Optional libraries below the lowest tested version may still work, but are not c See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more. +Datetimes are now parsed with a consistent format +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In the past, :func:`to_datetime` guessed the format for each element independently. This was appropriate for some cases where elements had mixed date formats - however, it would regularly cause problems when users expected a consistent format but the function would switch formats between elements. As of version 2.0.0, parsing will use a consistent format, determined by the first non-NA value (unless the user specifies a format, in which case that is used). + +*Old behavior*: + + .. code-block:: ipython + + In [1]: ser = pd.Series(['13-01-2000', '12-01-2000']) + In [2]: pd.to_datetime(ser) + Out[2]: + 0 2000-01-13 + 1 2000-12-01 + dtype: datetime64[ns] + +*New behavior*: + + .. ipython:: python + :okwarning: + + ser = pd.Series(['13-01-2000', '12-01-2000']) + pd.to_datetime(ser) + +Note that this affects :func:`read_csv` as well. + +If you still need to parse dates with inconsistent formats, you'll need to apply :func:`to_datetime` +to each element individually, e.g. :: + + ser = pd.Series(['13-01-2000', '12 January 2000']) + ser.apply(pd.to_datetime) + .. _whatsnew_200.api_breaking.other: Other API changes @@ -453,7 +485,7 @@ Other API changes Deprecations ~~~~~~~~~~~~ -- +- Deprecated argument ``infer_datetime_format`` in :func:`to_datetime` and :func:`read_csv`, as a strict version of it is now the default (:issue:`48621`) .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 7ae6dad58a1d9..83f03f94d2fad 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -1032,6 +1032,7 @@ def guess_datetime_format(dt_str: str, bint dayfirst=False) -> str | None: # rebuild string, capturing any inferred padding dt_str = "".join(tokens) if parsed_datetime.strftime(guessed_format) == dt_str: + _maybe_warn_about_dayfirst(guessed_format, dayfirst) return guessed_format else: return None @@ -1051,6 +1052,28 @@ cdef str _fill_token(token: str, padding: int): token_filled = f"{seconds}.{nanoseconds}" return token_filled +cdef void _maybe_warn_about_dayfirst(format: str, bint dayfirst): + """Warn if guessed datetime format doesn't respect dayfirst argument.""" + cdef: + int day_index = format.find("%d") + int month_index = format.find("%m") + + if (day_index != -1) and (month_index != -1): + if (day_index > month_index) and dayfirst: + warnings.warn( + f"Parsing dates in {format} format when dayfirst=True was specified. " + "Pass `dayfirst=False` or specify a format to silence this warning.", + UserWarning, + stacklevel=find_stack_level(), + ) + if (day_index < month_index) and not dayfirst: + warnings.warn( + f"Parsing dates in {format} format when dayfirst=False was specified. " + "Pass `dayfirst=True` or specify a format to silence this warning.", + UserWarning, + stacklevel=find_stack_level(), + ) + @cython.wraparound(False) @cython.boundscheck(False) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 263881f38b1fa..595d13b95fe12 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -15,10 +15,14 @@ cast, overload, ) +import warnings import numpy as np -from pandas._libs import tslib +from pandas._libs import ( + lib, + tslib, +) from pandas._libs.tslibs import ( OutOfBoundsDatetime, Timedelta, @@ -40,6 +44,7 @@ DateTimeErrorChoices, npt, ) +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( ensure_object, @@ -126,7 +131,18 @@ def _guess_datetime_format_for_array(arr, dayfirst: bool | None = False) -> str if (first_non_null := tslib.first_non_null(arr)) != -1: if type(first_non_nan_element := arr[first_non_null]) is str: # GH#32264 np.str_ object - return guess_datetime_format(first_non_nan_element, dayfirst=dayfirst) + guessed_format = guess_datetime_format( + first_non_nan_element, dayfirst=dayfirst + ) + if guessed_format is not None: + return guessed_format + warnings.warn( + "Could not infer format, so each element will be parsed " + "individually by `dateutil`. To ensure parsing is " + "consistent and as-expected, please specify a format.", + UserWarning, + stacklevel=find_stack_level(), + ) return None @@ -334,7 +350,6 @@ def _convert_listlike_datetimes( utc: bool = False, unit: str | None = None, errors: DateTimeErrorChoices = "raise", - infer_datetime_format: bool = False, dayfirst: bool | None = None, yearfirst: bool | None = None, exact: bool = True, @@ -355,8 +370,6 @@ def _convert_listlike_datetimes( None or string of the frequency of the passed data errors : str error handing behaviors from to_datetime, 'raise', 'coerce', 'ignore' - infer_datetime_format : bool, default False - inferring format behavior from to_datetime dayfirst : bool dayfirst parsing behavior from to_datetime yearfirst : bool @@ -419,24 +432,22 @@ def _convert_listlike_datetimes( arg = ensure_object(arg) require_iso8601 = False - if infer_datetime_format and format is None: + if format is None: format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst) - if format is not None: - # There is a special fast-path for iso8601 formatted - # datetime strings, so in those cases don't use the inferred - # format because this path makes process slower in this - # special case - format_is_iso8601 = format_is_iso(format) - if format_is_iso8601: - require_iso8601 = not infer_datetime_format + # There is a special fast-path for iso8601 formatted datetime strings + require_iso8601 = format is not None and format_is_iso(format) if format is not None and not require_iso8601: - res = _to_datetime_with_format( - arg, orig_arg, name, utc, format, exact, errors, infer_datetime_format + return _to_datetime_with_format( + arg, + orig_arg, + name, + utc, + format, + exact, + errors, ) - if res is not None: - return res result, tz_parsed = objects_to_datetime64ns( arg, @@ -466,8 +477,7 @@ def _array_strptime_with_fallback( fmt: str, exact: bool, errors: str, - infer_datetime_format: bool, -) -> Index | None: +) -> Index: """ Call array_strptime, with fallback behavior depending on 'errors'. """ @@ -485,21 +495,14 @@ def _array_strptime_with_fallback( else: result = arg except ValueError: - # if fmt was inferred, try falling back - # to array_to_datetime - terminate here - # for specified formats - if not infer_datetime_format: - if errors == "raise": - raise - if errors == "coerce": - result = np.empty(arg.shape, dtype="M8[ns]") - iresult = result.view("i8") - iresult.fill(iNaT) - else: - result = arg + if errors == "raise": + raise + if errors == "coerce": + result = np.empty(arg.shape, dtype="M8[ns]") + iresult = result.view("i8") + iresult.fill(iNaT) else: - # Indicates to the caller to fallback to objects_to_datetime64ns - return None + result = arg else: if any(tz is not None for tz in timezones): return _return_parsed_timezone_results(result, timezones, utc, name) @@ -515,10 +518,9 @@ def _to_datetime_with_format( fmt: str, exact: bool, errors: str, - infer_datetime_format: bool, -) -> Index | None: +) -> Index: """ - Try parsing with the given format, returning None on failure. + Try parsing with the given format. """ result = None @@ -538,9 +540,7 @@ def _to_datetime_with_format( return _box_as_indexlike(result, utc=utc, name=name) # fallback - res = _array_strptime_with_fallback( - arg, name, utc, fmt, exact, errors, infer_datetime_format - ) + res = _array_strptime_with_fallback(arg, name, utc, fmt, exact, errors) return res @@ -714,7 +714,7 @@ def to_datetime( format: str | None = None, exact: bool = True, unit: str | None = None, - infer_datetime_format: bool = False, + infer_datetime_format: lib.NoDefault | bool = lib.no_default, origin: str = "unix", cache: bool = True, ) -> DatetimeIndex | Series | DatetimeScalar | NaTType | None: @@ -801,6 +801,11 @@ def to_datetime( of the datetime strings based on the first non-NaN element, and if it can be inferred, switch to a faster method of parsing them. In some cases this can increase the parsing speed by ~5-10x. + + .. deprecated:: 2.0.0 + A strict version of this argument is now the default, passing it has + no effect. + origin : scalar, default 'unix' Define the reference date. The numeric values would be parsed as number of units (defined by `unit`) since this reference date. @@ -916,24 +921,6 @@ def to_datetime( 1 2016-03-05 dtype: datetime64[ns] - Passing ``infer_datetime_format=True`` can often-times speedup a parsing - if its not an ISO8601 format exactly, but in a regular format. - - >>> s = pd.Series(['3/11/2000', '3/12/2000', '3/13/2000'] * 1000) - >>> s.head() - 0 3/11/2000 - 1 3/12/2000 - 2 3/13/2000 - 3 3/11/2000 - 4 3/12/2000 - dtype: object - - >>> %timeit pd.to_datetime(s, infer_datetime_format=True) # doctest: +SKIP - 100 loops, best of 3: 10.4 ms per loop - - >>> %timeit pd.to_datetime(s, infer_datetime_format=False) # doctest: +SKIP - 1 loop, best of 3: 471 ms per loop - Using a unix epoch time >>> pd.to_datetime(1490195805, unit='s') @@ -982,7 +969,7 @@ def to_datetime( - Timezone-naive inputs are converted to timezone-naive :class:`DatetimeIndex`: - >>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 13:00:15']) + >>> pd.to_datetime(['2018-10-26 12:00:00', '2018-10-26 13:00:15']) DatetimeIndex(['2018-10-26 12:00:00', '2018-10-26 13:00:15'], dtype='datetime64[ns]', freq=None) @@ -1008,7 +995,7 @@ def to_datetime( are constant: >>> from datetime import datetime - >>> pd.to_datetime(["2020-01-01 01:00 -01:00", datetime(2020, 1, 1, 3, 0)]) + >>> pd.to_datetime(["2020-01-01 01:00:00-01:00", datetime(2020, 1, 1, 3, 0)]) DatetimeIndex(['2020-01-01 01:00:00-01:00', '2020-01-01 02:00:00-01:00'], dtype='datetime64[ns, UTC-01:00]', freq=None) @@ -1030,19 +1017,22 @@ def to_datetime( DatetimeIndex(['2018-10-26 17:30:00+00:00', '2018-10-26 17:00:00+00:00'], dtype='datetime64[ns, UTC]', freq=None) - - Inputs can contain both naive and aware, string or datetime, the above + - Inputs can contain both string or datetime, the above rules still apply - >>> from datetime import timezone, timedelta - >>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 12:00 -0530', - ... datetime(2020, 1, 1, 18), - ... datetime(2020, 1, 1, 18, - ... tzinfo=timezone(-timedelta(hours=1)))], - ... utc=True) - DatetimeIndex(['2018-10-26 12:00:00+00:00', '2018-10-26 17:30:00+00:00', - '2020-01-01 18:00:00+00:00', '2020-01-01 19:00:00+00:00'], + >>> pd.to_datetime(['2018-10-26 12:00', datetime(2020, 1, 1, 18)], utc=True) + DatetimeIndex(['2018-10-26 12:00:00+00:00', '2020-01-01 18:00:00+00:00'], dtype='datetime64[ns, UTC]', freq=None) """ + if infer_datetime_format is not lib.no_default: + warnings.warn( + "The argument 'infer_datetime_format' is deprecated and will " + "be removed in a future version. " + "A strict version of it is now the default, see " + "https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. " + "You can safely remove this argument.", + stacklevel=find_stack_level(), + ) if arg is None: return None @@ -1057,7 +1047,6 @@ def to_datetime( yearfirst=yearfirst, errors=errors, exact=exact, - infer_datetime_format=infer_datetime_format, ) # pylint: disable-next=used-before-assignment result: Timestamp | NaTType | Series | Index diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index ff94502d69ca3..111a827459022 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -130,13 +130,11 @@ def __init__(self, kwds) -> None: self.true_values = kwds.get("true_values") self.false_values = kwds.get("false_values") - self.infer_datetime_format = kwds.pop("infer_datetime_format", False) self.cache_dates = kwds.pop("cache_dates", True) self._date_conv = _make_date_converter( date_parser=self.date_parser, dayfirst=self.dayfirst, - infer_datetime_format=self.infer_datetime_format, cache_dates=self.cache_dates, ) @@ -1115,7 +1113,6 @@ def _get_empty_meta( def _make_date_converter( date_parser=None, dayfirst: bool = False, - infer_datetime_format: bool = False, cache_dates: bool = True, ): def converter(*date_cols): @@ -1128,7 +1125,6 @@ def converter(*date_cols): utc=False, dayfirst=dayfirst, errors="ignore", - infer_datetime_format=infer_datetime_format, cache=cache_dates, ).to_numpy() @@ -1192,7 +1188,6 @@ def converter(*date_cols): "verbose": False, "encoding": None, "compression": None, - "infer_datetime_format": False, "skip_blank_lines": True, "encoding_errors": "strict", "on_bad_lines": ParserBase.BadLineHandleMethod.ERROR, diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 23a335ec0b965..96c2fd08bbc59 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -247,6 +247,10 @@ format of the datetime strings in the columns, and if it can be inferred, switch to a faster method of parsing them. In some cases this can increase the parsing speed by 5-10x. + + .. deprecated:: 2.0.0 + A strict version of this argument is now the default, passing it has no effect. + keep_date_col : bool, default False If True and `parse_dates` specifies combining multiple columns then keep the original columns. @@ -449,7 +453,6 @@ "decimal", "iterator", "dayfirst", - "infer_datetime_format", "verbose", "skipinitialspace", "low_memory", @@ -607,7 +610,7 @@ def read_csv( verbose: bool = ..., skip_blank_lines: bool = ..., parse_dates: bool | Sequence[Hashable] | None = ..., - infer_datetime_format: bool = ..., + infer_datetime_format: bool | lib.NoDefault = ..., keep_date_col: bool = ..., date_parser=..., dayfirst: bool = ..., @@ -663,7 +666,7 @@ def read_csv( verbose: bool = ..., skip_blank_lines: bool = ..., parse_dates: bool | Sequence[Hashable] | None = ..., - infer_datetime_format: bool = ..., + infer_datetime_format: bool | lib.NoDefault = ..., keep_date_col: bool = ..., date_parser=..., dayfirst: bool = ..., @@ -719,7 +722,7 @@ def read_csv( verbose: bool = ..., skip_blank_lines: bool = ..., parse_dates: bool | Sequence[Hashable] | None = ..., - infer_datetime_format: bool = ..., + infer_datetime_format: bool | lib.NoDefault = ..., keep_date_col: bool = ..., date_parser=..., dayfirst: bool = ..., @@ -775,7 +778,7 @@ def read_csv( verbose: bool = ..., skip_blank_lines: bool = ..., parse_dates: bool | Sequence[Hashable] | None = ..., - infer_datetime_format: bool = ..., + infer_datetime_format: bool | lib.NoDefault = ..., keep_date_col: bool = ..., date_parser=..., dayfirst: bool = ..., @@ -843,7 +846,7 @@ def read_csv( skip_blank_lines: bool = True, # Datetime Handling parse_dates: bool | Sequence[Hashable] | None = None, - infer_datetime_format: bool = False, + infer_datetime_format: bool | lib.NoDefault = lib.no_default, keep_date_col: bool = False, date_parser=None, dayfirst: bool = False, @@ -874,6 +877,15 @@ def read_csv( storage_options: StorageOptions = None, use_nullable_dtypes: bool = False, ) -> DataFrame | TextFileReader: + if infer_datetime_format is not lib.no_default: + warnings.warn( + "The argument 'infer_datetime_format' is deprecated and will " + "be removed in a future version. " + "A strict version of it is now the default, see " + "https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. " + "You can safely remove this argument.", + stacklevel=find_stack_level(), + ) # locals() should never be modified kwds = locals().copy() del kwds["filepath_or_buffer"] @@ -920,7 +932,7 @@ def read_table( verbose: bool = ..., skip_blank_lines: bool = ..., parse_dates: bool | Sequence[Hashable] = ..., - infer_datetime_format: bool = ..., + infer_datetime_format: bool | lib.NoDefault = ..., keep_date_col: bool = ..., date_parser=..., dayfirst: bool = ..., @@ -976,7 +988,7 @@ def read_table( verbose: bool = ..., skip_blank_lines: bool = ..., parse_dates: bool | Sequence[Hashable] = ..., - infer_datetime_format: bool = ..., + infer_datetime_format: bool | lib.NoDefault = ..., keep_date_col: bool = ..., date_parser=..., dayfirst: bool = ..., @@ -1032,7 +1044,7 @@ def read_table( verbose: bool = ..., skip_blank_lines: bool = ..., parse_dates: bool | Sequence[Hashable] = ..., - infer_datetime_format: bool = ..., + infer_datetime_format: bool | lib.NoDefault = ..., keep_date_col: bool = ..., date_parser=..., dayfirst: bool = ..., @@ -1088,7 +1100,7 @@ def read_table( verbose: bool = ..., skip_blank_lines: bool = ..., parse_dates: bool | Sequence[Hashable] = ..., - infer_datetime_format: bool = ..., + infer_datetime_format: bool | lib.NoDefault = ..., keep_date_col: bool = ..., date_parser=..., dayfirst: bool = ..., @@ -1156,7 +1168,7 @@ def read_table( skip_blank_lines: bool = True, # Datetime Handling parse_dates: bool | Sequence[Hashable] = False, - infer_datetime_format: bool = False, + infer_datetime_format: bool | lib.NoDefault = lib.no_default, keep_date_col: bool = False, date_parser=None, dayfirst: bool = False, @@ -1737,10 +1749,6 @@ def TextParser(*args, **kwds) -> TextFileReader: transformed content. encoding : str, optional Encoding to use for UTF when reading/writing (ex. 'utf-8') - infer_datetime_format: bool, default False - If True and `parse_dates` is True for a column, try to infer the - datetime format based on the first datetime string. If the format - can be inferred, there often will be a large parsing speed-up. float_precision : str, optional Specifies which converter the C engine should use for floating-point values. The options are `None` or `high` for the ordinary converter, diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index eb6ad4b575414..557cdd96bf00c 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -688,6 +688,13 @@ def test_EA_types(self, engine, data, request): reason=f"Parameterized types with tz={pa_dtype.tz} not supported.", ) ) + elif pa.types.is_timestamp(pa_dtype) and pa_dtype.unit in ("us", "ns"): + request.node.add_marker( + pytest.mark.xfail( + raises=ValueError, + reason="https://github.com/pandas-dev/pandas/issues/49767", + ) + ) elif pa.types.is_binary(pa_dtype): request.node.add_marker( pytest.mark.xfail(reason="CSV parsers don't correctly handle binary") diff --git a/pandas/tests/frame/methods/test_drop.py b/pandas/tests/frame/methods/test_drop.py index e6db7ec8ed3d7..776e5b85317ff 100644 --- a/pandas/tests/frame/methods/test_drop.py +++ b/pandas/tests/frame/methods/test_drop.py @@ -405,11 +405,11 @@ def test_drop_level_nonunique_datetime(self): idx = Index([2, 3, 4, 4, 5], name="id") idxdt = pd.to_datetime( [ - "201603231400", - "201603231500", - "201603231600", - "201603231600", - "201603231700", + "2016-03-23 14:00", + "2016-03-23 15:00", + "2016-03-23 16:00", + "2016-03-23 16:00", + "2016-03-23 17:00", ] ) df = DataFrame(np.arange(10).reshape(5, 2), columns=list("ab"), index=idx) diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 638387452903b..519be89c8793a 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -27,7 +27,7 @@ class TestDataFrameToCSV: def read_csv(self, path, **kwargs): - params = {"index_col": 0, "parse_dates": True} + params = {"index_col": 0} params.update(**kwargs) return read_csv(path, **params) @@ -46,17 +46,17 @@ def test_to_csv_from_csv1(self, float_frame, datetime_frame): # freq does not roundtrip datetime_frame.index = datetime_frame.index._with_freq(None) datetime_frame.to_csv(path) - recons = self.read_csv(path) + recons = self.read_csv(path, parse_dates=True) tm.assert_frame_equal(datetime_frame, recons) datetime_frame.to_csv(path, index_label="index") - recons = self.read_csv(path, index_col=None) + recons = self.read_csv(path, index_col=None, parse_dates=True) assert len(recons.columns) == len(datetime_frame.columns) + 1 # no index datetime_frame.to_csv(path, index=False) - recons = self.read_csv(path, index_col=None) + recons = self.read_csv(path, index_col=None, parse_dates=True) tm.assert_almost_equal(datetime_frame.values, recons.values) # corner case @@ -514,7 +514,10 @@ def test_to_csv_multiindex(self, float_frame, datetime_frame): tsframe.index = MultiIndex.from_arrays(new_index) tsframe.to_csv(path, index_label=["time", "foo"]) - recons = self.read_csv(path, index_col=[0, 1]) + with tm.assert_produces_warning( + UserWarning, match="Could not infer format" + ): + recons = self.read_csv(path, index_col=[0, 1], parse_dates=True) # TODO to_csv drops column name tm.assert_frame_equal(tsframe, recons, check_names=False) @@ -1056,7 +1059,7 @@ def test_to_csv_date_format(self, datetime_frame): # test NaTs nat_index = to_datetime( - ["NaT"] * 10 + ["2000-01-01", "1/1/2000", "1-1-2000"] + ["NaT"] * 10 + ["2000-01-01", "2000-01-01", "2000-01-01"] ) nat_frame = DataFrame({"A": nat_index}, index=nat_index) nat_frame.to_csv(path, date_format="%Y-%m-%d") diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 7e5bfff53054a..2b0c607d6851a 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -697,7 +697,8 @@ def test_max_nan_bug(): -05-06,2013-05-06 00:00:00,,log.log -05-07,2013-05-07 00:00:00,OE,xlsx""" - df = pd.read_csv(StringIO(raw), parse_dates=[0]) + with tm.assert_produces_warning(UserWarning, match="Could not infer format"): + df = pd.read_csv(StringIO(raw), parse_dates=[0]) gb = df.groupby("Date") r = gb[["File"]].max() e = gb["File"].max().to_frame() diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py index 6a63090dd374f..246de06a04de2 100644 --- a/pandas/tests/indexes/datetimes/test_constructors.py +++ b/pandas/tests/indexes/datetimes/test_constructors.py @@ -1060,19 +1060,13 @@ def test_datetimeindex_constructor_misc(self): arr = np.array(["1/1/2005", "1/2/2005", "1/3/2005", "2005-01-04"], dtype="O") idx4 = DatetimeIndex(arr) - arr = to_datetime(["1/1/2005", "1/2/2005", "1/3/2005", "2005-01-04"]) - idx5 = DatetimeIndex(arr) - - arr = to_datetime(["1/1/2005", "1/2/2005", "Jan 3, 2005", "2005-01-04"]) - idx6 = DatetimeIndex(arr) - - idx7 = DatetimeIndex(["12/05/2007", "25/01/2008"], dayfirst=True) - idx8 = DatetimeIndex( + idx5 = DatetimeIndex(["12/05/2007", "25/01/2008"], dayfirst=True) + idx6 = DatetimeIndex( ["2007/05/12", "2008/01/25"], dayfirst=False, yearfirst=True ) - tm.assert_index_equal(idx7, idx8) + tm.assert_index_equal(idx5, idx6) - for other in [idx2, idx3, idx4, idx5, idx6]: + for other in [idx2, idx3, idx4]: assert (idx1.values == other.values).all() sdate = datetime(1999, 12, 25) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 396022488aaf5..c2c1073eef36d 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1214,10 +1214,16 @@ def test_equals_op_index_vs_mi_same_length(self): expected = np.array([False, False, False]) tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize("dt_conv", [pd.to_datetime, pd.to_timedelta]) - def test_dt_conversion_preserves_name(self, dt_conv): + @pytest.mark.parametrize( + "dt_conv, arg", + [ + (pd.to_datetime, ["2000-01-01", "2000-01-02"]), + (pd.to_timedelta, ["01:02:03", "01:02:04"]), + ], + ) + def test_dt_conversion_preserves_name(self, dt_conv, arg): # GH 10875 - index = Index(["01:02:03", "01:02:04"], name="label") + index = Index(arg, name="label") assert index.name == dt_conv(index).name def test_cached_properties_not_settable(self): diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 24b18c8657546..6656face3be84 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -57,8 +57,8 @@ def _set_noconvert_columns(self): return CParserWrapper._set_noconvert_columns(self) data = """a,b,c,d,e -0,1,20140101,0900,4 -0,1,20140102,1000,4""" +0,1,2014-01-01,09:00,4 +0,1,2014-01-02,10:00,4""" parse_dates = [[1, 2]] cols = { diff --git a/pandas/tests/io/parser/dtypes/test_categorical.py b/pandas/tests/io/parser/dtypes/test_categorical.py index 3b8c520004f12..a0deebecdfff8 100644 --- a/pandas/tests/io/parser/dtypes/test_categorical.py +++ b/pandas/tests/io/parser/dtypes/test_categorical.py @@ -262,7 +262,7 @@ def test_categorical_coerces_timestamp(all_parsers): parser = all_parsers dtype = {"b": CategoricalDtype([Timestamp("2014")])} - data = "b\n2014-01-01\n2014-01-01T00:00:00" + data = "b\n2014-01-01\n2014-01-01" expected = DataFrame({"b": Categorical([Timestamp("2014")] * 2)}) result = parser.read_csv(StringIO(data), dtype=dtype) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 5b21028ade84c..c366613c2815f 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -803,7 +803,13 @@ def test_yy_format_with_year_first(all_parsers, parse_dates): 090331,0830,5,6 """ parser = all_parsers - result = parser.read_csv(StringIO(data), index_col=0, parse_dates=parse_dates) + result = parser.read_csv_check_warnings( + UserWarning, + "Could not infer format", + StringIO(data), + index_col=0, + parse_dates=parse_dates, + ) index = DatetimeIndex( [ datetime(2009, 1, 31, 0, 10, 0), @@ -876,7 +882,13 @@ def test_multi_index_parse_dates(all_parsers, index_col): columns=["A", "B", "C"], index=index, ) - result = parser.read_csv(StringIO(data), index_col=index_col, parse_dates=True) + result = parser.read_csv_check_warnings( + UserWarning, + "Could not infer format", + StringIO(data), + index_col=index_col, + parse_dates=True, + ) tm.assert_frame_equal(result, expected) @@ -1213,19 +1225,55 @@ def test_read_with_parse_dates_invalid_type(all_parsers, parse_dates): @pytest.mark.parametrize("cache_dates", [True, False]) -@pytest.mark.parametrize("value", ["nan", "0", ""]) +@pytest.mark.parametrize("value", ["nan", ""]) def test_bad_date_parse(all_parsers, cache_dates, value): # if we have an invalid date make sure that we handle this with # and w/o the cache properly parser = all_parsers s = StringIO((f"{value},\n") * 50000) - parser.read_csv( + if parser.engine == "pyarrow": + # None in input gets converted to 'None', for which + # pandas tries to guess the datetime format, triggering + # the warning. TODO: parse dates directly in pyarrow, see + # https://github.com/pandas-dev/pandas/issues/48017 + warn = UserWarning + else: + warn = None + parser.read_csv_check_warnings( + warn, + "Could not infer format", + s, + header=None, + names=["foo", "bar"], + parse_dates=["foo"], + cache_dates=cache_dates, + ) + + +@pytest.mark.parametrize("cache_dates", [True, False]) +@pytest.mark.parametrize("value", ["0"]) +def test_bad_date_parse_with_warning(all_parsers, cache_dates, value): + # if we have an invalid date make sure that we handle this with + # and w/o the cache properly. + parser = all_parsers + s = StringIO((f"{value},\n") * 50000) + + if parser.engine == "pyarrow": + # pyarrow reads "0" as 0 (of type int64), and so + # pandas doesn't try to guess the datetime format + # TODO: parse dates directly in pyarrow, see + # https://github.com/pandas-dev/pandas/issues/48017 + warn = None + else: + warn = UserWarning + parser.read_csv_check_warnings( + warn, + "Could not infer format", s, header=None, names=["foo", "bar"], parse_dates=["foo"], - infer_datetime_format=False, cache_dates=cache_dates, ) @@ -1243,6 +1291,19 @@ def test_parse_dates_empty_string(all_parsers): tm.assert_frame_equal(result, expected) +def test_parse_dates_infer_datetime_format_warning(all_parsers): + # GH 49024 + parser = all_parsers + data = "Date,test\n2012-01-01,1\n,2" + parser.read_csv_check_warnings( + UserWarning, + "The argument 'infer_datetime_format' is deprecated", + StringIO(data), + parse_dates=["Date"], + infer_datetime_format=True, + ) + + @xfail_pyarrow @pytest.mark.parametrize( "data,kwargs,expected", @@ -1589,7 +1650,13 @@ def test_parse_timezone(all_parsers): def test_invalid_parse_delimited_date(all_parsers, date_string): parser = all_parsers expected = DataFrame({0: [date_string]}, dtype="object") - result = parser.read_csv(StringIO(date_string), header=None, parse_dates=[0]) + result = parser.read_csv_check_warnings( + UserWarning, + "Could not infer format", + StringIO(date_string), + header=None, + parse_dates=[0], + ) tm.assert_frame_equal(result, expected) @@ -1620,9 +1687,9 @@ def test_parse_delimited_date_swap_no_warning( @pytest.mark.parametrize( "date_string,dayfirst,expected", [ - # %d/%m/%Y; month > 12 thus replacement + # %d/%m/%Y; month > 12 ("13/02/2019", False, datetime(2019, 2, 13)), - # %m/%d/%Y; day > 12 thus there will be no replacement + # %m/%d/%Y; day > 12 ("02/13/2019", True, datetime(2019, 2, 13)), ], ) @@ -1631,7 +1698,10 @@ def test_parse_delimited_date_swap_with_warning( ): parser = all_parsers expected = DataFrame({0: [expected]}, dtype="datetime64[ns]") - warning_msg = "Specify a format to ensure consistent parsing" + warning_msg = ( + "Parsing dates in .* format when dayfirst=.* was specified. " + "Pass `dayfirst=.*` or specify a format to silence this warning." + ) result = parser.read_csv_check_warnings( UserWarning, warning_msg, @@ -1645,13 +1715,11 @@ def test_parse_delimited_date_swap_with_warning( def test_parse_multiple_delimited_dates_with_swap_warnings(): # GH46210 - warning_msg = "Specify a format to ensure consistent parsing" - with tm.assert_produces_warning(UserWarning, match=warning_msg) as record: + with pytest.raises( + ValueError, + match=r"^time data '31/05/2000' does not match format '%m/%d/%Y' \(match\)$", + ): pd.to_datetime(["01/01/2000", "31/05/2000", "31/05/2001", "01/02/2000"]) - assert len({str(warning.message) for warning in record}) == 1 - # Using set(record) as repetitions of the same warning are suppressed - # https://docs.python.org/3/library/warnings.html - # and here we care to check that the warning is only shows once to users. def _helper_hypothesis_delimited_date(call, date_string, **kwargs): @@ -1738,7 +1806,13 @@ def test_date_parser_and_names(all_parsers): # GH#33699 parser = all_parsers data = StringIO("""x,y\n1,2""") - result = parser.read_csv(data, parse_dates=["B"], names=["B"]) + result = parser.read_csv_check_warnings( + UserWarning, + "Could not infer format", + data, + parse_dates=["B"], + names=["B"], + ) expected = DataFrame({"B": ["y", "2"]}, index=["x", "1"]) tm.assert_frame_equal(result, expected) @@ -1785,7 +1859,9 @@ def test_date_parser_usecols_thousands(all_parsers): """ parser = all_parsers - result = parser.read_csv( + result = parser.read_csv_check_warnings( + UserWarning, + "Could not infer format", StringIO(data), parse_dates=[1], usecols=[1, 2], @@ -1813,97 +1889,51 @@ def test_parse_dates_and_keep_orgin_column(all_parsers): def test_dayfirst_warnings(): # GH 12585 - warning_msg_day_first = ( - r"Parsing dates in DD/MM/YYYY format when dayfirst=False \(the default\) was " - r"specified. This may lead to inconsistently parsed dates! Specify a format " - r"to ensure consistent parsing." - ) - warning_msg_month_first = ( - "Parsing dates in MM/DD/YYYY format when dayfirst=True was " - "specified. This may lead to inconsistently parsed dates! Specify a format " - "to ensure consistent parsing." - ) # CASE 1: valid input input = "date\n31/12/2014\n10/03/2011" - expected_consistent = DatetimeIndex( + expected = DatetimeIndex( ["2014-12-31", "2011-03-10"], dtype="datetime64[ns]", freq=None, name="date" ) - expected_inconsistent = DatetimeIndex( - ["2014-12-31", "2011-10-03"], dtype="datetime64[ns]", freq=None, name="date" + warning_msg = ( + "Parsing dates in .* format when dayfirst=.* was specified. " + "Pass `dayfirst=.*` or specify a format to silence this warning." ) # A. dayfirst arg correct, no warning res1 = read_csv( StringIO(input), parse_dates=["date"], dayfirst=True, index_col="date" ).index - tm.assert_index_equal(expected_consistent, res1) + tm.assert_index_equal(expected, res1) - # B. dayfirst arg incorrect, warning + incorrect output - with tm.assert_produces_warning(UserWarning, match=warning_msg_day_first): + # B. dayfirst arg incorrect, warning + with tm.assert_produces_warning(UserWarning, match=warning_msg): res2 = read_csv( StringIO(input), parse_dates=["date"], dayfirst=False, index_col="date" ).index - tm.assert_index_equal(expected_inconsistent, res2) - - # C. dayfirst default arg, same as B - with tm.assert_produces_warning(UserWarning, match=warning_msg_day_first): - res3 = read_csv( - StringIO(input), parse_dates=["date"], dayfirst=False, index_col="date" - ).index - tm.assert_index_equal(expected_inconsistent, res3) - - # D. infer_datetime_format=True overrides dayfirst default - # no warning + correct result - res4 = read_csv( - StringIO(input), - parse_dates=["date"], - infer_datetime_format=True, - index_col="date", - ).index - tm.assert_index_equal(expected_consistent, res4) + tm.assert_index_equal(expected, res2) # CASE 2: invalid input # cannot consistently process with single format - # warnings *always* raised + # return to user unaltered # first in DD/MM/YYYY, second in MM/DD/YYYY input = "date\n31/12/2014\n03/30/2011" - expected = DatetimeIndex( - ["2014-12-31", "2011-03-30"], dtype="datetime64[ns]", freq=None, name="date" - ) + expected = Index(["31/12/2014", "03/30/2011"], dtype="object", name="date") # A. use dayfirst=True - with tm.assert_produces_warning(UserWarning, match=warning_msg_month_first): - res5 = read_csv( - StringIO(input), parse_dates=["date"], dayfirst=True, index_col="date" - ).index + res5 = read_csv( + StringIO(input), parse_dates=["date"], dayfirst=True, index_col="date" + ).index tm.assert_index_equal(expected, res5) # B. use dayfirst=False - with tm.assert_produces_warning(UserWarning, match=warning_msg_day_first): + with tm.assert_produces_warning(UserWarning, match=warning_msg): res6 = read_csv( StringIO(input), parse_dates=["date"], dayfirst=False, index_col="date" ).index tm.assert_index_equal(expected, res6) - # C. use dayfirst default arg, same as B - with tm.assert_produces_warning(UserWarning, match=warning_msg_day_first): - res7 = read_csv( - StringIO(input), parse_dates=["date"], dayfirst=False, index_col="date" - ).index - tm.assert_index_equal(expected, res7) - - # D. use infer_datetime_format=True - with tm.assert_produces_warning(UserWarning, match=warning_msg_day_first): - res8 = read_csv( - StringIO(input), - parse_dates=["date"], - infer_datetime_format=True, - index_col="date", - ).index - tm.assert_index_equal(expected, res8) - @pytest.mark.parametrize( "date_string, dayfirst", @@ -1926,9 +1956,11 @@ def test_dayfirst_warnings_no_leading_zero(date_string, dayfirst): expected = DatetimeIndex( ["2014-01-31"], dtype="datetime64[ns]", freq=None, name="date" ) - with tm.assert_produces_warning( - UserWarning, match=r"may lead to inconsistently parsed dates" - ): + warning_msg = ( + "Parsing dates in .* format when dayfirst=.* was specified. " + "Pass `dayfirst=.*` or specify a format to silence this warning." + ) + with tm.assert_produces_warning(UserWarning, match=warning_msg): res = read_csv( StringIO(initial_value), parse_dates=["date"], @@ -1943,7 +1975,12 @@ def test_infer_first_column_as_index(all_parsers): # GH#11019 parser = all_parsers data = "a,b,c\n1970-01-01,2,3,4" - result = parser.read_csv(StringIO(data), parse_dates=["a"]) + result = parser.read_csv_check_warnings( + UserWarning, + "Could not infer format", + StringIO(data), + parse_dates=["a"], + ) expected = DataFrame({"a": "2", "b": 3, "c": 4}, index=["1970-01-01"]) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/usecols/test_parse_dates.py b/pandas/tests/io/parser/usecols/test_parse_dates.py index 50000dab8a7aa..4823df1da9959 100644 --- a/pandas/tests/io/parser/usecols/test_parse_dates.py +++ b/pandas/tests/io/parser/usecols/test_parse_dates.py @@ -31,8 +31,8 @@ def test_usecols_with_parse_dates(all_parsers, usecols): # see gh-9755 data = """a,b,c,d,e -0,1,20140101,0900,4 -0,1,20140102,1000,4""" +0,1,2014-01-01,09:00,4 +0,1,2014-01-02,10:00,4""" parser = all_parsers parse_dates = [[1, 2]] @@ -124,7 +124,13 @@ def test_usecols_with_parse_dates4(all_parsers): } expected = DataFrame(cols, columns=["a_b"] + list("cdefghij")) - result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates) + result = parser.read_csv_check_warnings( + UserWarning, + "Could not infer format", + StringIO(data), + usecols=usecols, + parse_dates=parse_dates, + ) tm.assert_frame_equal(result, expected) @@ -138,8 +144,8 @@ def test_usecols_with_parse_dates4(all_parsers): ) def test_usecols_with_parse_dates_and_names(all_parsers, usecols, names): # see gh-9755 - s = """0,1,20140101,0900,4 -0,1,20140102,1000,4""" + s = """0,1,2014-01-01,09:00,4 +0,1,2014-01-02,10:00,4""" parse_dates = [[1, 2]] parser = all_parsers diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index db37b1785af5c..6764ff27674ab 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -1395,7 +1395,7 @@ def test_sqlalchemy_type_mapping(self): # Test Timestamp objects (no datetime64 because of timezone) (GH9085) df = DataFrame( - {"time": to_datetime(["201412120154", "201412110254"], utc=True)} + {"time": to_datetime(["2014-12-12 01:54", "2014-12-11 02:54"], utc=True)} ) db = sql.SQLDatabase(self.conn) table = sql.SQLTable("test_type", db, frame=df) @@ -1577,7 +1577,7 @@ def test_sqlite_type_mapping(self): # Test Timestamp objects (no datetime64 because of timezone) (GH9085) df = DataFrame( - {"time": to_datetime(["201412120154", "201412110254"], utc=True)} + {"time": to_datetime(["2014-12-12 01:54", "2014-12-11 02:54"], utc=True)} ) db = sql.SQLiteDatabase(self.conn) table = sql.SQLiteTable("test_type", db, frame=df) diff --git a/pandas/tests/io/xml/test_xml_dtypes.py b/pandas/tests/io/xml/test_xml_dtypes.py index 412c8a8dde175..17d1e7e00653b 100644 --- a/pandas/tests/io/xml/test_xml_dtypes.py +++ b/pandas/tests/io/xml/test_xml_dtypes.py @@ -457,7 +457,7 @@ def test_day_first_parse_dates(parser): ) with tm.assert_produces_warning( - UserWarning, match="Parsing dates in DD/MM/YYYY format" + UserWarning, match="Parsing dates in %d/%m/%Y format" ): df_result = read_xml(xml, parse_dates=["date"], parser=parser) df_iter = read_xml_iterparse( diff --git a/pandas/tests/plotting/test_converter.py b/pandas/tests/plotting/test_converter.py index 8da663b8e9977..d0b3b3c413e7a 100644 --- a/pandas/tests/plotting/test_converter.py +++ b/pandas/tests/plotting/test_converter.py @@ -161,8 +161,8 @@ def dtc(self): return converter.DatetimeConverter() def test_convert_accepts_unicode(self, dtc): - r1 = dtc.convert("12:22", None, None) - r2 = dtc.convert("12:22", None, None) + r1 = dtc.convert("2000-01-01 12:22", None, None) + r2 = dtc.convert("2000-01-01 12:22", None, None) assert r1 == r2, "DatetimeConverter.convert should accept unicode" def test_conversion(self, dtc): diff --git a/pandas/tests/series/methods/test_to_csv.py b/pandas/tests/series/methods/test_to_csv.py index 28519fc9b529f..7827483644634 100644 --- a/pandas/tests/series/methods/test_to_csv.py +++ b/pandas/tests/series/methods/test_to_csv.py @@ -13,7 +13,7 @@ class TestSeriesToCSV: def read_csv(self, path, **kwargs): - params = {"index_col": 0, "header": None, "parse_dates": True} + params = {"index_col": 0, "header": None} params.update(**kwargs) header = params.get("header") @@ -30,7 +30,7 @@ def test_from_csv(self, datetime_series, string_series): with tm.ensure_clean() as path: datetime_series.to_csv(path, header=False) - ts = self.read_csv(path) + ts = self.read_csv(path, parse_dates=True) tm.assert_series_equal(datetime_series, ts, check_names=False) assert ts.name is None @@ -55,7 +55,7 @@ def test_from_csv(self, datetime_series, string_series): with open(path, "w") as outfile: outfile.write("1998-01-01|1.0\n1999-01-01|2.0") - series = self.read_csv(path, sep="|") + series = self.read_csv(path, sep="|", parse_dates=True) check_series = Series( {datetime(1998, 1, 1): 1.0, datetime(1999, 1, 1): 2.0} ) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index df83a5e410e71..818211cf0fa2a 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1218,7 +1218,8 @@ def test_value_counts_datetime_outofbounds(self): tm.assert_series_equal(res, exp) # GH 12424 - res = to_datetime(Series(["2362-01-01", np.nan]), errors="ignore") + with tm.assert_produces_warning(UserWarning, match="Could not infer format"): + res = to_datetime(Series(["2362-01-01", np.nan]), errors="ignore") exp = Series(["2362-01-01", np.nan], dtype=object) tm.assert_series_equal(res, exp) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 9ea7530640035..48844beed30f4 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -220,7 +220,6 @@ def test_to_datetime_format_YYYYMMDD_overflow(self, input_s, expected): ), (["201010", pd.NA], "%y%m%d", DatetimeIndex(["2020-10-10", "NaT"])), (["201010", pd.NA], "%d%m%y", DatetimeIndex(["2010-10-20", "NaT"])), - (["201010", pd.NA], None, DatetimeIndex(["2010-10-20", "NaT"])), ([None, np.nan, pd.NA], None, DatetimeIndex(["NaT", "NaT", "NaT"])), ([None, np.nan, pd.NA], "%Y%m%d", DatetimeIndex(["NaT", "NaT", "NaT"])), ], @@ -230,6 +229,13 @@ def test_to_datetime_with_NA(self, data, format, expected): result = to_datetime(data, format=format) tm.assert_index_equal(result, expected) + def test_to_datetime_with_NA_with_warning(self): + # GH#42957 + with tm.assert_produces_warning(UserWarning, match="Could not infer format"): + result = to_datetime(["201010", pd.NA]) + expected = DatetimeIndex(["2010-10-20", "NaT"]) + tm.assert_index_equal(result, expected) + def test_to_datetime_format_integer(self, cache): # GH 10178 ser = Series([2000, 2001, 2002]) @@ -576,8 +582,7 @@ def test_to_datetime_mixed_datetime_and_string_with_format_raises( ): to_datetime([ts1, ts2], format=fmt, utc=False) - @pytest.mark.parametrize("infer_datetime_format", [True, False]) - def test_to_datetime_np_str(self, infer_datetime_format): + def test_to_datetime_np_str(self): # GH#32264 # GH#48969 value = np.str_("2019-02-04 10:18:46.297000+0000") @@ -589,11 +594,11 @@ def test_to_datetime_np_str(self, infer_datetime_format): assert to_datetime(value) == exp assert to_datetime(ser.iloc[0]) == exp - res = to_datetime([value], infer_datetime_format=infer_datetime_format) + res = to_datetime([value]) expected = Index([exp]) tm.assert_index_equal(res, expected) - res = to_datetime(ser, infer_datetime_format=infer_datetime_format) + res = to_datetime(ser) expected = Series(expected) tm.assert_series_equal(res, expected) @@ -727,7 +732,8 @@ def test_to_datetime_YYYYMMDD(self): def test_to_datetime_unparsable_ignore(self): # unparsable ser = "Month 1, 1999" - assert to_datetime(ser, errors="ignore") == ser + with tm.assert_produces_warning(UserWarning, match="Could not infer format"): + assert to_datetime(ser, errors="ignore") == ser @td.skip_if_windows # `tm.set_timezone` does not work in windows def test_to_datetime_now(self): @@ -1037,7 +1043,10 @@ def test_datetime_bool_arrays_mixed(self, cache): msg = f"{type(cache)} is not convertible to datetime" with pytest.raises(TypeError, match=msg): to_datetime([False, datetime.today()], cache=cache) - with pytest.raises(TypeError, match=msg): + with pytest.raises( + ValueError, + match=r"^time data 'True' does not match format '%Y%m%d' \(match\)$", + ): to_datetime(["20130101", True], cache=cache) tm.assert_index_equal( to_datetime([0, False, NaT, 0.0], errors="coerce", cache=cache), @@ -1054,18 +1063,17 @@ def test_datetime_invalid_datatype(self, arg): to_datetime(arg) @pytest.mark.parametrize("value", ["a", "00:01:99"]) - @pytest.mark.parametrize("infer", [True, False]) - @pytest.mark.parametrize("format", [None, "H%:M%:S%"]) - def test_datetime_invalid_scalar(self, value, format, infer): + @pytest.mark.parametrize( + "format,warning", [(None, UserWarning), ("H%:M%:S%", None)] + ) + def test_datetime_invalid_scalar(self, value, format, warning): # GH24763 - res = to_datetime( - value, errors="ignore", format=format, infer_datetime_format=infer - ) + with tm.assert_produces_warning(warning, match="Could not infer format"): + res = to_datetime(value, errors="ignore", format=format) assert res == value - res = to_datetime( - value, errors="coerce", format=format, infer_datetime_format=infer - ) + with tm.assert_produces_warning(warning, match="Could not infer format"): + res = to_datetime(value, errors="coerce", format=format) assert res is NaT msg = ( @@ -1074,51 +1082,46 @@ def test_datetime_invalid_scalar(self, value, format, infer): f"Given date string {value} not likely a datetime" ) with pytest.raises(ValueError, match=msg): - to_datetime( - value, errors="raise", format=format, infer_datetime_format=infer - ) + with tm.assert_produces_warning(warning, match="Could not infer format"): + to_datetime(value, errors="raise", format=format) @pytest.mark.parametrize("value", ["3000/12/11 00:00:00"]) - @pytest.mark.parametrize("infer", [True, False]) - @pytest.mark.parametrize("format", [None, "H%:M%:S%"]) - def test_datetime_outofbounds_scalar(self, value, format, infer): + @pytest.mark.parametrize( + "format,warning", [(None, UserWarning), ("H%:M%:S%", None)] + ) + def test_datetime_outofbounds_scalar(self, value, format, warning): # GH24763 - res = to_datetime( - value, errors="ignore", format=format, infer_datetime_format=infer - ) + with tm.assert_produces_warning(warning, match="Could not infer format"): + res = to_datetime(value, errors="ignore", format=format) assert res == value - res = to_datetime( - value, errors="coerce", format=format, infer_datetime_format=infer - ) + with tm.assert_produces_warning(warning, match="Could not infer format"): + res = to_datetime(value, errors="coerce", format=format) assert res is NaT if format is not None: msg = "is a bad directive in format|Out of bounds .* present at position 0" with pytest.raises(ValueError, match=msg): - to_datetime( - value, errors="raise", format=format, infer_datetime_format=infer - ) + to_datetime(value, errors="raise", format=format) else: msg = "Out of bounds .* present at position 0" - with pytest.raises(OutOfBoundsDatetime, match=msg): - to_datetime( - value, errors="raise", format=format, infer_datetime_format=infer - ) + with pytest.raises( + OutOfBoundsDatetime, match=msg + ), tm.assert_produces_warning(warning, match="Could not infer format"): + to_datetime(value, errors="raise", format=format) @pytest.mark.parametrize("values", [["a"], ["00:01:99"], ["a", "b", "99:00:00"]]) - @pytest.mark.parametrize("infer", [True, False]) - @pytest.mark.parametrize("format", [None, "H%:M%:S%"]) - def test_datetime_invalid_index(self, values, format, infer): + @pytest.mark.parametrize( + "format,warning", [(None, UserWarning), ("H%:M%:S%", None)] + ) + def test_datetime_invalid_index(self, values, format, warning): # GH24763 - res = to_datetime( - values, errors="ignore", format=format, infer_datetime_format=infer - ) + with tm.assert_produces_warning(warning, match="Could not infer format"): + res = to_datetime(values, errors="ignore", format=format) tm.assert_index_equal(res, Index(values)) - res = to_datetime( - values, errors="coerce", format=format, infer_datetime_format=infer - ) + with tm.assert_produces_warning(warning, match="Could not infer format"): + res = to_datetime(values, errors="coerce", format=format) tm.assert_index_equal(res, DatetimeIndex([NaT] * len(values))) msg = ( @@ -1127,9 +1130,8 @@ def test_datetime_invalid_index(self, values, format, infer): "second must be in 0..59" ) with pytest.raises(ValueError, match=msg): - to_datetime( - values, errors="raise", format=format, infer_datetime_format=infer - ) + with tm.assert_produces_warning(warning, match="Could not infer format"): + to_datetime(values, errors="raise", format=format) @pytest.mark.parametrize("utc", [True, None]) @pytest.mark.parametrize("format", ["%Y%m%d %H:%M:%S", None]) @@ -1263,7 +1265,6 @@ def test_to_datetime_coerce(self): ) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("infer_datetime_format", [True, False]) @pytest.mark.parametrize( "errors, expected", [ @@ -1271,28 +1272,28 @@ def test_to_datetime_coerce(self): ("ignore", Index(["200622-12-31", "111111-24-11"])), ], ) - def test_to_datetime_malformed_no_raise( - self, errors, expected, infer_datetime_format - ): + def test_to_datetime_malformed_no_raise(self, errors, expected): # GH 28299 # GH 48633 ts_strings = ["200622-12-31", "111111-24-11"] - result = to_datetime( - ts_strings, errors=errors, infer_datetime_format=infer_datetime_format - ) + with tm.assert_produces_warning(UserWarning, match="Could not infer format"): + result = to_datetime(ts_strings, errors=errors) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("infer_datetime_format", [True, False]) - def test_to_datetime_malformed_raise(self, infer_datetime_format): + def test_to_datetime_malformed_raise(self): # GH 48633 ts_strings = ["200622-12-31", "111111-24-11"] with pytest.raises( ValueError, match=r"^hour must be in 0\.\.23: 111111-24-11 present at position 1$", ): - to_datetime( - ts_strings, errors="raise", infer_datetime_format=infer_datetime_format - ) + with tm.assert_produces_warning( + UserWarning, match="Could not infer format" + ): + to_datetime( + ts_strings, + errors="raise", + ) def test_iso_8601_strings_with_same_offset(self): # GH 17697, 11736 @@ -1332,36 +1333,6 @@ def test_iso_8601_strings_with_different_offsets_utc(self): ) tm.assert_index_equal(result, expected) - def test_iso8601_strings_mixed_offsets_with_naive(self): - # GH 24992 - result = to_datetime( - [ - "2018-11-28T00:00:00", - "2018-11-28T00:00:00+12:00", - "2018-11-28T00:00:00", - "2018-11-28T00:00:00+06:00", - "2018-11-28T00:00:00", - ], - utc=True, - ) - expected = to_datetime( - [ - "2018-11-28T00:00:00", - "2018-11-27T12:00:00", - "2018-11-28T00:00:00", - "2018-11-27T18:00:00", - "2018-11-28T00:00:00", - ], - utc=True, - ) - tm.assert_index_equal(result, expected) - - def test_iso8601_strings_mixed_offsets_with_naive_reversed(self): - items = ["2018-11-28T00:00:00+12:00", "2018-11-28T00:00:00"] - result = to_datetime(items, utc=True) - expected = to_datetime(list(reversed(items)), utc=True)[::-1] - tm.assert_index_equal(result, expected) - def test_mixed_offsets_with_native_datetime_raises(self): # GH 25978 @@ -1515,23 +1486,26 @@ def test_unit_with_numeric(self, cache, errors, dtype): tm.assert_index_equal(result, expected) @pytest.mark.parametrize( - "exp, arr", + "exp, arr, warning", [ [ ["NaT", "2015-06-19 05:33:20", "2015-05-27 22:33:20"], ["foo", 1.434692e18, 1.432766e18], + UserWarning, ], [ ["2015-06-19 05:33:20", "2015-05-27 22:33:20", "NaT", "NaT"], [1.434692e18, 1.432766e18, "foo", "NaT"], + None, ], ], ) - def test_unit_with_numeric_coerce(self, cache, exp, arr): + def test_unit_with_numeric_coerce(self, cache, exp, arr, warning): # but we want to make sure that we are coercing # if we have ints/strings expected = DatetimeIndex(exp) - result = to_datetime(arr, errors="coerce", cache=cache) + with tm.assert_produces_warning(warning, match="Could not infer format"): + result = to_datetime(arr, errors="coerce", cache=cache) tm.assert_index_equal(result, expected) @pytest.mark.parametrize( @@ -1852,7 +1826,10 @@ def test_to_datetime_barely_out_of_bounds(self): msg = "Out of bounds .* present at position 0" with pytest.raises(OutOfBoundsDatetime, match=msg): - to_datetime(arr) + with tm.assert_produces_warning( + UserWarning, match="Could not infer format" + ): + to_datetime(arr) @pytest.mark.parametrize( "arg, exp_str", @@ -2016,7 +1993,7 @@ def test_to_datetime_on_datetime64_series(self, cache): def test_to_datetime_with_space_in_series(self, cache): # GH 6428 ser = Series(["10/18/2006", "10/18/2008", " "]) - msg = r"(\(')?String does not contain a date(:', ' '\))?" + msg = r"^time data ' ' does not match format '%m/%d/%Y' \(match\)$" with pytest.raises(ValueError, match=msg): to_datetime(ser, errors="raise", cache=cache) result_coerce = to_datetime(ser, errors="coerce", cache=cache) @@ -2082,7 +2059,7 @@ def test_to_datetime_strings(self, cache): def test_to_datetime_strings_variation(self, cache): array = ["2012", "20120101", "20120101 12:01:01"] - expected = list(to_datetime(array, cache=cache)) + expected = [to_datetime(dt_str, cache=cache) for dt_str in array] result = [Timestamp(date_str) for date_str in array] tm.assert_almost_equal(result, expected) @@ -2138,9 +2115,7 @@ def test_to_datetime_overflow(self): def test_string_na_nat_conversion(self, cache): # GH #999, #858 - strings = np.array( - ["1/1/2000", "1/2/2000", np.nan, "1/4/2000, 12:34:56"], dtype=object - ) + strings = np.array(["1/1/2000", "1/2/2000", np.nan, "1/4/2000"], dtype=object) expected = np.empty(4, dtype="M8[ns]") for i, val in enumerate(strings): @@ -2162,15 +2137,22 @@ def test_string_na_nat_conversion_malformed(self, cache): # GH 10636, default is now 'raise' msg = r"Unknown string format:|day is out of range for month" with pytest.raises(ValueError, match=msg): - to_datetime(malformed, errors="raise", cache=cache) + with tm.assert_produces_warning( + UserWarning, match="Could not infer format" + ): + to_datetime(malformed, errors="raise", cache=cache) - result = to_datetime(malformed, errors="ignore", cache=cache) + with tm.assert_produces_warning(UserWarning, match="Could not infer format"): + result = to_datetime(malformed, errors="ignore", cache=cache) # GH 21864 expected = Index(malformed) tm.assert_index_equal(result, expected) with pytest.raises(ValueError, match=msg): - to_datetime(malformed, errors="raise", cache=cache) + with tm.assert_produces_warning( + UserWarning, match="Could not infer format" + ): + to_datetime(malformed, errors="raise", cache=cache) def test_string_na_nat_conversion_with_name(self, cache): idx = ["a", "b", "c", "d", "e"] @@ -2255,80 +2237,39 @@ def test_dayfirst(self, cache): def test_dayfirst_warnings_valid_input(self): # GH 12585 - warning_msg_day_first = ( - r"Parsing dates in DD/MM/YYYY format when dayfirst=False \(the default\) " - "was specified. This may lead to inconsistently parsed dates! Specify a " - "format to ensure consistent parsing." + warning_msg = ( + "Parsing dates in .* format when dayfirst=.* was specified. " + "Pass `dayfirst=.*` or specify a format to silence this warning." ) # CASE 1: valid input arr = ["31/12/2014", "10/03/2011"] - expected_consistent = DatetimeIndex( + expected = DatetimeIndex( ["2014-12-31", "2011-03-10"], dtype="datetime64[ns]", freq=None ) - expected_inconsistent = DatetimeIndex( - ["2014-12-31", "2011-10-03"], dtype="datetime64[ns]", freq=None - ) # A. dayfirst arg correct, no warning res1 = to_datetime(arr, dayfirst=True) - tm.assert_index_equal(expected_consistent, res1) + tm.assert_index_equal(expected, res1) - # B. dayfirst arg incorrect, warning + incorrect output - with tm.assert_produces_warning(UserWarning, match=warning_msg_day_first): + # B. dayfirst arg incorrect, warning + with tm.assert_produces_warning(UserWarning, match=warning_msg): res2 = to_datetime(arr, dayfirst=False) - tm.assert_index_equal(expected_inconsistent, res2) - - # C. dayfirst default arg, same as B - with tm.assert_produces_warning(UserWarning, match=warning_msg_day_first): - res3 = to_datetime(arr, dayfirst=False) - tm.assert_index_equal(expected_inconsistent, res3) - - # D. infer_datetime_format=True overrides dayfirst default - # no warning + correct result - res4 = to_datetime(arr, infer_datetime_format=True) - tm.assert_index_equal(expected_consistent, res4) + tm.assert_index_equal(expected, res2) def test_dayfirst_warnings_invalid_input(self): # CASE 2: invalid input # cannot consistently process with single format - # warnings *always* raised - warning_msg_day_first = ( - r"Parsing dates in DD/MM/YYYY format when dayfirst=False \(the default\) " - "was specified. This may lead to inconsistently parsed dates! Specify a " - "format to ensure consistent parsing." - ) - warning_msg_month_first = ( - r"Parsing dates in MM/DD/YYYY format when dayfirst=True " - "was specified. This may lead to inconsistently parsed dates! Specify a " - "format to ensure consistent parsing." - ) + # ValueError *always* raised - arr = ["31/12/2014", "03/30/2011"] # first in DD/MM/YYYY, second in MM/DD/YYYY - expected = DatetimeIndex( - ["2014-12-31", "2011-03-30"], dtype="datetime64[ns]", freq=None - ) - - # A. use dayfirst=True - with tm.assert_produces_warning(UserWarning, match=warning_msg_month_first): - res5 = to_datetime(arr, dayfirst=True) - tm.assert_index_equal(expected, res5) - - # B. use dayfirst=False - with tm.assert_produces_warning(UserWarning, match=warning_msg_day_first): - res6 = to_datetime(arr, dayfirst=False) - tm.assert_index_equal(expected, res6) - - # C. use dayfirst default arg, same as B - with tm.assert_produces_warning(UserWarning, match=warning_msg_day_first): - res7 = to_datetime(arr, dayfirst=False) - tm.assert_index_equal(expected, res7) + arr = ["31/12/2014", "03/30/2011"] - # D. use infer_datetime_format=True - with tm.assert_produces_warning(UserWarning, match=warning_msg_day_first): - res8 = to_datetime(arr, infer_datetime_format=True) - tm.assert_index_equal(expected, res8) + with pytest.raises( + ValueError, + match=r"time data '03/30/2011' does not match format '%d/%m/%Y' \(match\)$", + ): + to_datetime(arr, dayfirst=True) @pytest.mark.parametrize("klass", [DatetimeIndex, DatetimeArray]) def test_to_datetime_dta_tz(self, klass): @@ -2385,48 +2326,41 @@ def test_to_datetime_infer_datetime_format_consistent_format( s_as_dt_strings = ser.apply(lambda x: x.strftime(test_format)) with_format = to_datetime(s_as_dt_strings, format=test_format, cache=cache) - no_infer = to_datetime( - s_as_dt_strings, infer_datetime_format=False, cache=cache - ) - yes_infer = to_datetime( - s_as_dt_strings, infer_datetime_format=True, cache=cache - ) + without_format = to_datetime(s_as_dt_strings, cache=cache) - # Whether the format is explicitly passed, it is inferred, or - # it is not inferred, the results should all be the same - tm.assert_series_equal(with_format, no_infer) - tm.assert_series_equal(no_infer, yes_infer) + # Whether the format is explicitly passed, or + # it is inferred, the results should all be the same + tm.assert_series_equal(with_format, without_format) - @pytest.mark.parametrize( - "data", - [ - ["01/01/2011 00:00:00", "01-02-2011 00:00:00", "2011-01-03T00:00:00"], - ["Jan/01/2011", "Feb/01/2011", "Mar/01/2011"], - ], - ) - def test_to_datetime_infer_datetime_format_inconsistent_format(self, cache, data): + def test_to_datetime_inconsistent_format(self, cache): + data = ["01/01/2011 00:00:00", "01-02-2011 00:00:00", "2011-01-03T00:00:00"] ser = Series(np.array(data)) + with pytest.raises(ValueError, match="does not match format"): + to_datetime(ser, cache=cache) - # When the format is inconsistent, infer_datetime_format should just - # fallback to the default parsing - tm.assert_series_equal( - to_datetime(ser, infer_datetime_format=False, cache=cache), - to_datetime(ser, infer_datetime_format=True, cache=cache), + def test_to_datetime_consistent_format(self, cache): + data = ["Jan/01/2011", "Feb/01/2011", "Mar/01/2011"] + ser = Series(np.array(data)) + result = to_datetime(ser, cache=cache) + expected = Series( + ["2011-01-01", "2011-02-01", "2011-03-01"], dtype="datetime64[ns]" ) + tm.assert_series_equal(result, expected) - def test_to_datetime_infer_datetime_format_series_with_nans(self, cache): + def test_to_datetime_series_with_nans(self, cache): ser = Series( np.array( ["01/01/2011 00:00:00", np.nan, "01/03/2011 00:00:00", np.nan], dtype=object, ) ) - tm.assert_series_equal( - to_datetime(ser, infer_datetime_format=False, cache=cache), - to_datetime(ser, infer_datetime_format=True, cache=cache), + result = to_datetime(ser, cache=cache) + expected = Series( + ["2011-01-01", NaT, "2011-01-03", NaT], dtype="datetime64[ns]" ) + tm.assert_series_equal(result, expected) - def test_to_datetime_infer_datetime_format_series_start_with_nans(self, cache): + def test_to_datetime_series_start_with_nans(self, cache): ser = Series( np.array( [ @@ -2440,18 +2374,21 @@ def test_to_datetime_infer_datetime_format_series_start_with_nans(self, cache): ) ) - tm.assert_series_equal( - to_datetime(ser, infer_datetime_format=False, cache=cache), - to_datetime(ser, infer_datetime_format=True, cache=cache), + result = to_datetime(ser, cache=cache) + expected = Series( + [NaT, NaT, "2011-01-01", "2011-01-02", "2011-01-03"], dtype="datetime64[ns]" ) + tm.assert_series_equal(result, expected) @pytest.mark.parametrize( - "tz_name, offset", [("UTC", 0), ("UTC-3", 180), ("UTC+3", -180)] + "tz_name, offset, warning", + [("UTC", 0, None), ("UTC-3", 180, UserWarning), ("UTC+3", -180, UserWarning)], ) - def test_infer_datetime_format_tz_name(self, tz_name, offset): + def test_infer_datetime_format_tz_name(self, tz_name, offset, warning): # GH 33133 ser = Series([f"2019-02-02 08:07:13 {tz_name}"]) - result = to_datetime(ser, infer_datetime_format=True) + with tm.assert_produces_warning(warning, match="Could not infer format"): + result = to_datetime(ser) tz = timezone(timedelta(minutes=offset)) expected = Series([Timestamp("2019-02-02 08:07:13").tz_localize(tz)]) tm.assert_series_equal(result, expected) @@ -2468,7 +2405,7 @@ def test_infer_datetime_format_tz_name(self, tz_name, offset): def test_infer_datetime_format_zero_tz(self, ts, zero_tz): # GH 41047 ser = Series([ts + zero_tz]) - result = to_datetime(ser, infer_datetime_format=True) + result = to_datetime(ser) tz = pytz.utc if zero_tz == "Z" else None expected = Series([Timestamp(ts, tz=tz)]) tm.assert_series_equal(result, expected) @@ -2486,26 +2423,38 @@ def test_to_datetime_iso8601_noleading_0s(self, cache, format): ) tm.assert_series_equal(to_datetime(ser, format=format, cache=cache), expected) + def test_parse_dates_infer_datetime_format_warning(self): + # GH 49024 + with tm.assert_produces_warning( + UserWarning, + match="The argument 'infer_datetime_format' is deprecated", + ): + to_datetime(["10-10-2000"], infer_datetime_format=True) + class TestDaysInMonth: # tests for issue #10154 @pytest.mark.parametrize( - "arg, format", + "arg, format, warning", [ - ["2015-02-29", None], - ["2015-02-29", "%Y-%m-%d"], - ["2015-02-32", "%Y-%m-%d"], - ["2015-04-31", "%Y-%m-%d"], + ["2015-02-29", None, UserWarning], + ["2015-02-29", "%Y-%m-%d", None], + ["2015-02-32", "%Y-%m-%d", None], + ["2015-04-31", "%Y-%m-%d", None], ], ) - def test_day_not_in_month_coerce(self, cache, arg, format): - assert isna(to_datetime(arg, errors="coerce", format=format, cache=cache)) + def test_day_not_in_month_coerce(self, cache, arg, format, warning): + with tm.assert_produces_warning(warning, match="Could not infer format"): + assert isna(to_datetime(arg, errors="coerce", format=format, cache=cache)) def test_day_not_in_month_raise(self, cache): msg = "day is out of range for month" with pytest.raises(ValueError, match=msg): - to_datetime("2015-02-29", errors="raise", cache=cache) + with tm.assert_produces_warning( + UserWarning, match="Could not infer format" + ): + to_datetime("2015-02-29", errors="raise", cache=cache) @pytest.mark.parametrize("arg", ["2015-02-29", "2015-02-32", "2015-04-31"]) def test_day_not_in_month_raise_value(self, cache, arg): @@ -2514,85 +2463,85 @@ def test_day_not_in_month_raise_value(self, cache, arg): to_datetime(arg, errors="raise", format="%Y-%m-%d", cache=cache) @pytest.mark.parametrize( - "expected, format", + "expected, format, warning", [ - ["2015-02-29", None], - ["2015-02-29", "%Y-%m-%d"], - ["2015-02-29", "%Y-%m-%d"], - ["2015-04-31", "%Y-%m-%d"], + ["2015-02-29", None, UserWarning], + ["2015-02-29", "%Y-%m-%d", None], + ["2015-02-29", "%Y-%m-%d", None], + ["2015-04-31", "%Y-%m-%d", None], ], ) - def test_day_not_in_month_ignore(self, cache, expected, format): - result = to_datetime(expected, errors="ignore", format=format, cache=cache) + def test_day_not_in_month_ignore(self, cache, expected, format, warning): + with tm.assert_produces_warning(warning, match="Could not infer format"): + result = to_datetime(expected, errors="ignore", format=format, cache=cache) assert result == expected class TestDatetimeParsingWrappers: @pytest.mark.parametrize( - "date_str,expected", - list( - { - "2011-01-01": datetime(2011, 1, 1), - "2Q2005": datetime(2005, 4, 1), - "2Q05": datetime(2005, 4, 1), - "2005Q1": datetime(2005, 1, 1), - "05Q1": datetime(2005, 1, 1), - "2011Q3": datetime(2011, 7, 1), - "11Q3": datetime(2011, 7, 1), - "3Q2011": datetime(2011, 7, 1), - "3Q11": datetime(2011, 7, 1), - # quarterly without space - "2000Q4": datetime(2000, 10, 1), - "00Q4": datetime(2000, 10, 1), - "4Q2000": datetime(2000, 10, 1), - "4Q00": datetime(2000, 10, 1), - "2000q4": datetime(2000, 10, 1), - "2000-Q4": datetime(2000, 10, 1), - "00-Q4": datetime(2000, 10, 1), - "4Q-2000": datetime(2000, 10, 1), - "4Q-00": datetime(2000, 10, 1), - "00q4": datetime(2000, 10, 1), - "2005": datetime(2005, 1, 1), - "2005-11": datetime(2005, 11, 1), - "2005 11": datetime(2005, 11, 1), - "11-2005": datetime(2005, 11, 1), - "11 2005": datetime(2005, 11, 1), - "200511": datetime(2020, 5, 11), - "20051109": datetime(2005, 11, 9), - "20051109 10:15": datetime(2005, 11, 9, 10, 15), - "20051109 08H": datetime(2005, 11, 9, 8, 0), - "2005-11-09 10:15": datetime(2005, 11, 9, 10, 15), - "2005-11-09 08H": datetime(2005, 11, 9, 8, 0), - "2005/11/09 10:15": datetime(2005, 11, 9, 10, 15), - "2005/11/09 08H": datetime(2005, 11, 9, 8, 0), - "Thu Sep 25 10:36:28 2003": datetime(2003, 9, 25, 10, 36, 28), - "Thu Sep 25 2003": datetime(2003, 9, 25), - "Sep 25 2003": datetime(2003, 9, 25), - "January 1 2014": datetime(2014, 1, 1), - # GHE10537 - "2014-06": datetime(2014, 6, 1), - "06-2014": datetime(2014, 6, 1), - "2014-6": datetime(2014, 6, 1), - "6-2014": datetime(2014, 6, 1), - "20010101 12": datetime(2001, 1, 1, 12), - "20010101 1234": datetime(2001, 1, 1, 12, 34), - "20010101 123456": datetime(2001, 1, 1, 12, 34, 56), - }.items() - ), + "date_str, expected, warning", + [ + ("2011-01-01", datetime(2011, 1, 1), None), + ("2Q2005", datetime(2005, 4, 1), UserWarning), + ("2Q05", datetime(2005, 4, 1), UserWarning), + ("2005Q1", datetime(2005, 1, 1), UserWarning), + ("05Q1", datetime(2005, 1, 1), UserWarning), + ("2011Q3", datetime(2011, 7, 1), UserWarning), + ("11Q3", datetime(2011, 7, 1), UserWarning), + ("3Q2011", datetime(2011, 7, 1), UserWarning), + ("3Q11", datetime(2011, 7, 1), UserWarning), + # quarterly without space + ("2000Q4", datetime(2000, 10, 1), UserWarning), + ("00Q4", datetime(2000, 10, 1), UserWarning), + ("4Q2000", datetime(2000, 10, 1), UserWarning), + ("4Q00", datetime(2000, 10, 1), UserWarning), + ("2000q4", datetime(2000, 10, 1), UserWarning), + ("2000-Q4", datetime(2000, 10, 1), UserWarning), + ("00-Q4", datetime(2000, 10, 1), UserWarning), + ("4Q-2000", datetime(2000, 10, 1), UserWarning), + ("4Q-00", datetime(2000, 10, 1), UserWarning), + ("00q4", datetime(2000, 10, 1), UserWarning), + ("2005", datetime(2005, 1, 1), None), + ("2005-11", datetime(2005, 11, 1), None), + ("2005 11", datetime(2005, 11, 1), UserWarning), + ("11-2005", datetime(2005, 11, 1), UserWarning), + ("11 2005", datetime(2005, 11, 1), UserWarning), + ("200511", datetime(2020, 5, 11), UserWarning), + ("20051109", datetime(2005, 11, 9), None), + ("20051109 10:15", datetime(2005, 11, 9, 10, 15), None), + ("20051109 08H", datetime(2005, 11, 9, 8, 0), None), + ("2005-11-09 10:15", datetime(2005, 11, 9, 10, 15), None), + ("2005-11-09 08H", datetime(2005, 11, 9, 8, 0), None), + ("2005/11/09 10:15", datetime(2005, 11, 9, 10, 15), None), + ("2005/11/09 08H", datetime(2005, 11, 9, 8, 0), None), + ("Thu Sep 25 10:36:28 2003", datetime(2003, 9, 25, 10, 36, 28), None), + ("Thu Sep 25 2003", datetime(2003, 9, 25), None), + ("Sep 25 2003", datetime(2003, 9, 25), None), + ("January 1 2014", datetime(2014, 1, 1), None), + # GHE10537 + ("2014-06", datetime(2014, 6, 1), None), + ("06-2014", datetime(2014, 6, 1), UserWarning), + ("2014-6", datetime(2014, 6, 1), None), + ("6-2014", datetime(2014, 6, 1), UserWarning), + ("20010101 12", datetime(2001, 1, 1, 12), None), + ("20010101 1234", datetime(2001, 1, 1, 12, 34), UserWarning), + ("20010101 123456", datetime(2001, 1, 1, 12, 34, 56), UserWarning), + ], ) - def test_parsers(self, date_str, expected, cache): + def test_parsers(self, date_str, expected, warning, cache): # dateutil >= 2.5.0 defaults to yearfirst=True # https://github.com/dateutil/dateutil/issues/217 yearfirst = True result1, _ = parsing.parse_time_string(date_str, yearfirst=yearfirst) - result2 = to_datetime(date_str, yearfirst=yearfirst) - result3 = to_datetime([date_str], yearfirst=yearfirst) - # result5 is used below - result4 = to_datetime( - np.array([date_str], dtype=object), yearfirst=yearfirst, cache=cache - ) + with tm.assert_produces_warning(warning, match="Could not infer format"): + result2 = to_datetime(date_str, yearfirst=yearfirst) + result3 = to_datetime([date_str], yearfirst=yearfirst) + # result5 is used below + result4 = to_datetime( + np.array([date_str], dtype=object), yearfirst=yearfirst, cache=cache + ) result6 = DatetimeIndex([date_str], yearfirst=yearfirst) # result7 is used below result8 = DatetimeIndex(Index([date_str]), yearfirst=yearfirst) @@ -2701,9 +2650,10 @@ def test_parsers_dayfirst_yearfirst( result2 = Timestamp(date_str) assert result2 == expected - result3 = to_datetime( - date_str, dayfirst=dayfirst, yearfirst=yearfirst, cache=cache - ) + with tm.assert_produces_warning(UserWarning, match="Could not infer format"): + result3 = to_datetime( + date_str, dayfirst=dayfirst, yearfirst=yearfirst, cache=cache + ) result4 = DatetimeIndex([date_str], dayfirst=dayfirst, yearfirst=yearfirst)[0] @@ -2720,8 +2670,9 @@ def test_parsers_timestring(self, date_str, exp_def): exp_now = parse(date_str) result1, _ = parsing.parse_time_string(date_str) - result2 = to_datetime(date_str) - result3 = to_datetime([date_str]) + with tm.assert_produces_warning(UserWarning, match="Could not infer format"): + result2 = to_datetime(date_str) + result3 = to_datetime([date_str]) result4 = Timestamp(date_str) result5 = DatetimeIndex([date_str])[0] # parse time string return time string based on default date @@ -2891,17 +2842,23 @@ def test_incorrect_value_exception(self): with pytest.raises( ValueError, match="Unknown string format: yesterday present at position 1" ): - to_datetime(["today", "yesterday"]) + with tm.assert_produces_warning( + UserWarning, match="Could not infer format" + ): + to_datetime(["today", "yesterday"]) - @pytest.mark.parametrize("format", [None, "%Y-%m-%d %H:%M:%S"]) - def test_to_datetime_out_of_bounds_with_format_arg(self, format): + @pytest.mark.parametrize( + "format, warning", [(None, UserWarning), ("%Y-%m-%d %H:%M:%S", None)] + ) + def test_to_datetime_out_of_bounds_with_format_arg(self, format, warning): # see gh-23830 msg = ( "Out of bounds nanosecond timestamp: 2417-10-27 00:00:00 " "present at position 0" ) with pytest.raises(OutOfBoundsDatetime, match=msg): - to_datetime("2417-10-27 00:00:00", format=format) + with tm.assert_produces_warning(warning, match="Could not infer format"): + to_datetime("2417-10-27 00:00:00", format=format) @pytest.mark.parametrize( "arg, origin, expected_str", @@ -3033,9 +2990,9 @@ def test_empty_string_datetime_coerce_format(): with pytest.raises(ValueError, match="does not match format"): to_datetime(td, format=format, errors="raise") - # don't raise an exception in case no format is given - result = to_datetime(td, errors="raise") - tm.assert_series_equal(result, expected) + # still raise an exception in case no format is given + with pytest.raises(ValueError, match="does not match format"): + to_datetime(td, errors="raise") def test_empty_string_datetime_coerce__unit(): @@ -3112,30 +3069,6 @@ def test_to_datetime_cache_coerce_50_lines_outofbounds(series_length): to_datetime(s, errors="raise", utc=True) -@pytest.mark.parametrize( - "arg", - [ - ["1724-12-20 20:20:20+00:00", "2022-01-01 00:00:00"], - [ - Timestamp("1724-12-20 20:20:20+00:00"), - Timestamp("2022-01-01 00:00:00"), - ], - [datetime(1724, 12, 20, 20, 20, 20, tzinfo=timezone.utc), datetime(2022, 1, 1)], - ], - ids=["string", "pd.Timestamp", "datetime.datetime"], -) -@pytest.mark.parametrize("tz_aware_first", [True, False]) -def test_to_datetime_mixed_tzaware_timestamp_utc_true(arg, tz_aware_first): - # GH 48678 - exp_arg = ["1724-12-20 20:20:20", "2022-01-01 00:00:00"] - if not tz_aware_first: - arg.reverse() - exp_arg.reverse() - result = to_datetime(arg, utc=True) - expected = DatetimeIndex(exp_arg).tz_localize("UTC") - tm.assert_index_equal(result, expected) - - def test_to_datetime_format_f_parse_nanos(): # GH 48767 timestamp = "15/02/2020 02:03:04.123456789" diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py index 4d7501cdadcd9..a4c79e77d2eed 100644 --- a/pandas/tests/tslibs/test_parsing.py +++ b/pandas/tests/tslibs/test_parsing.py @@ -238,19 +238,30 @@ def test_guess_datetime_format_wrong_type_inputs(invalid_type_dt): @pytest.mark.parametrize( - "string,fmt", + "string,fmt,dayfirst,warning", [ - ("2011-1-1", "%Y-%m-%d"), - ("1/1/2011", "%m/%d/%Y"), - ("30-1-2011", "%d-%m-%Y"), - ("2011-1-1 0:0:0", "%Y-%m-%d %H:%M:%S"), - ("2011-1-3T00:00:0", "%Y-%m-%dT%H:%M:%S"), - ("2011-1-1 00:00:00", "%Y-%m-%d %H:%M:%S"), + ("2011-1-1", "%Y-%m-%d", False, None), + ("2011-1-1", "%Y-%d-%m", True, None), + ("1/1/2011", "%m/%d/%Y", False, None), + ("1/1/2011", "%d/%m/%Y", True, None), + ("30-1-2011", "%d-%m-%Y", False, UserWarning), + ("30-1-2011", "%d-%m-%Y", True, None), + ("2011-1-1 0:0:0", "%Y-%m-%d %H:%M:%S", False, None), + ("2011-1-1 0:0:0", "%Y-%d-%m %H:%M:%S", True, None), + ("2011-1-3T00:00:0", "%Y-%m-%dT%H:%M:%S", False, None), + ("2011-1-3T00:00:0", "%Y-%d-%mT%H:%M:%S", True, None), + ("2011-1-1 00:00:00", "%Y-%m-%d %H:%M:%S", False, None), + ("2011-1-1 00:00:00", "%Y-%d-%m %H:%M:%S", True, None), ], ) -def test_guess_datetime_format_no_padding(string, fmt): +def test_guess_datetime_format_no_padding(string, fmt, dayfirst, warning): # see gh-11142 - result = parsing.guess_datetime_format(string) + msg = ( + f"Parsing dates in {fmt} format when dayfirst=False was specified. " + "Pass `dayfirst=True` or specify a format to silence this warning." + ) + with tm.assert_produces_warning(warning, match=msg): + result = parsing.guess_datetime_format(string, dayfirst=dayfirst) assert result == fmt