diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 3b06fa1b5517a..043394ded42e9 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -290,6 +290,16 @@ date_parser : function, default ``None`` values from the columns defined by parse_dates into a single array and pass that; and 3) call date_parser once for each row using one or more strings (corresponding to the columns defined by parse_dates) as arguments. + + .. deprecated:: 2.0.0 + Use ``date_format`` instead, or read in as ``object`` and then apply + :func:`to_datetime` as-needed. +date_format : str, default ``None`` + If used in conjunction with ``parse_dates``, will parse dates according to this + format. For anything more complex (e.g. different formats for different columns), + please read in as ``object`` and then apply :func:`to_datetime` as-needed. + + .. versionadded:: 2.0.0 dayfirst : boolean, default ``False`` DD/MM format dates, international and European format. cache_dates : boolean, default True @@ -800,7 +810,7 @@ Specifying date columns +++++++++++++++++++++++ To better facilitate working with datetime data, :func:`read_csv` -uses the keyword arguments ``parse_dates`` and ``date_parser`` +uses the keyword arguments ``parse_dates`` and ``date_format`` to allow users to specify a variety of columns and date/time formats to turn the input text data into ``datetime`` objects. @@ -898,33 +908,15 @@ data columns: Date parsing functions ++++++++++++++++++++++ -Finally, the parser allows you to specify a custom ``date_parser`` function to -take full advantage of the flexibility of the date parsing API: - -.. ipython:: python - - df = pd.read_csv( - "tmp.csv", header=None, parse_dates=date_spec, date_parser=pd.to_datetime - ) - df - -pandas will try to call the ``date_parser`` function in three different ways. If -an exception is raised, the next one is tried: - -1. ``date_parser`` is first called with one or more arrays as arguments, - as defined using ``parse_dates`` (e.g., ``date_parser(['2013', '2013'], ['1', '2'])``). - -2. If #1 fails, ``date_parser`` is called with all the columns - concatenated row-wise into a single array (e.g., ``date_parser(['2013 1', '2013 2'])``). +Finally, the parser allows you to specify a custom ``date_format``. +Performance-wise, you should try these methods of parsing dates in order: -Note that performance-wise, you should try these methods of parsing dates in order: +1. If you know the format, use ``date_format``, e.g.: + ``date_format="%d/%m/%Y"``. -1. If you know the format, use ``pd.to_datetime()``: - ``date_parser=lambda x: pd.to_datetime(x, format=...)``. - -2. If you have a really non-standard format, use a custom ``date_parser`` function. - For optimal performance, this should be vectorized, i.e., it should accept arrays - as arguments. +2. If you different formats for different columns, or want to pass any extra options (such + as ``utc``) to ``to_datetime``, then you should read in your data as ``object`` dtype, and + then use ``to_datetime``. .. ipython:: python @@ -952,16 +944,13 @@ an object-dtype column with strings, even with ``parse_dates``. df = pd.read_csv(StringIO(content), parse_dates=["a"]) df["a"] -To parse the mixed-timezone values as a datetime column, pass a partially-applied -:func:`to_datetime` with ``utc=True`` as the ``date_parser``. +To parse the mixed-timezone values as a datetime column, read in as ``object`` dtype and +then call :func:`to_datetime` with ``utc=True``. .. ipython:: python - df = pd.read_csv( - StringIO(content), - parse_dates=["a"], - date_parser=lambda col: pd.to_datetime(col, utc=True), - ) + df = pd.read_csv(StringIO(content)) + df["a"] = pd.to_datetime(df["a"], utc=True) df["a"] diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 06356c8b02e84..8b850c7da37f3 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -686,11 +686,19 @@ Parsing mixed-timezones with :func:`read_csv` As can be seen, the ``dtype`` is object; each value in the column is a string. To convert the strings to an array of datetimes, the ``date_parser`` argument -.. ipython:: python +.. code-block:: ipython - df = pd.read_csv(io.StringIO(content), parse_dates=['a'], - date_parser=lambda col: pd.to_datetime(col, utc=True)) - df.a + In [3]: df = pd.read_csv( + ...: io.StringIO(content), + ...: parse_dates=['a'], + ...: date_parser=lambda col: pd.to_datetime(col, utc=True), + ...: ) + + In [4]: df.a + Out[4]: + 0 1999-12-31 19:00:00+00:00 + 1 1999-12-31 18:00:00+00:00 + Name: a, dtype: datetime64[ns, UTC] See :ref:`whatsnew_0240.api.timezone_offset_parsing` for more. diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 7fc856be374e9..0bb2ca451a420 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -315,6 +315,7 @@ Other enhancements - Added :meth:`DatetimeIndex.as_unit` and :meth:`TimedeltaIndex.as_unit` to convert to different resolutions; supported resolutions are "s", "ms", "us", and "ns" (:issue:`50616`) - Added :meth:`Series.dt.unit` and :meth:`Series.dt.as_unit` to convert to different resolutions; supported resolutions are "s", "ms", "us", and "ns" (:issue:`51223`) - Added new argument ``dtype`` to :func:`read_sql` to be consistent with :func:`read_sql_query` (:issue:`50797`) +- :func:`read_csv`, :func:`read_table`, :func:`read_fwf` and :func:`read_excel` now accept ``date_format`` (:issue:`50601`) - :func:`to_datetime` now accepts ``"ISO8601"`` as an argument to ``format``, which will match any ISO8601 string (but possibly not identically-formatted) (:issue:`50411`) - :func:`to_datetime` now accepts ``"mixed"`` as an argument to ``format``, which will infer the format for each element individually (:issue:`50972`) - Added new argument ``engine`` to :func:`read_json` to support parsing JSON with pyarrow by specifying ``engine="pyarrow"`` (:issue:`48893`) @@ -831,6 +832,7 @@ Deprecations - :meth:`Index.is_categorical` has been deprecated. Use :func:`pandas.api.types.is_categorical_dtype` instead (:issue:`50042`) - :meth:`Index.is_object` has been deprecated. Use :func:`pandas.api.types.is_object_dtype` instead (:issue:`50042`) - :meth:`Index.is_interval` has been deprecated. Use :func:`pandas.api.types.is_interval_dtype` instead (:issue:`50042`) +- Deprecated argument ``date_parser`` in :func:`read_csv`, :func:`read_table`, :func:`read_fwf`, and :func:`read_excel` in favour of ``date_format`` (:issue:`50601`) - Deprecated ``all`` and ``any`` reductions with ``datetime64`` and :class:`DatetimeTZDtype` dtypes, use e.g. ``(obj != pd.Timestamp(0), tz=obj.tz).all()`` instead (:issue:`34479`) - Deprecated unused arguments ``*args`` and ``**kwargs`` in :class:`Resampler` (:issue:`50977`) - Deprecated calling ``float`` or ``int`` on a single element :class:`Series` to return a ``float`` or ``int`` respectively. Extract the element before calling ``float`` or ``int`` instead (:issue:`51101`) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 13057a6277673..e2ff1f61abcc2 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -250,6 +250,16 @@ and pass that; and 3) call `date_parser` once for each row using one or more strings (corresponding to the columns defined by `parse_dates`) as arguments. + + .. deprecated:: 2.0.0 + Use ``date_format`` instead, or read in as ``object`` and then apply + :func:`to_datetime` as-needed. +date_format : str, default ``None`` + If used in conjunction with ``parse_dates``, will parse dates according to this + format. For anything more complex (e.g. different formats for different columns), + please read in as ``object`` and then apply :func:`to_datetime` as-needed. + + .. versionadded:: 2.0.0 thousands : str, default None Thousands separator for parsing string columns to numeric. Note that this parameter is only necessary for columns stored as TEXT in Excel, @@ -386,7 +396,8 @@ def read_excel( na_filter: bool = ..., verbose: bool = ..., parse_dates: list | dict | bool = ..., - date_parser: Callable | None = ..., + date_parser: Callable | lib.NoDefault = ..., + date_format: str | None = ..., thousands: str | None = ..., decimal: str = ..., comment: str | None = ..., @@ -425,7 +436,8 @@ def read_excel( na_filter: bool = ..., verbose: bool = ..., parse_dates: list | dict | bool = ..., - date_parser: Callable | None = ..., + date_parser: Callable | lib.NoDefault = ..., + date_format: str | None = ..., thousands: str | None = ..., decimal: str = ..., comment: str | None = ..., @@ -464,7 +476,8 @@ def read_excel( na_filter: bool = True, verbose: bool = False, parse_dates: list | dict | bool = False, - date_parser: Callable | None = None, + date_parser: Callable | lib.NoDefault = lib.no_default, + date_format: str | None = None, thousands: str | None = None, decimal: str = ".", comment: str | None = None, @@ -508,6 +521,7 @@ def read_excel( verbose=verbose, parse_dates=parse_dates, date_parser=date_parser, + date_format=date_format, thousands=thousands, decimal=decimal, comment=comment, @@ -711,7 +725,8 @@ def parse( na_values=None, verbose: bool = False, parse_dates: list | dict | bool = False, - date_parser: Callable | None = None, + date_parser: Callable | lib.NoDefault = lib.no_default, + date_format: str | None = None, thousands: str | None = None, decimal: str = ".", comment: str | None = None, @@ -870,6 +885,7 @@ def parse( skip_blank_lines=False, # GH 39808 parse_dates=parse_dates, date_parser=date_parser, + date_format=date_format, thousands=thousands, decimal=decimal, comment=comment, @@ -1537,7 +1553,8 @@ def parse( nrows: int | None = None, na_values=None, parse_dates: list | dict | bool = False, - date_parser: Callable | None = None, + date_parser: Callable | lib.NoDefault = lib.no_default, + date_format: str | None = None, thousands: str | None = None, comment: str | None = None, skipfooter: int = 0, @@ -1570,6 +1587,7 @@ def parse( na_values=na_values, parse_dates=parse_dates, date_parser=date_parser, + date_format=date_format, thousands=thousands, comment=comment, skipfooter=skipfooter, diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 83064d87069c1..a7abaeba5766e 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -114,7 +114,8 @@ def __init__(self, kwds) -> None: self.parse_dates = _validate_parse_dates_arg(kwds.pop("parse_dates", False)) self._parse_date_cols: Iterable = [] - self.date_parser = kwds.pop("date_parser", None) + self.date_parser = kwds.pop("date_parser", lib.no_default) + self.date_format = kwds.pop("date_format", None) self.dayfirst = kwds.pop("dayfirst", False) self.keep_date_col = kwds.pop("keep_date_col", False) @@ -133,6 +134,7 @@ def __init__(self, kwds) -> None: self._date_conv = _make_date_converter( date_parser=self.date_parser, + date_format=self.date_format, dayfirst=self.dayfirst, cache_dates=self.cache_dates, ) @@ -1089,16 +1091,30 @@ def _get_empty_meta( def _make_date_converter( - date_parser=None, + date_parser=lib.no_default, dayfirst: bool = False, cache_dates: bool = True, + date_format: str | None = None, ): + if date_parser is not lib.no_default: + warnings.warn( + "The argument 'date_parser' is deprecated and will " + "be removed in a future version. " + "Please use 'date_format' instead, or read your data in as 'object' dtype " + "and then call 'to_datetime'.", + FutureWarning, + stacklevel=find_stack_level(), + ) + if date_parser is not lib.no_default and date_format is not None: + raise TypeError("Cannot use both 'date_parser' and 'date_format'") + def converter(*date_cols): - if date_parser is None: + if date_parser is lib.no_default: strs = parsing.concat_date_cols(date_cols) return tools.to_datetime( ensure_object(strs), + format=date_format, utc=False, dayfirst=dayfirst, errors="ignore", @@ -1152,7 +1168,8 @@ def converter(*date_cols): "parse_dates": False, "keep_date_col": False, "dayfirst": False, - "date_parser": None, + "date_parser": lib.no_default, + "date_format": None, "usecols": None, # 'iterator': False, "chunksize": None, diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 7230c675ee775..28a005df19442 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -236,10 +236,7 @@ say because of an unparsable value or a mixture of timezones, the column or index will be returned unaltered as an object data type. For non-standard datetime parsing, use ``pd.to_datetime`` after - ``pd.read_csv``. To parse an index or column with a mixture of timezones, - specify ``date_parser`` to be a partially-applied - :func:`pandas.to_datetime` with ``utc=True``. See - :ref:`io.csv.mixed_timezones` for more. + ``pd.read_csv``. Note: A fast-path exists for iso8601-formatted dates. infer_datetime_format : bool, default False @@ -264,6 +261,16 @@ and pass that; and 3) call `date_parser` once for each row using one or more strings (corresponding to the columns defined by `parse_dates`) as arguments. + + .. deprecated:: 2.0.0 + Use ``date_format`` instead, or read in as ``object`` and then apply + :func:`to_datetime` as-needed. +date_format : str, default ``None`` + If used in conjunction with ``parse_dates``, will parse dates according to this + format. For anything more complex (e.g. different formats for different columns), + please read in as ``object`` and then apply :func:`to_datetime` as-needed. + + .. versionadded:: 2.0.0 dayfirst : bool, default False DD/MM format dates, international and European format. cache_dates : bool, default True @@ -543,7 +550,10 @@ def _read( # if we pass a date_parser and parse_dates=False, we should not parse the # dates GH#44366 if kwds.get("parse_dates", None) is None: - if kwds.get("date_parser", None) is None: + if ( + kwds.get("date_parser", lib.no_default) is lib.no_default + and kwds.get("date_format", None) is None + ): kwds["parse_dates"] = False else: kwds["parse_dates"] = True @@ -608,6 +618,7 @@ def read_csv( infer_datetime_format: bool | lib.NoDefault = ..., keep_date_col: bool = ..., date_parser=..., + date_format: str | None = ..., dayfirst: bool = ..., cache_dates: bool = ..., iterator: Literal[True], @@ -664,6 +675,7 @@ def read_csv( infer_datetime_format: bool | lib.NoDefault = ..., keep_date_col: bool = ..., date_parser=..., + date_format: str | None = ..., dayfirst: bool = ..., cache_dates: bool = ..., iterator: bool = ..., @@ -720,6 +732,7 @@ def read_csv( infer_datetime_format: bool | lib.NoDefault = ..., keep_date_col: bool = ..., date_parser=..., + date_format: str | None = ..., dayfirst: bool = ..., cache_dates: bool = ..., iterator: Literal[False] = ..., @@ -776,6 +789,7 @@ def read_csv( infer_datetime_format: bool | lib.NoDefault = ..., keep_date_col: bool = ..., date_parser=..., + date_format: str | None = ..., dayfirst: bool = ..., cache_dates: bool = ..., iterator: bool = ..., @@ -843,7 +857,8 @@ def read_csv( parse_dates: bool | Sequence[Hashable] | None = None, infer_datetime_format: bool | lib.NoDefault = lib.no_default, keep_date_col: bool = False, - date_parser=None, + date_parser=lib.no_default, + date_format: str | None = None, dayfirst: bool = False, cache_dates: bool = True, # Iteration @@ -932,6 +947,7 @@ def read_table( infer_datetime_format: bool | lib.NoDefault = ..., keep_date_col: bool = ..., date_parser=..., + date_format: str | None = ..., dayfirst: bool = ..., cache_dates: bool = ..., iterator: Literal[True], @@ -988,6 +1004,7 @@ def read_table( infer_datetime_format: bool | lib.NoDefault = ..., keep_date_col: bool = ..., date_parser=..., + date_format: str | None = ..., dayfirst: bool = ..., cache_dates: bool = ..., iterator: bool = ..., @@ -1044,6 +1061,7 @@ def read_table( infer_datetime_format: bool | lib.NoDefault = ..., keep_date_col: bool = ..., date_parser=..., + date_format: str | None = ..., dayfirst: bool = ..., cache_dates: bool = ..., iterator: Literal[False] = ..., @@ -1100,6 +1118,7 @@ def read_table( infer_datetime_format: bool | lib.NoDefault = ..., keep_date_col: bool = ..., date_parser=..., + date_format: str | None = ..., dayfirst: bool = ..., cache_dates: bool = ..., iterator: bool = ..., @@ -1167,7 +1186,8 @@ def read_table( parse_dates: bool | Sequence[Hashable] = False, infer_datetime_format: bool | lib.NoDefault = lib.no_default, keep_date_col: bool = False, - date_parser=None, + date_parser=lib.no_default, + date_format: str | None = None, dayfirst: bool = False, cache_dates: bool = True, # Iteration @@ -1772,6 +1792,11 @@ def TextParser(*args, **kwds) -> TextFileReader: parse_dates : bool, default False keep_date_col : bool, default False date_parser : function, optional + + .. deprecated:: 2.0.0 + date_format : str, default ``None`` + + .. versionadded:: 2.0.0 skiprows : list of integers Row numbers to skip skipfooter : int diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index f2df065571c6d..9a8e4eff5470a 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -269,8 +269,18 @@ def test_read_excel_parse_dates(self, ext): tm.assert_frame_equal(df, res) date_parser = lambda x: datetime.strptime(x, "%m/%d/%Y") + with tm.assert_produces_warning( + FutureWarning, match="use 'date_format' instead" + ): + res = pd.read_excel( + pth, + parse_dates=["date_strings"], + date_parser=date_parser, + index_col=0, + ) + tm.assert_frame_equal(df, res) res = pd.read_excel( - pth, parse_dates=["date_strings"], date_parser=date_parser, index_col=0 + pth, parse_dates=["date_strings"], date_format="%m/%d/%Y", index_col=0 ) tm.assert_frame_equal(df, res) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index edae696b84bf4..f9e152a25081d 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -60,7 +60,9 @@ def __custom_date_parser(time): 41051.00 -98573.7302 871458.0640 389.0086 """ ) - result = all_parsers.read_csv( + result = all_parsers.read_csv_check_warnings( + FutureWarning, + "Please use 'date_format' instead", testdata, delim_whitespace=True, parse_dates=True, @@ -98,7 +100,9 @@ def __custom_date_parser(time): 41051.00 -97.72 """ ) - result = all_parsers.read_csv( + result = all_parsers.read_csv_check_warnings( + FutureWarning, + "Please use 'date_format' instead", testdata, delim_whitespace=True, parse_dates=False, @@ -173,7 +177,12 @@ def date_parser(*date_cols): "keep_date_col": keep_date_col, "names": ["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8"], } - result = parser.read_csv(StringIO(data), **kwds) + result = parser.read_csv_check_warnings( + FutureWarning, + "use 'date_format' instead", + StringIO(data), + **kwds, + ) expected = DataFrame( [ @@ -479,7 +488,9 @@ def test_multiple_date_cols_int_cast(all_parsers): "parse_dates": parse_dates, "date_parser": pd.to_datetime, } - result = parser.read_csv(StringIO(data), **kwds) + result = parser.read_csv_check_warnings( + FutureWarning, "use 'date_format' instead", StringIO(data), **kwds + ) expected = DataFrame( [ @@ -526,8 +537,13 @@ def test_multiple_date_col_timestamp_parse(all_parsers): data = """05/31/2012,15:30:00.029,1306.25,1,E,0,,1306.25 05/31/2012,15:30:00.029,1306.25,8,E,0,,1306.25""" - result = parser.read_csv( - StringIO(data), parse_dates=[[0, 1]], header=None, date_parser=Timestamp + result = parser.read_csv_check_warnings( + FutureWarning, + "use 'date_format' instead", + StringIO(data), + parse_dates=[[0, 1]], + header=None, + date_parser=Timestamp, ) expected = DataFrame( [ @@ -683,7 +699,9 @@ def test_date_parser_int_bug(all_parsers): "12345,1,-1,3,invoice_InvoiceResource,search\n" ) - result = parser.read_csv( + result = parser.read_csv_check_warnings( + FutureWarning, + "use 'date_format' instead", StringIO(data), index_col=0, parse_dates=[0], @@ -749,11 +767,16 @@ def test_csv_custom_parser(all_parsers): 20090103,c,4,5 """ parser = all_parsers - result = parser.read_csv( - StringIO(data), date_parser=lambda x: datetime.strptime(x, "%Y%m%d") + result = parser.read_csv_check_warnings( + FutureWarning, + "use 'date_format' instead", + StringIO(data), + date_parser=lambda x: datetime.strptime(x, "%Y%m%d"), ) expected = parser.read_csv(StringIO(data), parse_dates=True) tm.assert_frame_equal(result, expected) + result = parser.read_csv(StringIO(data), date_format="%Y%m%d") + tm.assert_frame_equal(result, expected) @xfail_pyarrow @@ -900,7 +923,9 @@ def test_parse_dates_custom_euro_format(all_parsers, kwargs): 02/02/2010,1,2 """ if "dayfirst" in kwargs: - df = parser.read_csv( + df = parser.read_csv_check_warnings( + FutureWarning, + "use 'date_format' instead", StringIO(data), names=["time", "Q", "NTU"], date_parser=lambda d: du_parse(d, **kwargs), @@ -922,7 +947,9 @@ def test_parse_dates_custom_euro_format(all_parsers, kwargs): else: msg = "got an unexpected keyword argument 'day_first'" with pytest.raises(TypeError, match=msg): - parser.read_csv( + parser.read_csv_check_warnings( + FutureWarning, + "use 'date_format' instead", StringIO(data), names=["time", "Q", "NTU"], date_parser=lambda d: du_parse(d, **kwargs), @@ -1305,6 +1332,26 @@ def test_parse_dates_infer_datetime_format_warning(all_parsers, reader): ) +@pytest.mark.parametrize( + "reader", ["read_csv_check_warnings", "read_table_check_warnings"] +) +def test_parse_dates_date_parser_and_date_format(all_parsers, reader): + # GH 50601 + parser = all_parsers + data = "Date,test\n2012-01-01,1\n,2" + msg = "Cannot use both 'date_parser' and 'date_format'" + with pytest.raises(TypeError, match=msg): + getattr(parser, reader)( + FutureWarning, + "use 'date_format' instead", + StringIO(data), + parse_dates=["Date"], + date_parser=pd.to_datetime, + date_format="ISO8601", + sep=",", + ) + + @xfail_pyarrow @pytest.mark.parametrize( "data,kwargs,expected", @@ -1355,7 +1402,9 @@ def test_parse_date_time_multi_level_column_name(all_parsers): 2001-01-06, 00:00:00, 1.0, 11. """ parser = all_parsers - result = parser.read_csv( + result = parser.read_csv_check_warnings( + FutureWarning, + "use 'date_format' instead", StringIO(data), header=[0, 1], parse_dates={"date_time": [0, 1]}, @@ -1445,7 +1494,13 @@ def test_parse_date_time_multi_level_column_name(all_parsers): ) def test_parse_date_time(all_parsers, data, kwargs, expected): parser = all_parsers - result = parser.read_csv(StringIO(data), date_parser=pd.to_datetime, **kwargs) + result = parser.read_csv_check_warnings( + FutureWarning, + "use 'date_format' instead", + StringIO(data), + date_parser=pd.to_datetime, + **kwargs, + ) # Python can sometimes be flaky about how # the aggregated columns are entered, so @@ -1460,7 +1515,9 @@ def test_parse_date_time(all_parsers, data, kwargs, expected): def test_parse_date_fields(all_parsers): parser = all_parsers data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11." - result = parser.read_csv( + result = parser.read_csv_check_warnings( + FutureWarning, + "use 'date_format' instead", StringIO(data), header=0, parse_dates={"ymd": [0, 1, 2]}, @@ -1475,18 +1532,31 @@ def test_parse_date_fields(all_parsers): @xfail_pyarrow -def test_parse_date_all_fields(all_parsers): +@pytest.mark.parametrize( + ("key", "value", "warn"), + [ + ( + "date_parser", + lambda x: pd.to_datetime(x, format="%Y %m %d %H %M %S"), + FutureWarning, + ), + ("date_format", "%Y %m %d %H %M %S", None), + ], +) +def test_parse_date_all_fields(all_parsers, key, value, warn): parser = all_parsers data = """\ year,month,day,hour,minute,second,a,b 2001,01,05,10,00,0,0.0,10. 2001,01,5,10,0,00,1.,11. """ - result = parser.read_csv( + result = parser.read_csv_check_warnings( + warn, + "use 'date_format' instead", StringIO(data), header=0, - date_parser=lambda x: pd.to_datetime(x, format="%Y %m %d %H %M %S"), parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]}, + **{key: value}, ) expected = DataFrame( [ @@ -1499,18 +1569,31 @@ def test_parse_date_all_fields(all_parsers): @xfail_pyarrow -def test_datetime_fractional_seconds(all_parsers): +@pytest.mark.parametrize( + ("key", "value", "warn"), + [ + ( + "date_parser", + lambda x: pd.to_datetime(x, format="%Y %m %d %H %M %S.%f"), + FutureWarning, + ), + ("date_format", "%Y %m %d %H %M %S.%f", None), + ], +) +def test_datetime_fractional_seconds(all_parsers, key, value, warn): parser = all_parsers data = """\ year,month,day,hour,minute,second,a,b 2001,01,05,10,00,0.123456,0.0,10. 2001,01,5,10,0,0.500000,1.,11. """ - result = parser.read_csv( + result = parser.read_csv_check_warnings( + warn, + "use 'date_format' instead", StringIO(data), header=0, - date_parser=lambda x: pd.to_datetime(x, format="%Y %m %d %H %M %S.%f"), parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]}, + **{key: value}, ) expected = DataFrame( [ @@ -1530,7 +1613,9 @@ def test_generic(all_parsers): def parse_function(yy, mm): return [date(year=int(y), month=int(m), day=1) for y, m in zip(yy, mm)] - result = parser.read_csv( + result = parser.read_csv_check_warnings( + FutureWarning, + "use 'date_format' instead", StringIO(data), header=0, parse_dates={"ym": [0, 1]}, @@ -1563,7 +1648,9 @@ def date_parser(dt, time): arr = [datetime.combine(d, t) for d, t in zip(dt, time)] return np.array(arr, dtype="datetime64[s]") - result = parser.read_csv( + result = parser.read_csv_check_warnings( + FutureWarning, + "use 'date_format' instead", StringIO(data), date_parser=date_parser, parse_dates={"datetime": ["date", "time"]}, @@ -1990,7 +2077,14 @@ def test_infer_first_column_as_index(all_parsers): @skip_pyarrow -def test_replace_nans_before_parsing_dates(all_parsers): +@pytest.mark.parametrize( + ("key", "value", "warn"), + [ + ("date_parser", lambda x: pd.to_datetime(x, format="%Y-%m-%d"), FutureWarning), + ("date_format", "%Y-%m-%d", None), + ], +) +def test_replace_nans_before_parsing_dates(all_parsers, key, value, warn): # GH#26203 parser = all_parsers data = """Test @@ -2000,11 +2094,13 @@ def test_replace_nans_before_parsing_dates(all_parsers): # 2017-09-09 """ - result = parser.read_csv( + result = parser.read_csv_check_warnings( + warn, + "use 'date_format' instead", StringIO(data), na_values={"Test": ["#", "0"]}, parse_dates=["Test"], - date_parser=lambda x: pd.to_datetime(x, format="%Y-%m-%d"), + **{key: value}, ) expected = DataFrame( { diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index c2939f7c12f10..47379aaab6feb 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -284,15 +284,16 @@ def test_fwf_regression(): 2009164210000 9.6034 9.0897 8.3822 7.4905 6.0908 5.7904 5.4039 """ - result = read_fwf( - StringIO(data), - index_col=0, - header=None, - names=names, - widths=widths, - parse_dates=True, - date_parser=lambda s: datetime.strptime(s, "%Y%j%H%M%S"), - ) + with tm.assert_produces_warning(FutureWarning, match="use 'date_format' instead"): + result = read_fwf( + StringIO(data), + index_col=0, + header=None, + names=names, + widths=widths, + parse_dates=True, + date_parser=lambda s: datetime.strptime(s, "%Y%j%H%M%S"), + ) expected = DataFrame( [ [9.5403, 9.4105, 8.6571, 7.8372, 6.0612, 5.8843, 5.5192], @@ -313,6 +314,16 @@ def test_fwf_regression(): columns=["SST", "T010", "T020", "T030", "T060", "T080", "T100"], ) tm.assert_frame_equal(result, expected) + result = read_fwf( + StringIO(data), + index_col=0, + header=None, + names=names, + widths=widths, + parse_dates=True, + date_format="%Y%j%H%M%S", + ) + tm.assert_frame_equal(result, expected) def test_fwf_for_uint8():