diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index dae6107db4d92..ff0ccffced0f3 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -445,16 +445,6 @@ def setup(self, engine): data = data.format(*two_cols) self.StringIO_input = StringIO(data) - def time_multiple_date(self, engine): - read_csv( - self.data(self.StringIO_input), - engine=engine, - sep=",", - header=None, - names=list(string.digits[:9]), - parse_dates=[[1, 2], [1, 3]], - ) - def time_baseline(self, engine): read_csv( self.data(self.StringIO_input), diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 49609a80d7e15..732c28019a30a 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -267,13 +267,10 @@ skip_blank_lines : boolean, default ``True`` Datetime handling +++++++++++++++++ -parse_dates : boolean or list of ints or names or list of lists or dict, default ``False``. +parse_dates : boolean or list of ints or names, default ``False``. * If ``True`` -> try parsing the index. * If ``[1, 2, 3]`` -> try parsing columns 1, 2, 3 each as a separate date column. - * If ``[[1, 3]]`` -> combine columns 1 and 3 and parse as a single date - column. - * If ``{'foo': [1, 3]}`` -> parse columns 1, 3 as date and call result 'foo'. .. note:: A fast-path exists for iso8601-formatted dates. @@ -828,74 +825,6 @@ The simplest case is to just pass in ``parse_dates=True``: # These are Python datetime objects df.index -It is often the case that we may want to store date and time data separately, -or store various date fields separately. the ``parse_dates`` keyword can be -used to specify a combination of columns to parse the dates and/or times from. - -You can specify a list of column lists to ``parse_dates``, the resulting date -columns will be prepended to the output (so as to not affect the existing column -order) and the new column names will be the concatenation of the component -column names: - -.. ipython:: python - :okwarning: - - data = ( - "KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" - "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" - "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" - "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" - "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" - "KORD,19990127, 23:00:00, 22:56:00, -0.5900" - ) - - with open("tmp.csv", "w") as fh: - fh.write(data) - - df = pd.read_csv("tmp.csv", header=None, parse_dates=[[1, 2], [1, 3]]) - df - -By default the parser removes the component date columns, but you can choose -to retain them via the ``keep_date_col`` keyword: - -.. ipython:: python - :okwarning: - - df = pd.read_csv( - "tmp.csv", header=None, parse_dates=[[1, 2], [1, 3]], keep_date_col=True - ) - df - -Note that if you wish to combine multiple columns into a single date column, a -nested list must be used. In other words, ``parse_dates=[1, 2]`` indicates that -the second and third columns should each be parsed as separate date columns -while ``parse_dates=[[1, 2]]`` means the two columns should be parsed into a -single column. - -You can also use a dict to specify custom name columns: - -.. ipython:: python - :okwarning: - - date_spec = {"nominal": [1, 2], "actual": [1, 3]} - df = pd.read_csv("tmp.csv", header=None, parse_dates=date_spec) - df - -It is important to remember that if multiple text columns are to be parsed into -a single date column, then a new column is prepended to the data. The ``index_col`` -specification is based off of this new set of columns rather than the original -data columns: - - -.. ipython:: python - :okwarning: - - date_spec = {"nominal": [1, 2], "actual": [1, 3]} - df = pd.read_csv( - "tmp.csv", header=None, parse_dates=date_spec, index_col=0 - ) # index is the nominal column - df - .. note:: If a column or index contains an unparsable date, the entire column or index will be returned unaltered as an object data type. For non-standard @@ -908,10 +837,6 @@ data columns: for your data to store datetimes in this format, load times will be significantly faster, ~20x has been observed. -.. deprecated:: 2.2.0 - Combining date columns inside read_csv is deprecated. Use ``pd.to_datetime`` - on the relevant result columns instead. - Date parsing functions ++++++++++++++++++++++ diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index c77348b365370..ff89b700d2053 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -225,6 +225,7 @@ Removal of prior version deprecations/changes - Disallow calling :meth:`Series.replace` or :meth:`DataFrame.replace` without a ``value`` and with non-dict-like ``to_replace`` (:issue:`33302`) - Disallow constructing a :class:`arrays.SparseArray` with scalar data (:issue:`53039`) - Disallow indexing an :class:`Index` with a boolean indexer of length zero, it now raises ``ValueError`` (:issue:`55820`) +- Disallow nested sequences for 'parse_dates' in :func:`read_csv`, combine the desired columns using :func:`to_datetime` after parsing instead (:issue:`55569`) - Disallow non-standard (``np.ndarray``, :class:`Index`, :class:`ExtensionArray`, or :class:`Series`) to :func:`isin`, :func:`unique`, :func:`factorize` (:issue:`52986`) - Disallow passing a pandas type to :meth:`Index.view` (:issue:`55709`) - Disallow units other than "s", "ms", "us", "ns" for datetime64 and timedelta64 dtypes in :func:`array` (:issue:`53817`) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 70f9a68244164..647793b6c1c95 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -814,12 +814,11 @@ def read_csv( ): depr = True if depr: - warnings.warn( - "Support for nested sequences for 'parse_dates' in pd.read_csv " - "is deprecated. Combine the desired columns with pd.to_datetime " - "after parsing instead.", - FutureWarning, - stacklevel=find_stack_level(), + raise ValueError( + # GH#55569 + "Nested sequences for 'parse_dates' is no longer supported. " + "Combine the desired columns with pd.to_datetime after parsing " + "instead." ) if infer_datetime_format is not lib.no_default: diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 8968948df5fa9..5435bf7664aa5 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -116,7 +116,6 @@ def __custom_date_parser(time): tm.assert_frame_equal(result, expected) -@xfail_pyarrow def test_separator_date_conflict(all_parsers): # Regression test for gh-4678 # @@ -124,24 +123,16 @@ def test_separator_date_conflict(all_parsers): # date parsing do not conflict. parser = all_parsers data = "06-02-2013;13:00;1-000.215" - expected = DataFrame( - [[datetime(2013, 6, 2, 13, 0, 0), 1000.215]], columns=["Date", 2] - ) - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - df = parser.read_csv( + msg = "Nested sequences for 'parse_dates' is no longer supported" + with pytest.raises(ValueError, match=msg): + parser.read_csv( StringIO(data), sep=";", thousands="-", parse_dates={"Date": [0, 1]}, header=None, ) - tm.assert_frame_equal(df, expected) @pytest.mark.parametrize("keep_date_col", [True, False]) @@ -156,14 +147,6 @@ def test_multiple_date_col_custom(all_parsers, keep_date_col, request): """ parser = all_parsers - if keep_date_col and parser.engine == "pyarrow": - # For this to pass, we need to disable auto-inference on the date columns - # in parse_dates. We have no way of doing this though - mark = pytest.mark.xfail( - reason="pyarrow doesn't support disabling auto-inference on column numbers." - ) - request.applymarker(mark) - def date_parser(*date_cols): """ Test date parser. @@ -181,125 +164,20 @@ def date_parser(*date_cols): parsing.concat_date_cols(date_cols), parser=du_parse ) - kwds = { - "header": None, - "date_parser": date_parser, - "parse_dates": {"actual": [1, 2], "nominal": [1, 3]}, - "keep_date_col": keep_date_col, - "names": ["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8"], - } - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - **kwds, - raise_on_extra_warnings=False, - ) - - expected = DataFrame( - [ - [ - datetime(1999, 1, 27, 19, 0), - datetime(1999, 1, 27, 18, 56), - "KORD", - "19990127", - " 19:00:00", - " 18:56:00", - 0.81, - 2.81, - 7.2, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 20, 0), - datetime(1999, 1, 27, 19, 56), - "KORD", - "19990127", - " 20:00:00", - " 19:56:00", - 0.01, - 2.21, - 7.2, - 0.0, - 260.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - datetime(1999, 1, 27, 20, 56), - "KORD", - "19990127", - " 21:00:00", - " 20:56:00", - -0.59, - 2.21, - 5.7, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - datetime(1999, 1, 27, 21, 18), - "KORD", - "19990127", - " 21:00:00", - " 21:18:00", - -0.99, - 2.01, - 3.6, - 0.0, - 270.0, - ], - [ - datetime(1999, 1, 27, 22, 0), - datetime(1999, 1, 27, 21, 56), - "KORD", - "19990127", - " 22:00:00", - " 21:56:00", - -0.59, - 1.71, - 5.1, - 0.0, - 290.0, - ], - [ - datetime(1999, 1, 27, 23, 0), - datetime(1999, 1, 27, 22, 56), - "KORD", - "19990127", - " 23:00:00", - " 22:56:00", - -0.59, - 1.71, - 4.6, - 0.0, - 280.0, - ], - ], - columns=[ - "actual", - "nominal", - "X0", - "X1", - "X2", - "X3", - "X4", - "X5", - "X6", - "X7", - "X8", - ], - ) - - if not keep_date_col: - expected = expected.drop(["X1", "X2", "X3"], axis=1) - - # Python can sometimes be flaky about how - # the aggregated columns are entered, so - # this standardizes the order. - result = result[expected.columns] - tm.assert_frame_equal(result, expected) + depr_msg = "The 'keep_date_col' keyword in pd.read_csv is deprecated" + msg = "Nested sequences for 'parse_dates' is no longer supported" + with pytest.raises(ValueError, match=msg): + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + parser.read_csv( + StringIO(data), + header=None, + date_parser=date_parser, + parse_dates={"actual": [1, 2], "nominal": [1, 3]}, + keep_date_col=keep_date_col, + names=["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8"], + ) @pytest.mark.parametrize("container", [list, tuple, Index, Series]) @@ -326,13 +204,7 @@ def test_multiple_date_col(all_parsers, keep_date_col, request): """ parser = all_parsers - if keep_date_col and parser.engine == "pyarrow": - # For this to pass, we need to disable auto-inference on the date columns - # in parse_dates. We have no way of doing this though - mark = pytest.mark.xfail( - reason="pyarrow doesn't support disabling auto-inference on column numbers." - ) - request.applymarker(mark) + msg = "Nested sequences for 'parse_dates' is no longer supported" depr_msg = "The 'keep_date_col' keyword in pd.read_csv is deprecated" @@ -342,111 +214,11 @@ def test_multiple_date_col(all_parsers, keep_date_col, request): "keep_date_col": keep_date_col, "names": ["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8"], } - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv(StringIO(data), **kwds) - - expected = DataFrame( - [ - [ - datetime(1999, 1, 27, 19, 0), - datetime(1999, 1, 27, 18, 56), - "KORD", - "19990127", - " 19:00:00", - " 18:56:00", - 0.81, - 2.81, - 7.2, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 20, 0), - datetime(1999, 1, 27, 19, 56), - "KORD", - "19990127", - " 20:00:00", - " 19:56:00", - 0.01, - 2.21, - 7.2, - 0.0, - 260.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - datetime(1999, 1, 27, 20, 56), - "KORD", - "19990127", - " 21:00:00", - " 20:56:00", - -0.59, - 2.21, - 5.7, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - datetime(1999, 1, 27, 21, 18), - "KORD", - "19990127", - " 21:00:00", - " 21:18:00", - -0.99, - 2.01, - 3.6, - 0.0, - 270.0, - ], - [ - datetime(1999, 1, 27, 22, 0), - datetime(1999, 1, 27, 21, 56), - "KORD", - "19990127", - " 22:00:00", - " 21:56:00", - -0.59, - 1.71, - 5.1, - 0.0, - 290.0, - ], - [ - datetime(1999, 1, 27, 23, 0), - datetime(1999, 1, 27, 22, 56), - "KORD", - "19990127", - " 23:00:00", - " 22:56:00", - -0.59, - 1.71, - 4.6, - 0.0, - 280.0, - ], - ], - columns=[ - "X1_X2", - "X1_X3", - "X0", - "X1", - "X2", - "X3", - "X4", - "X5", - "X6", - "X7", - "X8", - ], - ) - - if not keep_date_col: - expected = expected.drop(["X1", "X2", "X3"], axis=1) - - tm.assert_frame_equal(result, expected) + with pytest.raises(ValueError, match=msg): + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + parser.read_csv(StringIO(data), **kwds) def test_date_col_as_index_col(all_parsers): @@ -512,51 +284,12 @@ def test_multiple_date_cols_int_cast(all_parsers): "parse_dates": parse_dates, "date_parser": pd.to_datetime, } - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - **kwds, - raise_on_extra_warnings=False, - ) - - expected = DataFrame( - [ - [datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 18, 56), "KORD", 0.81], - [datetime(1999, 1, 27, 20, 0), datetime(1999, 1, 27, 19, 56), "KORD", 0.01], - [ - datetime(1999, 1, 27, 21, 0), - datetime(1999, 1, 27, 20, 56), - "KORD", - -0.59, - ], - [ - datetime(1999, 1, 27, 21, 0), - datetime(1999, 1, 27, 21, 18), - "KORD", - -0.99, - ], - [ - datetime(1999, 1, 27, 22, 0), - datetime(1999, 1, 27, 21, 56), - "KORD", - -0.59, - ], - [ - datetime(1999, 1, 27, 23, 0), - datetime(1999, 1, 27, 22, 56), - "KORD", - -0.59, - ], - ], - columns=["actual", "nominal", 0, 4], - ) - - # Python can sometimes be flaky about how - # the aggregated columns are entered, so - # this standardizes the order. - result = result[expected.columns] - tm.assert_frame_equal(result, expected) + msg = "Nested sequences for 'parse_dates' is no longer supported" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), + **kwds, + ) def test_multiple_date_col_timestamp_parse(all_parsers): @@ -564,42 +297,16 @@ def test_multiple_date_col_timestamp_parse(all_parsers): data = """05/31/2012,15:30:00.029,1306.25,1,E,0,,1306.25 05/31/2012,15:30:00.029,1306.25,8,E,0,,1306.25""" - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - parse_dates=[[0, 1]], - header=None, - date_parser=Timestamp, - raise_on_extra_warnings=False, - ) - expected = DataFrame( - [ - [ - Timestamp("05/31/2012, 15:30:00.029"), - 1306.25, - 1, - "E", - 0, - np.nan, - 1306.25, - ], - [ - Timestamp("05/31/2012, 15:30:00.029"), - 1306.25, - 8, - "E", - 0, - np.nan, - 1306.25, - ], - ], - columns=["0_1", 2, 3, 4, 5, 6, 7], - ) - tm.assert_frame_equal(result, expected) + msg = "Nested sequences for 'parse_dates' is no longer supported" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), + parse_dates=[[0, 1]], + header=None, + date_parser=Timestamp, + ) -@xfail_pyarrow def test_multiple_date_cols_with_header(all_parsers): parser = all_parsers data = """\ @@ -611,88 +318,9 @@ def test_multiple_date_cols_with_header(all_parsers): KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv(StringIO(data), parse_dates={"nominal": [1, 2]}) - expected = DataFrame( - [ - [ - datetime(1999, 1, 27, 19, 0), - "KORD", - " 18:56:00", - 0.81, - 2.81, - 7.2, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 20, 0), - "KORD", - " 19:56:00", - 0.01, - 2.21, - 7.2, - 0.0, - 260.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - "KORD", - " 20:56:00", - -0.59, - 2.21, - 5.7, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - "KORD", - " 21:18:00", - -0.99, - 2.01, - 3.6, - 0.0, - 270.0, - ], - [ - datetime(1999, 1, 27, 22, 0), - "KORD", - " 21:56:00", - -0.59, - 1.71, - 5.1, - 0.0, - 290.0, - ], - [ - datetime(1999, 1, 27, 23, 0), - "KORD", - " 22:56:00", - -0.59, - 1.71, - 4.6, - 0.0, - 280.0, - ], - ], - columns=[ - "nominal", - "ID", - "ActualTime", - "TDew", - "TAir", - "Windspeed", - "Precip", - "WindDir", - ], - ) - tm.assert_frame_equal(result, expected) + msg = "Nested sequences for 'parse_dates' is no longer supported" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), parse_dates={"nominal": [1, 2]}) @pytest.mark.parametrize( @@ -719,14 +347,9 @@ def test_multiple_date_cols_with_header(all_parsers): def test_multiple_date_col_name_collision(all_parsers, data, parse_dates, msg): parser = all_parsers - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) + msg = "Nested sequences for 'parse_dates' is no longer supported" with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - parser.read_csv(StringIO(data), parse_dates=parse_dates) + parser.read_csv(StringIO(data), parse_dates=parse_dates) def test_date_parser_int_bug(all_parsers): @@ -1026,7 +649,6 @@ def test_parse_tz_aware(all_parsers): assert result.index.tz is expected_tz -@xfail_pyarrow @pytest.mark.parametrize( "parse_dates,index_col", [({"nominal": [1, 2]}, "nominal"), ({"nominal": [1, 2]}, 0), ([[1, 2]], 0)], @@ -1042,98 +664,11 @@ def test_multiple_date_cols_index(all_parsers, parse_dates, index_col): KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 """ - expected = DataFrame( - [ - [ - datetime(1999, 1, 27, 19, 0), - "KORD1", - " 18:56:00", - 0.81, - 2.81, - 7.2, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 20, 0), - "KORD2", - " 19:56:00", - 0.01, - 2.21, - 7.2, - 0.0, - 260.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - "KORD3", - " 20:56:00", - -0.59, - 2.21, - 5.7, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - "KORD4", - " 21:18:00", - -0.99, - 2.01, - 3.6, - 0.0, - 270.0, - ], - [ - datetime(1999, 1, 27, 22, 0), - "KORD5", - " 21:56:00", - -0.59, - 1.71, - 5.1, - 0.0, - 290.0, - ], - [ - datetime(1999, 1, 27, 23, 0), - "KORD6", - " 22:56:00", - -0.59, - 1.71, - 4.6, - 0.0, - 280.0, - ], - ], - columns=[ - "nominal", - "ID", - "ActualTime", - "TDew", - "TAir", - "Windspeed", - "Precip", - "WindDir", - ], - ) - expected = expected.set_index("nominal") - - if not isinstance(parse_dates, dict): - expected.index.name = "date_NominalTime" - - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv( - StringIO(data), parse_dates=parse_dates, index_col=index_col - ) - tm.assert_frame_equal(result, expected) + msg = "Nested sequences for 'parse_dates' is no longer supported" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), parse_dates=parse_dates, index_col=index_col) -@xfail_pyarrow def test_multiple_date_cols_chunked(all_parsers): parser = all_parsers data = """\ @@ -1146,90 +681,15 @@ def test_multiple_date_cols_chunked(all_parsers): KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 """ - expected = DataFrame( - [ - [ - datetime(1999, 1, 27, 19, 0), - "KORD", - " 18:56:00", - 0.81, - 2.81, - 7.2, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 20, 0), - "KORD", - " 19:56:00", - 0.01, - 2.21, - 7.2, - 0.0, - 260.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - "KORD", - " 20:56:00", - -0.59, - 2.21, - 5.7, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - "KORD", - " 21:18:00", - -0.99, - 2.01, - 3.6, - 0.0, - 270.0, - ], - [ - datetime(1999, 1, 27, 22, 0), - "KORD", - " 21:56:00", - -0.59, - 1.71, - 5.1, - 0.0, - 290.0, - ], - [ - datetime(1999, 1, 27, 23, 0), - "KORD", - " 22:56:00", - -0.59, - 1.71, - 4.6, - 0.0, - 280.0, - ], - ], - columns=["nominal", "ID", "actualTime", "A", "B", "C", "D", "E"], - ) - expected = expected.set_index("nominal") - - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): + msg = "Nested sequences for 'parse_dates' is no longer supported" + with pytest.raises(ValueError, match=msg): with parser.read_csv( StringIO(data), parse_dates={"nominal": [1, 2]}, index_col="nominal", chunksize=2, ) as reader: - chunks = list(reader) - - tm.assert_frame_equal(chunks[0], expected[:2]) - tm.assert_frame_equal(chunks[1], expected[2:4]) - tm.assert_frame_equal(chunks[2], expected[4:]) + list(reader) def test_multiple_date_col_named_index_compat(all_parsers): @@ -1244,25 +704,17 @@ def test_multiple_date_col_named_index_compat(all_parsers): KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 """ - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - with_indices = parser.read_csv( + msg = "Nested sequences for 'parse_dates' is no longer supported" + with pytest.raises(ValueError, match=msg): + parser.read_csv( StringIO(data), parse_dates={"nominal": [1, 2]}, index_col="nominal" ) - - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - with_names = parser.read_csv( + with pytest.raises(ValueError, match=msg): + parser.read_csv( StringIO(data), index_col="nominal", parse_dates={"nominal": ["date", "nominalTime"]}, ) - tm.assert_frame_equal(with_indices, with_names) def test_multiple_date_col_multiple_index_compat(all_parsers): @@ -1276,22 +728,12 @@ def test_multiple_date_col_multiple_index_compat(all_parsers): KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 """ - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv( + + msg = "Nested sequences for 'parse_dates' is no longer supported" + with pytest.raises(ValueError, match=msg): + parser.read_csv( StringIO(data), index_col=["nominal", "ID"], parse_dates={"nominal": [1, 2]} ) - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - expected = parser.read_csv(StringIO(data), parse_dates={"nominal": [1, 2]}) - - expected = expected.set_index(["nominal", "ID"]) - tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("kwargs", [{}, {"index_col": "C"}]) @@ -1463,7 +905,6 @@ def test_parse_dates_no_convert_thousands(all_parsers, data, kwargs, expected): tm.assert_frame_equal(result, expected) -@xfail_pyarrow def test_parse_date_time_multi_level_column_name(all_parsers): data = """\ D,T,A,B @@ -1472,21 +913,14 @@ def test_parse_date_time_multi_level_column_name(all_parsers): 2001-01-06, 00:00:00, 1.0, 11. """ parser = all_parsers - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - header=[0, 1], - parse_dates={"date_time": [0, 1]}, - date_parser=pd.to_datetime, - ) - - expected_data = [ - [datetime(2001, 1, 5, 9, 0, 0), 0.0, 10.0], - [datetime(2001, 1, 6, 0, 0, 0), 1.0, 11.0], - ] - expected = DataFrame(expected_data, columns=["date_time", ("A", "a"), ("B", "b")]) - tm.assert_frame_equal(result, expected) + msg = "Nested sequences for 'parse_dates' is no longer supported" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), + header=[0, 1], + parse_dates={"date_time": [0, 1]}, + date_parser=pd.to_datetime, + ) @pytest.mark.parametrize( @@ -1563,114 +997,82 @@ def test_parse_date_time_multi_level_column_name(all_parsers): ) def test_parse_date_time(all_parsers, data, kwargs, expected): parser = all_parsers - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - date_parser=pd.to_datetime, - **kwargs, - raise_on_extra_warnings=False, - ) - # Python can sometimes be flaky about how - # the aggregated columns are entered, so - # this standardizes the order. - result = result[expected.columns] - tm.assert_frame_equal(result, expected) + msg = "Nested sequences for 'parse_dates' is no longer supported" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), + date_parser=pd.to_datetime, + **kwargs, + ) def test_parse_date_fields(all_parsers): parser = all_parsers data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11." - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - header=0, - parse_dates={"ymd": [0, 1, 2]}, - date_parser=lambda x: x, - raise_on_extra_warnings=False, - ) - expected = DataFrame( - [[datetime(2001, 1, 10), 10.0], [datetime(2001, 2, 1), 11.0]], - columns=["ymd", "a"], - ) - tm.assert_frame_equal(result, expected) + msg = "Nested sequences for 'parse_dates' is no longer supported" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), + header=0, + parse_dates={"ymd": [0, 1, 2]}, + date_parser=lambda x: x, + ) @pytest.mark.parametrize( - ("key", "value", "warn"), + ("key", "value"), [ ( "date_parser", lambda x: pd.to_datetime(x, format="%Y %m %d %H %M %S"), - FutureWarning, ), - ("date_format", "%Y %m %d %H %M %S", None), + ("date_format", "%Y %m %d %H %M %S"), ], ) -def test_parse_date_all_fields(all_parsers, key, value, warn): +def test_parse_date_all_fields(all_parsers, key, value): parser = all_parsers data = """\ year,month,day,hour,minute,second,a,b 2001,01,05,10,00,0,0.0,10. 2001,01,5,10,0,00,1.,11. """ - result = parser.read_csv_check_warnings( - warn, - "use 'date_format' instead", - StringIO(data), - header=0, - parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]}, - **{key: value}, - raise_on_extra_warnings=False, - ) - expected = DataFrame( - [ - [datetime(2001, 1, 5, 10, 0, 0), 0.0, 10.0], - [datetime(2001, 1, 5, 10, 0, 0), 1.0, 11.0], - ], - columns=["ymdHMS", "a", "b"], - ) - tm.assert_frame_equal(result, expected) + msg = "Nested sequences for 'parse_dates' is no longer supported" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), + header=0, + parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]}, + **{key: value}, + ) @pytest.mark.parametrize( - ("key", "value", "warn"), + ("key", "value"), [ ( "date_parser", lambda x: pd.to_datetime(x, format="%Y %m %d %H %M %S.%f"), - FutureWarning, ), - ("date_format", "%Y %m %d %H %M %S.%f", None), + ("date_format", "%Y %m %d %H %M %S.%f"), ], ) -def test_datetime_fractional_seconds(all_parsers, key, value, warn): +def test_datetime_fractional_seconds(all_parsers, key, value): parser = all_parsers data = """\ year,month,day,hour,minute,second,a,b 2001,01,05,10,00,0.123456,0.0,10. 2001,01,5,10,0,0.500000,1.,11. """ - result = parser.read_csv_check_warnings( - warn, - "use 'date_format' instead", - StringIO(data), - header=0, - parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]}, - **{key: value}, - raise_on_extra_warnings=False, - ) - expected = DataFrame( - [ - [datetime(2001, 1, 5, 10, 0, 0, microsecond=123456), 0.0, 10.0], - [datetime(2001, 1, 5, 10, 0, 0, microsecond=500000), 1.0, 11.0], - ], - columns=["ymdHMS", "a", "b"], - ) - tm.assert_frame_equal(result, expected) + msg = "Nested sequences for 'parse_dates' is no longer supported" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), + header=0, + parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]}, + **{key: value}, + ) def test_generic(all_parsers): @@ -1680,24 +1082,16 @@ def test_generic(all_parsers): def parse_function(yy, mm): return [date(year=int(y), month=int(m), day=1) for y, m in zip(yy, mm)] - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - header=0, - parse_dates={"ym": [0, 1]}, - date_parser=parse_function, - raise_on_extra_warnings=False, - ) - expected = DataFrame( - [[date(2001, 1, 1), 10, 10.0], [date(2001, 2, 1), 1, 11.0]], - columns=["ym", "day", "a"], - ) - expected["ym"] = expected["ym"].astype("datetime64[ns]") - tm.assert_frame_equal(result, expected) + msg = "Nested sequences for 'parse_dates' is no longer supported" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), + header=0, + parse_dates={"ym": [0, 1]}, + date_parser=parse_function, + ) -@xfail_pyarrow def test_date_parser_resolution_if_not_ns(all_parsers): # see gh-10245 parser = all_parsers @@ -1716,24 +1110,14 @@ def date_parser(dt, time): arr = [datetime.combine(d, t) for d, t in zip(dt, time)] return np.array(arr, dtype="datetime64[s]") - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - date_parser=date_parser, - parse_dates={"datetime": ["date", "time"]}, - index_col=["datetime", "prn"], - ) - - datetimes = np.array(["2013-11-03T19:00:00"] * 3, dtype="datetime64[s]") - expected = DataFrame( - data={"rxstatus": ["00E80000"] * 3}, - index=MultiIndex.from_arrays( - [datetimes, [126, 23, 13]], - names=["datetime", "prn"], - ), - ) - tm.assert_frame_equal(result, expected) + msg = "Nested sequences for 'parse_dates' is no longer supported" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), + date_parser=date_parser, + parse_dates={"datetime": ["date", "time"]}, + index_col=["datetime", "prn"], + ) def test_parse_date_column_with_empty_string(all_parsers): @@ -1917,20 +1301,17 @@ def test_missing_parse_dates_column_raises( content = StringIO("date,time,val\n2020-01-31,04:20:32,32\n") msg = f"Missing column provided to 'parse_dates': '{missing_cols}'" - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - warn = FutureWarning if isinstance(parse_dates, list) and all( isinstance(x, (int, str)) for x in parse_dates ): - warn = None + pass + else: + msg = "Nested sequences for 'parse_dates' is no longer supported" with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(warn, match=depr_msg, check_stacklevel=False): - parser.read_csv( - content, sep=",", names=names, usecols=usecols, parse_dates=parse_dates - ) + parser.read_csv( + content, sep=",", names=names, usecols=usecols, parse_dates=parse_dates + ) @xfail_pyarrow # mismatched shape @@ -1966,7 +1347,6 @@ def test_date_parser_multiindex_columns(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # TypeError: an integer is required @pytest.mark.parametrize( "parse_spec, col_name", [ @@ -1980,21 +1360,13 @@ def test_date_parser_multiindex_columns_combine_cols(all_parsers, parse_spec, co 1,2,3 2019-12,-31,6""" - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv( + msg = "Nested sequences for 'parse_dates' is no longer supported" + with pytest.raises(ValueError, match=msg): + parser.read_csv( StringIO(data), parse_dates=parse_spec, header=[0, 1], ) - expected = DataFrame( - {col_name: Timestamp("2019-12-31").as_unit("ns"), ("c", "3"): [6]} - ) - tm.assert_frame_equal(result, expected) def test_date_parser_usecols_thousands(all_parsers): @@ -2030,7 +1402,6 @@ def test_date_parser_usecols_thousands(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # mismatched shape def test_parse_dates_and_keep_original_column(all_parsers): # GH#13378 parser = all_parsers @@ -2038,16 +1409,16 @@ def test_parse_dates_and_keep_original_column(all_parsers): 20150908 20150909 """ + + msg = "Nested sequences for 'parse_dates' is no longer supported" depr_msg = "The 'keep_date_col' keyword in pd.read_csv is deprecated" with tm.assert_produces_warning( FutureWarning, match=depr_msg, check_stacklevel=False ): - result = parser.read_csv( - StringIO(data), parse_dates={"date": ["A"]}, keep_date_col=True - ) - expected_data = [Timestamp("2015-09-08"), Timestamp("2015-09-09")] - expected = DataFrame({"date": expected_data, "A": expected_data}) - tm.assert_frame_equal(result, expected) + with pytest.raises(ValueError, match=msg): + parser.read_csv( + StringIO(data), parse_dates={"date": ["A"]}, keep_date_col=True + ) def test_dayfirst_warnings(): @@ -2263,21 +1634,11 @@ def test_parse_dates_dict_format_two_columns(all_parsers, key, parse_dates): 31-,12-2019 31-,12-2020""" - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv( + msg = "Nested sequences for 'parse_dates' is no longer supported" + with pytest.raises(ValueError, match=msg): + parser.read_csv( StringIO(data), date_format={key: "%d- %m-%Y"}, parse_dates=parse_dates ) - expected = DataFrame( - { - key: [Timestamp("2019-12-31"), Timestamp("2020-12-31")], - } - ) - tm.assert_frame_equal(result, expected) @xfail_pyarrow # object dtype index diff --git a/pandas/tests/io/parser/usecols/test_parse_dates.py b/pandas/tests/io/parser/usecols/test_parse_dates.py index ab98857e0c178..a3a929f3a0122 100644 --- a/pandas/tests/io/parser/usecols/test_parse_dates.py +++ b/pandas/tests/io/parser/usecols/test_parse_dates.py @@ -35,31 +35,9 @@ def test_usecols_with_parse_dates(all_parsers, usecols): parser = all_parsers parse_dates = [[1, 2]] - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - - cols = { - "a": [0, 0], - "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")], - } - expected = DataFrame(cols, columns=["c_d", "a"]) - if parser.engine == "pyarrow": - with pytest.raises(ValueError, match=_msg_pyarrow_requires_names): - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - parser.read_csv( - StringIO(data), usecols=usecols, parse_dates=parse_dates - ) - return - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv( - StringIO(data), usecols=usecols, parse_dates=parse_dates - ) - tm.assert_frame_equal(result, expected) + msg = "Nested sequences for 'parse_dates' is no longer supported" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates) @skip_pyarrow # pyarrow.lib.ArrowKeyError: Column 'fdate' in include_columns @@ -129,31 +107,13 @@ def test_usecols_with_parse_dates4(all_parsers): parse_dates = [[0, 1]] parser = all_parsers - cols = { - "a_b": "2016/09/21 1", - "c": [1], - "d": [2], - "e": [3], - "f": [4], - "g": [5], - "h": [6], - "i": [7], - "j": [8], - } - expected = DataFrame(cols, columns=["a_b"] + list("cdefghij")) - - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv( + msg = "Nested sequences for 'parse_dates' is no longer supported" + with pytest.raises(ValueError, match=msg): + parser.read_csv( StringIO(data), usecols=usecols, parse_dates=parse_dates, ) - tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]]) @@ -171,25 +131,8 @@ def test_usecols_with_parse_dates_and_names(all_parsers, usecols, names, request parse_dates = [[1, 2]] parser = all_parsers - if parser.engine == "pyarrow" and not (len(names) == 3 and usecols[0] == 0): - mark = pytest.mark.xfail( - reason="Length mismatch in some cases, UserWarning in other" - ) - request.applymarker(mark) - - cols = { - "a": [0, 0], - "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")], - } - expected = DataFrame(cols, columns=["c_d", "a"]) - - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv( + msg = "Nested sequences for 'parse_dates' is no longer supported" + with pytest.raises(ValueError, match=msg): + parser.read_csv( StringIO(s), names=names, parse_dates=parse_dates, usecols=usecols ) - tm.assert_frame_equal(result, expected)