From c759fc2fba249e790b1928a39e5797d7a3461696 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 7 May 2024 13:57:59 -0700 Subject: [PATCH 01/12] CLN: Enforce read_csv(keep_date_col, parse_dates) deprecations --- asv_bench/benchmarks/io/csv.py | 10 - doc/source/user_guide/io.rst | 75 +- pandas/io/parsers/base_parser.py | 235 +--- pandas/io/parsers/c_parser_wrapper.py | 1 - pandas/io/parsers/python_parser.py | 11 +- pandas/io/parsers/readers.py | 70 +- pandas/tests/io/parser/test_parse_dates.py | 1173 +------------------- 7 files changed, 53 insertions(+), 1522 deletions(-) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index dae6107db4d92..ff0ccffced0f3 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -445,16 +445,6 @@ def setup(self, engine): data = data.format(*two_cols) self.StringIO_input = StringIO(data) - def time_multiple_date(self, engine): - read_csv( - self.data(self.StringIO_input), - engine=engine, - sep=",", - header=None, - names=list(string.digits[:9]), - parse_dates=[[1, 2], [1, 3]], - ) - def time_baseline(self, engine): read_csv( self.data(self.StringIO_input), diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index b5cc8c43ae143..c62e803b797b0 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -270,9 +270,6 @@ parse_dates : boolean or list of ints or names or list of lists or dict, default * If ``True`` -> try parsing the index. * If ``[1, 2, 3]`` -> try parsing columns 1, 2, 3 each as a separate date column. - * If ``[[1, 3]]`` -> combine columns 1 and 3 and parse as a single date - column. - * If ``{'foo': [1, 3]}`` -> parse columns 1, 3 as date and call result 'foo'. .. note:: A fast-path exists for iso8601-formatted dates. @@ -282,9 +279,6 @@ infer_datetime_format : boolean, default ``False`` .. deprecated:: 2.0.0 A strict version of this argument is now the default, passing it has no effect. -keep_date_col : boolean, default ``False`` - If ``True`` and parse_dates specifies combining multiple columns then keep the - original columns. date_parser : function, default ``None`` Function to use for converting a sequence of string columns to an array of datetime instances. The default uses ``dateutil.parser.parser`` to do the @@ -829,71 +823,8 @@ The simplest case is to just pass in ``parse_dates=True``: It is often the case that we may want to store date and time data separately, or store various date fields separately. the ``parse_dates`` keyword can be -used to specify a combination of columns to parse the dates and/or times from. - -You can specify a list of column lists to ``parse_dates``, the resulting date -columns will be prepended to the output (so as to not affect the existing column -order) and the new column names will be the concatenation of the component -column names: - -.. ipython:: python - :okwarning: - - data = ( - "KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" - "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" - "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" - "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" - "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" - "KORD,19990127, 23:00:00, 22:56:00, -0.5900" - ) - - with open("tmp.csv", "w") as fh: - fh.write(data) - - df = pd.read_csv("tmp.csv", header=None, parse_dates=[[1, 2], [1, 3]]) - df - -By default the parser removes the component date columns, but you can choose -to retain them via the ``keep_date_col`` keyword: - -.. ipython:: python - :okwarning: - - df = pd.read_csv( - "tmp.csv", header=None, parse_dates=[[1, 2], [1, 3]], keep_date_col=True - ) - df - -Note that if you wish to combine multiple columns into a single date column, a -nested list must be used. In other words, ``parse_dates=[1, 2]`` indicates that -the second and third columns should each be parsed as separate date columns -while ``parse_dates=[[1, 2]]`` means the two columns should be parsed into a -single column. - -You can also use a dict to specify custom name columns: - -.. ipython:: python - :okwarning: - - date_spec = {"nominal": [1, 2], "actual": [1, 3]} - df = pd.read_csv("tmp.csv", header=None, parse_dates=date_spec) - df - -It is important to remember that if multiple text columns are to be parsed into -a single date column, then a new column is prepended to the data. The ``index_col`` -specification is based off of this new set of columns rather than the original -data columns: - +used to specify columns to parse the dates and/or times. -.. ipython:: python - :okwarning: - - date_spec = {"nominal": [1, 2], "actual": [1, 3]} - df = pd.read_csv( - "tmp.csv", header=None, parse_dates=date_spec, index_col=0 - ) # index is the nominal column - df .. note:: If a column or index contains an unparsable date, the entire column or @@ -907,10 +838,6 @@ data columns: for your data to store datetimes in this format, load times will be significantly faster, ~20x has been observed. -.. deprecated:: 2.2.0 - Combining date columns inside read_csv is deprecated. Use ``pd.to_datetime`` - on the relevant result columns instead. - Date parsing functions ++++++++++++++++++++++ diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 510097aed2a25..962360a29aecb 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -5,7 +5,6 @@ import csv import datetime from enum import Enum -import itertools from typing import ( TYPE_CHECKING, Any, @@ -43,7 +42,6 @@ is_integer_dtype, is_list_like, is_object_dtype, - is_scalar, is_string_dtype, pandas_dtype, ) @@ -58,7 +56,6 @@ DataFrame, DatetimeIndex, StringDtype, - concat, ) from pandas.core import algorithms from pandas.core.arrays import ( @@ -111,7 +108,6 @@ class BadLineHandleMethod(Enum): keep_default_na: bool dayfirst: bool cache_dates: bool - keep_date_col: bool usecols_dtype: str | None def __init__(self, kwds) -> None: @@ -125,12 +121,19 @@ def __init__(self, kwds) -> None: self.index_names: Sequence[Hashable] | None = None self.col_names: Sequence[Hashable] | None = None - self.parse_dates = _validate_parse_dates_arg(kwds.pop("parse_dates", False)) - self._parse_date_cols: Iterable = [] + parse_dates = kwds.pop("parse_dates", False) + if isinstance(parse_dates, None) or lib.is_bool(parse_dates): + parse_dates = bool(parse_dates) + elif not isinstance(parse_dates, list): + raise TypeError( + "Only booleans and lists are accepted " + "for the 'parse_dates' parameter" + ) + self.parse_dates: bool | list = parse_dates + self._parse_date_cols: set = set() self.date_parser = kwds.pop("date_parser", lib.no_default) self.date_format = kwds.pop("date_format", None) self.dayfirst = kwds.pop("dayfirst", False) - self.keep_date_col = kwds.pop("keep_date_col", False) self.na_values = kwds.get("na_values") self.na_fvalues = kwds.get("na_fvalues") @@ -180,8 +183,6 @@ def __init__(self, kwds) -> None: else: self.index_col = list(self.index_col) - self._name_processed = False - self._first_chunk = True self.usecols, self.usecols_dtype = self._validate_usecols_arg(kwds["usecols"]) @@ -190,7 +191,7 @@ def __init__(self, kwds) -> None: # Normally, this arg would get pre-processed earlier on self.on_bad_lines = kwds.get("on_bad_lines", self.BadLineHandleMethod.ERROR) - def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> Iterable: + def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> set: """ Check if parse_dates are in columns. @@ -204,7 +205,7 @@ def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> Iterabl Returns ------- - The names of the columns which will get parsed later if a dict or list + The names of the columns which will get parsed later if a list is given as specification. Raises @@ -213,30 +214,15 @@ def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> Iterabl If column to parse_date is not in dataframe. """ - cols_needed: Iterable - if is_dict_like(self.parse_dates): - cols_needed = itertools.chain(*self.parse_dates.values()) - elif is_list_like(self.parse_dates): - # a column in parse_dates could be represented - # ColReference = Union[int, str] - # DateGroups = List[ColReference] - # ParseDates = Union[DateGroups, List[DateGroups], - # Dict[ColReference, DateGroups]] - cols_needed = itertools.chain.from_iterable( - col if is_list_like(col) and not isinstance(col, tuple) else [col] - for col in self.parse_dates - ) - else: - cols_needed = [] - - cols_needed = list(cols_needed) + if not isinstance(self.parse_dates, list): + return set() # get only columns that are references using names (str), not by index missing_cols = ", ".join( sorted( { col - for col in cols_needed + for col in self.parse_dates if isinstance(col, str) and col not in columns } ) @@ -246,27 +232,18 @@ def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> Iterabl f"Missing column provided to 'parse_dates': '{missing_cols}'" ) # Convert positions to actual column names - return [ + return { col if (isinstance(col, str) or col in columns) else columns[col] - for col in cols_needed - ] + for col in self.parse_dates + } def close(self) -> None: pass - @final - @property - def _has_complex_date_col(self) -> bool: - return isinstance(self.parse_dates, dict) or ( - isinstance(self.parse_dates, list) - and len(self.parse_dates) > 0 - and isinstance(self.parse_dates[0], list) - ) - @final def _should_parse_dates(self, i: int) -> bool: - if lib.is_bool(self.parse_dates): - return bool(self.parse_dates) + if isinstance(self.parse_dates, bool): + return self.parse_dates else: if self.index_names is not None: name = self.index_names[i] @@ -368,18 +345,9 @@ def _make_index( index: Index | None if not is_index_col(self.index_col) or not self.index_col: index = None - - elif not self._has_complex_date_col: + else: simple_index = self._get_simple_index(alldata, columns) index = self._agg_index(simple_index) - elif self._has_complex_date_col: - if not self._name_processed: - (self.index_names, _, self.index_col) = self._clean_index_names( - list(columns), self.index_col - ) - self._name_processed = True - date_index = self._get_complex_date_index(data, columns) - index = self._agg_index(date_index, try_parse_dates=False) # add names for the index if indexnamerow: @@ -645,19 +613,7 @@ def _set(x) -> int: if isinstance(self.parse_dates, list): for val in self.parse_dates: - if isinstance(val, list): - for k in val: - noconvert_columns.add(_set(k)) - else: - noconvert_columns.add(_set(val)) - - elif isinstance(self.parse_dates, dict): - for val in self.parse_dates.values(): - if isinstance(val, list): - for k in val: - noconvert_columns.add(_set(k)) - else: - noconvert_columns.add(_set(val)) + noconvert_columns.add(_set(val)) elif self.parse_dates: if isinstance(self.index_col, list): @@ -875,7 +831,7 @@ def _do_date_conversions( ) -> tuple[Sequence[Hashable] | Index, Mapping[Hashable, ArrayLike] | DataFrame]: # returns data, columns - if self.parse_dates is not None: + if isinstance(self.parse_dates, list): data, names = _process_date_conversion( data, self._date_conv, @@ -883,7 +839,6 @@ def _do_date_conversions( self.index_col, self.index_names, names, - keep_date_col=self.keep_date_col, dtype_backend=self.dtype_backend, ) @@ -1228,7 +1183,6 @@ def converter(*date_cols, col: Hashable): "decimal": ".", # 'engine': 'c', "parse_dates": False, - "keep_date_col": False, "dayfirst": False, "date_parser": lib.no_default, "date_format": None, @@ -1247,11 +1201,10 @@ def converter(*date_cols, col: Hashable): def _process_date_conversion( data_dict, converter: Callable, - parse_spec, + parse_spec: list, index_col, index_names, columns, - keep_date_col: bool = False, dtype_backend=lib.no_default, ) -> tuple[dict, list]: def _isindex(colspec): @@ -1259,111 +1212,28 @@ def _isindex(colspec): isinstance(index_names, list) and colspec in index_names ) - new_cols = [] - new_data = {} - orig_names = columns - columns = list(columns) - - date_cols = set() - - if parse_spec is None or isinstance(parse_spec, bool): - return data_dict, columns - - if isinstance(parse_spec, list): - # list of column lists - for colspec in parse_spec: - if is_scalar(colspec) or isinstance(colspec, tuple): - if isinstance(colspec, int) and colspec not in data_dict: - colspec = orig_names[colspec] - if _isindex(colspec): - continue - elif dtype_backend == "pyarrow": - import pyarrow as pa - - dtype = data_dict[colspec].dtype - if isinstance(dtype, ArrowDtype) and ( - pa.types.is_timestamp(dtype.pyarrow_dtype) - or pa.types.is_date(dtype.pyarrow_dtype) - ): - continue - - # Pyarrow engine returns Series which we need to convert to - # numpy array before converter, its a no-op for other parsers - data_dict[colspec] = converter( - np.asarray(data_dict[colspec]), col=colspec - ) - else: - new_name, col, old_names = _try_convert_dates( - converter, colspec, data_dict, orig_names - ) - if new_name in data_dict: - raise ValueError(f"New date column already in dict {new_name}") - new_data[new_name] = col - new_cols.append(new_name) - date_cols.update(old_names) - - elif isinstance(parse_spec, dict): - # dict of new name to column list - for new_name, colspec in parse_spec.items(): - if new_name in data_dict: - raise ValueError(f"Date column {new_name} already in dict") - - _, col, old_names = _try_convert_dates( - converter, - colspec, - data_dict, - orig_names, - target_name=new_name, - ) - - new_data[new_name] = col - - # If original column can be converted to date we keep the converted values - # This can only happen if values are from single column - if len(colspec) == 1: - new_data[colspec[0]] = col - - new_cols.append(new_name) - date_cols.update(old_names) - - if isinstance(data_dict, DataFrame): - data_dict = concat([DataFrame(new_data), data_dict], axis=1) - else: - data_dict.update(new_data) - new_cols.extend(columns) - - if not keep_date_col: - for c in list(date_cols): - data_dict.pop(c) - new_cols.remove(c) - - return data_dict, new_cols + for colspec in parse_spec: + if isinstance(colspec, int) and colspec not in data_dict: + colspec = orig_names[colspec] + if _isindex(colspec): + continue + elif dtype_backend == "pyarrow": + import pyarrow as pa + + dtype = data_dict[colspec].dtype + if isinstance(dtype, ArrowDtype) and ( + pa.types.is_timestamp(dtype.pyarrow_dtype) + or pa.types.is_date(dtype.pyarrow_dtype) + ): + continue -def _try_convert_dates( - parser: Callable, colspec, data_dict, columns, target_name: str | None = None -): - colset = set(columns) - colnames = [] - - for c in colspec: - if c in colset: - colnames.append(c) - elif isinstance(c, int) and c not in columns: - colnames.append(columns[c]) - else: - colnames.append(c) - - new_name: tuple | str - if all(isinstance(x, tuple) for x in colnames): - new_name = tuple(map("_".join, zip(*colnames))) - else: - new_name = "_".join([str(x) for x in colnames]) - to_parse = [np.asarray(data_dict[c]) for c in colnames if c in data_dict] + # Pyarrow engine returns Series which we need to convert to + # numpy array before converter, its a no-op for other parsers + data_dict[colspec] = converter(np.asarray(data_dict[colspec]), col=colspec) - new_col = parser(*to_parse, col=new_name if target_name is None else target_name) - return new_name, new_col, colnames + return data_dict, columns def _get_na_values(col, na_values, na_fvalues, keep_default_na: bool): @@ -1401,26 +1271,5 @@ def _get_na_values(col, na_values, na_fvalues, keep_default_na: bool): return na_values, na_fvalues -def _validate_parse_dates_arg(parse_dates): - """ - Check whether or not the 'parse_dates' parameter - is a non-boolean scalar. Raises a ValueError if - that is the case. - """ - msg = ( - "Only booleans, lists, and dictionaries are accepted " - "for the 'parse_dates' parameter" - ) - - if not ( - parse_dates is None - or lib.is_bool(parse_dates) - or isinstance(parse_dates, (list, dict)) - ): - raise TypeError(msg) - - return parse_dates - - def is_index_col(col) -> bool: return col is not None and col is not False diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index 6e5d36ad39c8a..1baca9d48d795 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -171,7 +171,6 @@ def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None: if self._reader.leading_cols == 0 and is_index_col( self.index_col # type: ignore[has-type] ): - self._name_processed = True ( index_names, # error: Cannot determine type of 'names' diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index e2456b165fe60..5b4139edc19a9 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -150,14 +150,9 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None: # get popped off for index self.orig_names: list[Hashable] = list(self.columns) - # needs to be cleaned/refactored - # multiple date column thing turning into a real spaghetti factory - - if not self._has_complex_date_col: - (index_names, self.orig_names, self.columns) = self._get_index_name() - self._name_processed = True - if self.index_names is None: - self.index_names = index_names + index_names, self.orig_names, self.columns = self._get_index_name() + if self.index_names is None: + self.index_names = index_names if self._col_indices is None: self._col_indices = list(range(len(self.columns))) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 70f9a68244164..d13d0e22b2b07 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -40,7 +40,6 @@ from pandas.core.dtypes.common import ( is_file_like, is_float, - is_hashable, is_integer, is_list_like, pandas_dtype, @@ -119,7 +118,6 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): skip_blank_lines: bool parse_dates: bool | Sequence[Hashable] | None infer_datetime_format: bool | lib.NoDefault - keep_date_col: bool | lib.NoDefault date_parser: Callable | lib.NoDefault date_format: str | dict[Hashable, str] | None dayfirst: bool @@ -302,8 +300,7 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): performance of reading a large file. skip_blank_lines : bool, default True If ``True``, skip over blank lines rather than interpreting as ``NaN`` values. -parse_dates : bool, None, list of Hashable, list of lists or dict of {{Hashable : \ -list}}, default None +parse_dates : bool, None, list of Hashable, default None The behavior is as follows: * ``bool``. If ``True`` -> try parsing the index. @@ -311,10 +308,6 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): specified. * ``list`` of ``int`` or names. e.g. If ``[1, 2, 3]`` -> try parsing columns 1, 2, 3 each as a separate date column. - * ``list`` of ``list``. e.g. If ``[[1, 3]]`` -> combine columns 1 and 3 and parse - as a single date column. Values are joined with a space before parsing. - * ``dict``, e.g. ``{{'foo' : [1, 3]}}`` -> parse columns 1, 3 as date and call - result 'foo'. Values are joined with a space before parsing. If a column or index cannot be represented as an array of ``datetime``, say because of an unparsable value or a mixture of timezones, the column @@ -332,9 +325,6 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): .. deprecated:: 2.0.0 A strict version of this argument is now the default, passing it has no effect. -keep_date_col : bool, default False - If ``True`` and ``parse_dates`` specifies combining multiple columns then - keep the original columns. date_parser : Callable, optional Function to use for converting a sequence of string columns to an array of ``datetime`` instances. The default uses ``dateutil.parser.parser`` to do the @@ -759,7 +749,6 @@ def read_csv( # Datetime Handling parse_dates: bool | Sequence[Hashable] | None = None, infer_datetime_format: bool | lib.NoDefault = lib.no_default, - keep_date_col: bool | lib.NoDefault = lib.no_default, date_parser: Callable | lib.NoDefault = lib.no_default, date_format: str | dict[Hashable, str] | None = None, dayfirst: bool = False, @@ -790,38 +779,6 @@ def read_csv( storage_options: StorageOptions | None = None, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, ) -> DataFrame | TextFileReader: - if keep_date_col is not lib.no_default: - # GH#55569 - warnings.warn( - "The 'keep_date_col' keyword in pd.read_csv is deprecated and " - "will be removed in a future version. Explicitly remove unwanted " - "columns after parsing instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - else: - keep_date_col = False - - if lib.is_list_like(parse_dates): - # GH#55569 - depr = False - # error: Item "bool" of "bool | Sequence[Hashable] | None" has no - # attribute "__iter__" (not iterable) - if not all(is_hashable(x) for x in parse_dates): # type: ignore[union-attr] - depr = True - elif isinstance(parse_dates, dict) and any( - lib.is_list_like(x) for x in parse_dates.values() - ): - depr = True - if depr: - warnings.warn( - "Support for nested sequences for 'parse_dates' in pd.read_csv " - "is deprecated. Combine the desired columns with pd.to_datetime " - "after parsing instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - if infer_datetime_format is not lib.no_default: warnings.warn( "The argument 'infer_datetime_format' is deprecated and will " @@ -950,7 +907,6 @@ def read_table( # Datetime Handling parse_dates: bool | Sequence[Hashable] | None = None, infer_datetime_format: bool | lib.NoDefault = lib.no_default, - keep_date_col: bool | lib.NoDefault = lib.no_default, date_parser: Callable | lib.NoDefault = lib.no_default, date_format: str | dict[Hashable, str] | None = None, dayfirst: bool = False, @@ -981,29 +937,6 @@ def read_table( storage_options: StorageOptions | None = None, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, ) -> DataFrame | TextFileReader: - if keep_date_col is not lib.no_default: - # GH#55569 - warnings.warn( - "The 'keep_date_col' keyword in pd.read_table is deprecated and " - "will be removed in a future version. Explicitly remove unwanted " - "columns after parsing instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - else: - keep_date_col = False - - # error: Item "bool" of "bool | Sequence[Hashable]" has no attribute "__iter__" - if lib.is_list_like(parse_dates) and not all(is_hashable(x) for x in parse_dates): # type: ignore[union-attr] - # GH#55569 - warnings.warn( - "Support for nested sequences for 'parse_dates' in pd.read_table " - "is deprecated. Combine the desired columns with pd.to_datetime " - "after parsing instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - if infer_datetime_format is not lib.no_default: warnings.warn( "The argument 'infer_datetime_format' is deprecated and will " @@ -1671,7 +1604,6 @@ def TextParser(*args, **kwds) -> TextFileReader: comment : str, optional Comment out remainder of line parse_dates : bool, default False - keep_date_col : bool, default False date_parser : function, optional .. deprecated:: 2.0.0 diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 8968948df5fa9..96ff06ceafa3b 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -4,7 +4,6 @@ """ from datetime import ( - date, datetime, timedelta, timezone, @@ -116,192 +115,6 @@ def __custom_date_parser(time): tm.assert_frame_equal(result, expected) -@xfail_pyarrow -def test_separator_date_conflict(all_parsers): - # Regression test for gh-4678 - # - # Make sure thousands separator and - # date parsing do not conflict. - parser = all_parsers - data = "06-02-2013;13:00;1-000.215" - expected = DataFrame( - [[datetime(2013, 6, 2, 13, 0, 0), 1000.215]], columns=["Date", 2] - ) - - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - df = parser.read_csv( - StringIO(data), - sep=";", - thousands="-", - parse_dates={"Date": [0, 1]}, - header=None, - ) - tm.assert_frame_equal(df, expected) - - -@pytest.mark.parametrize("keep_date_col", [True, False]) -def test_multiple_date_col_custom(all_parsers, keep_date_col, request): - data = """\ -KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 -""" - parser = all_parsers - - if keep_date_col and parser.engine == "pyarrow": - # For this to pass, we need to disable auto-inference on the date columns - # in parse_dates. We have no way of doing this though - mark = pytest.mark.xfail( - reason="pyarrow doesn't support disabling auto-inference on column numbers." - ) - request.applymarker(mark) - - def date_parser(*date_cols): - """ - Test date parser. - - Parameters - ---------- - date_cols : args - The list of data columns to parse. - - Returns - ------- - parsed : Series - """ - return parsing.try_parse_dates( - parsing.concat_date_cols(date_cols), parser=du_parse - ) - - kwds = { - "header": None, - "date_parser": date_parser, - "parse_dates": {"actual": [1, 2], "nominal": [1, 3]}, - "keep_date_col": keep_date_col, - "names": ["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8"], - } - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - **kwds, - raise_on_extra_warnings=False, - ) - - expected = DataFrame( - [ - [ - datetime(1999, 1, 27, 19, 0), - datetime(1999, 1, 27, 18, 56), - "KORD", - "19990127", - " 19:00:00", - " 18:56:00", - 0.81, - 2.81, - 7.2, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 20, 0), - datetime(1999, 1, 27, 19, 56), - "KORD", - "19990127", - " 20:00:00", - " 19:56:00", - 0.01, - 2.21, - 7.2, - 0.0, - 260.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - datetime(1999, 1, 27, 20, 56), - "KORD", - "19990127", - " 21:00:00", - " 20:56:00", - -0.59, - 2.21, - 5.7, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - datetime(1999, 1, 27, 21, 18), - "KORD", - "19990127", - " 21:00:00", - " 21:18:00", - -0.99, - 2.01, - 3.6, - 0.0, - 270.0, - ], - [ - datetime(1999, 1, 27, 22, 0), - datetime(1999, 1, 27, 21, 56), - "KORD", - "19990127", - " 22:00:00", - " 21:56:00", - -0.59, - 1.71, - 5.1, - 0.0, - 290.0, - ], - [ - datetime(1999, 1, 27, 23, 0), - datetime(1999, 1, 27, 22, 56), - "KORD", - "19990127", - " 23:00:00", - " 22:56:00", - -0.59, - 1.71, - 4.6, - 0.0, - 280.0, - ], - ], - columns=[ - "actual", - "nominal", - "X0", - "X1", - "X2", - "X3", - "X4", - "X5", - "X6", - "X7", - "X8", - ], - ) - - if not keep_date_col: - expected = expected.drop(["X1", "X2", "X3"], axis=1) - - # Python can sometimes be flaky about how - # the aggregated columns are entered, so - # this standardizes the order. - result = result[expected.columns] - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("container", [list, tuple, Index, Series]) @pytest.mark.parametrize("dim", [1, 2]) def test_concat_date_col_fail(container, dim): @@ -314,141 +127,6 @@ def test_concat_date_col_fail(container, dim): parsing.concat_date_cols(date_cols) -@pytest.mark.parametrize("keep_date_col", [True, False]) -def test_multiple_date_col(all_parsers, keep_date_col, request): - data = """\ -KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 -""" - parser = all_parsers - - if keep_date_col and parser.engine == "pyarrow": - # For this to pass, we need to disable auto-inference on the date columns - # in parse_dates. We have no way of doing this though - mark = pytest.mark.xfail( - reason="pyarrow doesn't support disabling auto-inference on column numbers." - ) - request.applymarker(mark) - - depr_msg = "The 'keep_date_col' keyword in pd.read_csv is deprecated" - - kwds = { - "header": None, - "parse_dates": [[1, 2], [1, 3]], - "keep_date_col": keep_date_col, - "names": ["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8"], - } - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv(StringIO(data), **kwds) - - expected = DataFrame( - [ - [ - datetime(1999, 1, 27, 19, 0), - datetime(1999, 1, 27, 18, 56), - "KORD", - "19990127", - " 19:00:00", - " 18:56:00", - 0.81, - 2.81, - 7.2, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 20, 0), - datetime(1999, 1, 27, 19, 56), - "KORD", - "19990127", - " 20:00:00", - " 19:56:00", - 0.01, - 2.21, - 7.2, - 0.0, - 260.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - datetime(1999, 1, 27, 20, 56), - "KORD", - "19990127", - " 21:00:00", - " 20:56:00", - -0.59, - 2.21, - 5.7, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - datetime(1999, 1, 27, 21, 18), - "KORD", - "19990127", - " 21:00:00", - " 21:18:00", - -0.99, - 2.01, - 3.6, - 0.0, - 270.0, - ], - [ - datetime(1999, 1, 27, 22, 0), - datetime(1999, 1, 27, 21, 56), - "KORD", - "19990127", - " 22:00:00", - " 21:56:00", - -0.59, - 1.71, - 5.1, - 0.0, - 290.0, - ], - [ - datetime(1999, 1, 27, 23, 0), - datetime(1999, 1, 27, 22, 56), - "KORD", - "19990127", - " 23:00:00", - " 22:56:00", - -0.59, - 1.71, - 4.6, - 0.0, - 280.0, - ], - ], - columns=[ - "X1_X2", - "X1_X3", - "X0", - "X1", - "X2", - "X3", - "X4", - "X5", - "X6", - "X7", - "X8", - ], - ) - - if not keep_date_col: - expected = expected.drop(["X1", "X2", "X3"], axis=1) - - tm.assert_frame_equal(result, expected) - - def test_date_col_as_index_col(all_parsers): data = """\ KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 @@ -495,240 +173,6 @@ def test_date_col_as_index_col(all_parsers): tm.assert_frame_equal(result, expected) -def test_multiple_date_cols_int_cast(all_parsers): - data = ( - "KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" - "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" - "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" - "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" - "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" - "KORD,19990127, 23:00:00, 22:56:00, -0.5900" - ) - parse_dates = {"actual": [1, 2], "nominal": [1, 3]} - parser = all_parsers - - kwds = { - "header": None, - "parse_dates": parse_dates, - "date_parser": pd.to_datetime, - } - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - **kwds, - raise_on_extra_warnings=False, - ) - - expected = DataFrame( - [ - [datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 18, 56), "KORD", 0.81], - [datetime(1999, 1, 27, 20, 0), datetime(1999, 1, 27, 19, 56), "KORD", 0.01], - [ - datetime(1999, 1, 27, 21, 0), - datetime(1999, 1, 27, 20, 56), - "KORD", - -0.59, - ], - [ - datetime(1999, 1, 27, 21, 0), - datetime(1999, 1, 27, 21, 18), - "KORD", - -0.99, - ], - [ - datetime(1999, 1, 27, 22, 0), - datetime(1999, 1, 27, 21, 56), - "KORD", - -0.59, - ], - [ - datetime(1999, 1, 27, 23, 0), - datetime(1999, 1, 27, 22, 56), - "KORD", - -0.59, - ], - ], - columns=["actual", "nominal", 0, 4], - ) - - # Python can sometimes be flaky about how - # the aggregated columns are entered, so - # this standardizes the order. - result = result[expected.columns] - tm.assert_frame_equal(result, expected) - - -def test_multiple_date_col_timestamp_parse(all_parsers): - parser = all_parsers - data = """05/31/2012,15:30:00.029,1306.25,1,E,0,,1306.25 -05/31/2012,15:30:00.029,1306.25,8,E,0,,1306.25""" - - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - parse_dates=[[0, 1]], - header=None, - date_parser=Timestamp, - raise_on_extra_warnings=False, - ) - expected = DataFrame( - [ - [ - Timestamp("05/31/2012, 15:30:00.029"), - 1306.25, - 1, - "E", - 0, - np.nan, - 1306.25, - ], - [ - Timestamp("05/31/2012, 15:30:00.029"), - 1306.25, - 8, - "E", - 0, - np.nan, - 1306.25, - ], - ], - columns=["0_1", 2, 3, 4, 5, 6, 7], - ) - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -def test_multiple_date_cols_with_header(all_parsers): - parser = all_parsers - data = """\ -ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir -KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" - - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv(StringIO(data), parse_dates={"nominal": [1, 2]}) - expected = DataFrame( - [ - [ - datetime(1999, 1, 27, 19, 0), - "KORD", - " 18:56:00", - 0.81, - 2.81, - 7.2, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 20, 0), - "KORD", - " 19:56:00", - 0.01, - 2.21, - 7.2, - 0.0, - 260.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - "KORD", - " 20:56:00", - -0.59, - 2.21, - 5.7, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - "KORD", - " 21:18:00", - -0.99, - 2.01, - 3.6, - 0.0, - 270.0, - ], - [ - datetime(1999, 1, 27, 22, 0), - "KORD", - " 21:56:00", - -0.59, - 1.71, - 5.1, - 0.0, - 290.0, - ], - [ - datetime(1999, 1, 27, 23, 0), - "KORD", - " 22:56:00", - -0.59, - 1.71, - 4.6, - 0.0, - 280.0, - ], - ], - columns=[ - "nominal", - "ID", - "ActualTime", - "TDew", - "TAir", - "Windspeed", - "Precip", - "WindDir", - ], - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "data,parse_dates,msg", - [ - ( - """\ -date_NominalTime,date,NominalTime -KORD1,19990127, 19:00:00 -KORD2,19990127, 20:00:00""", - [[1, 2]], - ("New date column already in dict date_NominalTime"), - ), - ( - """\ -ID,date,nominalTime -KORD,19990127, 19:00:00 -KORD,19990127, 20:00:00""", - {"ID": [1, 2]}, - "Date column ID already in dict", - ), - ], -) -def test_multiple_date_col_name_collision(all_parsers, data, parse_dates, msg): - parser = all_parsers - - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - parser.read_csv(StringIO(data), parse_dates=parse_dates) - - def test_date_parser_int_bug(all_parsers): # see gh-3071 parser = all_parsers @@ -859,37 +303,6 @@ def test_parse_dates_string(all_parsers): tm.assert_frame_equal(result, expected) -# Bug in https://github.com/dateutil/dateutil/issues/217 -# has been addressed, but we just don't pass in the `yearfirst` -@pytest.mark.xfail(reason="yearfirst is not surfaced in read_*") -@pytest.mark.parametrize("parse_dates", [[["date", "time"]], [[0, 1]]]) -def test_yy_format_with_year_first(all_parsers, parse_dates): - data = """date,time,B,C -090131,0010,1,2 -090228,1020,3,4 -090331,0830,5,6 -""" - parser = all_parsers - result = parser.read_csv_check_warnings( - UserWarning, - "Could not infer format", - StringIO(data), - index_col=0, - parse_dates=parse_dates, - ) - index = DatetimeIndex( - [ - datetime(2009, 1, 31, 0, 10, 0), - datetime(2009, 2, 28, 10, 20, 0), - datetime(2009, 3, 31, 8, 30, 0), - ], - dtype=object, - name="date_time", - ) - expected = DataFrame({"B": [1, 3, 5], "C": [2, 4, 6]}, index=index) - tm.assert_frame_equal(result, expected) - - @xfail_pyarrow @pytest.mark.parametrize("parse_dates", [[0, 2], ["a", "c"]]) def test_parse_dates_column_list(all_parsers, parse_dates): @@ -1026,282 +439,11 @@ def test_parse_tz_aware(all_parsers): assert result.index.tz is expected_tz -@xfail_pyarrow -@pytest.mark.parametrize( - "parse_dates,index_col", - [({"nominal": [1, 2]}, "nominal"), ({"nominal": [1, 2]}, 0), ([[1, 2]], 0)], -) -def test_multiple_date_cols_index(all_parsers, parse_dates, index_col): - parser = all_parsers - data = """ -ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir -KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 -""" - expected = DataFrame( - [ - [ - datetime(1999, 1, 27, 19, 0), - "KORD1", - " 18:56:00", - 0.81, - 2.81, - 7.2, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 20, 0), - "KORD2", - " 19:56:00", - 0.01, - 2.21, - 7.2, - 0.0, - 260.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - "KORD3", - " 20:56:00", - -0.59, - 2.21, - 5.7, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - "KORD4", - " 21:18:00", - -0.99, - 2.01, - 3.6, - 0.0, - 270.0, - ], - [ - datetime(1999, 1, 27, 22, 0), - "KORD5", - " 21:56:00", - -0.59, - 1.71, - 5.1, - 0.0, - 290.0, - ], - [ - datetime(1999, 1, 27, 23, 0), - "KORD6", - " 22:56:00", - -0.59, - 1.71, - 4.6, - 0.0, - 280.0, - ], - ], - columns=[ - "nominal", - "ID", - "ActualTime", - "TDew", - "TAir", - "Windspeed", - "Precip", - "WindDir", - ], - ) - expected = expected.set_index("nominal") - - if not isinstance(parse_dates, dict): - expected.index.name = "date_NominalTime" - - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv( - StringIO(data), parse_dates=parse_dates, index_col=index_col - ) - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -def test_multiple_date_cols_chunked(all_parsers): - parser = all_parsers - data = """\ -ID,date,nominalTime,actualTime,A,B,C,D,E -KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 -""" - - expected = DataFrame( - [ - [ - datetime(1999, 1, 27, 19, 0), - "KORD", - " 18:56:00", - 0.81, - 2.81, - 7.2, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 20, 0), - "KORD", - " 19:56:00", - 0.01, - 2.21, - 7.2, - 0.0, - 260.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - "KORD", - " 20:56:00", - -0.59, - 2.21, - 5.7, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - "KORD", - " 21:18:00", - -0.99, - 2.01, - 3.6, - 0.0, - 270.0, - ], - [ - datetime(1999, 1, 27, 22, 0), - "KORD", - " 21:56:00", - -0.59, - 1.71, - 5.1, - 0.0, - 290.0, - ], - [ - datetime(1999, 1, 27, 23, 0), - "KORD", - " 22:56:00", - -0.59, - 1.71, - 4.6, - 0.0, - 280.0, - ], - ], - columns=["nominal", "ID", "actualTime", "A", "B", "C", "D", "E"], - ) - expected = expected.set_index("nominal") - - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - with parser.read_csv( - StringIO(data), - parse_dates={"nominal": [1, 2]}, - index_col="nominal", - chunksize=2, - ) as reader: - chunks = list(reader) - - tm.assert_frame_equal(chunks[0], expected[:2]) - tm.assert_frame_equal(chunks[1], expected[2:4]) - tm.assert_frame_equal(chunks[2], expected[4:]) - - -def test_multiple_date_col_named_index_compat(all_parsers): - parser = all_parsers - data = """\ -ID,date,nominalTime,actualTime,A,B,C,D,E -KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 -""" - - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - with_indices = parser.read_csv( - StringIO(data), parse_dates={"nominal": [1, 2]}, index_col="nominal" - ) - - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - with_names = parser.read_csv( - StringIO(data), - index_col="nominal", - parse_dates={"nominal": ["date", "nominalTime"]}, - ) - tm.assert_frame_equal(with_indices, with_names) - - -def test_multiple_date_col_multiple_index_compat(all_parsers): - parser = all_parsers - data = """\ -ID,date,nominalTime,actualTime,A,B,C,D,E -KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 -""" - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv( - StringIO(data), index_col=["nominal", "ID"], parse_dates={"nominal": [1, 2]} - ) - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - expected = parser.read_csv(StringIO(data), parse_dates={"nominal": [1, 2]}) - - expected = expected.set_index(["nominal", "ID"]) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("kwargs", [{}, {"index_col": "C"}]) def test_read_with_parse_dates_scalar_non_bool(all_parsers, kwargs): # see gh-5636 parser = all_parsers - msg = ( - "Only booleans, lists, and dictionaries " - "are accepted for the 'parse_dates' parameter" - ) + msg = "Only booleans and lists " "are accepted for the 'parse_dates' parameter" data = """A,B,C 1,2,2003-11-1""" @@ -1312,15 +454,12 @@ def test_read_with_parse_dates_scalar_non_bool(all_parsers, kwargs): @pytest.mark.parametrize("parse_dates", [(1,), np.array([4, 5]), {1, 3}]) def test_read_with_parse_dates_invalid_type(all_parsers, parse_dates): parser = all_parsers - msg = ( - "Only booleans, lists, and dictionaries " - "are accepted for the 'parse_dates' parameter" - ) + msg = "Only booleans and lists " "are accepted for the 'parse_dates' parameter" data = """A,B,C 1,2,2003-11-1""" with pytest.raises(TypeError, match=msg): - parser.read_csv(StringIO(data), parse_dates=(1,)) + parser.read_csv(StringIO(data), parse_dates=parse_dates) @pytest.mark.parametrize("value", ["nan", ""]) @@ -1463,240 +602,6 @@ def test_parse_dates_no_convert_thousands(all_parsers, data, kwargs, expected): tm.assert_frame_equal(result, expected) -@xfail_pyarrow -def test_parse_date_time_multi_level_column_name(all_parsers): - data = """\ -D,T,A,B -date, time,a,b -2001-01-05, 09:00:00, 0.0, 10. -2001-01-06, 00:00:00, 1.0, 11. -""" - parser = all_parsers - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - header=[0, 1], - parse_dates={"date_time": [0, 1]}, - date_parser=pd.to_datetime, - ) - - expected_data = [ - [datetime(2001, 1, 5, 9, 0, 0), 0.0, 10.0], - [datetime(2001, 1, 6, 0, 0, 0), 1.0, 11.0], - ] - expected = DataFrame(expected_data, columns=["date_time", ("A", "a"), ("B", "b")]) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "data,kwargs,expected", - [ - ( - """\ -date,time,a,b -2001-01-05, 10:00:00, 0.0, 10. -2001-01-05, 00:00:00, 1., 11. -""", - {"header": 0, "parse_dates": {"date_time": [0, 1]}}, - DataFrame( - [ - [datetime(2001, 1, 5, 10, 0, 0), 0.0, 10], - [datetime(2001, 1, 5, 0, 0, 0), 1.0, 11.0], - ], - columns=["date_time", "a", "b"], - ), - ), - ( - ( - "KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" - "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" - "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" - "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" - "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" - "KORD,19990127, 23:00:00, 22:56:00, -0.5900" - ), - {"header": None, "parse_dates": {"actual": [1, 2], "nominal": [1, 3]}}, - DataFrame( - [ - [ - datetime(1999, 1, 27, 19, 0), - datetime(1999, 1, 27, 18, 56), - "KORD", - 0.81, - ], - [ - datetime(1999, 1, 27, 20, 0), - datetime(1999, 1, 27, 19, 56), - "KORD", - 0.01, - ], - [ - datetime(1999, 1, 27, 21, 0), - datetime(1999, 1, 27, 20, 56), - "KORD", - -0.59, - ], - [ - datetime(1999, 1, 27, 21, 0), - datetime(1999, 1, 27, 21, 18), - "KORD", - -0.99, - ], - [ - datetime(1999, 1, 27, 22, 0), - datetime(1999, 1, 27, 21, 56), - "KORD", - -0.59, - ], - [ - datetime(1999, 1, 27, 23, 0), - datetime(1999, 1, 27, 22, 56), - "KORD", - -0.59, - ], - ], - columns=["actual", "nominal", 0, 4], - ), - ), - ], -) -def test_parse_date_time(all_parsers, data, kwargs, expected): - parser = all_parsers - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - date_parser=pd.to_datetime, - **kwargs, - raise_on_extra_warnings=False, - ) - - # Python can sometimes be flaky about how - # the aggregated columns are entered, so - # this standardizes the order. - result = result[expected.columns] - tm.assert_frame_equal(result, expected) - - -def test_parse_date_fields(all_parsers): - parser = all_parsers - data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11." - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - header=0, - parse_dates={"ymd": [0, 1, 2]}, - date_parser=lambda x: x, - raise_on_extra_warnings=False, - ) - - expected = DataFrame( - [[datetime(2001, 1, 10), 10.0], [datetime(2001, 2, 1), 11.0]], - columns=["ymd", "a"], - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - ("key", "value", "warn"), - [ - ( - "date_parser", - lambda x: pd.to_datetime(x, format="%Y %m %d %H %M %S"), - FutureWarning, - ), - ("date_format", "%Y %m %d %H %M %S", None), - ], -) -def test_parse_date_all_fields(all_parsers, key, value, warn): - parser = all_parsers - data = """\ -year,month,day,hour,minute,second,a,b -2001,01,05,10,00,0,0.0,10. -2001,01,5,10,0,00,1.,11. -""" - result = parser.read_csv_check_warnings( - warn, - "use 'date_format' instead", - StringIO(data), - header=0, - parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]}, - **{key: value}, - raise_on_extra_warnings=False, - ) - expected = DataFrame( - [ - [datetime(2001, 1, 5, 10, 0, 0), 0.0, 10.0], - [datetime(2001, 1, 5, 10, 0, 0), 1.0, 11.0], - ], - columns=["ymdHMS", "a", "b"], - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - ("key", "value", "warn"), - [ - ( - "date_parser", - lambda x: pd.to_datetime(x, format="%Y %m %d %H %M %S.%f"), - FutureWarning, - ), - ("date_format", "%Y %m %d %H %M %S.%f", None), - ], -) -def test_datetime_fractional_seconds(all_parsers, key, value, warn): - parser = all_parsers - data = """\ -year,month,day,hour,minute,second,a,b -2001,01,05,10,00,0.123456,0.0,10. -2001,01,5,10,0,0.500000,1.,11. -""" - result = parser.read_csv_check_warnings( - warn, - "use 'date_format' instead", - StringIO(data), - header=0, - parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]}, - **{key: value}, - raise_on_extra_warnings=False, - ) - expected = DataFrame( - [ - [datetime(2001, 1, 5, 10, 0, 0, microsecond=123456), 0.0, 10.0], - [datetime(2001, 1, 5, 10, 0, 0, microsecond=500000), 1.0, 11.0], - ], - columns=["ymdHMS", "a", "b"], - ) - tm.assert_frame_equal(result, expected) - - -def test_generic(all_parsers): - parser = all_parsers - data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11." - - def parse_function(yy, mm): - return [date(year=int(y), month=int(m), day=1) for y, m in zip(yy, mm)] - - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - header=0, - parse_dates={"ym": [0, 1]}, - date_parser=parse_function, - raise_on_extra_warnings=False, - ) - expected = DataFrame( - [[date(2001, 1, 1), 10, 10.0], [date(2001, 2, 1), 1, 11.0]], - columns=["ym", "day", "a"], - ) - expected["ym"] = expected["ym"].astype("datetime64[ns]") - tm.assert_frame_equal(result, expected) - - @xfail_pyarrow def test_date_parser_resolution_if_not_ns(all_parsers): # see gh-10245 @@ -1895,11 +800,6 @@ def test_parse_multiple_delimited_dates_with_swap_warnings(): [ (None, ["val"], ["date", "time"], "date, time"), (None, ["val"], [0, "time"], "time"), - (None, ["val"], [["date", "time"]], "date, time"), - (None, ["val"], [[0, "time"]], "time"), - (None, ["val"], {"date": [0, "time"]}, "time"), - (None, ["val"], {"date": ["date", "time"]}, "date, time"), - (None, ["val"], [["date", "time"], "date"], "date, time"), (["date1", "time1", "temperature"], None, ["date", "time"], "date, time"), ( ["date1", "time1", "temperature"], @@ -1917,20 +817,10 @@ def test_missing_parse_dates_column_raises( content = StringIO("date,time,val\n2020-01-31,04:20:32,32\n") msg = f"Missing column provided to 'parse_dates': '{missing_cols}'" - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - warn = FutureWarning - if isinstance(parse_dates, list) and all( - isinstance(x, (int, str)) for x in parse_dates - ): - warn = None - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(warn, match=depr_msg, check_stacklevel=False): - parser.read_csv( - content, sep=",", names=names, usecols=usecols, parse_dates=parse_dates - ) + parser.read_csv( + content, sep=",", names=names, usecols=usecols, parse_dates=parse_dates + ) @xfail_pyarrow # mismatched shape @@ -1966,37 +856,6 @@ def test_date_parser_multiindex_columns(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # TypeError: an integer is required -@pytest.mark.parametrize( - "parse_spec, col_name", - [ - ([[("a", "1"), ("b", "2")]], ("a_b", "1_2")), - ({("foo", "1"): [("a", "1"), ("b", "2")]}, ("foo", "1")), - ], -) -def test_date_parser_multiindex_columns_combine_cols(all_parsers, parse_spec, col_name): - parser = all_parsers - data = """a,b,c -1,2,3 -2019-12,-31,6""" - - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv( - StringIO(data), - parse_dates=parse_spec, - header=[0, 1], - ) - expected = DataFrame( - {col_name: Timestamp("2019-12-31").as_unit("ns"), ("c", "3"): [6]} - ) - tm.assert_frame_equal(result, expected) - - def test_date_parser_usecols_thousands(all_parsers): # GH#39365 data = """A,B,C @@ -2030,26 +889,6 @@ def test_date_parser_usecols_thousands(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # mismatched shape -def test_parse_dates_and_keep_original_column(all_parsers): - # GH#13378 - parser = all_parsers - data = """A -20150908 -20150909 -""" - depr_msg = "The 'keep_date_col' keyword in pd.read_csv is deprecated" - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv( - StringIO(data), parse_dates={"date": ["A"]}, keep_date_col=True - ) - expected_data = [Timestamp("2015-09-08"), Timestamp("2015-09-09")] - expected = DataFrame({"date": expected_data, "A": expected_data}) - tm.assert_frame_equal(result, expected) - - def test_dayfirst_warnings(): # GH 12585 From 9b53badafab3a36ef2fe24924c56607cd3729d5a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 7 May 2024 14:25:36 -0700 Subject: [PATCH 02/12] Add whatsnew, address other tests --- doc/source/whatsnew/v3.0.0.rst | 2 + pandas/io/parsers/base_parser.py | 2 +- pandas/io/parsers/c_parser_wrapper.py | 44 ++++--- .../io/parser/common/test_common_basic.py | 51 --------- pandas/tests/io/parser/test_parse_dates.py | 66 ----------- .../io/parser/usecols/test_parse_dates.py | 108 ------------------ 6 files changed, 23 insertions(+), 250 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 90923bfac8e62..01d8e34964c33 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -254,7 +254,9 @@ Removal of prior version deprecations/changes - Enforced deprecation of :meth:`offsets.Tick.delta`, use ``pd.Timedelta(obj)`` instead (:issue:`55498`) - Enforced deprecation of ``axis=None`` acting the same as ``axis=0`` in the DataFrame reductions ``sum``, ``prod``, ``std``, ``var``, and ``sem``, passing ``axis=None`` will now reduce over both axes; this is particularly the case when doing e.g. ``numpy.sum(df)`` (:issue:`21597`) - Enforced deprecation of ``core.internals`` members ``Block``, ``ExtensionBlock``, and ``DatetimeTZBlock`` (:issue:`58467`) +- Enforced deprecation of ``keep_date_col`` keyword in :func:`read_csv` (:issue:`55569`) - Enforced deprecation of ``quantile`` keyword in :meth:`.Rolling.quantile` and :meth:`.Expanding.quantile`, renamed to ``q`` instead. (:issue:`52550`) +- Enforced deprecation of combining parsed datetime columns in :func:`read_csv` in ``parse_dates`` (:issue:`55569`) - Enforced deprecation of non-standard (``np.ndarray``, :class:`ExtensionArray`, :class:`Index`, or :class:`Series`) argument to :func:`api.extensions.take` (:issue:`52981`) - Enforced deprecation of parsing system timezone strings to ``tzlocal``, which depended on system timezone, pass the 'tz' keyword instead (:issue:`50791`) - Enforced deprecation of passing a dictionary to :meth:`SeriesGroupBy.agg` (:issue:`52268`) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 962360a29aecb..c442fceec84da 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -122,7 +122,7 @@ def __init__(self, kwds) -> None: self.col_names: Sequence[Hashable] | None = None parse_dates = kwds.pop("parse_dates", False) - if isinstance(parse_dates, None) or lib.is_bool(parse_dates): + if parse_dates is None or lib.is_bool(parse_dates): parse_dates = bool(parse_dates) elif not isinstance(parse_dates, list): raise TypeError( diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index 1baca9d48d795..0740978bbbc3c 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -166,29 +166,28 @@ def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None: # error: Cannot determine type of 'names' self.orig_names = self.names # type: ignore[has-type] - if not self._has_complex_date_col: - # error: Cannot determine type of 'index_col' - if self._reader.leading_cols == 0 and is_index_col( - self.index_col # type: ignore[has-type] - ): - ( - index_names, - # error: Cannot determine type of 'names' - self.names, # type: ignore[has-type] - self.index_col, - ) = self._clean_index_names( - # error: Cannot determine type of 'names' - self.names, # type: ignore[has-type] - # error: Cannot determine type of 'index_col' - self.index_col, # type: ignore[has-type] - ) + # error: Cannot determine type of 'index_col' + if self._reader.leading_cols == 0 and is_index_col( + self.index_col # type: ignore[has-type] + ): + ( + index_names, + # error: Cannot determine type of 'names' + self.names, # type: ignore[has-type] + self.index_col, + ) = self._clean_index_names( + # error: Cannot determine type of 'names' + self.names, # type: ignore[has-type] + # error: Cannot determine type of 'index_col' + self.index_col, # type: ignore[has-type] + ) - if self.index_names is None: - self.index_names = index_names + if self.index_names is None: + self.index_names = index_names - if self._reader.header is None and not passed_names: - assert self.index_names is not None - self.index_names = [None] * len(self.index_names) + if self._reader.header is None and not passed_names: + assert self.index_names is not None + self.index_names = [None] * len(self.index_names) self._implicit_index = self._reader.leading_cols > 0 @@ -273,9 +272,6 @@ def read( names = self.names # type: ignore[has-type] if self._reader.leading_cols: - if self._has_complex_date_col: - raise NotImplementedError("file structure not yet supported") - # implicit index, no index names arrays = [] diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 485680d9de48c..d79e0c34edaab 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -22,14 +22,10 @@ from pandas import ( DataFrame, Index, - Timestamp, compat, ) import pandas._testing as tm -from pandas.io.parsers import TextFileReader -from pandas.io.parsers.c_parser_wrapper import CParserWrapper - pytestmark = pytest.mark.filterwarnings( "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ) @@ -38,53 +34,6 @@ skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") -def test_override_set_noconvert_columns(): - # see gh-17351 - # - # Usecols needs to be sorted in _set_noconvert_columns based - # on the test_usecols_with_parse_dates test from test_usecols.py - class MyTextFileReader(TextFileReader): - def __init__(self) -> None: - self._currow = 0 - self.squeeze = False - - class MyCParserWrapper(CParserWrapper): - def _set_noconvert_columns(self): - if self.usecols_dtype == "integer": - # self.usecols is a set, which is documented as unordered - # but in practice, a CPython set of integers is sorted. - # In other implementations this assumption does not hold. - # The following code simulates a different order, which - # before GH 17351 would cause the wrong columns to be - # converted via the parse_dates parameter - self.usecols = list(self.usecols) - self.usecols.reverse() - return CParserWrapper._set_noconvert_columns(self) - - data = """a,b,c,d,e -0,1,2014-01-01,09:00,4 -0,1,2014-01-02,10:00,4""" - - parse_dates = [[1, 2]] - cols = { - "a": [0, 0], - "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")], - } - expected = DataFrame(cols, columns=["c_d", "a"]) - - parser = MyTextFileReader() - parser.options = { - "usecols": [0, 2, 3], - "parse_dates": parse_dates, - "delimiter": ",", - } - parser.engine = "c" - parser._engine = MyCParserWrapper(StringIO(data), **parser.options) - - result = parser.read() - tm.assert_frame_equal(result, expected) - - def test_read_csv_local(all_parsers, csv1): prefix = "file:///" if compat.is_platform_windows() else "file://" parser = all_parsers diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 96ff06ceafa3b..37d20b8b52d68 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -602,45 +602,6 @@ def test_parse_dates_no_convert_thousands(all_parsers, data, kwargs, expected): tm.assert_frame_equal(result, expected) -@xfail_pyarrow -def test_date_parser_resolution_if_not_ns(all_parsers): - # see gh-10245 - parser = all_parsers - data = """\ -date,time,prn,rxstatus -2013-11-03,19:00:00,126,00E80000 -2013-11-03,19:00:00,23,00E80000 -2013-11-03,19:00:00,13,00E80000 -""" - - def date_parser(dt, time): - try: - arr = dt + "T" + time - except TypeError: - # dt & time are date/time objects - arr = [datetime.combine(d, t) for d, t in zip(dt, time)] - return np.array(arr, dtype="datetime64[s]") - - result = parser.read_csv_check_warnings( - FutureWarning, - "use 'date_format' instead", - StringIO(data), - date_parser=date_parser, - parse_dates={"datetime": ["date", "time"]}, - index_col=["datetime", "prn"], - ) - - datetimes = np.array(["2013-11-03T19:00:00"] * 3, dtype="datetime64[s]") - expected = DataFrame( - data={"rxstatus": ["00E80000"] * 3}, - index=MultiIndex.from_arrays( - [datetimes, [126, 23, 13]], - names=["datetime", "prn"], - ), - ) - tm.assert_frame_equal(result, expected) - - def test_parse_date_column_with_empty_string(all_parsers): # see gh-6428 parser = all_parsers @@ -1092,33 +1053,6 @@ def test_parse_dates_dict_format(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize( - "key, parse_dates", [("a_b", [[0, 1]]), ("foo", {"foo": [0, 1]})] -) -def test_parse_dates_dict_format_two_columns(all_parsers, key, parse_dates): - # GH#51240 - parser = all_parsers - data = """a,b -31-,12-2019 -31-,12-2020""" - - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv( - StringIO(data), date_format={key: "%d- %m-%Y"}, parse_dates=parse_dates - ) - expected = DataFrame( - { - key: [Timestamp("2019-12-31"), Timestamp("2020-12-31")], - } - ) - tm.assert_frame_equal(result, expected) - - @xfail_pyarrow # object dtype index def test_parse_dates_dict_format_index(all_parsers): # GH#51240 diff --git a/pandas/tests/io/parser/usecols/test_parse_dates.py b/pandas/tests/io/parser/usecols/test_parse_dates.py index ab98857e0c178..0cf3fe894c916 100644 --- a/pandas/tests/io/parser/usecols/test_parse_dates.py +++ b/pandas/tests/io/parser/usecols/test_parse_dates.py @@ -26,42 +26,6 @@ ) -@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]]) -def test_usecols_with_parse_dates(all_parsers, usecols): - # see gh-9755 - data = """a,b,c,d,e -0,1,2014-01-01,09:00,4 -0,1,2014-01-02,10:00,4""" - parser = all_parsers - parse_dates = [[1, 2]] - - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - - cols = { - "a": [0, 0], - "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")], - } - expected = DataFrame(cols, columns=["c_d", "a"]) - if parser.engine == "pyarrow": - with pytest.raises(ValueError, match=_msg_pyarrow_requires_names): - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - parser.read_csv( - StringIO(data), usecols=usecols, parse_dates=parse_dates - ) - return - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv( - StringIO(data), usecols=usecols, parse_dates=parse_dates - ) - tm.assert_frame_equal(result, expected) - - @skip_pyarrow # pyarrow.lib.ArrowKeyError: Column 'fdate' in include_columns def test_usecols_with_parse_dates2(all_parsers): # see gh-13604 @@ -121,75 +85,3 @@ def test_usecols_with_parse_dates3(all_parsers): result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates) tm.assert_frame_equal(result, expected) - - -def test_usecols_with_parse_dates4(all_parsers): - data = "a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8" - usecols = list("abcdefghij") - parse_dates = [[0, 1]] - parser = all_parsers - - cols = { - "a_b": "2016/09/21 1", - "c": [1], - "d": [2], - "e": [3], - "f": [4], - "g": [5], - "h": [6], - "i": [7], - "j": [8], - } - expected = DataFrame(cols, columns=["a_b"] + list("cdefghij")) - - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv( - StringIO(data), - usecols=usecols, - parse_dates=parse_dates, - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]]) -@pytest.mark.parametrize( - "names", - [ - list("abcde"), # Names span all columns in original data. - list("acd"), # Names span only the selected columns. - ], -) -def test_usecols_with_parse_dates_and_names(all_parsers, usecols, names, request): - # see gh-9755 - s = """0,1,2014-01-01,09:00,4 -0,1,2014-01-02,10:00,4""" - parse_dates = [[1, 2]] - parser = all_parsers - - if parser.engine == "pyarrow" and not (len(names) == 3 and usecols[0] == 0): - mark = pytest.mark.xfail( - reason="Length mismatch in some cases, UserWarning in other" - ) - request.applymarker(mark) - - cols = { - "a": [0, 0], - "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")], - } - expected = DataFrame(cols, columns=["c_d", "a"]) - - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv( - StringIO(s), names=names, parse_dates=parse_dates, usecols=usecols - ) - tm.assert_frame_equal(result, expected) From 1d7d7131c4f1f799f7b1f3dec15cba692a0c4efa Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 7 May 2024 14:34:26 -0700 Subject: [PATCH 03/12] Remove unnecessary reference --- pandas/io/parsers/base_parser.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index c442fceec84da..bc886cad05b94 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -1212,11 +1212,9 @@ def _isindex(colspec): isinstance(index_names, list) and colspec in index_names ) - orig_names = columns - for colspec in parse_spec: if isinstance(colspec, int) and colspec not in data_dict: - colspec = orig_names[colspec] + colspec = columns[colspec] if _isindex(colspec): continue elif dtype_backend == "pyarrow": From def098b8713935110e79eb938424f59459dc98ed Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 7 May 2024 14:35:20 -0700 Subject: [PATCH 04/12] inline function --- pandas/io/parsers/base_parser.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index bc886cad05b94..284762227bc05 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -1207,15 +1207,12 @@ def _process_date_conversion( columns, dtype_backend=lib.no_default, ) -> tuple[dict, list]: - def _isindex(colspec): - return (isinstance(index_col, list) and colspec in index_col) or ( - isinstance(index_names, list) and colspec in index_names - ) - for colspec in parse_spec: if isinstance(colspec, int) and colspec not in data_dict: colspec = columns[colspec] - if _isindex(colspec): + if (isinstance(index_col, list) and colspec in index_col) or ( + isinstance(index_names, list) and colspec in index_names + ): continue elif dtype_backend == "pyarrow": import pyarrow as pa From ec963a2ee4ea698811ae4304a314ba48ab8623cb Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 7 May 2024 14:47:15 -0700 Subject: [PATCH 05/12] Remove os.remove --- doc/source/user_guide/io.rst | 6 ------ 1 file changed, 6 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 5b91b1c048f39..04307c92a4d45 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -847,12 +847,6 @@ Performance-wise, you should try these methods of parsing dates in order: then use ``to_datetime``. -.. ipython:: python - :suppress: - - os.remove("tmp.csv") - - .. _io.csv.mixed_timezones: Parsing a CSV with mixed timezones From 26d14502369c5b2bbf243d9b9c4fd1b0caf6552d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 7 May 2024 15:20:08 -0700 Subject: [PATCH 06/12] Address html and xml tests --- pandas/tests/io/test_html.py | 14 ------- pandas/tests/io/xml/test_xml_dtypes.py | 54 +------------------------- 2 files changed, 1 insertion(+), 67 deletions(-) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index f16f3a2a5c775..594c1d02b94cc 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -1050,20 +1050,6 @@ def test_parse_dates_list(self, flavor_read_html): res = flavor_read_html(StringIO(expected), parse_dates=["date"], index_col=0) tm.assert_frame_equal(df, res[0]) - def test_parse_dates_combine(self, flavor_read_html): - raw_dates = Series(date_range("1/1/2001", periods=10)) - df = DataFrame( - { - "date": raw_dates.map(lambda x: str(x.date())), - "time": raw_dates.map(lambda x: str(x.time())), - } - ) - res = flavor_read_html( - StringIO(df.to_html()), parse_dates={"datetime": [1, 2]}, index_col=1 - ) - newdf = DataFrame({"datetime": raw_dates}) - tm.assert_frame_equal(newdf, res[0]) - def test_wikipedia_states_table(self, datapath, flavor_read_html): data = datapath("io", "data", "html", "wikipedia_states.html") assert os.path.isfile(data), f"{data!r} is not a file" diff --git a/pandas/tests/io/xml/test_xml_dtypes.py b/pandas/tests/io/xml/test_xml_dtypes.py index a85576ff13f5c..1f290a673c554 100644 --- a/pandas/tests/io/xml/test_xml_dtypes.py +++ b/pandas/tests/io/xml/test_xml_dtypes.py @@ -378,58 +378,6 @@ def test_parse_dates_true(parser): tm.assert_frame_equal(df_iter, df_expected) -def test_parse_dates_dictionary(parser): - xml = """ - - - square - 360 - 4.0 - 2020 - 12 - 31 - - - circle - 360 - - 2021 - 12 - 31 - - - triangle - 180 - 3.0 - 2022 - 12 - 31 - -""" - - df_result = read_xml( - StringIO(xml), parse_dates={"date_end": ["year", "month", "day"]}, parser=parser - ) - df_iter = read_xml_iterparse( - xml, - parser=parser, - parse_dates={"date_end": ["year", "month", "day"]}, - iterparse={"row": ["shape", "degrees", "sides", "year", "month", "day"]}, - ) - - df_expected = DataFrame( - { - "date_end": to_datetime(["2020-12-31", "2021-12-31", "2022-12-31"]), - "shape": ["square", "circle", "triangle"], - "degrees": [360, 360, 180], - "sides": [4.0, float("nan"), 3.0], - } - ) - - tm.assert_frame_equal(df_result, df_expected) - tm.assert_frame_equal(df_iter, df_expected) - - def test_day_first_parse_dates(parser): xml = """\ @@ -480,6 +428,6 @@ def test_day_first_parse_dates(parser): def test_wrong_parse_dates_type(xml_books, parser, iterparse): with pytest.raises( - TypeError, match=("Only booleans, lists, and dictionaries are accepted") + TypeError, match="Only booleans and lists are accepted are accepted" ): read_xml(xml_books, parse_dates={"date"}, parser=parser, iterparse=iterparse) From 0fba554afa9ee2d312b33852a76c56f550954327 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 7 May 2024 15:39:01 -0700 Subject: [PATCH 07/12] Typo --- pandas/tests/io/xml/test_xml_dtypes.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/tests/io/xml/test_xml_dtypes.py b/pandas/tests/io/xml/test_xml_dtypes.py index 1f290a673c554..96ef50f9d7149 100644 --- a/pandas/tests/io/xml/test_xml_dtypes.py +++ b/pandas/tests/io/xml/test_xml_dtypes.py @@ -427,7 +427,5 @@ def test_day_first_parse_dates(parser): def test_wrong_parse_dates_type(xml_books, parser, iterparse): - with pytest.raises( - TypeError, match="Only booleans and lists are accepted are accepted" - ): + with pytest.raises(TypeError, match="Only booleans and lists are accepted"): read_xml(xml_books, parse_dates={"date"}, parser=parser, iterparse=iterparse) From a736a348d2f60a3e8d7e073c4154b3fc50948619 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 7 May 2024 19:16:57 -0700 Subject: [PATCH 08/12] Simplify _process_date_conversion --- pandas/io/parsers/arrow_parser_wrapper.py | 4 ++-- pandas/io/parsers/base_parser.py | 25 ++++++++++++----------- pandas/io/parsers/c_parser_wrapper.py | 8 +++----- pandas/io/parsers/python_parser.py | 2 +- 4 files changed, 19 insertions(+), 20 deletions(-) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index f8263a65ef5c7..8b6f7d5750ffe 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -174,8 +174,8 @@ def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame: self.names = list(range(num_cols - len(self.names))) + self.names multi_index_named = False frame.columns = self.names - # we only need the frame not the names - _, frame = self._do_date_conversions(frame.columns, frame) + + frame = self._do_date_conversions(frame.columns, frame) if self.index_col is not None: index_to_set = self.index_col.copy() for i, item in enumerate(self.index_col): diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 284762227bc05..c217c08adaea7 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -814,25 +814,23 @@ def _do_date_conversions( self, names: Index, data: DataFrame, - ) -> tuple[Sequence[Hashable] | Index, DataFrame]: ... + ) -> DataFrame: ... @overload def _do_date_conversions( self, names: Sequence[Hashable], data: Mapping[Hashable, ArrayLike], - ) -> tuple[Sequence[Hashable], Mapping[Hashable, ArrayLike]]: ... + ) -> Mapping[Hashable, ArrayLike]: ... @final def _do_date_conversions( self, names: Sequence[Hashable] | Index, data: Mapping[Hashable, ArrayLike] | DataFrame, - ) -> tuple[Sequence[Hashable] | Index, Mapping[Hashable, ArrayLike] | DataFrame]: - # returns data, columns - + ) -> Mapping[Hashable, ArrayLike] | DataFrame: if isinstance(self.parse_dates, list): - data, names = _process_date_conversion( + return _process_date_conversion( data, self._date_conv, self.parse_dates, @@ -842,7 +840,7 @@ def _do_date_conversions( dtype_backend=self.dtype_backend, ) - return names, data + return data @final def _check_data_length( @@ -1199,14 +1197,14 @@ def converter(*date_cols, col: Hashable): def _process_date_conversion( - data_dict, + data_dict: Mapping[Hashable, ArrayLike] | DataFrame, converter: Callable, parse_spec: list, index_col, index_names, - columns, + columns: Sequence[Hashable] | Index, dtype_backend=lib.no_default, -) -> tuple[dict, list]: +) -> Mapping[Hashable, ArrayLike] | DataFrame: for colspec in parse_spec: if isinstance(colspec, int) and colspec not in data_dict: colspec = columns[colspec] @@ -1226,9 +1224,12 @@ def _process_date_conversion( # Pyarrow engine returns Series which we need to convert to # numpy array before converter, its a no-op for other parsers - data_dict[colspec] = converter(np.asarray(data_dict[colspec]), col=colspec) + result = converter(np.asarray(data_dict[colspec]), col=colspec) + # error: Unsupported target for indexed assignment + # ("Mapping[Hashable, ExtensionArray | ndarray[Any, Any]] | DataFrame") + data_dict[colspec] = result # type: ignore[index] - return data_dict, columns + return data_dict def _get_na_values(col, na_values, na_fvalues, keep_default_na: bool): diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index 0740978bbbc3c..4de626288aa41 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -302,12 +302,10 @@ def read( data_tups = sorted(data.items()) data = {k: v for k, (i, v) in zip(names, data_tups)} - column_names, date_data = self._do_date_conversions(names, data) + date_data = self._do_date_conversions(names, data) # maybe create a mi on the columns - column_names = self._maybe_make_multi_index_columns( - column_names, self.col_names - ) + column_names = self._maybe_make_multi_index_columns(names, self.col_names) else: # rename dict keys @@ -330,7 +328,7 @@ def read( data = {k: v for k, (i, v) in zip(names, data_tups)} - names, date_data = self._do_date_conversions(names, data) + date_data = self._do_date_conversions(names, data) index, column_names = self._make_index(date_data, alldata, names) return index, column_names, date_data diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 5b4139edc19a9..f7d2aa2419429 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -289,7 +289,7 @@ def read( data, columns = self._exclude_implicit_index(alldata) conv_data = self._convert_data(data) - columns, conv_data = self._do_date_conversions(columns, conv_data) + conv_data = self._do_date_conversions(columns, conv_data) index, result_columns = self._make_index( conv_data, alldata, columns, indexnamerow From 546957e200bef00f876f3497d6e20a289deec828 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 8 May 2024 10:23:33 -0700 Subject: [PATCH 09/12] Remove _get_complex_date_index --- pandas/io/parsers/base_parser.py | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index a0afde1700ce0..2b2d5a3d2d4d6 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -381,34 +381,6 @@ def ix(col): return index - @final - def _get_complex_date_index(self, data, col_names): - def _get_name(icol): - if isinstance(icol, str): - return icol - - if col_names is None: - raise ValueError(f"Must supply column order to use {icol!s} as index") - - for i, c in enumerate(col_names): - if i == icol: - return c - - to_remove = [] - index = [] - for idx in self.index_col: - name = _get_name(idx) - to_remove.append(name) - index.append(data[name]) - - # remove index items from content and columns, don't pop in - # loop - for c in sorted(to_remove, reverse=True): - data.pop(c) - col_names.remove(c) - - return index - @final def _clean_mapping(self, mapping): """converts col numbers to names""" From faf4c6f782ce9e7359eaf999a940000e426ee9ff Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 8 May 2024 12:18:56 -0700 Subject: [PATCH 10/12] Remove concat arrays for csv --- asv_bench/benchmarks/io/parsers.py | 25 +------ pandas/_libs/tslibs/parsing.pyi | 3 - pandas/_libs/tslibs/parsing.pyx | 81 ---------------------- pandas/io/parsers/base_parser.py | 14 ++-- pandas/tests/io/parser/test_parse_dates.py | 14 ---- 5 files changed, 6 insertions(+), 131 deletions(-) diff --git a/asv_bench/benchmarks/io/parsers.py b/asv_bench/benchmarks/io/parsers.py index 1078837a8e395..d3fd5075a4707 100644 --- a/asv_bench/benchmarks/io/parsers.py +++ b/asv_bench/benchmarks/io/parsers.py @@ -1,10 +1,5 @@ -import numpy as np - try: - from pandas._libs.tslibs.parsing import ( - _does_string_look_like_datetime, - concat_date_cols, - ) + from pandas._libs.tslibs.parsing import _does_string_look_like_datetime except ImportError: # Avoid whole benchmark suite import failure on asv (currently 0.4) pass @@ -20,21 +15,3 @@ def setup(self, value): def time_check_datetimes(self, value): for obj in self.objects: _does_string_look_like_datetime(obj) - - -class ConcatDateCols: - params = ([1234567890, "AAAA"], [1, 2]) - param_names = ["value", "dim"] - - def setup(self, value, dim): - count_elem = 10000 - if dim == 1: - self.object = (np.array([value] * count_elem),) - if dim == 2: - self.object = ( - np.array([value] * count_elem), - np.array([value] * count_elem), - ) - - def time_check_concat(self, value, dim): - concat_date_cols(self.object) diff --git a/pandas/_libs/tslibs/parsing.pyi b/pandas/_libs/tslibs/parsing.pyi index 40394f915d4b0..845bd9a5a5635 100644 --- a/pandas/_libs/tslibs/parsing.pyi +++ b/pandas/_libs/tslibs/parsing.pyi @@ -27,7 +27,4 @@ def guess_datetime_format( dt_str: str, dayfirst: bool | None = ..., ) -> str | None: ... -def concat_date_cols( - date_cols: tuple, -) -> npt.NDArray[np.object_]: ... def get_rule_month(source: str) -> str: ... diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 85ef3fd93ff09..3d930dab9a949 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -29,11 +29,6 @@ import numpy as np cimport numpy as cnp from numpy cimport ( - PyArray_GETITEM, - PyArray_ITER_DATA, - PyArray_ITER_NEXT, - PyArray_IterNew, - flatiter, float64_t, int64_t, ) @@ -75,8 +70,6 @@ import_pandas_datetime() from pandas._libs.tslibs.strptime import array_strptime -from pandas._libs.tslibs.util cimport is_array - cdef extern from "pandas/portable.h": int getdigit_ascii(char c, int default) nogil @@ -1132,80 +1125,6 @@ cdef object convert_to_unicode(object item, bint keep_trivial_numbers): return item -@cython.wraparound(False) -@cython.boundscheck(False) -def concat_date_cols(tuple date_cols) -> np.ndarray: - """ - Concatenates elements from numpy arrays in `date_cols` into strings. - - Parameters - ---------- - date_cols : tuple[ndarray] - - Returns - ------- - arr_of_rows : ndarray[object] - - Examples - -------- - >>> dates=np.array(['3/31/2019', '4/31/2019'], dtype=object) - >>> times=np.array(['11:20', '10:45'], dtype=object) - >>> result = concat_date_cols((dates, times)) - >>> result - array(['3/31/2019 11:20', '4/31/2019 10:45'], dtype=object) - """ - cdef: - Py_ssize_t rows_count = 0, col_count = len(date_cols) - Py_ssize_t col_idx, row_idx - list list_to_join - cnp.ndarray[object] iters - object[::1] iters_view - flatiter it - cnp.ndarray[object] result - object[::1] result_view - - if col_count == 0: - return np.zeros(0, dtype=object) - - if not all(is_array(array) for array in date_cols): - raise ValueError("not all elements from date_cols are numpy arrays") - - rows_count = min(len(array) for array in date_cols) - result = np.zeros(rows_count, dtype=object) - result_view = result - - if col_count == 1: - array = date_cols[0] - it = PyArray_IterNew(array) - for row_idx in range(rows_count): - item = PyArray_GETITEM(array, PyArray_ITER_DATA(it)) - result_view[row_idx] = convert_to_unicode(item, True) - PyArray_ITER_NEXT(it) - else: - # create fixed size list - more efficient memory allocation - list_to_join = [None] * col_count - iters = np.zeros(col_count, dtype=object) - - # create memoryview of iters ndarray, that will contain some - # flatiter's for each array in `date_cols` - more efficient indexing - iters_view = iters - for col_idx, array in enumerate(date_cols): - iters_view[col_idx] = PyArray_IterNew(array) - - # array elements that are on the same line are converted to one string - for row_idx in range(rows_count): - for col_idx, array in enumerate(date_cols): - # this cast is needed, because we did not find a way - # to efficiently store `flatiter` type objects in ndarray - it = iters_view[col_idx] - item = PyArray_GETITEM(array, PyArray_ITER_DATA(it)) - list_to_join[col_idx] = convert_to_unicode(item, False) - PyArray_ITER_NEXT(it) - result_view[row_idx] = " ".join(list_to_join) - - return result - - cpdef str get_rule_month(str source): """ Return starting month of given freq, default is December. diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 2b2d5a3d2d4d6..c6cc85b9f722b 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -22,7 +22,6 @@ ) import pandas._libs.ops as libops from pandas._libs.parsers import STR_NA_VALUES -from pandas._libs.tslibs import parsing from pandas.compat._optional import import_optional_dependency from pandas.errors import ( ParserError, @@ -32,7 +31,6 @@ from pandas.core.dtypes.astype import astype_array from pandas.core.dtypes.common import ( - ensure_object, is_bool_dtype, is_dict_like, is_extension_array_dtype, @@ -1047,17 +1045,15 @@ def _make_date_converter( cache_dates: bool = True, date_format: dict[Hashable, str] | str | None = None, ): - def converter(*date_cols, col: Hashable): - if len(date_cols) == 1 and date_cols[0].dtype.kind in "Mm": - return date_cols[0] - # TODO: Can we remove concat_date_cols after deprecation of parsing - # multiple cols? - strs = parsing.concat_date_cols(date_cols) + def converter(date_col, col: Hashable): + if date_col.dtype.kind in "Mm": + return date_col + date_fmt = ( date_format.get(col) if isinstance(date_format, dict) else date_format ) - str_objs = ensure_object(strs) + str_objs = lib.ensure_string_array(date_col) try: result = tools.to_datetime( str_objs, diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index e21503c04d3a2..a0c7fd8df7f52 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -14,8 +14,6 @@ import pytest import pytz -from pandas._libs.tslibs import parsing - import pandas as pd from pandas import ( DataFrame, @@ -39,18 +37,6 @@ skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") -@pytest.mark.parametrize("container", [list, tuple, Index, Series]) -@pytest.mark.parametrize("dim", [1, 2]) -def test_concat_date_col_fail(container, dim): - msg = "not all elements from date_cols are numpy arrays" - value = "19990127" - - date_cols = tuple(container([value]) for _ in range(dim)) - - with pytest.raises(ValueError, match=msg): - parsing.concat_date_cols(date_cols) - - def test_date_col_as_index_col(all_parsers): data = """\ KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 From 42691e4a74ce60374602c7e140d0b1f2d6d59f9f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 8 May 2024 13:47:32 -0700 Subject: [PATCH 11/12] Unexfail test --- pandas/tests/io/parser/test_parse_dates.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index a0c7fd8df7f52..3bb3d793606e1 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -293,7 +293,6 @@ def test_bad_date_parse_with_warning(all_parsers, cache, value): ) -@xfail_pyarrow def test_parse_dates_empty_string(all_parsers): # see gh-2263 parser = all_parsers From 64fc335fbe6a5048f3452e16b0abf3bcc9f7e021 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 8 May 2024 16:04:28 -0700 Subject: [PATCH 12/12] Remove convert to unicode --- pandas/_libs/tslibs/parsing.pyx | 42 +-------------------------------- 1 file changed, 1 insertion(+), 41 deletions(-) diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 3d930dab9a949..c448a7e7c01b5 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -7,7 +7,6 @@ import warnings from pandas.util._exceptions import find_stack_level -cimport cython from cpython.datetime cimport ( datetime, datetime_new, @@ -18,7 +17,6 @@ from cpython.datetime cimport ( from datetime import timezone -from cpython.object cimport PyObject_Str from cpython.unicode cimport PyUnicode_AsUTF8AndSize from cython cimport Py_ssize_t from libc.string cimport strchr @@ -28,10 +26,7 @@ import_datetime() import numpy as np cimport numpy as cnp -from numpy cimport ( - float64_t, - int64_t, -) +from numpy cimport int64_t cnp.import_array() @@ -1090,41 +1085,6 @@ cdef void _maybe_warn_about_dayfirst(format: str, bint dayfirst) noexcept: ) -@cython.wraparound(False) -@cython.boundscheck(False) -cdef object convert_to_unicode(object item, bint keep_trivial_numbers): - """ - Convert `item` to str. - - Parameters - ---------- - item : object - keep_trivial_numbers : bool - if True, then conversion (to string from integer/float zero) - is not performed - - Returns - ------- - str or int or float - """ - cdef: - float64_t float_item - - if keep_trivial_numbers: - if isinstance(item, int): - if item == 0: - return item - elif isinstance(item, float): - float_item = item - if float_item == 0.0 or float_item != float_item: - return item - - if not isinstance(item, str): - item = PyObject_Str(item) - - return item - - cpdef str get_rule_month(str source): """ Return starting month of given freq, default is December.