From c759fc2fba249e790b1928a39e5797d7a3461696 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 7 May 2024 13:57:59 -0700
Subject: [PATCH 01/12] CLN: Enforce read_csv(keep_date_col, parse_dates)
deprecations
---
asv_bench/benchmarks/io/csv.py | 10 -
doc/source/user_guide/io.rst | 75 +-
pandas/io/parsers/base_parser.py | 235 +---
pandas/io/parsers/c_parser_wrapper.py | 1 -
pandas/io/parsers/python_parser.py | 11 +-
pandas/io/parsers/readers.py | 70 +-
pandas/tests/io/parser/test_parse_dates.py | 1173 +-------------------
7 files changed, 53 insertions(+), 1522 deletions(-)
diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
index dae6107db4d92..ff0ccffced0f3 100644
--- a/asv_bench/benchmarks/io/csv.py
+++ b/asv_bench/benchmarks/io/csv.py
@@ -445,16 +445,6 @@ def setup(self, engine):
data = data.format(*two_cols)
self.StringIO_input = StringIO(data)
- def time_multiple_date(self, engine):
- read_csv(
- self.data(self.StringIO_input),
- engine=engine,
- sep=",",
- header=None,
- names=list(string.digits[:9]),
- parse_dates=[[1, 2], [1, 3]],
- )
-
def time_baseline(self, engine):
read_csv(
self.data(self.StringIO_input),
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
index b5cc8c43ae143..c62e803b797b0 100644
--- a/doc/source/user_guide/io.rst
+++ b/doc/source/user_guide/io.rst
@@ -270,9 +270,6 @@ parse_dates : boolean or list of ints or names or list of lists or dict, default
* If ``True`` -> try parsing the index.
* If ``[1, 2, 3]`` -> try parsing columns 1, 2, 3 each as a separate date
column.
- * If ``[[1, 3]]`` -> combine columns 1 and 3 and parse as a single date
- column.
- * If ``{'foo': [1, 3]}`` -> parse columns 1, 3 as date and call result 'foo'.
.. note::
A fast-path exists for iso8601-formatted dates.
@@ -282,9 +279,6 @@ infer_datetime_format : boolean, default ``False``
.. deprecated:: 2.0.0
A strict version of this argument is now the default, passing it has no effect.
-keep_date_col : boolean, default ``False``
- If ``True`` and parse_dates specifies combining multiple columns then keep the
- original columns.
date_parser : function, default ``None``
Function to use for converting a sequence of string columns to an array of
datetime instances. The default uses ``dateutil.parser.parser`` to do the
@@ -829,71 +823,8 @@ The simplest case is to just pass in ``parse_dates=True``:
It is often the case that we may want to store date and time data separately,
or store various date fields separately. the ``parse_dates`` keyword can be
-used to specify a combination of columns to parse the dates and/or times from.
-
-You can specify a list of column lists to ``parse_dates``, the resulting date
-columns will be prepended to the output (so as to not affect the existing column
-order) and the new column names will be the concatenation of the component
-column names:
-
-.. ipython:: python
- :okwarning:
-
- data = (
- "KORD,19990127, 19:00:00, 18:56:00, 0.8100\n"
- "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n"
- "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n"
- "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n"
- "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n"
- "KORD,19990127, 23:00:00, 22:56:00, -0.5900"
- )
-
- with open("tmp.csv", "w") as fh:
- fh.write(data)
-
- df = pd.read_csv("tmp.csv", header=None, parse_dates=[[1, 2], [1, 3]])
- df
-
-By default the parser removes the component date columns, but you can choose
-to retain them via the ``keep_date_col`` keyword:
-
-.. ipython:: python
- :okwarning:
-
- df = pd.read_csv(
- "tmp.csv", header=None, parse_dates=[[1, 2], [1, 3]], keep_date_col=True
- )
- df
-
-Note that if you wish to combine multiple columns into a single date column, a
-nested list must be used. In other words, ``parse_dates=[1, 2]`` indicates that
-the second and third columns should each be parsed as separate date columns
-while ``parse_dates=[[1, 2]]`` means the two columns should be parsed into a
-single column.
-
-You can also use a dict to specify custom name columns:
-
-.. ipython:: python
- :okwarning:
-
- date_spec = {"nominal": [1, 2], "actual": [1, 3]}
- df = pd.read_csv("tmp.csv", header=None, parse_dates=date_spec)
- df
-
-It is important to remember that if multiple text columns are to be parsed into
-a single date column, then a new column is prepended to the data. The ``index_col``
-specification is based off of this new set of columns rather than the original
-data columns:
-
+used to specify columns to parse the dates and/or times.
-.. ipython:: python
- :okwarning:
-
- date_spec = {"nominal": [1, 2], "actual": [1, 3]}
- df = pd.read_csv(
- "tmp.csv", header=None, parse_dates=date_spec, index_col=0
- ) # index is the nominal column
- df
.. note::
If a column or index contains an unparsable date, the entire column or
@@ -907,10 +838,6 @@ data columns:
for your data to store datetimes in this format, load times will be
significantly faster, ~20x has been observed.
-.. deprecated:: 2.2.0
- Combining date columns inside read_csv is deprecated. Use ``pd.to_datetime``
- on the relevant result columns instead.
-
Date parsing functions
++++++++++++++++++++++
diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
index 510097aed2a25..962360a29aecb 100644
--- a/pandas/io/parsers/base_parser.py
+++ b/pandas/io/parsers/base_parser.py
@@ -5,7 +5,6 @@
import csv
import datetime
from enum import Enum
-import itertools
from typing import (
TYPE_CHECKING,
Any,
@@ -43,7 +42,6 @@
is_integer_dtype,
is_list_like,
is_object_dtype,
- is_scalar,
is_string_dtype,
pandas_dtype,
)
@@ -58,7 +56,6 @@
DataFrame,
DatetimeIndex,
StringDtype,
- concat,
)
from pandas.core import algorithms
from pandas.core.arrays import (
@@ -111,7 +108,6 @@ class BadLineHandleMethod(Enum):
keep_default_na: bool
dayfirst: bool
cache_dates: bool
- keep_date_col: bool
usecols_dtype: str | None
def __init__(self, kwds) -> None:
@@ -125,12 +121,19 @@ def __init__(self, kwds) -> None:
self.index_names: Sequence[Hashable] | None = None
self.col_names: Sequence[Hashable] | None = None
- self.parse_dates = _validate_parse_dates_arg(kwds.pop("parse_dates", False))
- self._parse_date_cols: Iterable = []
+ parse_dates = kwds.pop("parse_dates", False)
+ if isinstance(parse_dates, None) or lib.is_bool(parse_dates):
+ parse_dates = bool(parse_dates)
+ elif not isinstance(parse_dates, list):
+ raise TypeError(
+ "Only booleans and lists are accepted "
+ "for the 'parse_dates' parameter"
+ )
+ self.parse_dates: bool | list = parse_dates
+ self._parse_date_cols: set = set()
self.date_parser = kwds.pop("date_parser", lib.no_default)
self.date_format = kwds.pop("date_format", None)
self.dayfirst = kwds.pop("dayfirst", False)
- self.keep_date_col = kwds.pop("keep_date_col", False)
self.na_values = kwds.get("na_values")
self.na_fvalues = kwds.get("na_fvalues")
@@ -180,8 +183,6 @@ def __init__(self, kwds) -> None:
else:
self.index_col = list(self.index_col)
- self._name_processed = False
-
self._first_chunk = True
self.usecols, self.usecols_dtype = self._validate_usecols_arg(kwds["usecols"])
@@ -190,7 +191,7 @@ def __init__(self, kwds) -> None:
# Normally, this arg would get pre-processed earlier on
self.on_bad_lines = kwds.get("on_bad_lines", self.BadLineHandleMethod.ERROR)
- def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> Iterable:
+ def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> set:
"""
Check if parse_dates are in columns.
@@ -204,7 +205,7 @@ def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> Iterabl
Returns
-------
- The names of the columns which will get parsed later if a dict or list
+ The names of the columns which will get parsed later if a list
is given as specification.
Raises
@@ -213,30 +214,15 @@ def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> Iterabl
If column to parse_date is not in dataframe.
"""
- cols_needed: Iterable
- if is_dict_like(self.parse_dates):
- cols_needed = itertools.chain(*self.parse_dates.values())
- elif is_list_like(self.parse_dates):
- # a column in parse_dates could be represented
- # ColReference = Union[int, str]
- # DateGroups = List[ColReference]
- # ParseDates = Union[DateGroups, List[DateGroups],
- # Dict[ColReference, DateGroups]]
- cols_needed = itertools.chain.from_iterable(
- col if is_list_like(col) and not isinstance(col, tuple) else [col]
- for col in self.parse_dates
- )
- else:
- cols_needed = []
-
- cols_needed = list(cols_needed)
+ if not isinstance(self.parse_dates, list):
+ return set()
# get only columns that are references using names (str), not by index
missing_cols = ", ".join(
sorted(
{
col
- for col in cols_needed
+ for col in self.parse_dates
if isinstance(col, str) and col not in columns
}
)
@@ -246,27 +232,18 @@ def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> Iterabl
f"Missing column provided to 'parse_dates': '{missing_cols}'"
)
# Convert positions to actual column names
- return [
+ return {
col if (isinstance(col, str) or col in columns) else columns[col]
- for col in cols_needed
- ]
+ for col in self.parse_dates
+ }
def close(self) -> None:
pass
- @final
- @property
- def _has_complex_date_col(self) -> bool:
- return isinstance(self.parse_dates, dict) or (
- isinstance(self.parse_dates, list)
- and len(self.parse_dates) > 0
- and isinstance(self.parse_dates[0], list)
- )
-
@final
def _should_parse_dates(self, i: int) -> bool:
- if lib.is_bool(self.parse_dates):
- return bool(self.parse_dates)
+ if isinstance(self.parse_dates, bool):
+ return self.parse_dates
else:
if self.index_names is not None:
name = self.index_names[i]
@@ -368,18 +345,9 @@ def _make_index(
index: Index | None
if not is_index_col(self.index_col) or not self.index_col:
index = None
-
- elif not self._has_complex_date_col:
+ else:
simple_index = self._get_simple_index(alldata, columns)
index = self._agg_index(simple_index)
- elif self._has_complex_date_col:
- if not self._name_processed:
- (self.index_names, _, self.index_col) = self._clean_index_names(
- list(columns), self.index_col
- )
- self._name_processed = True
- date_index = self._get_complex_date_index(data, columns)
- index = self._agg_index(date_index, try_parse_dates=False)
# add names for the index
if indexnamerow:
@@ -645,19 +613,7 @@ def _set(x) -> int:
if isinstance(self.parse_dates, list):
for val in self.parse_dates:
- if isinstance(val, list):
- for k in val:
- noconvert_columns.add(_set(k))
- else:
- noconvert_columns.add(_set(val))
-
- elif isinstance(self.parse_dates, dict):
- for val in self.parse_dates.values():
- if isinstance(val, list):
- for k in val:
- noconvert_columns.add(_set(k))
- else:
- noconvert_columns.add(_set(val))
+ noconvert_columns.add(_set(val))
elif self.parse_dates:
if isinstance(self.index_col, list):
@@ -875,7 +831,7 @@ def _do_date_conversions(
) -> tuple[Sequence[Hashable] | Index, Mapping[Hashable, ArrayLike] | DataFrame]:
# returns data, columns
- if self.parse_dates is not None:
+ if isinstance(self.parse_dates, list):
data, names = _process_date_conversion(
data,
self._date_conv,
@@ -883,7 +839,6 @@ def _do_date_conversions(
self.index_col,
self.index_names,
names,
- keep_date_col=self.keep_date_col,
dtype_backend=self.dtype_backend,
)
@@ -1228,7 +1183,6 @@ def converter(*date_cols, col: Hashable):
"decimal": ".",
# 'engine': 'c',
"parse_dates": False,
- "keep_date_col": False,
"dayfirst": False,
"date_parser": lib.no_default,
"date_format": None,
@@ -1247,11 +1201,10 @@ def converter(*date_cols, col: Hashable):
def _process_date_conversion(
data_dict,
converter: Callable,
- parse_spec,
+ parse_spec: list,
index_col,
index_names,
columns,
- keep_date_col: bool = False,
dtype_backend=lib.no_default,
) -> tuple[dict, list]:
def _isindex(colspec):
@@ -1259,111 +1212,28 @@ def _isindex(colspec):
isinstance(index_names, list) and colspec in index_names
)
- new_cols = []
- new_data = {}
-
orig_names = columns
- columns = list(columns)
-
- date_cols = set()
-
- if parse_spec is None or isinstance(parse_spec, bool):
- return data_dict, columns
-
- if isinstance(parse_spec, list):
- # list of column lists
- for colspec in parse_spec:
- if is_scalar(colspec) or isinstance(colspec, tuple):
- if isinstance(colspec, int) and colspec not in data_dict:
- colspec = orig_names[colspec]
- if _isindex(colspec):
- continue
- elif dtype_backend == "pyarrow":
- import pyarrow as pa
-
- dtype = data_dict[colspec].dtype
- if isinstance(dtype, ArrowDtype) and (
- pa.types.is_timestamp(dtype.pyarrow_dtype)
- or pa.types.is_date(dtype.pyarrow_dtype)
- ):
- continue
-
- # Pyarrow engine returns Series which we need to convert to
- # numpy array before converter, its a no-op for other parsers
- data_dict[colspec] = converter(
- np.asarray(data_dict[colspec]), col=colspec
- )
- else:
- new_name, col, old_names = _try_convert_dates(
- converter, colspec, data_dict, orig_names
- )
- if new_name in data_dict:
- raise ValueError(f"New date column already in dict {new_name}")
- new_data[new_name] = col
- new_cols.append(new_name)
- date_cols.update(old_names)
-
- elif isinstance(parse_spec, dict):
- # dict of new name to column list
- for new_name, colspec in parse_spec.items():
- if new_name in data_dict:
- raise ValueError(f"Date column {new_name} already in dict")
-
- _, col, old_names = _try_convert_dates(
- converter,
- colspec,
- data_dict,
- orig_names,
- target_name=new_name,
- )
-
- new_data[new_name] = col
-
- # If original column can be converted to date we keep the converted values
- # This can only happen if values are from single column
- if len(colspec) == 1:
- new_data[colspec[0]] = col
-
- new_cols.append(new_name)
- date_cols.update(old_names)
-
- if isinstance(data_dict, DataFrame):
- data_dict = concat([DataFrame(new_data), data_dict], axis=1)
- else:
- data_dict.update(new_data)
- new_cols.extend(columns)
-
- if not keep_date_col:
- for c in list(date_cols):
- data_dict.pop(c)
- new_cols.remove(c)
-
- return data_dict, new_cols
+ for colspec in parse_spec:
+ if isinstance(colspec, int) and colspec not in data_dict:
+ colspec = orig_names[colspec]
+ if _isindex(colspec):
+ continue
+ elif dtype_backend == "pyarrow":
+ import pyarrow as pa
+
+ dtype = data_dict[colspec].dtype
+ if isinstance(dtype, ArrowDtype) and (
+ pa.types.is_timestamp(dtype.pyarrow_dtype)
+ or pa.types.is_date(dtype.pyarrow_dtype)
+ ):
+ continue
-def _try_convert_dates(
- parser: Callable, colspec, data_dict, columns, target_name: str | None = None
-):
- colset = set(columns)
- colnames = []
-
- for c in colspec:
- if c in colset:
- colnames.append(c)
- elif isinstance(c, int) and c not in columns:
- colnames.append(columns[c])
- else:
- colnames.append(c)
-
- new_name: tuple | str
- if all(isinstance(x, tuple) for x in colnames):
- new_name = tuple(map("_".join, zip(*colnames)))
- else:
- new_name = "_".join([str(x) for x in colnames])
- to_parse = [np.asarray(data_dict[c]) for c in colnames if c in data_dict]
+ # Pyarrow engine returns Series which we need to convert to
+ # numpy array before converter, its a no-op for other parsers
+ data_dict[colspec] = converter(np.asarray(data_dict[colspec]), col=colspec)
- new_col = parser(*to_parse, col=new_name if target_name is None else target_name)
- return new_name, new_col, colnames
+ return data_dict, columns
def _get_na_values(col, na_values, na_fvalues, keep_default_na: bool):
@@ -1401,26 +1271,5 @@ def _get_na_values(col, na_values, na_fvalues, keep_default_na: bool):
return na_values, na_fvalues
-def _validate_parse_dates_arg(parse_dates):
- """
- Check whether or not the 'parse_dates' parameter
- is a non-boolean scalar. Raises a ValueError if
- that is the case.
- """
- msg = (
- "Only booleans, lists, and dictionaries are accepted "
- "for the 'parse_dates' parameter"
- )
-
- if not (
- parse_dates is None
- or lib.is_bool(parse_dates)
- or isinstance(parse_dates, (list, dict))
- ):
- raise TypeError(msg)
-
- return parse_dates
-
-
def is_index_col(col) -> bool:
return col is not None and col is not False
diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py
index 6e5d36ad39c8a..1baca9d48d795 100644
--- a/pandas/io/parsers/c_parser_wrapper.py
+++ b/pandas/io/parsers/c_parser_wrapper.py
@@ -171,7 +171,6 @@ def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None:
if self._reader.leading_cols == 0 and is_index_col(
self.index_col # type: ignore[has-type]
):
- self._name_processed = True
(
index_names,
# error: Cannot determine type of 'names'
diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
index e2456b165fe60..5b4139edc19a9 100644
--- a/pandas/io/parsers/python_parser.py
+++ b/pandas/io/parsers/python_parser.py
@@ -150,14 +150,9 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None:
# get popped off for index
self.orig_names: list[Hashable] = list(self.columns)
- # needs to be cleaned/refactored
- # multiple date column thing turning into a real spaghetti factory
-
- if not self._has_complex_date_col:
- (index_names, self.orig_names, self.columns) = self._get_index_name()
- self._name_processed = True
- if self.index_names is None:
- self.index_names = index_names
+ index_names, self.orig_names, self.columns = self._get_index_name()
+ if self.index_names is None:
+ self.index_names = index_names
if self._col_indices is None:
self._col_indices = list(range(len(self.columns)))
diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
index 70f9a68244164..d13d0e22b2b07 100644
--- a/pandas/io/parsers/readers.py
+++ b/pandas/io/parsers/readers.py
@@ -40,7 +40,6 @@
from pandas.core.dtypes.common import (
is_file_like,
is_float,
- is_hashable,
is_integer,
is_list_like,
pandas_dtype,
@@ -119,7 +118,6 @@ class _read_shared(TypedDict, Generic[HashableT], total=False):
skip_blank_lines: bool
parse_dates: bool | Sequence[Hashable] | None
infer_datetime_format: bool | lib.NoDefault
- keep_date_col: bool | lib.NoDefault
date_parser: Callable | lib.NoDefault
date_format: str | dict[Hashable, str] | None
dayfirst: bool
@@ -302,8 +300,7 @@ class _read_shared(TypedDict, Generic[HashableT], total=False):
performance of reading a large file.
skip_blank_lines : bool, default True
If ``True``, skip over blank lines rather than interpreting as ``NaN`` values.
-parse_dates : bool, None, list of Hashable, list of lists or dict of {{Hashable : \
-list}}, default None
+parse_dates : bool, None, list of Hashable, default None
The behavior is as follows:
* ``bool``. If ``True`` -> try parsing the index.
@@ -311,10 +308,6 @@ class _read_shared(TypedDict, Generic[HashableT], total=False):
specified.
* ``list`` of ``int`` or names. e.g. If ``[1, 2, 3]`` -> try parsing columns 1, 2, 3
each as a separate date column.
- * ``list`` of ``list``. e.g. If ``[[1, 3]]`` -> combine columns 1 and 3 and parse
- as a single date column. Values are joined with a space before parsing.
- * ``dict``, e.g. ``{{'foo' : [1, 3]}}`` -> parse columns 1, 3 as date and call
- result 'foo'. Values are joined with a space before parsing.
If a column or index cannot be represented as an array of ``datetime``,
say because of an unparsable value or a mixture of timezones, the column
@@ -332,9 +325,6 @@ class _read_shared(TypedDict, Generic[HashableT], total=False):
.. deprecated:: 2.0.0
A strict version of this argument is now the default, passing it has no effect.
-keep_date_col : bool, default False
- If ``True`` and ``parse_dates`` specifies combining multiple columns then
- keep the original columns.
date_parser : Callable, optional
Function to use for converting a sequence of string columns to an array of
``datetime`` instances. The default uses ``dateutil.parser.parser`` to do the
@@ -759,7 +749,6 @@ def read_csv(
# Datetime Handling
parse_dates: bool | Sequence[Hashable] | None = None,
infer_datetime_format: bool | lib.NoDefault = lib.no_default,
- keep_date_col: bool | lib.NoDefault = lib.no_default,
date_parser: Callable | lib.NoDefault = lib.no_default,
date_format: str | dict[Hashable, str] | None = None,
dayfirst: bool = False,
@@ -790,38 +779,6 @@ def read_csv(
storage_options: StorageOptions | None = None,
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
) -> DataFrame | TextFileReader:
- if keep_date_col is not lib.no_default:
- # GH#55569
- warnings.warn(
- "The 'keep_date_col' keyword in pd.read_csv is deprecated and "
- "will be removed in a future version. Explicitly remove unwanted "
- "columns after parsing instead.",
- FutureWarning,
- stacklevel=find_stack_level(),
- )
- else:
- keep_date_col = False
-
- if lib.is_list_like(parse_dates):
- # GH#55569
- depr = False
- # error: Item "bool" of "bool | Sequence[Hashable] | None" has no
- # attribute "__iter__" (not iterable)
- if not all(is_hashable(x) for x in parse_dates): # type: ignore[union-attr]
- depr = True
- elif isinstance(parse_dates, dict) and any(
- lib.is_list_like(x) for x in parse_dates.values()
- ):
- depr = True
- if depr:
- warnings.warn(
- "Support for nested sequences for 'parse_dates' in pd.read_csv "
- "is deprecated. Combine the desired columns with pd.to_datetime "
- "after parsing instead.",
- FutureWarning,
- stacklevel=find_stack_level(),
- )
-
if infer_datetime_format is not lib.no_default:
warnings.warn(
"The argument 'infer_datetime_format' is deprecated and will "
@@ -950,7 +907,6 @@ def read_table(
# Datetime Handling
parse_dates: bool | Sequence[Hashable] | None = None,
infer_datetime_format: bool | lib.NoDefault = lib.no_default,
- keep_date_col: bool | lib.NoDefault = lib.no_default,
date_parser: Callable | lib.NoDefault = lib.no_default,
date_format: str | dict[Hashable, str] | None = None,
dayfirst: bool = False,
@@ -981,29 +937,6 @@ def read_table(
storage_options: StorageOptions | None = None,
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
) -> DataFrame | TextFileReader:
- if keep_date_col is not lib.no_default:
- # GH#55569
- warnings.warn(
- "The 'keep_date_col' keyword in pd.read_table is deprecated and "
- "will be removed in a future version. Explicitly remove unwanted "
- "columns after parsing instead.",
- FutureWarning,
- stacklevel=find_stack_level(),
- )
- else:
- keep_date_col = False
-
- # error: Item "bool" of "bool | Sequence[Hashable]" has no attribute "__iter__"
- if lib.is_list_like(parse_dates) and not all(is_hashable(x) for x in parse_dates): # type: ignore[union-attr]
- # GH#55569
- warnings.warn(
- "Support for nested sequences for 'parse_dates' in pd.read_table "
- "is deprecated. Combine the desired columns with pd.to_datetime "
- "after parsing instead.",
- FutureWarning,
- stacklevel=find_stack_level(),
- )
-
if infer_datetime_format is not lib.no_default:
warnings.warn(
"The argument 'infer_datetime_format' is deprecated and will "
@@ -1671,7 +1604,6 @@ def TextParser(*args, **kwds) -> TextFileReader:
comment : str, optional
Comment out remainder of line
parse_dates : bool, default False
- keep_date_col : bool, default False
date_parser : function, optional
.. deprecated:: 2.0.0
diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py
index 8968948df5fa9..96ff06ceafa3b 100644
--- a/pandas/tests/io/parser/test_parse_dates.py
+++ b/pandas/tests/io/parser/test_parse_dates.py
@@ -4,7 +4,6 @@
"""
from datetime import (
- date,
datetime,
timedelta,
timezone,
@@ -116,192 +115,6 @@ def __custom_date_parser(time):
tm.assert_frame_equal(result, expected)
-@xfail_pyarrow
-def test_separator_date_conflict(all_parsers):
- # Regression test for gh-4678
- #
- # Make sure thousands separator and
- # date parsing do not conflict.
- parser = all_parsers
- data = "06-02-2013;13:00;1-000.215"
- expected = DataFrame(
- [[datetime(2013, 6, 2, 13, 0, 0), 1000.215]], columns=["Date", 2]
- )
-
- depr_msg = (
- "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated"
- )
- with tm.assert_produces_warning(
- FutureWarning, match=depr_msg, check_stacklevel=False
- ):
- df = parser.read_csv(
- StringIO(data),
- sep=";",
- thousands="-",
- parse_dates={"Date": [0, 1]},
- header=None,
- )
- tm.assert_frame_equal(df, expected)
-
-
-@pytest.mark.parametrize("keep_date_col", [True, False])
-def test_multiple_date_col_custom(all_parsers, keep_date_col, request):
- data = """\
-KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
-KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
-KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
-KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
-KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
-KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
-"""
- parser = all_parsers
-
- if keep_date_col and parser.engine == "pyarrow":
- # For this to pass, we need to disable auto-inference on the date columns
- # in parse_dates. We have no way of doing this though
- mark = pytest.mark.xfail(
- reason="pyarrow doesn't support disabling auto-inference on column numbers."
- )
- request.applymarker(mark)
-
- def date_parser(*date_cols):
- """
- Test date parser.
-
- Parameters
- ----------
- date_cols : args
- The list of data columns to parse.
-
- Returns
- -------
- parsed : Series
- """
- return parsing.try_parse_dates(
- parsing.concat_date_cols(date_cols), parser=du_parse
- )
-
- kwds = {
- "header": None,
- "date_parser": date_parser,
- "parse_dates": {"actual": [1, 2], "nominal": [1, 3]},
- "keep_date_col": keep_date_col,
- "names": ["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8"],
- }
- result = parser.read_csv_check_warnings(
- FutureWarning,
- "use 'date_format' instead",
- StringIO(data),
- **kwds,
- raise_on_extra_warnings=False,
- )
-
- expected = DataFrame(
- [
- [
- datetime(1999, 1, 27, 19, 0),
- datetime(1999, 1, 27, 18, 56),
- "KORD",
- "19990127",
- " 19:00:00",
- " 18:56:00",
- 0.81,
- 2.81,
- 7.2,
- 0.0,
- 280.0,
- ],
- [
- datetime(1999, 1, 27, 20, 0),
- datetime(1999, 1, 27, 19, 56),
- "KORD",
- "19990127",
- " 20:00:00",
- " 19:56:00",
- 0.01,
- 2.21,
- 7.2,
- 0.0,
- 260.0,
- ],
- [
- datetime(1999, 1, 27, 21, 0),
- datetime(1999, 1, 27, 20, 56),
- "KORD",
- "19990127",
- " 21:00:00",
- " 20:56:00",
- -0.59,
- 2.21,
- 5.7,
- 0.0,
- 280.0,
- ],
- [
- datetime(1999, 1, 27, 21, 0),
- datetime(1999, 1, 27, 21, 18),
- "KORD",
- "19990127",
- " 21:00:00",
- " 21:18:00",
- -0.99,
- 2.01,
- 3.6,
- 0.0,
- 270.0,
- ],
- [
- datetime(1999, 1, 27, 22, 0),
- datetime(1999, 1, 27, 21, 56),
- "KORD",
- "19990127",
- " 22:00:00",
- " 21:56:00",
- -0.59,
- 1.71,
- 5.1,
- 0.0,
- 290.0,
- ],
- [
- datetime(1999, 1, 27, 23, 0),
- datetime(1999, 1, 27, 22, 56),
- "KORD",
- "19990127",
- " 23:00:00",
- " 22:56:00",
- -0.59,
- 1.71,
- 4.6,
- 0.0,
- 280.0,
- ],
- ],
- columns=[
- "actual",
- "nominal",
- "X0",
- "X1",
- "X2",
- "X3",
- "X4",
- "X5",
- "X6",
- "X7",
- "X8",
- ],
- )
-
- if not keep_date_col:
- expected = expected.drop(["X1", "X2", "X3"], axis=1)
-
- # Python can sometimes be flaky about how
- # the aggregated columns are entered, so
- # this standardizes the order.
- result = result[expected.columns]
- tm.assert_frame_equal(result, expected)
-
-
@pytest.mark.parametrize("container", [list, tuple, Index, Series])
@pytest.mark.parametrize("dim", [1, 2])
def test_concat_date_col_fail(container, dim):
@@ -314,141 +127,6 @@ def test_concat_date_col_fail(container, dim):
parsing.concat_date_cols(date_cols)
-@pytest.mark.parametrize("keep_date_col", [True, False])
-def test_multiple_date_col(all_parsers, keep_date_col, request):
- data = """\
-KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
-KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
-KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
-KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
-KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
-KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
-"""
- parser = all_parsers
-
- if keep_date_col and parser.engine == "pyarrow":
- # For this to pass, we need to disable auto-inference on the date columns
- # in parse_dates. We have no way of doing this though
- mark = pytest.mark.xfail(
- reason="pyarrow doesn't support disabling auto-inference on column numbers."
- )
- request.applymarker(mark)
-
- depr_msg = "The 'keep_date_col' keyword in pd.read_csv is deprecated"
-
- kwds = {
- "header": None,
- "parse_dates": [[1, 2], [1, 3]],
- "keep_date_col": keep_date_col,
- "names": ["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8"],
- }
- with tm.assert_produces_warning(
- FutureWarning, match=depr_msg, check_stacklevel=False
- ):
- result = parser.read_csv(StringIO(data), **kwds)
-
- expected = DataFrame(
- [
- [
- datetime(1999, 1, 27, 19, 0),
- datetime(1999, 1, 27, 18, 56),
- "KORD",
- "19990127",
- " 19:00:00",
- " 18:56:00",
- 0.81,
- 2.81,
- 7.2,
- 0.0,
- 280.0,
- ],
- [
- datetime(1999, 1, 27, 20, 0),
- datetime(1999, 1, 27, 19, 56),
- "KORD",
- "19990127",
- " 20:00:00",
- " 19:56:00",
- 0.01,
- 2.21,
- 7.2,
- 0.0,
- 260.0,
- ],
- [
- datetime(1999, 1, 27, 21, 0),
- datetime(1999, 1, 27, 20, 56),
- "KORD",
- "19990127",
- " 21:00:00",
- " 20:56:00",
- -0.59,
- 2.21,
- 5.7,
- 0.0,
- 280.0,
- ],
- [
- datetime(1999, 1, 27, 21, 0),
- datetime(1999, 1, 27, 21, 18),
- "KORD",
- "19990127",
- " 21:00:00",
- " 21:18:00",
- -0.99,
- 2.01,
- 3.6,
- 0.0,
- 270.0,
- ],
- [
- datetime(1999, 1, 27, 22, 0),
- datetime(1999, 1, 27, 21, 56),
- "KORD",
- "19990127",
- " 22:00:00",
- " 21:56:00",
- -0.59,
- 1.71,
- 5.1,
- 0.0,
- 290.0,
- ],
- [
- datetime(1999, 1, 27, 23, 0),
- datetime(1999, 1, 27, 22, 56),
- "KORD",
- "19990127",
- " 23:00:00",
- " 22:56:00",
- -0.59,
- 1.71,
- 4.6,
- 0.0,
- 280.0,
- ],
- ],
- columns=[
- "X1_X2",
- "X1_X3",
- "X0",
- "X1",
- "X2",
- "X3",
- "X4",
- "X5",
- "X6",
- "X7",
- "X8",
- ],
- )
-
- if not keep_date_col:
- expected = expected.drop(["X1", "X2", "X3"], axis=1)
-
- tm.assert_frame_equal(result, expected)
-
-
def test_date_col_as_index_col(all_parsers):
data = """\
KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
@@ -495,240 +173,6 @@ def test_date_col_as_index_col(all_parsers):
tm.assert_frame_equal(result, expected)
-def test_multiple_date_cols_int_cast(all_parsers):
- data = (
- "KORD,19990127, 19:00:00, 18:56:00, 0.8100\n"
- "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n"
- "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n"
- "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n"
- "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n"
- "KORD,19990127, 23:00:00, 22:56:00, -0.5900"
- )
- parse_dates = {"actual": [1, 2], "nominal": [1, 3]}
- parser = all_parsers
-
- kwds = {
- "header": None,
- "parse_dates": parse_dates,
- "date_parser": pd.to_datetime,
- }
- result = parser.read_csv_check_warnings(
- FutureWarning,
- "use 'date_format' instead",
- StringIO(data),
- **kwds,
- raise_on_extra_warnings=False,
- )
-
- expected = DataFrame(
- [
- [datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 18, 56), "KORD", 0.81],
- [datetime(1999, 1, 27, 20, 0), datetime(1999, 1, 27, 19, 56), "KORD", 0.01],
- [
- datetime(1999, 1, 27, 21, 0),
- datetime(1999, 1, 27, 20, 56),
- "KORD",
- -0.59,
- ],
- [
- datetime(1999, 1, 27, 21, 0),
- datetime(1999, 1, 27, 21, 18),
- "KORD",
- -0.99,
- ],
- [
- datetime(1999, 1, 27, 22, 0),
- datetime(1999, 1, 27, 21, 56),
- "KORD",
- -0.59,
- ],
- [
- datetime(1999, 1, 27, 23, 0),
- datetime(1999, 1, 27, 22, 56),
- "KORD",
- -0.59,
- ],
- ],
- columns=["actual", "nominal", 0, 4],
- )
-
- # Python can sometimes be flaky about how
- # the aggregated columns are entered, so
- # this standardizes the order.
- result = result[expected.columns]
- tm.assert_frame_equal(result, expected)
-
-
-def test_multiple_date_col_timestamp_parse(all_parsers):
- parser = all_parsers
- data = """05/31/2012,15:30:00.029,1306.25,1,E,0,,1306.25
-05/31/2012,15:30:00.029,1306.25,8,E,0,,1306.25"""
-
- result = parser.read_csv_check_warnings(
- FutureWarning,
- "use 'date_format' instead",
- StringIO(data),
- parse_dates=[[0, 1]],
- header=None,
- date_parser=Timestamp,
- raise_on_extra_warnings=False,
- )
- expected = DataFrame(
- [
- [
- Timestamp("05/31/2012, 15:30:00.029"),
- 1306.25,
- 1,
- "E",
- 0,
- np.nan,
- 1306.25,
- ],
- [
- Timestamp("05/31/2012, 15:30:00.029"),
- 1306.25,
- 8,
- "E",
- 0,
- np.nan,
- 1306.25,
- ],
- ],
- columns=["0_1", 2, 3, 4, 5, 6, 7],
- )
- tm.assert_frame_equal(result, expected)
-
-
-@xfail_pyarrow
-def test_multiple_date_cols_with_header(all_parsers):
- parser = all_parsers
- data = """\
-ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir
-KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
-KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
-KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
-KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
-KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
-KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000"""
-
- depr_msg = (
- "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated"
- )
- with tm.assert_produces_warning(
- FutureWarning, match=depr_msg, check_stacklevel=False
- ):
- result = parser.read_csv(StringIO(data), parse_dates={"nominal": [1, 2]})
- expected = DataFrame(
- [
- [
- datetime(1999, 1, 27, 19, 0),
- "KORD",
- " 18:56:00",
- 0.81,
- 2.81,
- 7.2,
- 0.0,
- 280.0,
- ],
- [
- datetime(1999, 1, 27, 20, 0),
- "KORD",
- " 19:56:00",
- 0.01,
- 2.21,
- 7.2,
- 0.0,
- 260.0,
- ],
- [
- datetime(1999, 1, 27, 21, 0),
- "KORD",
- " 20:56:00",
- -0.59,
- 2.21,
- 5.7,
- 0.0,
- 280.0,
- ],
- [
- datetime(1999, 1, 27, 21, 0),
- "KORD",
- " 21:18:00",
- -0.99,
- 2.01,
- 3.6,
- 0.0,
- 270.0,
- ],
- [
- datetime(1999, 1, 27, 22, 0),
- "KORD",
- " 21:56:00",
- -0.59,
- 1.71,
- 5.1,
- 0.0,
- 290.0,
- ],
- [
- datetime(1999, 1, 27, 23, 0),
- "KORD",
- " 22:56:00",
- -0.59,
- 1.71,
- 4.6,
- 0.0,
- 280.0,
- ],
- ],
- columns=[
- "nominal",
- "ID",
- "ActualTime",
- "TDew",
- "TAir",
- "Windspeed",
- "Precip",
- "WindDir",
- ],
- )
- tm.assert_frame_equal(result, expected)
-
-
-@pytest.mark.parametrize(
- "data,parse_dates,msg",
- [
- (
- """\
-date_NominalTime,date,NominalTime
-KORD1,19990127, 19:00:00
-KORD2,19990127, 20:00:00""",
- [[1, 2]],
- ("New date column already in dict date_NominalTime"),
- ),
- (
- """\
-ID,date,nominalTime
-KORD,19990127, 19:00:00
-KORD,19990127, 20:00:00""",
- {"ID": [1, 2]},
- "Date column ID already in dict",
- ),
- ],
-)
-def test_multiple_date_col_name_collision(all_parsers, data, parse_dates, msg):
- parser = all_parsers
-
- depr_msg = (
- "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated"
- )
- with pytest.raises(ValueError, match=msg):
- with tm.assert_produces_warning(
- FutureWarning, match=depr_msg, check_stacklevel=False
- ):
- parser.read_csv(StringIO(data), parse_dates=parse_dates)
-
-
def test_date_parser_int_bug(all_parsers):
# see gh-3071
parser = all_parsers
@@ -859,37 +303,6 @@ def test_parse_dates_string(all_parsers):
tm.assert_frame_equal(result, expected)
-# Bug in https://github.com/dateutil/dateutil/issues/217
-# has been addressed, but we just don't pass in the `yearfirst`
-@pytest.mark.xfail(reason="yearfirst is not surfaced in read_*")
-@pytest.mark.parametrize("parse_dates", [[["date", "time"]], [[0, 1]]])
-def test_yy_format_with_year_first(all_parsers, parse_dates):
- data = """date,time,B,C
-090131,0010,1,2
-090228,1020,3,4
-090331,0830,5,6
-"""
- parser = all_parsers
- result = parser.read_csv_check_warnings(
- UserWarning,
- "Could not infer format",
- StringIO(data),
- index_col=0,
- parse_dates=parse_dates,
- )
- index = DatetimeIndex(
- [
- datetime(2009, 1, 31, 0, 10, 0),
- datetime(2009, 2, 28, 10, 20, 0),
- datetime(2009, 3, 31, 8, 30, 0),
- ],
- dtype=object,
- name="date_time",
- )
- expected = DataFrame({"B": [1, 3, 5], "C": [2, 4, 6]}, index=index)
- tm.assert_frame_equal(result, expected)
-
-
@xfail_pyarrow
@pytest.mark.parametrize("parse_dates", [[0, 2], ["a", "c"]])
def test_parse_dates_column_list(all_parsers, parse_dates):
@@ -1026,282 +439,11 @@ def test_parse_tz_aware(all_parsers):
assert result.index.tz is expected_tz
-@xfail_pyarrow
-@pytest.mark.parametrize(
- "parse_dates,index_col",
- [({"nominal": [1, 2]}, "nominal"), ({"nominal": [1, 2]}, 0), ([[1, 2]], 0)],
-)
-def test_multiple_date_cols_index(all_parsers, parse_dates, index_col):
- parser = all_parsers
- data = """
-ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir
-KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
-KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
-KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
-KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
-KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
-KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
-"""
- expected = DataFrame(
- [
- [
- datetime(1999, 1, 27, 19, 0),
- "KORD1",
- " 18:56:00",
- 0.81,
- 2.81,
- 7.2,
- 0.0,
- 280.0,
- ],
- [
- datetime(1999, 1, 27, 20, 0),
- "KORD2",
- " 19:56:00",
- 0.01,
- 2.21,
- 7.2,
- 0.0,
- 260.0,
- ],
- [
- datetime(1999, 1, 27, 21, 0),
- "KORD3",
- " 20:56:00",
- -0.59,
- 2.21,
- 5.7,
- 0.0,
- 280.0,
- ],
- [
- datetime(1999, 1, 27, 21, 0),
- "KORD4",
- " 21:18:00",
- -0.99,
- 2.01,
- 3.6,
- 0.0,
- 270.0,
- ],
- [
- datetime(1999, 1, 27, 22, 0),
- "KORD5",
- " 21:56:00",
- -0.59,
- 1.71,
- 5.1,
- 0.0,
- 290.0,
- ],
- [
- datetime(1999, 1, 27, 23, 0),
- "KORD6",
- " 22:56:00",
- -0.59,
- 1.71,
- 4.6,
- 0.0,
- 280.0,
- ],
- ],
- columns=[
- "nominal",
- "ID",
- "ActualTime",
- "TDew",
- "TAir",
- "Windspeed",
- "Precip",
- "WindDir",
- ],
- )
- expected = expected.set_index("nominal")
-
- if not isinstance(parse_dates, dict):
- expected.index.name = "date_NominalTime"
-
- depr_msg = (
- "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated"
- )
- with tm.assert_produces_warning(
- FutureWarning, match=depr_msg, check_stacklevel=False
- ):
- result = parser.read_csv(
- StringIO(data), parse_dates=parse_dates, index_col=index_col
- )
- tm.assert_frame_equal(result, expected)
-
-
-@xfail_pyarrow
-def test_multiple_date_cols_chunked(all_parsers):
- parser = all_parsers
- data = """\
-ID,date,nominalTime,actualTime,A,B,C,D,E
-KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
-KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
-KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
-KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
-KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
-KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
-"""
-
- expected = DataFrame(
- [
- [
- datetime(1999, 1, 27, 19, 0),
- "KORD",
- " 18:56:00",
- 0.81,
- 2.81,
- 7.2,
- 0.0,
- 280.0,
- ],
- [
- datetime(1999, 1, 27, 20, 0),
- "KORD",
- " 19:56:00",
- 0.01,
- 2.21,
- 7.2,
- 0.0,
- 260.0,
- ],
- [
- datetime(1999, 1, 27, 21, 0),
- "KORD",
- " 20:56:00",
- -0.59,
- 2.21,
- 5.7,
- 0.0,
- 280.0,
- ],
- [
- datetime(1999, 1, 27, 21, 0),
- "KORD",
- " 21:18:00",
- -0.99,
- 2.01,
- 3.6,
- 0.0,
- 270.0,
- ],
- [
- datetime(1999, 1, 27, 22, 0),
- "KORD",
- " 21:56:00",
- -0.59,
- 1.71,
- 5.1,
- 0.0,
- 290.0,
- ],
- [
- datetime(1999, 1, 27, 23, 0),
- "KORD",
- " 22:56:00",
- -0.59,
- 1.71,
- 4.6,
- 0.0,
- 280.0,
- ],
- ],
- columns=["nominal", "ID", "actualTime", "A", "B", "C", "D", "E"],
- )
- expected = expected.set_index("nominal")
-
- depr_msg = (
- "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated"
- )
- with tm.assert_produces_warning(
- FutureWarning, match=depr_msg, check_stacklevel=False
- ):
- with parser.read_csv(
- StringIO(data),
- parse_dates={"nominal": [1, 2]},
- index_col="nominal",
- chunksize=2,
- ) as reader:
- chunks = list(reader)
-
- tm.assert_frame_equal(chunks[0], expected[:2])
- tm.assert_frame_equal(chunks[1], expected[2:4])
- tm.assert_frame_equal(chunks[2], expected[4:])
-
-
-def test_multiple_date_col_named_index_compat(all_parsers):
- parser = all_parsers
- data = """\
-ID,date,nominalTime,actualTime,A,B,C,D,E
-KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
-KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
-KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
-KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
-KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
-KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
-"""
-
- depr_msg = (
- "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated"
- )
- with tm.assert_produces_warning(
- FutureWarning, match=depr_msg, check_stacklevel=False
- ):
- with_indices = parser.read_csv(
- StringIO(data), parse_dates={"nominal": [1, 2]}, index_col="nominal"
- )
-
- with tm.assert_produces_warning(
- FutureWarning, match=depr_msg, check_stacklevel=False
- ):
- with_names = parser.read_csv(
- StringIO(data),
- index_col="nominal",
- parse_dates={"nominal": ["date", "nominalTime"]},
- )
- tm.assert_frame_equal(with_indices, with_names)
-
-
-def test_multiple_date_col_multiple_index_compat(all_parsers):
- parser = all_parsers
- data = """\
-ID,date,nominalTime,actualTime,A,B,C,D,E
-KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
-KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
-KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
-KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
-KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
-KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
-"""
- depr_msg = (
- "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated"
- )
- with tm.assert_produces_warning(
- FutureWarning, match=depr_msg, check_stacklevel=False
- ):
- result = parser.read_csv(
- StringIO(data), index_col=["nominal", "ID"], parse_dates={"nominal": [1, 2]}
- )
- with tm.assert_produces_warning(
- FutureWarning, match=depr_msg, check_stacklevel=False
- ):
- expected = parser.read_csv(StringIO(data), parse_dates={"nominal": [1, 2]})
-
- expected = expected.set_index(["nominal", "ID"])
- tm.assert_frame_equal(result, expected)
-
-
@pytest.mark.parametrize("kwargs", [{}, {"index_col": "C"}])
def test_read_with_parse_dates_scalar_non_bool(all_parsers, kwargs):
# see gh-5636
parser = all_parsers
- msg = (
- "Only booleans, lists, and dictionaries "
- "are accepted for the 'parse_dates' parameter"
- )
+ msg = "Only booleans and lists " "are accepted for the 'parse_dates' parameter"
data = """A,B,C
1,2,2003-11-1"""
@@ -1312,15 +454,12 @@ def test_read_with_parse_dates_scalar_non_bool(all_parsers, kwargs):
@pytest.mark.parametrize("parse_dates", [(1,), np.array([4, 5]), {1, 3}])
def test_read_with_parse_dates_invalid_type(all_parsers, parse_dates):
parser = all_parsers
- msg = (
- "Only booleans, lists, and dictionaries "
- "are accepted for the 'parse_dates' parameter"
- )
+ msg = "Only booleans and lists " "are accepted for the 'parse_dates' parameter"
data = """A,B,C
1,2,2003-11-1"""
with pytest.raises(TypeError, match=msg):
- parser.read_csv(StringIO(data), parse_dates=(1,))
+ parser.read_csv(StringIO(data), parse_dates=parse_dates)
@pytest.mark.parametrize("value", ["nan", ""])
@@ -1463,240 +602,6 @@ def test_parse_dates_no_convert_thousands(all_parsers, data, kwargs, expected):
tm.assert_frame_equal(result, expected)
-@xfail_pyarrow
-def test_parse_date_time_multi_level_column_name(all_parsers):
- data = """\
-D,T,A,B
-date, time,a,b
-2001-01-05, 09:00:00, 0.0, 10.
-2001-01-06, 00:00:00, 1.0, 11.
-"""
- parser = all_parsers
- result = parser.read_csv_check_warnings(
- FutureWarning,
- "use 'date_format' instead",
- StringIO(data),
- header=[0, 1],
- parse_dates={"date_time": [0, 1]},
- date_parser=pd.to_datetime,
- )
-
- expected_data = [
- [datetime(2001, 1, 5, 9, 0, 0), 0.0, 10.0],
- [datetime(2001, 1, 6, 0, 0, 0), 1.0, 11.0],
- ]
- expected = DataFrame(expected_data, columns=["date_time", ("A", "a"), ("B", "b")])
- tm.assert_frame_equal(result, expected)
-
-
-@pytest.mark.parametrize(
- "data,kwargs,expected",
- [
- (
- """\
-date,time,a,b
-2001-01-05, 10:00:00, 0.0, 10.
-2001-01-05, 00:00:00, 1., 11.
-""",
- {"header": 0, "parse_dates": {"date_time": [0, 1]}},
- DataFrame(
- [
- [datetime(2001, 1, 5, 10, 0, 0), 0.0, 10],
- [datetime(2001, 1, 5, 0, 0, 0), 1.0, 11.0],
- ],
- columns=["date_time", "a", "b"],
- ),
- ),
- (
- (
- "KORD,19990127, 19:00:00, 18:56:00, 0.8100\n"
- "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n"
- "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n"
- "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n"
- "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n"
- "KORD,19990127, 23:00:00, 22:56:00, -0.5900"
- ),
- {"header": None, "parse_dates": {"actual": [1, 2], "nominal": [1, 3]}},
- DataFrame(
- [
- [
- datetime(1999, 1, 27, 19, 0),
- datetime(1999, 1, 27, 18, 56),
- "KORD",
- 0.81,
- ],
- [
- datetime(1999, 1, 27, 20, 0),
- datetime(1999, 1, 27, 19, 56),
- "KORD",
- 0.01,
- ],
- [
- datetime(1999, 1, 27, 21, 0),
- datetime(1999, 1, 27, 20, 56),
- "KORD",
- -0.59,
- ],
- [
- datetime(1999, 1, 27, 21, 0),
- datetime(1999, 1, 27, 21, 18),
- "KORD",
- -0.99,
- ],
- [
- datetime(1999, 1, 27, 22, 0),
- datetime(1999, 1, 27, 21, 56),
- "KORD",
- -0.59,
- ],
- [
- datetime(1999, 1, 27, 23, 0),
- datetime(1999, 1, 27, 22, 56),
- "KORD",
- -0.59,
- ],
- ],
- columns=["actual", "nominal", 0, 4],
- ),
- ),
- ],
-)
-def test_parse_date_time(all_parsers, data, kwargs, expected):
- parser = all_parsers
- result = parser.read_csv_check_warnings(
- FutureWarning,
- "use 'date_format' instead",
- StringIO(data),
- date_parser=pd.to_datetime,
- **kwargs,
- raise_on_extra_warnings=False,
- )
-
- # Python can sometimes be flaky about how
- # the aggregated columns are entered, so
- # this standardizes the order.
- result = result[expected.columns]
- tm.assert_frame_equal(result, expected)
-
-
-def test_parse_date_fields(all_parsers):
- parser = all_parsers
- data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11."
- result = parser.read_csv_check_warnings(
- FutureWarning,
- "use 'date_format' instead",
- StringIO(data),
- header=0,
- parse_dates={"ymd": [0, 1, 2]},
- date_parser=lambda x: x,
- raise_on_extra_warnings=False,
- )
-
- expected = DataFrame(
- [[datetime(2001, 1, 10), 10.0], [datetime(2001, 2, 1), 11.0]],
- columns=["ymd", "a"],
- )
- tm.assert_frame_equal(result, expected)
-
-
-@pytest.mark.parametrize(
- ("key", "value", "warn"),
- [
- (
- "date_parser",
- lambda x: pd.to_datetime(x, format="%Y %m %d %H %M %S"),
- FutureWarning,
- ),
- ("date_format", "%Y %m %d %H %M %S", None),
- ],
-)
-def test_parse_date_all_fields(all_parsers, key, value, warn):
- parser = all_parsers
- data = """\
-year,month,day,hour,minute,second,a,b
-2001,01,05,10,00,0,0.0,10.
-2001,01,5,10,0,00,1.,11.
-"""
- result = parser.read_csv_check_warnings(
- warn,
- "use 'date_format' instead",
- StringIO(data),
- header=0,
- parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]},
- **{key: value},
- raise_on_extra_warnings=False,
- )
- expected = DataFrame(
- [
- [datetime(2001, 1, 5, 10, 0, 0), 0.0, 10.0],
- [datetime(2001, 1, 5, 10, 0, 0), 1.0, 11.0],
- ],
- columns=["ymdHMS", "a", "b"],
- )
- tm.assert_frame_equal(result, expected)
-
-
-@pytest.mark.parametrize(
- ("key", "value", "warn"),
- [
- (
- "date_parser",
- lambda x: pd.to_datetime(x, format="%Y %m %d %H %M %S.%f"),
- FutureWarning,
- ),
- ("date_format", "%Y %m %d %H %M %S.%f", None),
- ],
-)
-def test_datetime_fractional_seconds(all_parsers, key, value, warn):
- parser = all_parsers
- data = """\
-year,month,day,hour,minute,second,a,b
-2001,01,05,10,00,0.123456,0.0,10.
-2001,01,5,10,0,0.500000,1.,11.
-"""
- result = parser.read_csv_check_warnings(
- warn,
- "use 'date_format' instead",
- StringIO(data),
- header=0,
- parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]},
- **{key: value},
- raise_on_extra_warnings=False,
- )
- expected = DataFrame(
- [
- [datetime(2001, 1, 5, 10, 0, 0, microsecond=123456), 0.0, 10.0],
- [datetime(2001, 1, 5, 10, 0, 0, microsecond=500000), 1.0, 11.0],
- ],
- columns=["ymdHMS", "a", "b"],
- )
- tm.assert_frame_equal(result, expected)
-
-
-def test_generic(all_parsers):
- parser = all_parsers
- data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11."
-
- def parse_function(yy, mm):
- return [date(year=int(y), month=int(m), day=1) for y, m in zip(yy, mm)]
-
- result = parser.read_csv_check_warnings(
- FutureWarning,
- "use 'date_format' instead",
- StringIO(data),
- header=0,
- parse_dates={"ym": [0, 1]},
- date_parser=parse_function,
- raise_on_extra_warnings=False,
- )
- expected = DataFrame(
- [[date(2001, 1, 1), 10, 10.0], [date(2001, 2, 1), 1, 11.0]],
- columns=["ym", "day", "a"],
- )
- expected["ym"] = expected["ym"].astype("datetime64[ns]")
- tm.assert_frame_equal(result, expected)
-
-
@xfail_pyarrow
def test_date_parser_resolution_if_not_ns(all_parsers):
# see gh-10245
@@ -1895,11 +800,6 @@ def test_parse_multiple_delimited_dates_with_swap_warnings():
[
(None, ["val"], ["date", "time"], "date, time"),
(None, ["val"], [0, "time"], "time"),
- (None, ["val"], [["date", "time"]], "date, time"),
- (None, ["val"], [[0, "time"]], "time"),
- (None, ["val"], {"date": [0, "time"]}, "time"),
- (None, ["val"], {"date": ["date", "time"]}, "date, time"),
- (None, ["val"], [["date", "time"], "date"], "date, time"),
(["date1", "time1", "temperature"], None, ["date", "time"], "date, time"),
(
["date1", "time1", "temperature"],
@@ -1917,20 +817,10 @@ def test_missing_parse_dates_column_raises(
content = StringIO("date,time,val\n2020-01-31,04:20:32,32\n")
msg = f"Missing column provided to 'parse_dates': '{missing_cols}'"
- depr_msg = (
- "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated"
- )
- warn = FutureWarning
- if isinstance(parse_dates, list) and all(
- isinstance(x, (int, str)) for x in parse_dates
- ):
- warn = None
-
with pytest.raises(ValueError, match=msg):
- with tm.assert_produces_warning(warn, match=depr_msg, check_stacklevel=False):
- parser.read_csv(
- content, sep=",", names=names, usecols=usecols, parse_dates=parse_dates
- )
+ parser.read_csv(
+ content, sep=",", names=names, usecols=usecols, parse_dates=parse_dates
+ )
@xfail_pyarrow # mismatched shape
@@ -1966,37 +856,6 @@ def test_date_parser_multiindex_columns(all_parsers):
tm.assert_frame_equal(result, expected)
-@xfail_pyarrow # TypeError: an integer is required
-@pytest.mark.parametrize(
- "parse_spec, col_name",
- [
- ([[("a", "1"), ("b", "2")]], ("a_b", "1_2")),
- ({("foo", "1"): [("a", "1"), ("b", "2")]}, ("foo", "1")),
- ],
-)
-def test_date_parser_multiindex_columns_combine_cols(all_parsers, parse_spec, col_name):
- parser = all_parsers
- data = """a,b,c
-1,2,3
-2019-12,-31,6"""
-
- depr_msg = (
- "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated"
- )
- with tm.assert_produces_warning(
- FutureWarning, match=depr_msg, check_stacklevel=False
- ):
- result = parser.read_csv(
- StringIO(data),
- parse_dates=parse_spec,
- header=[0, 1],
- )
- expected = DataFrame(
- {col_name: Timestamp("2019-12-31").as_unit("ns"), ("c", "3"): [6]}
- )
- tm.assert_frame_equal(result, expected)
-
-
def test_date_parser_usecols_thousands(all_parsers):
# GH#39365
data = """A,B,C
@@ -2030,26 +889,6 @@ def test_date_parser_usecols_thousands(all_parsers):
tm.assert_frame_equal(result, expected)
-@xfail_pyarrow # mismatched shape
-def test_parse_dates_and_keep_original_column(all_parsers):
- # GH#13378
- parser = all_parsers
- data = """A
-20150908
-20150909
-"""
- depr_msg = "The 'keep_date_col' keyword in pd.read_csv is deprecated"
- with tm.assert_produces_warning(
- FutureWarning, match=depr_msg, check_stacklevel=False
- ):
- result = parser.read_csv(
- StringIO(data), parse_dates={"date": ["A"]}, keep_date_col=True
- )
- expected_data = [Timestamp("2015-09-08"), Timestamp("2015-09-09")]
- expected = DataFrame({"date": expected_data, "A": expected_data})
- tm.assert_frame_equal(result, expected)
-
-
def test_dayfirst_warnings():
# GH 12585
From 9b53badafab3a36ef2fe24924c56607cd3729d5a Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 7 May 2024 14:25:36 -0700
Subject: [PATCH 02/12] Add whatsnew, address other tests
---
doc/source/whatsnew/v3.0.0.rst | 2 +
pandas/io/parsers/base_parser.py | 2 +-
pandas/io/parsers/c_parser_wrapper.py | 44 ++++---
.../io/parser/common/test_common_basic.py | 51 ---------
pandas/tests/io/parser/test_parse_dates.py | 66 -----------
.../io/parser/usecols/test_parse_dates.py | 108 ------------------
6 files changed, 23 insertions(+), 250 deletions(-)
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index 90923bfac8e62..01d8e34964c33 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -254,7 +254,9 @@ Removal of prior version deprecations/changes
- Enforced deprecation of :meth:`offsets.Tick.delta`, use ``pd.Timedelta(obj)`` instead (:issue:`55498`)
- Enforced deprecation of ``axis=None`` acting the same as ``axis=0`` in the DataFrame reductions ``sum``, ``prod``, ``std``, ``var``, and ``sem``, passing ``axis=None`` will now reduce over both axes; this is particularly the case when doing e.g. ``numpy.sum(df)`` (:issue:`21597`)
- Enforced deprecation of ``core.internals`` members ``Block``, ``ExtensionBlock``, and ``DatetimeTZBlock`` (:issue:`58467`)
+- Enforced deprecation of ``keep_date_col`` keyword in :func:`read_csv` (:issue:`55569`)
- Enforced deprecation of ``quantile`` keyword in :meth:`.Rolling.quantile` and :meth:`.Expanding.quantile`, renamed to ``q`` instead. (:issue:`52550`)
+- Enforced deprecation of combining parsed datetime columns in :func:`read_csv` in ``parse_dates`` (:issue:`55569`)
- Enforced deprecation of non-standard (``np.ndarray``, :class:`ExtensionArray`, :class:`Index`, or :class:`Series`) argument to :func:`api.extensions.take` (:issue:`52981`)
- Enforced deprecation of parsing system timezone strings to ``tzlocal``, which depended on system timezone, pass the 'tz' keyword instead (:issue:`50791`)
- Enforced deprecation of passing a dictionary to :meth:`SeriesGroupBy.agg` (:issue:`52268`)
diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
index 962360a29aecb..c442fceec84da 100644
--- a/pandas/io/parsers/base_parser.py
+++ b/pandas/io/parsers/base_parser.py
@@ -122,7 +122,7 @@ def __init__(self, kwds) -> None:
self.col_names: Sequence[Hashable] | None = None
parse_dates = kwds.pop("parse_dates", False)
- if isinstance(parse_dates, None) or lib.is_bool(parse_dates):
+ if parse_dates is None or lib.is_bool(parse_dates):
parse_dates = bool(parse_dates)
elif not isinstance(parse_dates, list):
raise TypeError(
diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py
index 1baca9d48d795..0740978bbbc3c 100644
--- a/pandas/io/parsers/c_parser_wrapper.py
+++ b/pandas/io/parsers/c_parser_wrapper.py
@@ -166,29 +166,28 @@ def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None:
# error: Cannot determine type of 'names'
self.orig_names = self.names # type: ignore[has-type]
- if not self._has_complex_date_col:
- # error: Cannot determine type of 'index_col'
- if self._reader.leading_cols == 0 and is_index_col(
- self.index_col # type: ignore[has-type]
- ):
- (
- index_names,
- # error: Cannot determine type of 'names'
- self.names, # type: ignore[has-type]
- self.index_col,
- ) = self._clean_index_names(
- # error: Cannot determine type of 'names'
- self.names, # type: ignore[has-type]
- # error: Cannot determine type of 'index_col'
- self.index_col, # type: ignore[has-type]
- )
+ # error: Cannot determine type of 'index_col'
+ if self._reader.leading_cols == 0 and is_index_col(
+ self.index_col # type: ignore[has-type]
+ ):
+ (
+ index_names,
+ # error: Cannot determine type of 'names'
+ self.names, # type: ignore[has-type]
+ self.index_col,
+ ) = self._clean_index_names(
+ # error: Cannot determine type of 'names'
+ self.names, # type: ignore[has-type]
+ # error: Cannot determine type of 'index_col'
+ self.index_col, # type: ignore[has-type]
+ )
- if self.index_names is None:
- self.index_names = index_names
+ if self.index_names is None:
+ self.index_names = index_names
- if self._reader.header is None and not passed_names:
- assert self.index_names is not None
- self.index_names = [None] * len(self.index_names)
+ if self._reader.header is None and not passed_names:
+ assert self.index_names is not None
+ self.index_names = [None] * len(self.index_names)
self._implicit_index = self._reader.leading_cols > 0
@@ -273,9 +272,6 @@ def read(
names = self.names # type: ignore[has-type]
if self._reader.leading_cols:
- if self._has_complex_date_col:
- raise NotImplementedError("file structure not yet supported")
-
# implicit index, no index names
arrays = []
diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py
index 485680d9de48c..d79e0c34edaab 100644
--- a/pandas/tests/io/parser/common/test_common_basic.py
+++ b/pandas/tests/io/parser/common/test_common_basic.py
@@ -22,14 +22,10 @@
from pandas import (
DataFrame,
Index,
- Timestamp,
compat,
)
import pandas._testing as tm
-from pandas.io.parsers import TextFileReader
-from pandas.io.parsers.c_parser_wrapper import CParserWrapper
-
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
@@ -38,53 +34,6 @@
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
-def test_override_set_noconvert_columns():
- # see gh-17351
- #
- # Usecols needs to be sorted in _set_noconvert_columns based
- # on the test_usecols_with_parse_dates test from test_usecols.py
- class MyTextFileReader(TextFileReader):
- def __init__(self) -> None:
- self._currow = 0
- self.squeeze = False
-
- class MyCParserWrapper(CParserWrapper):
- def _set_noconvert_columns(self):
- if self.usecols_dtype == "integer":
- # self.usecols is a set, which is documented as unordered
- # but in practice, a CPython set of integers is sorted.
- # In other implementations this assumption does not hold.
- # The following code simulates a different order, which
- # before GH 17351 would cause the wrong columns to be
- # converted via the parse_dates parameter
- self.usecols = list(self.usecols)
- self.usecols.reverse()
- return CParserWrapper._set_noconvert_columns(self)
-
- data = """a,b,c,d,e
-0,1,2014-01-01,09:00,4
-0,1,2014-01-02,10:00,4"""
-
- parse_dates = [[1, 2]]
- cols = {
- "a": [0, 0],
- "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")],
- }
- expected = DataFrame(cols, columns=["c_d", "a"])
-
- parser = MyTextFileReader()
- parser.options = {
- "usecols": [0, 2, 3],
- "parse_dates": parse_dates,
- "delimiter": ",",
- }
- parser.engine = "c"
- parser._engine = MyCParserWrapper(StringIO(data), **parser.options)
-
- result = parser.read()
- tm.assert_frame_equal(result, expected)
-
-
def test_read_csv_local(all_parsers, csv1):
prefix = "file:///" if compat.is_platform_windows() else "file://"
parser = all_parsers
diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py
index 96ff06ceafa3b..37d20b8b52d68 100644
--- a/pandas/tests/io/parser/test_parse_dates.py
+++ b/pandas/tests/io/parser/test_parse_dates.py
@@ -602,45 +602,6 @@ def test_parse_dates_no_convert_thousands(all_parsers, data, kwargs, expected):
tm.assert_frame_equal(result, expected)
-@xfail_pyarrow
-def test_date_parser_resolution_if_not_ns(all_parsers):
- # see gh-10245
- parser = all_parsers
- data = """\
-date,time,prn,rxstatus
-2013-11-03,19:00:00,126,00E80000
-2013-11-03,19:00:00,23,00E80000
-2013-11-03,19:00:00,13,00E80000
-"""
-
- def date_parser(dt, time):
- try:
- arr = dt + "T" + time
- except TypeError:
- # dt & time are date/time objects
- arr = [datetime.combine(d, t) for d, t in zip(dt, time)]
- return np.array(arr, dtype="datetime64[s]")
-
- result = parser.read_csv_check_warnings(
- FutureWarning,
- "use 'date_format' instead",
- StringIO(data),
- date_parser=date_parser,
- parse_dates={"datetime": ["date", "time"]},
- index_col=["datetime", "prn"],
- )
-
- datetimes = np.array(["2013-11-03T19:00:00"] * 3, dtype="datetime64[s]")
- expected = DataFrame(
- data={"rxstatus": ["00E80000"] * 3},
- index=MultiIndex.from_arrays(
- [datetimes, [126, 23, 13]],
- names=["datetime", "prn"],
- ),
- )
- tm.assert_frame_equal(result, expected)
-
-
def test_parse_date_column_with_empty_string(all_parsers):
# see gh-6428
parser = all_parsers
@@ -1092,33 +1053,6 @@ def test_parse_dates_dict_format(all_parsers):
tm.assert_frame_equal(result, expected)
-@pytest.mark.parametrize(
- "key, parse_dates", [("a_b", [[0, 1]]), ("foo", {"foo": [0, 1]})]
-)
-def test_parse_dates_dict_format_two_columns(all_parsers, key, parse_dates):
- # GH#51240
- parser = all_parsers
- data = """a,b
-31-,12-2019
-31-,12-2020"""
-
- depr_msg = (
- "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated"
- )
- with tm.assert_produces_warning(
- FutureWarning, match=depr_msg, check_stacklevel=False
- ):
- result = parser.read_csv(
- StringIO(data), date_format={key: "%d- %m-%Y"}, parse_dates=parse_dates
- )
- expected = DataFrame(
- {
- key: [Timestamp("2019-12-31"), Timestamp("2020-12-31")],
- }
- )
- tm.assert_frame_equal(result, expected)
-
-
@xfail_pyarrow # object dtype index
def test_parse_dates_dict_format_index(all_parsers):
# GH#51240
diff --git a/pandas/tests/io/parser/usecols/test_parse_dates.py b/pandas/tests/io/parser/usecols/test_parse_dates.py
index ab98857e0c178..0cf3fe894c916 100644
--- a/pandas/tests/io/parser/usecols/test_parse_dates.py
+++ b/pandas/tests/io/parser/usecols/test_parse_dates.py
@@ -26,42 +26,6 @@
)
-@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]])
-def test_usecols_with_parse_dates(all_parsers, usecols):
- # see gh-9755
- data = """a,b,c,d,e
-0,1,2014-01-01,09:00,4
-0,1,2014-01-02,10:00,4"""
- parser = all_parsers
- parse_dates = [[1, 2]]
-
- depr_msg = (
- "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated"
- )
-
- cols = {
- "a": [0, 0],
- "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")],
- }
- expected = DataFrame(cols, columns=["c_d", "a"])
- if parser.engine == "pyarrow":
- with pytest.raises(ValueError, match=_msg_pyarrow_requires_names):
- with tm.assert_produces_warning(
- FutureWarning, match=depr_msg, check_stacklevel=False
- ):
- parser.read_csv(
- StringIO(data), usecols=usecols, parse_dates=parse_dates
- )
- return
- with tm.assert_produces_warning(
- FutureWarning, match=depr_msg, check_stacklevel=False
- ):
- result = parser.read_csv(
- StringIO(data), usecols=usecols, parse_dates=parse_dates
- )
- tm.assert_frame_equal(result, expected)
-
-
@skip_pyarrow # pyarrow.lib.ArrowKeyError: Column 'fdate' in include_columns
def test_usecols_with_parse_dates2(all_parsers):
# see gh-13604
@@ -121,75 +85,3 @@ def test_usecols_with_parse_dates3(all_parsers):
result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates)
tm.assert_frame_equal(result, expected)
-
-
-def test_usecols_with_parse_dates4(all_parsers):
- data = "a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8"
- usecols = list("abcdefghij")
- parse_dates = [[0, 1]]
- parser = all_parsers
-
- cols = {
- "a_b": "2016/09/21 1",
- "c": [1],
- "d": [2],
- "e": [3],
- "f": [4],
- "g": [5],
- "h": [6],
- "i": [7],
- "j": [8],
- }
- expected = DataFrame(cols, columns=["a_b"] + list("cdefghij"))
-
- depr_msg = (
- "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated"
- )
- with tm.assert_produces_warning(
- FutureWarning, match=depr_msg, check_stacklevel=False
- ):
- result = parser.read_csv(
- StringIO(data),
- usecols=usecols,
- parse_dates=parse_dates,
- )
- tm.assert_frame_equal(result, expected)
-
-
-@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]])
-@pytest.mark.parametrize(
- "names",
- [
- list("abcde"), # Names span all columns in original data.
- list("acd"), # Names span only the selected columns.
- ],
-)
-def test_usecols_with_parse_dates_and_names(all_parsers, usecols, names, request):
- # see gh-9755
- s = """0,1,2014-01-01,09:00,4
-0,1,2014-01-02,10:00,4"""
- parse_dates = [[1, 2]]
- parser = all_parsers
-
- if parser.engine == "pyarrow" and not (len(names) == 3 and usecols[0] == 0):
- mark = pytest.mark.xfail(
- reason="Length mismatch in some cases, UserWarning in other"
- )
- request.applymarker(mark)
-
- cols = {
- "a": [0, 0],
- "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")],
- }
- expected = DataFrame(cols, columns=["c_d", "a"])
-
- depr_msg = (
- "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated"
- )
- with tm.assert_produces_warning(
- FutureWarning, match=depr_msg, check_stacklevel=False
- ):
- result = parser.read_csv(
- StringIO(s), names=names, parse_dates=parse_dates, usecols=usecols
- )
- tm.assert_frame_equal(result, expected)
From 1d7d7131c4f1f799f7b1f3dec15cba692a0c4efa Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 7 May 2024 14:34:26 -0700
Subject: [PATCH 03/12] Remove unnecessary reference
---
pandas/io/parsers/base_parser.py | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
index c442fceec84da..bc886cad05b94 100644
--- a/pandas/io/parsers/base_parser.py
+++ b/pandas/io/parsers/base_parser.py
@@ -1212,11 +1212,9 @@ def _isindex(colspec):
isinstance(index_names, list) and colspec in index_names
)
- orig_names = columns
-
for colspec in parse_spec:
if isinstance(colspec, int) and colspec not in data_dict:
- colspec = orig_names[colspec]
+ colspec = columns[colspec]
if _isindex(colspec):
continue
elif dtype_backend == "pyarrow":
From def098b8713935110e79eb938424f59459dc98ed Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 7 May 2024 14:35:20 -0700
Subject: [PATCH 04/12] inline function
---
pandas/io/parsers/base_parser.py | 9 +++------
1 file changed, 3 insertions(+), 6 deletions(-)
diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
index bc886cad05b94..284762227bc05 100644
--- a/pandas/io/parsers/base_parser.py
+++ b/pandas/io/parsers/base_parser.py
@@ -1207,15 +1207,12 @@ def _process_date_conversion(
columns,
dtype_backend=lib.no_default,
) -> tuple[dict, list]:
- def _isindex(colspec):
- return (isinstance(index_col, list) and colspec in index_col) or (
- isinstance(index_names, list) and colspec in index_names
- )
-
for colspec in parse_spec:
if isinstance(colspec, int) and colspec not in data_dict:
colspec = columns[colspec]
- if _isindex(colspec):
+ if (isinstance(index_col, list) and colspec in index_col) or (
+ isinstance(index_names, list) and colspec in index_names
+ ):
continue
elif dtype_backend == "pyarrow":
import pyarrow as pa
From ec963a2ee4ea698811ae4304a314ba48ab8623cb Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 7 May 2024 14:47:15 -0700
Subject: [PATCH 05/12] Remove os.remove
---
doc/source/user_guide/io.rst | 6 ------
1 file changed, 6 deletions(-)
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
index 5b91b1c048f39..04307c92a4d45 100644
--- a/doc/source/user_guide/io.rst
+++ b/doc/source/user_guide/io.rst
@@ -847,12 +847,6 @@ Performance-wise, you should try these methods of parsing dates in order:
then use ``to_datetime``.
-.. ipython:: python
- :suppress:
-
- os.remove("tmp.csv")
-
-
.. _io.csv.mixed_timezones:
Parsing a CSV with mixed timezones
From 26d14502369c5b2bbf243d9b9c4fd1b0caf6552d Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 7 May 2024 15:20:08 -0700
Subject: [PATCH 06/12] Address html and xml tests
---
pandas/tests/io/test_html.py | 14 -------
pandas/tests/io/xml/test_xml_dtypes.py | 54 +-------------------------
2 files changed, 1 insertion(+), 67 deletions(-)
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index f16f3a2a5c775..594c1d02b94cc 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -1050,20 +1050,6 @@ def test_parse_dates_list(self, flavor_read_html):
res = flavor_read_html(StringIO(expected), parse_dates=["date"], index_col=0)
tm.assert_frame_equal(df, res[0])
- def test_parse_dates_combine(self, flavor_read_html):
- raw_dates = Series(date_range("1/1/2001", periods=10))
- df = DataFrame(
- {
- "date": raw_dates.map(lambda x: str(x.date())),
- "time": raw_dates.map(lambda x: str(x.time())),
- }
- )
- res = flavor_read_html(
- StringIO(df.to_html()), parse_dates={"datetime": [1, 2]}, index_col=1
- )
- newdf = DataFrame({"datetime": raw_dates})
- tm.assert_frame_equal(newdf, res[0])
-
def test_wikipedia_states_table(self, datapath, flavor_read_html):
data = datapath("io", "data", "html", "wikipedia_states.html")
assert os.path.isfile(data), f"{data!r} is not a file"
diff --git a/pandas/tests/io/xml/test_xml_dtypes.py b/pandas/tests/io/xml/test_xml_dtypes.py
index a85576ff13f5c..1f290a673c554 100644
--- a/pandas/tests/io/xml/test_xml_dtypes.py
+++ b/pandas/tests/io/xml/test_xml_dtypes.py
@@ -378,58 +378,6 @@ def test_parse_dates_true(parser):
tm.assert_frame_equal(df_iter, df_expected)
-def test_parse_dates_dictionary(parser):
- xml = """
-
-
- square
- 360
- 4.0
- 2020
- 12
- 31
-
-
- circle
- 360
-
- 2021
- 12
- 31
-
-
- triangle
- 180
- 3.0
- 2022
- 12
- 31
-
-"""
-
- df_result = read_xml(
- StringIO(xml), parse_dates={"date_end": ["year", "month", "day"]}, parser=parser
- )
- df_iter = read_xml_iterparse(
- xml,
- parser=parser,
- parse_dates={"date_end": ["year", "month", "day"]},
- iterparse={"row": ["shape", "degrees", "sides", "year", "month", "day"]},
- )
-
- df_expected = DataFrame(
- {
- "date_end": to_datetime(["2020-12-31", "2021-12-31", "2022-12-31"]),
- "shape": ["square", "circle", "triangle"],
- "degrees": [360, 360, 180],
- "sides": [4.0, float("nan"), 3.0],
- }
- )
-
- tm.assert_frame_equal(df_result, df_expected)
- tm.assert_frame_equal(df_iter, df_expected)
-
-
def test_day_first_parse_dates(parser):
xml = """\
@@ -480,6 +428,6 @@ def test_day_first_parse_dates(parser):
def test_wrong_parse_dates_type(xml_books, parser, iterparse):
with pytest.raises(
- TypeError, match=("Only booleans, lists, and dictionaries are accepted")
+ TypeError, match="Only booleans and lists are accepted are accepted"
):
read_xml(xml_books, parse_dates={"date"}, parser=parser, iterparse=iterparse)
From 0fba554afa9ee2d312b33852a76c56f550954327 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 7 May 2024 15:39:01 -0700
Subject: [PATCH 07/12] Typo
---
pandas/tests/io/xml/test_xml_dtypes.py | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/pandas/tests/io/xml/test_xml_dtypes.py b/pandas/tests/io/xml/test_xml_dtypes.py
index 1f290a673c554..96ef50f9d7149 100644
--- a/pandas/tests/io/xml/test_xml_dtypes.py
+++ b/pandas/tests/io/xml/test_xml_dtypes.py
@@ -427,7 +427,5 @@ def test_day_first_parse_dates(parser):
def test_wrong_parse_dates_type(xml_books, parser, iterparse):
- with pytest.raises(
- TypeError, match="Only booleans and lists are accepted are accepted"
- ):
+ with pytest.raises(TypeError, match="Only booleans and lists are accepted"):
read_xml(xml_books, parse_dates={"date"}, parser=parser, iterparse=iterparse)
From a736a348d2f60a3e8d7e073c4154b3fc50948619 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 7 May 2024 19:16:57 -0700
Subject: [PATCH 08/12] Simplify _process_date_conversion
---
pandas/io/parsers/arrow_parser_wrapper.py | 4 ++--
pandas/io/parsers/base_parser.py | 25 ++++++++++++-----------
pandas/io/parsers/c_parser_wrapper.py | 8 +++-----
pandas/io/parsers/python_parser.py | 2 +-
4 files changed, 19 insertions(+), 20 deletions(-)
diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py
index f8263a65ef5c7..8b6f7d5750ffe 100644
--- a/pandas/io/parsers/arrow_parser_wrapper.py
+++ b/pandas/io/parsers/arrow_parser_wrapper.py
@@ -174,8 +174,8 @@ def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame:
self.names = list(range(num_cols - len(self.names))) + self.names
multi_index_named = False
frame.columns = self.names
- # we only need the frame not the names
- _, frame = self._do_date_conversions(frame.columns, frame)
+
+ frame = self._do_date_conversions(frame.columns, frame)
if self.index_col is not None:
index_to_set = self.index_col.copy()
for i, item in enumerate(self.index_col):
diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
index 284762227bc05..c217c08adaea7 100644
--- a/pandas/io/parsers/base_parser.py
+++ b/pandas/io/parsers/base_parser.py
@@ -814,25 +814,23 @@ def _do_date_conversions(
self,
names: Index,
data: DataFrame,
- ) -> tuple[Sequence[Hashable] | Index, DataFrame]: ...
+ ) -> DataFrame: ...
@overload
def _do_date_conversions(
self,
names: Sequence[Hashable],
data: Mapping[Hashable, ArrayLike],
- ) -> tuple[Sequence[Hashable], Mapping[Hashable, ArrayLike]]: ...
+ ) -> Mapping[Hashable, ArrayLike]: ...
@final
def _do_date_conversions(
self,
names: Sequence[Hashable] | Index,
data: Mapping[Hashable, ArrayLike] | DataFrame,
- ) -> tuple[Sequence[Hashable] | Index, Mapping[Hashable, ArrayLike] | DataFrame]:
- # returns data, columns
-
+ ) -> Mapping[Hashable, ArrayLike] | DataFrame:
if isinstance(self.parse_dates, list):
- data, names = _process_date_conversion(
+ return _process_date_conversion(
data,
self._date_conv,
self.parse_dates,
@@ -842,7 +840,7 @@ def _do_date_conversions(
dtype_backend=self.dtype_backend,
)
- return names, data
+ return data
@final
def _check_data_length(
@@ -1199,14 +1197,14 @@ def converter(*date_cols, col: Hashable):
def _process_date_conversion(
- data_dict,
+ data_dict: Mapping[Hashable, ArrayLike] | DataFrame,
converter: Callable,
parse_spec: list,
index_col,
index_names,
- columns,
+ columns: Sequence[Hashable] | Index,
dtype_backend=lib.no_default,
-) -> tuple[dict, list]:
+) -> Mapping[Hashable, ArrayLike] | DataFrame:
for colspec in parse_spec:
if isinstance(colspec, int) and colspec not in data_dict:
colspec = columns[colspec]
@@ -1226,9 +1224,12 @@ def _process_date_conversion(
# Pyarrow engine returns Series which we need to convert to
# numpy array before converter, its a no-op for other parsers
- data_dict[colspec] = converter(np.asarray(data_dict[colspec]), col=colspec)
+ result = converter(np.asarray(data_dict[colspec]), col=colspec)
+ # error: Unsupported target for indexed assignment
+ # ("Mapping[Hashable, ExtensionArray | ndarray[Any, Any]] | DataFrame")
+ data_dict[colspec] = result # type: ignore[index]
- return data_dict, columns
+ return data_dict
def _get_na_values(col, na_values, na_fvalues, keep_default_na: bool):
diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py
index 0740978bbbc3c..4de626288aa41 100644
--- a/pandas/io/parsers/c_parser_wrapper.py
+++ b/pandas/io/parsers/c_parser_wrapper.py
@@ -302,12 +302,10 @@ def read(
data_tups = sorted(data.items())
data = {k: v for k, (i, v) in zip(names, data_tups)}
- column_names, date_data = self._do_date_conversions(names, data)
+ date_data = self._do_date_conversions(names, data)
# maybe create a mi on the columns
- column_names = self._maybe_make_multi_index_columns(
- column_names, self.col_names
- )
+ column_names = self._maybe_make_multi_index_columns(names, self.col_names)
else:
# rename dict keys
@@ -330,7 +328,7 @@ def read(
data = {k: v for k, (i, v) in zip(names, data_tups)}
- names, date_data = self._do_date_conversions(names, data)
+ date_data = self._do_date_conversions(names, data)
index, column_names = self._make_index(date_data, alldata, names)
return index, column_names, date_data
diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
index 5b4139edc19a9..f7d2aa2419429 100644
--- a/pandas/io/parsers/python_parser.py
+++ b/pandas/io/parsers/python_parser.py
@@ -289,7 +289,7 @@ def read(
data, columns = self._exclude_implicit_index(alldata)
conv_data = self._convert_data(data)
- columns, conv_data = self._do_date_conversions(columns, conv_data)
+ conv_data = self._do_date_conversions(columns, conv_data)
index, result_columns = self._make_index(
conv_data, alldata, columns, indexnamerow
From 546957e200bef00f876f3497d6e20a289deec828 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 8 May 2024 10:23:33 -0700
Subject: [PATCH 09/12] Remove _get_complex_date_index
---
pandas/io/parsers/base_parser.py | 28 ----------------------------
1 file changed, 28 deletions(-)
diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
index a0afde1700ce0..2b2d5a3d2d4d6 100644
--- a/pandas/io/parsers/base_parser.py
+++ b/pandas/io/parsers/base_parser.py
@@ -381,34 +381,6 @@ def ix(col):
return index
- @final
- def _get_complex_date_index(self, data, col_names):
- def _get_name(icol):
- if isinstance(icol, str):
- return icol
-
- if col_names is None:
- raise ValueError(f"Must supply column order to use {icol!s} as index")
-
- for i, c in enumerate(col_names):
- if i == icol:
- return c
-
- to_remove = []
- index = []
- for idx in self.index_col:
- name = _get_name(idx)
- to_remove.append(name)
- index.append(data[name])
-
- # remove index items from content and columns, don't pop in
- # loop
- for c in sorted(to_remove, reverse=True):
- data.pop(c)
- col_names.remove(c)
-
- return index
-
@final
def _clean_mapping(self, mapping):
"""converts col numbers to names"""
From faf4c6f782ce9e7359eaf999a940000e426ee9ff Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 8 May 2024 12:18:56 -0700
Subject: [PATCH 10/12] Remove concat arrays for csv
---
asv_bench/benchmarks/io/parsers.py | 25 +------
pandas/_libs/tslibs/parsing.pyi | 3 -
pandas/_libs/tslibs/parsing.pyx | 81 ----------------------
pandas/io/parsers/base_parser.py | 14 ++--
pandas/tests/io/parser/test_parse_dates.py | 14 ----
5 files changed, 6 insertions(+), 131 deletions(-)
diff --git a/asv_bench/benchmarks/io/parsers.py b/asv_bench/benchmarks/io/parsers.py
index 1078837a8e395..d3fd5075a4707 100644
--- a/asv_bench/benchmarks/io/parsers.py
+++ b/asv_bench/benchmarks/io/parsers.py
@@ -1,10 +1,5 @@
-import numpy as np
-
try:
- from pandas._libs.tslibs.parsing import (
- _does_string_look_like_datetime,
- concat_date_cols,
- )
+ from pandas._libs.tslibs.parsing import _does_string_look_like_datetime
except ImportError:
# Avoid whole benchmark suite import failure on asv (currently 0.4)
pass
@@ -20,21 +15,3 @@ def setup(self, value):
def time_check_datetimes(self, value):
for obj in self.objects:
_does_string_look_like_datetime(obj)
-
-
-class ConcatDateCols:
- params = ([1234567890, "AAAA"], [1, 2])
- param_names = ["value", "dim"]
-
- def setup(self, value, dim):
- count_elem = 10000
- if dim == 1:
- self.object = (np.array([value] * count_elem),)
- if dim == 2:
- self.object = (
- np.array([value] * count_elem),
- np.array([value] * count_elem),
- )
-
- def time_check_concat(self, value, dim):
- concat_date_cols(self.object)
diff --git a/pandas/_libs/tslibs/parsing.pyi b/pandas/_libs/tslibs/parsing.pyi
index 40394f915d4b0..845bd9a5a5635 100644
--- a/pandas/_libs/tslibs/parsing.pyi
+++ b/pandas/_libs/tslibs/parsing.pyi
@@ -27,7 +27,4 @@ def guess_datetime_format(
dt_str: str,
dayfirst: bool | None = ...,
) -> str | None: ...
-def concat_date_cols(
- date_cols: tuple,
-) -> npt.NDArray[np.object_]: ...
def get_rule_month(source: str) -> str: ...
diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx
index 85ef3fd93ff09..3d930dab9a949 100644
--- a/pandas/_libs/tslibs/parsing.pyx
+++ b/pandas/_libs/tslibs/parsing.pyx
@@ -29,11 +29,6 @@ import numpy as np
cimport numpy as cnp
from numpy cimport (
- PyArray_GETITEM,
- PyArray_ITER_DATA,
- PyArray_ITER_NEXT,
- PyArray_IterNew,
- flatiter,
float64_t,
int64_t,
)
@@ -75,8 +70,6 @@ import_pandas_datetime()
from pandas._libs.tslibs.strptime import array_strptime
-from pandas._libs.tslibs.util cimport is_array
-
cdef extern from "pandas/portable.h":
int getdigit_ascii(char c, int default) nogil
@@ -1132,80 +1125,6 @@ cdef object convert_to_unicode(object item, bint keep_trivial_numbers):
return item
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def concat_date_cols(tuple date_cols) -> np.ndarray:
- """
- Concatenates elements from numpy arrays in `date_cols` into strings.
-
- Parameters
- ----------
- date_cols : tuple[ndarray]
-
- Returns
- -------
- arr_of_rows : ndarray[object]
-
- Examples
- --------
- >>> dates=np.array(['3/31/2019', '4/31/2019'], dtype=object)
- >>> times=np.array(['11:20', '10:45'], dtype=object)
- >>> result = concat_date_cols((dates, times))
- >>> result
- array(['3/31/2019 11:20', '4/31/2019 10:45'], dtype=object)
- """
- cdef:
- Py_ssize_t rows_count = 0, col_count = len(date_cols)
- Py_ssize_t col_idx, row_idx
- list list_to_join
- cnp.ndarray[object] iters
- object[::1] iters_view
- flatiter it
- cnp.ndarray[object] result
- object[::1] result_view
-
- if col_count == 0:
- return np.zeros(0, dtype=object)
-
- if not all(is_array(array) for array in date_cols):
- raise ValueError("not all elements from date_cols are numpy arrays")
-
- rows_count = min(len(array) for array in date_cols)
- result = np.zeros(rows_count, dtype=object)
- result_view = result
-
- if col_count == 1:
- array = date_cols[0]
- it = PyArray_IterNew(array)
- for row_idx in range(rows_count):
- item = PyArray_GETITEM(array, PyArray_ITER_DATA(it))
- result_view[row_idx] = convert_to_unicode(item, True)
- PyArray_ITER_NEXT(it)
- else:
- # create fixed size list - more efficient memory allocation
- list_to_join = [None] * col_count
- iters = np.zeros(col_count, dtype=object)
-
- # create memoryview of iters ndarray, that will contain some
- # flatiter's for each array in `date_cols` - more efficient indexing
- iters_view = iters
- for col_idx, array in enumerate(date_cols):
- iters_view[col_idx] = PyArray_IterNew(array)
-
- # array elements that are on the same line are converted to one string
- for row_idx in range(rows_count):
- for col_idx, array in enumerate(date_cols):
- # this cast is needed, because we did not find a way
- # to efficiently store `flatiter` type objects in ndarray
- it = iters_view[col_idx]
- item = PyArray_GETITEM(array, PyArray_ITER_DATA(it))
- list_to_join[col_idx] = convert_to_unicode(item, False)
- PyArray_ITER_NEXT(it)
- result_view[row_idx] = " ".join(list_to_join)
-
- return result
-
-
cpdef str get_rule_month(str source):
"""
Return starting month of given freq, default is December.
diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
index 2b2d5a3d2d4d6..c6cc85b9f722b 100644
--- a/pandas/io/parsers/base_parser.py
+++ b/pandas/io/parsers/base_parser.py
@@ -22,7 +22,6 @@
)
import pandas._libs.ops as libops
from pandas._libs.parsers import STR_NA_VALUES
-from pandas._libs.tslibs import parsing
from pandas.compat._optional import import_optional_dependency
from pandas.errors import (
ParserError,
@@ -32,7 +31,6 @@
from pandas.core.dtypes.astype import astype_array
from pandas.core.dtypes.common import (
- ensure_object,
is_bool_dtype,
is_dict_like,
is_extension_array_dtype,
@@ -1047,17 +1045,15 @@ def _make_date_converter(
cache_dates: bool = True,
date_format: dict[Hashable, str] | str | None = None,
):
- def converter(*date_cols, col: Hashable):
- if len(date_cols) == 1 and date_cols[0].dtype.kind in "Mm":
- return date_cols[0]
- # TODO: Can we remove concat_date_cols after deprecation of parsing
- # multiple cols?
- strs = parsing.concat_date_cols(date_cols)
+ def converter(date_col, col: Hashable):
+ if date_col.dtype.kind in "Mm":
+ return date_col
+
date_fmt = (
date_format.get(col) if isinstance(date_format, dict) else date_format
)
- str_objs = ensure_object(strs)
+ str_objs = lib.ensure_string_array(date_col)
try:
result = tools.to_datetime(
str_objs,
diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py
index e21503c04d3a2..a0c7fd8df7f52 100644
--- a/pandas/tests/io/parser/test_parse_dates.py
+++ b/pandas/tests/io/parser/test_parse_dates.py
@@ -14,8 +14,6 @@
import pytest
import pytz
-from pandas._libs.tslibs import parsing
-
import pandas as pd
from pandas import (
DataFrame,
@@ -39,18 +37,6 @@
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
-@pytest.mark.parametrize("container", [list, tuple, Index, Series])
-@pytest.mark.parametrize("dim", [1, 2])
-def test_concat_date_col_fail(container, dim):
- msg = "not all elements from date_cols are numpy arrays"
- value = "19990127"
-
- date_cols = tuple(container([value]) for _ in range(dim))
-
- with pytest.raises(ValueError, match=msg):
- parsing.concat_date_cols(date_cols)
-
-
def test_date_col_as_index_col(all_parsers):
data = """\
KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
From 42691e4a74ce60374602c7e140d0b1f2d6d59f9f Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 8 May 2024 13:47:32 -0700
Subject: [PATCH 11/12] Unexfail test
---
pandas/tests/io/parser/test_parse_dates.py | 1 -
1 file changed, 1 deletion(-)
diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py
index a0c7fd8df7f52..3bb3d793606e1 100644
--- a/pandas/tests/io/parser/test_parse_dates.py
+++ b/pandas/tests/io/parser/test_parse_dates.py
@@ -293,7 +293,6 @@ def test_bad_date_parse_with_warning(all_parsers, cache, value):
)
-@xfail_pyarrow
def test_parse_dates_empty_string(all_parsers):
# see gh-2263
parser = all_parsers
From 64fc335fbe6a5048f3452e16b0abf3bcc9f7e021 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 8 May 2024 16:04:28 -0700
Subject: [PATCH 12/12] Remove convert to unicode
---
pandas/_libs/tslibs/parsing.pyx | 42 +--------------------------------
1 file changed, 1 insertion(+), 41 deletions(-)
diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx
index 3d930dab9a949..c448a7e7c01b5 100644
--- a/pandas/_libs/tslibs/parsing.pyx
+++ b/pandas/_libs/tslibs/parsing.pyx
@@ -7,7 +7,6 @@ import warnings
from pandas.util._exceptions import find_stack_level
-cimport cython
from cpython.datetime cimport (
datetime,
datetime_new,
@@ -18,7 +17,6 @@ from cpython.datetime cimport (
from datetime import timezone
-from cpython.object cimport PyObject_Str
from cpython.unicode cimport PyUnicode_AsUTF8AndSize
from cython cimport Py_ssize_t
from libc.string cimport strchr
@@ -28,10 +26,7 @@ import_datetime()
import numpy as np
cimport numpy as cnp
-from numpy cimport (
- float64_t,
- int64_t,
-)
+from numpy cimport int64_t
cnp.import_array()
@@ -1090,41 +1085,6 @@ cdef void _maybe_warn_about_dayfirst(format: str, bint dayfirst) noexcept:
)
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef object convert_to_unicode(object item, bint keep_trivial_numbers):
- """
- Convert `item` to str.
-
- Parameters
- ----------
- item : object
- keep_trivial_numbers : bool
- if True, then conversion (to string from integer/float zero)
- is not performed
-
- Returns
- -------
- str or int or float
- """
- cdef:
- float64_t float_item
-
- if keep_trivial_numbers:
- if isinstance(item, int):
- if item == 0:
- return item
- elif isinstance(item, float):
- float_item = item
- if float_item == 0.0 or float_item != float_item:
- return item
-
- if not isinstance(item, str):
- item = PyObject_Str(item)
-
- return item
-
-
cpdef str get_rule_month(str source):
"""
Return starting month of given freq, default is December.