From d24b57ab425bf879661ccd039453f213a695d254 Mon Sep 17 00:00:00 2001 From: Satheesh Kumar Mohan Date: Fri, 28 Feb 2020 08:55:38 +0530 Subject: [PATCH 01/11] BUG: parse_dates may have columns not in dataframe read_csv will raise ValueError when columnes used for parse_dates are found in the dataframe. --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/io/parsers.py | 55 +++++++++++++++++++++- pandas/tests/io/parser/test_parse_dates.py | 30 ++++++++++++ 3 files changed, 85 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 0f18a1fd81815..3ea177fbe1ab4 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -296,6 +296,7 @@ I/O ``coerce_timestamps``; following pyarrow's default allows writing nanosecond timestamps with ``version="2.0"`` (:issue:`31652`). - Bug in :class:`HDFStore` that caused it to set to ``int64`` the dtype of a ``datetime64`` column when reading a DataFrame in Python 3 from fixed format written in Python 2 (:issue:`31750`) +- `read_csv` will raise a ``ValueError`` when the columns passed in `parse_dates` is missing in the dataframe. (:issue:`31251`) Plotting ^^^^^^^^ diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 8a3ad6cb45b57..1cbc518f69e6b 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -6,10 +6,11 @@ import csv import datetime from io import BufferedIOBase, RawIOBase, StringIO, TextIOWrapper +from itertools import chain import re import sys from textwrap import fill -from typing import Any, Dict, Set +from typing import Any, Dict, List, Set import warnings import numpy as np @@ -1419,6 +1420,56 @@ def __init__(self, kwds): # keep references to file handles opened by the parser itself self.handles = [] + def _validate_parse_dates_presence(self, columns: List[str]): + """ + Check if parse_dates are in columns. + + if user has provided names for parse_dates, check if those columns + are available. + + Parameters + ---------- + columns : list + list of names of the dataframe. + + Raises + ------ + ValueError + If column to parse_date is not in dataframe. + + """ + if isinstance(self.parse_dates, list): + # a column in parse_dates could be represented + # ColReference = Union[int, str] + # DateGroups = List[ColReference] + # ParseDates = Union[ DateGroups, List[DateGroups], + # Dict[ColReference, DateGroups]] + cols_needed = [] + for col in self.parse_dates: + if isinstance(col, list): + cols_needed.extend(col) + else: + cols_needed.append(col) + elif isinstance(self.parse_dates, dict): + cols_needed = list(chain(*self.parse_dates.values())) + else: + cols_needed = [] + + # get only columns that are references using names (str), not by index + missing_cols = ", ".join( + sorted( + { + col + for col in cols_needed + if isinstance(col, str) and col not in columns + } + ) + ) + if missing_cols: + raise ValueError( + f"Missing column provided to 'parse_dates': '{missing_cols}'" + ) + def close(self): for f in self.handles: f.close() @@ -1938,6 +1989,7 @@ def __init__(self, src, **kwds): if len(self.names) < len(usecols): _validate_usecols_names(usecols, self.names) + self._validate_parse_dates_presence(self.names) self._set_noconvert_columns() self.orig_names = self.names @@ -2308,6 +2360,7 @@ def __init__(self, f, **kwds): if self.index_names is None: self.index_names = index_names + self._validate_parse_dates_presence(self.columns) if self.parse_dates: self._no_thousands_columns = self._set_no_thousands_columns() else: diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 31573e4e6ecce..6f7a1d3d5e351 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -1516,3 +1516,33 @@ def test_hypothesis_delimited_date(date_format, dayfirst, delimiter, test_dateti assert except_out_dateutil == except_in_dateutil assert result == expected + + +@pytest.mark.parametrize( + "names, usecols, parse_dates, missing_cols", + [ + (None, ["val"], ["date", "time"], "date, time"), + (None, ["val"], [0, "time"], "time"), + (None, ["val"], [["date", "time"]], "date, time"), + (None, ["val"], [[0, "time"]], "time"), + (None, ["val"], {"date": [0, "time"]}, "time"), + (None, ["val"], {"date": ["date", "time"]}, "date, time"), + (None, ["val"], [["date", "time"], "date"], "date, time"), + (["date1", "time1", "temperature"], None, ["date", "time"], "date, time"), + ( + ["date1", "time1", "temperature"], + ["date1", "temperature"], + ["date1", "time"], + "time", + ), + ], +) +def test_missing_column(all_parsers, names, usecols, parse_dates, missing_cols): + """GH31251 column names provided in parse_dates could be missing.""" + parser = all_parsers + content = StringIO("date,time,val\n2020-01-31,04:20:32,32\n") + msg = f"Missing column provided to 'parse_dates': '{missing_cols}'" + with pytest.raises(ValueError, match=msg): + parser.read_csv( + content, sep=",", names=names, usecols=usecols, parse_dates=parse_dates, + ) From 3a99b39f45bc8d74a3e67011fbd3dbfc0ae07437 Mon Sep 17 00:00:00 2001 From: Satheesh Kumar Mohan Date: Fri, 28 Feb 2020 22:01:29 +0530 Subject: [PATCH 02/11] add return annotation. --- pandas/io/parsers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 1cbc518f69e6b..81177d4c10f30 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1420,11 +1420,11 @@ def __init__(self, kwds): # keep references to file handles opened by the parser itself self.handles = [] - def _validate_parse_dates_presence(self, columns: List[str]): + def _validate_parse_dates_presence(self, columns: List[str]) -> None: """ Check if parse_dates are in columns. - if user has provided names for parse_dates, check if those columns + If user has provided names for parse_dates, check if those columns are available. Parameters From 78ff312a2592ec8b3be1ac90f3986ed876782d1d Mon Sep 17 00:00:00 2001 From: Satheesh Kumar Mohan Date: Sat, 29 Feb 2020 09:15:05 +0530 Subject: [PATCH 03/11] use chain.from_iterable to read parse_dates --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/io/parsers.py | 7 +------ pandas/tests/io/parser/test_parse_dates.py | 4 ++-- 3 files changed, 4 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 3ea177fbe1ab4..a0a232276510e 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -296,7 +296,7 @@ I/O ``coerce_timestamps``; following pyarrow's default allows writing nanosecond timestamps with ``version="2.0"`` (:issue:`31652`). - Bug in :class:`HDFStore` that caused it to set to ``int64`` the dtype of a ``datetime64`` column when reading a DataFrame in Python 3 from fixed format written in Python 2 (:issue:`31750`) -- `read_csv` will raise a ``ValueError`` when the columns passed in `parse_dates` is missing in the dataframe. (:issue:`31251`) +- `read_csv` will raise a ``ValueError`` when the columns passed in `parse_dates` are missing in the dataframe (:issue:`31251`) Plotting ^^^^^^^^ diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 81177d4c10f30..590f664fff964 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1444,12 +1444,7 @@ def _validate_parse_dates_presence(self, columns: List[str]) -> None: # DateGroups = List[ColReference] # ParseDates = Union[ DateGroups, List[DateGroups], # Dict[ColReference, DateGroups]] - cols_needed = [] - for col in self.parse_dates: - if isinstance(col, list): - cols_needed.extend(col) - else: - cols_needed.append(col) + cols_needed = chain.from_iterable([col if isinstance(col, list) else [col] for col in self.parse_dates ]) elif isinstance(self.parse_dates, dict): cols_needed = list(chain(*self.parse_dates.values())) else: diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 6f7a1d3d5e351..051382e1e527a 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -1537,8 +1537,8 @@ def test_hypothesis_delimited_date(date_format, dayfirst, delimiter, test_dateti ), ], ) -def test_missing_column(all_parsers, names, usecols, parse_dates, missing_cols): - """GH31251 column names provided in parse_dates could be missing.""" +def test_missing_parse_dates_column_raises(all_parsers, names, usecols, parse_dates, missing_cols): + # gh-31251 column names provided in parse_dates could be missing. parser = all_parsers content = StringIO("date,time,val\n2020-01-31,04:20:32,32\n") msg = f"Missing column provided to 'parse_dates': '{missing_cols}'" From 007c992bff320776b9bed11187ffe2973b5950b6 Mon Sep 17 00:00:00 2001 From: Satheesh Kumar Mohan Date: Sat, 29 Feb 2020 09:25:50 +0530 Subject: [PATCH 04/11] break long lines. --- pandas/io/parsers.py | 4 +++- pandas/tests/io/parser/test_parse_dates.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 590f664fff964..1428099b13c71 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1444,7 +1444,9 @@ def _validate_parse_dates_presence(self, columns: List[str]) -> None: # DateGroups = List[ColReference] # ParseDates = Union[ DateGroups, List[DateGroups], # Dict[ColReference, DateGroups]] - cols_needed = chain.from_iterable([col if isinstance(col, list) else [col] for col in self.parse_dates ]) + cols_needed = chain.from_iterable( + [col if isinstance(col, list) else [col] for col in self.parse_dates] + ) elif isinstance(self.parse_dates, dict): cols_needed = list(chain(*self.parse_dates.values())) else: diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 051382e1e527a..2fcac6fa57cf8 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -1537,7 +1537,9 @@ def test_hypothesis_delimited_date(date_format, dayfirst, delimiter, test_dateti ), ], ) -def test_missing_parse_dates_column_raises(all_parsers, names, usecols, parse_dates, missing_cols): +def test_missing_parse_dates_column_raises( + all_parsers, names, usecols, parse_dates, missing_cols +): # gh-31251 column names provided in parse_dates could be missing. parser = all_parsers content = StringIO("date,time,val\n2020-01-31,04:20:32,32\n") From 7f1cd6945e308146d280c358b2db6ef047ac8446 Mon Sep 17 00:00:00 2001 From: Satheesh Kumar Mohan Date: Sat, 29 Feb 2020 17:47:30 +0530 Subject: [PATCH 05/11] fixing typing mistake in cols_needed --- pandas/io/parsers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 1428099b13c71..c03b2edfcd732 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1448,9 +1448,9 @@ def _validate_parse_dates_presence(self, columns: List[str]) -> None: [col if isinstance(col, list) else [col] for col in self.parse_dates] ) elif isinstance(self.parse_dates, dict): - cols_needed = list(chain(*self.parse_dates.values())) + cols_needed = chain(*self.parse_dates.values()) else: - cols_needed = [] + cols_needed = chain() # get only columns that are references using names (str), not by index missing_cols = ", ".join( From 110f594fce222cfd5e2c65623f6063c27b389506 Mon Sep 17 00:00:00 2001 From: Satheesh Kumar Mohan Date: Tue, 3 Mar 2020 09:33:26 +0530 Subject: [PATCH 06/11] add func reference for read_csv in whatsnew entry --- doc/source/whatsnew/v1.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index a0a232276510e..bef692411424e 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -296,7 +296,7 @@ I/O ``coerce_timestamps``; following pyarrow's default allows writing nanosecond timestamps with ``version="2.0"`` (:issue:`31652`). - Bug in :class:`HDFStore` that caused it to set to ``int64`` the dtype of a ``datetime64`` column when reading a DataFrame in Python 3 from fixed format written in Python 2 (:issue:`31750`) -- `read_csv` will raise a ``ValueError`` when the columns passed in `parse_dates` are missing in the dataframe (:issue:`31251`) +- :func:`read_csv` will raise a ``ValueError`` when the columns passed in `parse_dates` are missing in the dataframe (:issue:`31251`) Plotting ^^^^^^^^ From 1536b77f86ca8bb8f7ec80b89f766fbc14ab9273 Mon Sep 17 00:00:00 2001 From: Satheesh Kumar Mohan Date: Wed, 4 Mar 2020 20:26:52 +0530 Subject: [PATCH 07/11] docstring fix in whatsnew. --- doc/source/whatsnew/v1.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index bef692411424e..0bc35fbc7a65c 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -296,7 +296,7 @@ I/O ``coerce_timestamps``; following pyarrow's default allows writing nanosecond timestamps with ``version="2.0"`` (:issue:`31652`). - Bug in :class:`HDFStore` that caused it to set to ``int64`` the dtype of a ``datetime64`` column when reading a DataFrame in Python 3 from fixed format written in Python 2 (:issue:`31750`) -- :func:`read_csv` will raise a ``ValueError`` when the columns passed in `parse_dates` are missing in the dataframe (:issue:`31251`) +- :func:`read_csv` will raise a ``ValueError`` when the column names passed in `parse_dates` are missing in the Dataframe (:issue:`31251`) Plotting ^^^^^^^^ From ee4f3fb9df8e7f352affa2ff3a0f965678ebd56b Mon Sep 17 00:00:00 2001 From: Satheesh Kumar Mohan Date: Wed, 11 Mar 2020 18:13:58 +0530 Subject: [PATCH 08/11] import itertools directly --- pandas/io/parsers.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 2b051d8d940b3..f8d0d6b7be585 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -5,8 +5,8 @@ from collections import abc, defaultdict import csv import datetime -from itertools import chain from io import StringIO, TextIOWrapper +import itertools import re import sys from textwrap import fill @@ -1432,7 +1432,7 @@ def _validate_parse_dates_presence(self, columns: List[str]) -> None: Parameters ---------- columns : list - list of names of the dataframe. + List of names of the dataframe. Raises ------ @@ -1444,15 +1444,15 @@ def _validate_parse_dates_presence(self, columns: List[str]) -> None: # a column in parse_dates could be represented # ColReference = Union[int, str] # DateGroups = List[ColReference] - # ParseDates = Union[ DateGroups, List[DateGroups], + # ParseDates = Union[DateGroups, List[DateGroups], # Dict[ColReference, DateGroups]] - cols_needed = chain.from_iterable( - [col if isinstance(col, list) else [col] for col in self.parse_dates] + cols_needed = itertools.chain.from_iterable( + col if isinstance(col, list) else [col] for col in self.parse_dates ) elif isinstance(self.parse_dates, dict): - cols_needed = chain(*self.parse_dates.values()) + cols_needed = itertools.chain(*self.parse_dates.values()) else: - cols_needed = chain() + cols_needed = itertools.chain() # get only columns that are references using names (str), not by index missing_cols = ", ".join( From 633e481da3e14a7c9eb16fb4362c6cb11ac027e1 Mon Sep 17 00:00:00 2001 From: Satheesh Kumar Mohan Date: Wed, 11 Mar 2020 18:20:49 +0530 Subject: [PATCH 09/11] typing hint for cols_needed --- pandas/io/parsers.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index f8d0d6b7be585..df0342d01d16b 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -10,7 +10,7 @@ import re import sys from textwrap import fill -from typing import Any, Dict, List, Set +from typing import Any, Dict, List, Set, Iterable import warnings import numpy as np @@ -1440,6 +1440,7 @@ def _validate_parse_dates_presence(self, columns: List[str]) -> None: If column to parse_date is not in dataframe. """ + cols_needed: Iterable if isinstance(self.parse_dates, list): # a column in parse_dates could be represented # ColReference = Union[int, str] @@ -1452,7 +1453,7 @@ def _validate_parse_dates_presence(self, columns: List[str]) -> None: elif isinstance(self.parse_dates, dict): cols_needed = itertools.chain(*self.parse_dates.values()) else: - cols_needed = itertools.chain() + cols_needed = [] # get only columns that are references using names (str), not by index missing_cols = ", ".join( From 537f4df09af3b2551da3b3d85c3711d993d12e40 Mon Sep 17 00:00:00 2001 From: Satheesh Kumar Mohan Date: Wed, 11 Mar 2020 18:56:21 +0530 Subject: [PATCH 10/11] sort import statemeents --- pandas/io/parsers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index df0342d01d16b..d29f180038933 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -10,7 +10,7 @@ import re import sys from textwrap import fill -from typing import Any, Dict, List, Set, Iterable +from typing import Any, Dict, Iterable, List, Set import warnings import numpy as np From 337efcd0825216682be37be1fdc3b43b99201c20 Mon Sep 17 00:00:00 2001 From: Satheesh Kumar Mohan Date: Mon, 16 Mar 2020 21:48:30 +0530 Subject: [PATCH 11/11] use is_dict_like & is_list_like --- pandas/io/parsers.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index d29f180038933..648c986460560 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -35,6 +35,7 @@ ensure_str, is_bool_dtype, is_categorical_dtype, + is_dict_like, is_dtype_equal, is_extension_array_dtype, is_file_like, @@ -1441,17 +1442,17 @@ def _validate_parse_dates_presence(self, columns: List[str]) -> None: """ cols_needed: Iterable - if isinstance(self.parse_dates, list): + if is_dict_like(self.parse_dates): + cols_needed = itertools.chain(*self.parse_dates.values()) + elif is_list_like(self.parse_dates): # a column in parse_dates could be represented # ColReference = Union[int, str] # DateGroups = List[ColReference] # ParseDates = Union[DateGroups, List[DateGroups], # Dict[ColReference, DateGroups]] cols_needed = itertools.chain.from_iterable( - col if isinstance(col, list) else [col] for col in self.parse_dates + col if is_list_like(col) else [col] for col in self.parse_dates ) - elif isinstance(self.parse_dates, dict): - cols_needed = itertools.chain(*self.parse_dates.values()) else: cols_needed = []