From d24b57ab425bf879661ccd039453f213a695d254 Mon Sep 17 00:00:00 2001
From: Satheesh Kumar Mohan <sathyz@gmail.com>
Date: Fri, 28 Feb 2020 08:55:38 +0530
Subject: [PATCH 01/11] BUG: parse_dates may have columns not in dataframe

read_csv will raise ValueError when columnes used for parse_dates are found in the dataframe.
---
 doc/source/whatsnew/v1.1.0.rst             |  1 +
 pandas/io/parsers.py                       | 55 +++++++++++++++++++++-
 pandas/tests/io/parser/test_parse_dates.py | 30 ++++++++++++
 3 files changed, 85 insertions(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
index 0f18a1fd81815..3ea177fbe1ab4 100644
--- a/doc/source/whatsnew/v1.1.0.rst
+++ b/doc/source/whatsnew/v1.1.0.rst
@@ -296,6 +296,7 @@ I/O
   ``coerce_timestamps``; following pyarrow's default allows writing nanosecond
   timestamps with ``version="2.0"`` (:issue:`31652`).
 - Bug in :class:`HDFStore` that caused it to set to ``int64`` the dtype of a ``datetime64`` column when reading a DataFrame in Python 3 from fixed format written in Python 2 (:issue:`31750`)
+- `read_csv` will raise a ``ValueError`` when the columns passed in `parse_dates` is missing in the dataframe. (:issue:`31251`)
 
 Plotting
 ^^^^^^^^
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 8a3ad6cb45b57..1cbc518f69e6b 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -6,10 +6,11 @@
 import csv
 import datetime
 from io import BufferedIOBase, RawIOBase, StringIO, TextIOWrapper
+from itertools import chain
 import re
 import sys
 from textwrap import fill
-from typing import Any, Dict, Set
+from typing import Any, Dict, List, Set
 import warnings
 
 import numpy as np
@@ -1419,6 +1420,56 @@ def __init__(self, kwds):
         # keep references to file handles opened by the parser itself
         self.handles = []
 
+    def _validate_parse_dates_presence(self, columns: List[str]):
+        """
+        Check if parse_dates are in columns.
+
+        if user has provided names for parse_dates, check if those columns
+        are available.
+
+        Parameters
+        ----------
+        columns : list
+            list of names of the dataframe.
+
+        Raises
+        ------
+        ValueError
+            If column to parse_date is not in dataframe.
+
+        """
+        if isinstance(self.parse_dates, list):
+            # a column in parse_dates could be represented
+            # ColReference = Union[int, str]
+            # DateGroups = List[ColReference]
+            # ParseDates = Union[ DateGroups, List[DateGroups],
+            #     Dict[ColReference, DateGroups]]
+            cols_needed = []
+            for col in self.parse_dates:
+                if isinstance(col, list):
+                    cols_needed.extend(col)
+                else:
+                    cols_needed.append(col)
+        elif isinstance(self.parse_dates, dict):
+            cols_needed = list(chain(*self.parse_dates.values()))
+        else:
+            cols_needed = []
+
+        # get only columns that are references using names (str), not by index
+        missing_cols = ", ".join(
+            sorted(
+                {
+                    col
+                    for col in cols_needed
+                    if isinstance(col, str) and col not in columns
+                }
+            )
+        )
+        if missing_cols:
+            raise ValueError(
+                f"Missing column provided to 'parse_dates': '{missing_cols}'"
+            )
+
     def close(self):
         for f in self.handles:
             f.close()
@@ -1938,6 +1989,7 @@ def __init__(self, src, **kwds):
             if len(self.names) < len(usecols):
                 _validate_usecols_names(usecols, self.names)
 
+        self._validate_parse_dates_presence(self.names)
         self._set_noconvert_columns()
 
         self.orig_names = self.names
@@ -2308,6 +2360,7 @@ def __init__(self, f, **kwds):
             if self.index_names is None:
                 self.index_names = index_names
 
+        self._validate_parse_dates_presence(self.columns)
         if self.parse_dates:
             self._no_thousands_columns = self._set_no_thousands_columns()
         else:
diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py
index 31573e4e6ecce..6f7a1d3d5e351 100644
--- a/pandas/tests/io/parser/test_parse_dates.py
+++ b/pandas/tests/io/parser/test_parse_dates.py
@@ -1516,3 +1516,33 @@ def test_hypothesis_delimited_date(date_format, dayfirst, delimiter, test_dateti
 
     assert except_out_dateutil == except_in_dateutil
     assert result == expected
+
+
+@pytest.mark.parametrize(
+    "names, usecols, parse_dates, missing_cols",
+    [
+        (None, ["val"], ["date", "time"], "date, time"),
+        (None, ["val"], [0, "time"], "time"),
+        (None, ["val"], [["date", "time"]], "date, time"),
+        (None, ["val"], [[0, "time"]], "time"),
+        (None, ["val"], {"date": [0, "time"]}, "time"),
+        (None, ["val"], {"date": ["date", "time"]}, "date, time"),
+        (None, ["val"], [["date", "time"], "date"], "date, time"),
+        (["date1", "time1", "temperature"], None, ["date", "time"], "date, time"),
+        (
+            ["date1", "time1", "temperature"],
+            ["date1", "temperature"],
+            ["date1", "time"],
+            "time",
+        ),
+    ],
+)
+def test_missing_column(all_parsers, names, usecols, parse_dates, missing_cols):
+    """GH31251 column names provided in parse_dates could be missing."""
+    parser = all_parsers
+    content = StringIO("date,time,val\n2020-01-31,04:20:32,32\n")
+    msg = f"Missing column provided to 'parse_dates': '{missing_cols}'"
+    with pytest.raises(ValueError, match=msg):
+        parser.read_csv(
+            content, sep=",", names=names, usecols=usecols, parse_dates=parse_dates,
+        )

From 3a99b39f45bc8d74a3e67011fbd3dbfc0ae07437 Mon Sep 17 00:00:00 2001
From: Satheesh Kumar Mohan <sathyz@gmail.com>
Date: Fri, 28 Feb 2020 22:01:29 +0530
Subject: [PATCH 02/11] add return annotation.

---
 pandas/io/parsers.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 1cbc518f69e6b..81177d4c10f30 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -1420,11 +1420,11 @@ def __init__(self, kwds):
         # keep references to file handles opened by the parser itself
         self.handles = []
 
-    def _validate_parse_dates_presence(self, columns: List[str]):
+    def _validate_parse_dates_presence(self, columns: List[str]) -> None:
         """
         Check if parse_dates are in columns.
 
-        if user has provided names for parse_dates, check if those columns
+        If user has provided names for parse_dates, check if those columns
         are available.
 
         Parameters

From 78ff312a2592ec8b3be1ac90f3986ed876782d1d Mon Sep 17 00:00:00 2001
From: Satheesh Kumar Mohan <sathyz@gmail.com>
Date: Sat, 29 Feb 2020 09:15:05 +0530
Subject: [PATCH 03/11] use chain.from_iterable to read parse_dates

---
 doc/source/whatsnew/v1.1.0.rst             | 2 +-
 pandas/io/parsers.py                       | 7 +------
 pandas/tests/io/parser/test_parse_dates.py | 4 ++--
 3 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
index 3ea177fbe1ab4..a0a232276510e 100644
--- a/doc/source/whatsnew/v1.1.0.rst
+++ b/doc/source/whatsnew/v1.1.0.rst
@@ -296,7 +296,7 @@ I/O
   ``coerce_timestamps``; following pyarrow's default allows writing nanosecond
   timestamps with ``version="2.0"`` (:issue:`31652`).
 - Bug in :class:`HDFStore` that caused it to set to ``int64`` the dtype of a ``datetime64`` column when reading a DataFrame in Python 3 from fixed format written in Python 2 (:issue:`31750`)
-- `read_csv` will raise a ``ValueError`` when the columns passed in `parse_dates` is missing in the dataframe. (:issue:`31251`)
+- `read_csv` will raise a ``ValueError`` when the columns passed in `parse_dates` are missing in the dataframe (:issue:`31251`)
 
 Plotting
 ^^^^^^^^
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 81177d4c10f30..590f664fff964 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -1444,12 +1444,7 @@ def _validate_parse_dates_presence(self, columns: List[str]) -> None:
             # DateGroups = List[ColReference]
             # ParseDates = Union[ DateGroups, List[DateGroups],
             #     Dict[ColReference, DateGroups]]
-            cols_needed = []
-            for col in self.parse_dates:
-                if isinstance(col, list):
-                    cols_needed.extend(col)
-                else:
-                    cols_needed.append(col)
+            cols_needed = chain.from_iterable([col if isinstance(col, list) else [col] for col in self.parse_dates ])
         elif isinstance(self.parse_dates, dict):
             cols_needed = list(chain(*self.parse_dates.values()))
         else:
diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py
index 6f7a1d3d5e351..051382e1e527a 100644
--- a/pandas/tests/io/parser/test_parse_dates.py
+++ b/pandas/tests/io/parser/test_parse_dates.py
@@ -1537,8 +1537,8 @@ def test_hypothesis_delimited_date(date_format, dayfirst, delimiter, test_dateti
         ),
     ],
 )
-def test_missing_column(all_parsers, names, usecols, parse_dates, missing_cols):
-    """GH31251 column names provided in parse_dates could be missing."""
+def test_missing_parse_dates_column_raises(all_parsers, names, usecols, parse_dates, missing_cols):
+    # gh-31251 column names provided in parse_dates could be missing.
     parser = all_parsers
     content = StringIO("date,time,val\n2020-01-31,04:20:32,32\n")
     msg = f"Missing column provided to 'parse_dates': '{missing_cols}'"

From 007c992bff320776b9bed11187ffe2973b5950b6 Mon Sep 17 00:00:00 2001
From: Satheesh Kumar Mohan <sathyz@gmail.com>
Date: Sat, 29 Feb 2020 09:25:50 +0530
Subject: [PATCH 04/11] break long lines.

---
 pandas/io/parsers.py                       | 4 +++-
 pandas/tests/io/parser/test_parse_dates.py | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 590f664fff964..1428099b13c71 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -1444,7 +1444,9 @@ def _validate_parse_dates_presence(self, columns: List[str]) -> None:
             # DateGroups = List[ColReference]
             # ParseDates = Union[ DateGroups, List[DateGroups],
             #     Dict[ColReference, DateGroups]]
-            cols_needed = chain.from_iterable([col if isinstance(col, list) else [col] for col in self.parse_dates ])
+            cols_needed = chain.from_iterable(
+                [col if isinstance(col, list) else [col] for col in self.parse_dates]
+            )
         elif isinstance(self.parse_dates, dict):
             cols_needed = list(chain(*self.parse_dates.values()))
         else:
diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py
index 051382e1e527a..2fcac6fa57cf8 100644
--- a/pandas/tests/io/parser/test_parse_dates.py
+++ b/pandas/tests/io/parser/test_parse_dates.py
@@ -1537,7 +1537,9 @@ def test_hypothesis_delimited_date(date_format, dayfirst, delimiter, test_dateti
         ),
     ],
 )
-def test_missing_parse_dates_column_raises(all_parsers, names, usecols, parse_dates, missing_cols):
+def test_missing_parse_dates_column_raises(
+    all_parsers, names, usecols, parse_dates, missing_cols
+):
     # gh-31251 column names provided in parse_dates could be missing.
     parser = all_parsers
     content = StringIO("date,time,val\n2020-01-31,04:20:32,32\n")

From 7f1cd6945e308146d280c358b2db6ef047ac8446 Mon Sep 17 00:00:00 2001
From: Satheesh Kumar Mohan <sathyz@gmail.com>
Date: Sat, 29 Feb 2020 17:47:30 +0530
Subject: [PATCH 05/11] fixing typing mistake in cols_needed

---
 pandas/io/parsers.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 1428099b13c71..c03b2edfcd732 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -1448,9 +1448,9 @@ def _validate_parse_dates_presence(self, columns: List[str]) -> None:
                 [col if isinstance(col, list) else [col] for col in self.parse_dates]
             )
         elif isinstance(self.parse_dates, dict):
-            cols_needed = list(chain(*self.parse_dates.values()))
+            cols_needed = chain(*self.parse_dates.values())
         else:
-            cols_needed = []
+            cols_needed = chain()
 
         # get only columns that are references using names (str), not by index
         missing_cols = ", ".join(

From 110f594fce222cfd5e2c65623f6063c27b389506 Mon Sep 17 00:00:00 2001
From: Satheesh Kumar Mohan <sathyz@gmail.com>
Date: Tue, 3 Mar 2020 09:33:26 +0530
Subject: [PATCH 06/11] add func reference for read_csv in whatsnew entry

---
 doc/source/whatsnew/v1.1.0.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
index a0a232276510e..bef692411424e 100644
--- a/doc/source/whatsnew/v1.1.0.rst
+++ b/doc/source/whatsnew/v1.1.0.rst
@@ -296,7 +296,7 @@ I/O
   ``coerce_timestamps``; following pyarrow's default allows writing nanosecond
   timestamps with ``version="2.0"`` (:issue:`31652`).
 - Bug in :class:`HDFStore` that caused it to set to ``int64`` the dtype of a ``datetime64`` column when reading a DataFrame in Python 3 from fixed format written in Python 2 (:issue:`31750`)
-- `read_csv` will raise a ``ValueError`` when the columns passed in `parse_dates` are missing in the dataframe (:issue:`31251`)
+- :func:`read_csv` will raise a ``ValueError`` when the columns passed in `parse_dates` are missing in the dataframe (:issue:`31251`)
 
 Plotting
 ^^^^^^^^

From 1536b77f86ca8bb8f7ec80b89f766fbc14ab9273 Mon Sep 17 00:00:00 2001
From: Satheesh Kumar Mohan <sathyz@gmail.com>
Date: Wed, 4 Mar 2020 20:26:52 +0530
Subject: [PATCH 07/11] docstring fix in whatsnew.

---
 doc/source/whatsnew/v1.1.0.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
index bef692411424e..0bc35fbc7a65c 100644
--- a/doc/source/whatsnew/v1.1.0.rst
+++ b/doc/source/whatsnew/v1.1.0.rst
@@ -296,7 +296,7 @@ I/O
   ``coerce_timestamps``; following pyarrow's default allows writing nanosecond
   timestamps with ``version="2.0"`` (:issue:`31652`).
 - Bug in :class:`HDFStore` that caused it to set to ``int64`` the dtype of a ``datetime64`` column when reading a DataFrame in Python 3 from fixed format written in Python 2 (:issue:`31750`)
-- :func:`read_csv` will raise a ``ValueError`` when the columns passed in `parse_dates` are missing in the dataframe (:issue:`31251`)
+- :func:`read_csv` will raise a ``ValueError`` when the column names passed in `parse_dates` are missing in the Dataframe (:issue:`31251`)
 
 Plotting
 ^^^^^^^^

From ee4f3fb9df8e7f352affa2ff3a0f965678ebd56b Mon Sep 17 00:00:00 2001
From: Satheesh Kumar Mohan <sathyz@gmail.com>
Date: Wed, 11 Mar 2020 18:13:58 +0530
Subject: [PATCH 08/11] import itertools directly

---
 pandas/io/parsers.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 2b051d8d940b3..f8d0d6b7be585 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -5,8 +5,8 @@
 from collections import abc, defaultdict
 import csv
 import datetime
-from itertools import chain
 from io import StringIO, TextIOWrapper
+import itertools
 import re
 import sys
 from textwrap import fill
@@ -1432,7 +1432,7 @@ def _validate_parse_dates_presence(self, columns: List[str]) -> None:
         Parameters
         ----------
         columns : list
-            list of names of the dataframe.
+            List of names of the dataframe.
 
         Raises
         ------
@@ -1444,15 +1444,15 @@ def _validate_parse_dates_presence(self, columns: List[str]) -> None:
             # a column in parse_dates could be represented
             # ColReference = Union[int, str]
             # DateGroups = List[ColReference]
-            # ParseDates = Union[ DateGroups, List[DateGroups],
+            # ParseDates = Union[DateGroups, List[DateGroups],
             #     Dict[ColReference, DateGroups]]
-            cols_needed = chain.from_iterable(
-                [col if isinstance(col, list) else [col] for col in self.parse_dates]
+            cols_needed = itertools.chain.from_iterable(
+                col if isinstance(col, list) else [col] for col in self.parse_dates
             )
         elif isinstance(self.parse_dates, dict):
-            cols_needed = chain(*self.parse_dates.values())
+            cols_needed = itertools.chain(*self.parse_dates.values())
         else:
-            cols_needed = chain()
+            cols_needed = itertools.chain()
 
         # get only columns that are references using names (str), not by index
         missing_cols = ", ".join(

From 633e481da3e14a7c9eb16fb4362c6cb11ac027e1 Mon Sep 17 00:00:00 2001
From: Satheesh Kumar Mohan <sathyz@gmail.com>
Date: Wed, 11 Mar 2020 18:20:49 +0530
Subject: [PATCH 09/11] typing hint for cols_needed

---
 pandas/io/parsers.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index f8d0d6b7be585..df0342d01d16b 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -10,7 +10,7 @@
 import re
 import sys
 from textwrap import fill
-from typing import Any, Dict, List, Set
+from typing import Any, Dict, List, Set, Iterable
 import warnings
 
 import numpy as np
@@ -1440,6 +1440,7 @@ def _validate_parse_dates_presence(self, columns: List[str]) -> None:
             If column to parse_date is not in dataframe.
 
         """
+        cols_needed: Iterable
         if isinstance(self.parse_dates, list):
             # a column in parse_dates could be represented
             # ColReference = Union[int, str]
@@ -1452,7 +1453,7 @@ def _validate_parse_dates_presence(self, columns: List[str]) -> None:
         elif isinstance(self.parse_dates, dict):
             cols_needed = itertools.chain(*self.parse_dates.values())
         else:
-            cols_needed = itertools.chain()
+            cols_needed = []
 
         # get only columns that are references using names (str), not by index
         missing_cols = ", ".join(

From 537f4df09af3b2551da3b3d85c3711d993d12e40 Mon Sep 17 00:00:00 2001
From: Satheesh Kumar Mohan <sathyz@gmail.com>
Date: Wed, 11 Mar 2020 18:56:21 +0530
Subject: [PATCH 10/11] sort import statemeents

---
 pandas/io/parsers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index df0342d01d16b..d29f180038933 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -10,7 +10,7 @@
 import re
 import sys
 from textwrap import fill
-from typing import Any, Dict, List, Set, Iterable
+from typing import Any, Dict, Iterable, List, Set
 import warnings
 
 import numpy as np

From 337efcd0825216682be37be1fdc3b43b99201c20 Mon Sep 17 00:00:00 2001
From: Satheesh Kumar Mohan <sathyz@gmail.com>
Date: Mon, 16 Mar 2020 21:48:30 +0530
Subject: [PATCH 11/11] use is_dict_like & is_list_like

---
 pandas/io/parsers.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index d29f180038933..648c986460560 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -35,6 +35,7 @@
     ensure_str,
     is_bool_dtype,
     is_categorical_dtype,
+    is_dict_like,
     is_dtype_equal,
     is_extension_array_dtype,
     is_file_like,
@@ -1441,17 +1442,17 @@ def _validate_parse_dates_presence(self, columns: List[str]) -> None:
 
         """
         cols_needed: Iterable
-        if isinstance(self.parse_dates, list):
+        if is_dict_like(self.parse_dates):
+            cols_needed = itertools.chain(*self.parse_dates.values())
+        elif is_list_like(self.parse_dates):
             # a column in parse_dates could be represented
             # ColReference = Union[int, str]
             # DateGroups = List[ColReference]
             # ParseDates = Union[DateGroups, List[DateGroups],
             #     Dict[ColReference, DateGroups]]
             cols_needed = itertools.chain.from_iterable(
-                col if isinstance(col, list) else [col] for col in self.parse_dates
+                col if is_list_like(col) else [col] for col in self.parse_dates
             )
-        elif isinstance(self.parse_dates, dict):
-            cols_needed = itertools.chain(*self.parse_dates.values())
         else:
             cols_needed = []