From 57ccd4cb5d0b63da32d13e405ddec1b16607f4b9 Mon Sep 17 00:00:00 2001
From: phofl <patrick_hoefler@gmx.net>
Date: Thu, 25 Nov 2021 23:57:03 +0100
Subject: [PATCH 1/5] BUG: read_csv not replacing nans before date convrsion

---
 doc/source/whatsnew/v1.4.0.rst             |  2 +
 pandas/io/parsers/base_parser.py           | 16 +++++++-
 pandas/io/parsers/python_parser.py         |  4 +-
 pandas/tests/io/parser/test_parse_dates.py | 45 ++++++++++++++++++++--
 pandas/tests/io/xml/__init__.py            |  0
 5 files changed, 60 insertions(+), 7 deletions(-)
 create mode 100644 pandas/tests/io/xml/__init__.py

diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
index 39e3894f86302..7da3599225fb0 100644
--- a/doc/source/whatsnew/v1.4.0.rst
+++ b/doc/source/whatsnew/v1.4.0.rst
@@ -661,6 +661,8 @@ I/O
 - Bug in dumping/loading a :class:`DataFrame` with ``yaml.dump(frame)`` (:issue:`42748`)
 - Bug in :class:`ExcelWriter`, where ``engine_kwargs`` were not passed through to all engines (:issue:`43442`)
 - Bug in :func:`read_csv` raising ``ValueError`` when ``parse_dates`` was used with ``MultiIndex`` columns (:issue:`8991`)
+- Bug in :func:`read_csv` converting columns to numeric after date parsing failed (:issue:`11019`)
+- Bug in :func:`read_csv` not replacing ``NaN`` values with ``np.nan`` before attempting date conversion (:issue:`26203`)
 - Bug in :func:`read_csv` raising ``AttributeError`` when attempting to read a .csv file and infer index column dtype from an nullable integer type (:issue:`44079`)
 - :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` with ``compression`` set to ``'zip'`` no longer create a zip file containing a file ending with ".zip". Instead, they try to infer the inner file name more smartly. (:issue:`39465`)
 - Bug in :func:`read_csv` when passing simultaneously a parser in ``date_parser`` and ``parse_dates=False``, the parsing was still called (:issue:`44366`)
diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
index 25a89d1c57006..a02054ecda82e 100644
--- a/pandas/io/parsers/base_parser.py
+++ b/pandas/io/parsers/base_parser.py
@@ -139,6 +139,7 @@ def __init__(self, kwds):
         self.col_names = None
 
         self.parse_dates = _validate_parse_dates_arg(kwds.pop("parse_dates", False))
+        self._parse_date_cols: Iterable = []
         self.date_parser = kwds.pop("date_parser", None)
         self.dayfirst = kwds.pop("dayfirst", False)
         self.keep_date_col = kwds.pop("keep_date_col", False)
@@ -237,7 +238,7 @@ def _open_handles(
             errors=kwds.get("encoding_errors", "strict"),
         )
 
-    def _validate_parse_dates_presence(self, columns: list[str]) -> None:
+    def _validate_parse_dates_presence(self, columns: list[str]) -> Iterable:
         """
         Check if parse_dates are in columns.
 
@@ -271,6 +272,8 @@ def _validate_parse_dates_presence(self, columns: list[str]) -> None:
         else:
             cols_needed = []
 
+        cols_needed = list(cols_needed)
+
         # get only columns that are references using names (str), not by index
         missing_cols = ", ".join(
             sorted(
@@ -285,6 +288,11 @@ def _validate_parse_dates_presence(self, columns: list[str]) -> None:
             raise ValueError(
                 f"Missing column provided to 'parse_dates': '{missing_cols}'"
             )
+        # Convert positions to actual column names
+        return [
+            col if (isinstance(col, str) or col in columns) else columns[col]
+            for col in cols_needed
+        ]
 
     def close(self):
         if self.handles is not None:
@@ -556,6 +564,12 @@ def _convert_to_ndarrays(
             else:
                 col_na_values, col_na_fvalues = set(), set()
 
+            if c in self._parse_date_cols:
+                mask = algorithms.isin(values, set(col_na_values) | col_na_fvalues)
+                np.putmask(values, mask, np.nan)
+                result[c] = values
+                continue
+
             if conv_f is not None:
                 # conv_f applied to data before inference
                 if cast_type is not None:
diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
index f5420618c0235..7c92dc41ecd1f 100644
--- a/pandas/io/parsers/python_parser.py
+++ b/pandas/io/parsers/python_parser.py
@@ -152,7 +152,7 @@ def __init__(
         if self._col_indices is None:
             self._col_indices = list(range(len(self.columns)))
 
-        self._validate_parse_dates_presence(self.columns)
+        self._parse_date_cols = self._validate_parse_dates_presence(self.columns)
         no_thousands_columns: set[int] | None = None
         if self.parse_dates:
             no_thousands_columns = self._set_noconvert_dtype_columns(
@@ -277,9 +277,9 @@ def read(self, rows=None):
         alldata = self._rows_to_cols(content)
         data, columns = self._exclude_implicit_index(alldata)
 
+        data = self._convert_data(data)
         columns, data = self._do_date_conversions(columns, data)
 
-        data = self._convert_data(data)
         index, columns = self._make_index(data, alldata, columns, indexnamerow)
 
         return index, columns, data
diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py
index 10d260bad215e..a61f3bc03028c 100644
--- a/pandas/tests/io/parser/test_parse_dates.py
+++ b/pandas/tests/io/parser/test_parse_dates.py
@@ -286,8 +286,6 @@ def date_parser(*date_cols):
 
     if not keep_date_col:
         expected = expected.drop(["X1", "X2", "X3"], axis=1)
-    elif parser.engine == "python":
-        expected["X1"] = expected["X1"].astype(np.int64)
 
     # Python can sometimes be flaky about how
     # the aggregated columns are entered, so
@@ -425,8 +423,6 @@ def test_multiple_date_col(all_parsers, keep_date_col):
 
     if not keep_date_col:
         expected = expected.drop(["X1", "X2", "X3"], axis=1)
-    elif parser.engine == "python":
-        expected["X1"] = expected["X1"].astype(np.int64)
 
     tm.assert_frame_equal(result, expected)
 
@@ -1907,3 +1903,44 @@ def test_dayfirst_warnings():
             index_col="date",
         ).index
     tm.assert_index_equal(expected, res8)
+
+
+@skip_pyarrow
+def test_infer_first_column_as_index(all_parsers):
+    # GH#11019
+    parser = all_parsers
+    data = "a,b,c\n1970-01-01,2,3,4"
+    result = parser.read_csv(StringIO(data), parse_dates=["a"])
+    expected = DataFrame({"a": "2", "b": 3, "c": 4}, index=["1970-01-01"])
+    tm.assert_frame_equal(result, expected)
+
+
+@skip_pyarrow
+def test_replace_nans_before_parsing_dates(all_parsers):
+    # GH#26203
+    parser = all_parsers
+    data = """Test
+2012-10-01
+0
+2015-05-15
+#
+2017-09-09
+"""
+    result = parser.read_csv(
+        StringIO(data),
+        na_values={"Test": ["#", "0"]},
+        parse_dates=["Test"],
+        date_parser=lambda x: pd.to_datetime(x, format="%Y-%m-%d"),
+    )
+    expected = DataFrame(
+        {
+            "Test": [
+                Timestamp("2012-10-01"),
+                pd.NaT,
+                Timestamp("2015-05-15"),
+                pd.NaT,
+                Timestamp("2017-09-09"),
+            ]
+        }
+    )
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/io/xml/__init__.py b/pandas/tests/io/xml/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d

From 72d7937fd3f05662418503abb21d03cf343f7c38 Mon Sep 17 00:00:00 2001
From: phofl <patrick_hoefler@gmx.net>
Date: Fri, 26 Nov 2021 01:26:38 +0100
Subject: [PATCH 2/5] Remove mypy ignore

---
 pandas/tests/io/xml/test_xml.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py
index a99f66336bf22..ecb040b28e4ca 100644
--- a/pandas/tests/io/xml/test_xml.py
+++ b/pandas/tests/io/xml/test_xml.py
@@ -681,9 +681,7 @@ def test_names_option_wrong_type(datapath, parser):
     filename = datapath("io", "data", "xml", "books.xml")
 
     with pytest.raises(TypeError, match=("is not a valid type for names")):
-        read_xml(
-            filename, names="Col1, Col2, Col3", parser=parser  # type: ignore[arg-type]
-        )
+        read_xml(filename, names="Col1, Col2, Col3", parser=parser)
 
 
 # ENCODING

From 965aec8c5bcdec76a03b7edcd8af3ee30b7147e9 Mon Sep 17 00:00:00 2001
From: phofl <patrick_hoefler@gmx.net>
Date: Fri, 26 Nov 2021 16:09:36 +0100
Subject: [PATCH 3/5] Remove init file

---
 pandas/tests/io/xml/__init__.py | 0
 pandas/tests/io/xml/test_xml.py | 4 +++-
 2 files changed, 3 insertions(+), 1 deletion(-)
 delete mode 100644 pandas/tests/io/xml/__init__.py

diff --git a/pandas/tests/io/xml/__init__.py b/pandas/tests/io/xml/__init__.py
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py
index ecb040b28e4ca..a99f66336bf22 100644
--- a/pandas/tests/io/xml/test_xml.py
+++ b/pandas/tests/io/xml/test_xml.py
@@ -681,7 +681,9 @@ def test_names_option_wrong_type(datapath, parser):
     filename = datapath("io", "data", "xml", "books.xml")
 
     with pytest.raises(TypeError, match=("is not a valid type for names")):
-        read_xml(filename, names="Col1, Col2, Col3", parser=parser)
+        read_xml(
+            filename, names="Col1, Col2, Col3", parser=parser  # type: ignore[arg-type]
+        )
 
 
 # ENCODING

From 383505fd87490bc5551d58b559f952d39fc5742f Mon Sep 17 00:00:00 2001
From: phofl <patrick_hoefler@gmx.net>
Date: Fri, 26 Nov 2021 16:24:32 +0100
Subject: [PATCH 4/5] Add comment

---
 pandas/io/parsers/base_parser.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
index a02054ecda82e..5f99489d0795f 100644
--- a/pandas/io/parsers/base_parser.py
+++ b/pandas/io/parsers/base_parser.py
@@ -565,6 +565,8 @@ def _convert_to_ndarrays(
                 col_na_values, col_na_fvalues = set(), set()
 
             if c in self._parse_date_cols:
+                # GH#26203 Do not convert columns which get converted to dates
+                # but replace nans to ensure to_datetime works
                 mask = algorithms.isin(values, set(col_na_values) | col_na_fvalues)
                 np.putmask(values, mask, np.nan)
                 result[c] = values

From 4e24a68d394adc8b9a74b668a68850eb25c3f543 Mon Sep 17 00:00:00 2001
From: phofl <patrick_hoefler@gmx.net>
Date: Fri, 26 Nov 2021 16:40:26 +0100
Subject: [PATCH 5/5] Add returns section

---
 pandas/io/parsers/base_parser.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
index 5f99489d0795f..1b6acfbbe4e4a 100644
--- a/pandas/io/parsers/base_parser.py
+++ b/pandas/io/parsers/base_parser.py
@@ -250,6 +250,11 @@ def _validate_parse_dates_presence(self, columns: list[str]) -> Iterable:
         columns : list
             List of names of the dataframe.
 
+        Returns
+        -------
+        The names of the columns which will get parsed later if a dict or list
+        is given as specification.
+
         Raises
         ------
         ValueError