BUG: read_csv not replacing nans before date conversion (#44620)

phofl · web-flow · commit 9d55b50cc1e8 · 2021-11-26T18:02:01.000+01:00
* BUG: read_csv not replacing nans before date convrsion

* Remove mypy ignore

* Remove init file

* Add comment

* Add returns section
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -663,6 +663,8 @@ I/O
 - Bug in dumping/loading a :class:`DataFrame` with ``yaml.dump(frame)`` (:issue:`42748`)
 - Bug in :class:`ExcelWriter`, where ``engine_kwargs`` were not passed through to all engines (:issue:`43442`)
 - Bug in :func:`read_csv` raising ``ValueError`` when ``parse_dates`` was used with ``MultiIndex`` columns (:issue:`8991`)
+- Bug in :func:`read_csv` converting columns to numeric after date parsing failed (:issue:`11019`)
+- Bug in :func:`read_csv` not replacing ``NaN`` values with ``np.nan`` before attempting date conversion (:issue:`26203`)
 - Bug in :func:`read_csv` raising ``AttributeError`` when attempting to read a .csv file and infer index column dtype from an nullable integer type (:issue:`44079`)
 - :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` with ``compression`` set to ``'zip'`` no longer create a zip file containing a file ending with ".zip". Instead, they try to infer the inner file name more smartly. (:issue:`39465`)
 - Bug in :func:`read_csv` when passing simultaneously a parser in ``date_parser`` and ``parse_dates=False``, the parsing was still called (:issue:`44366`)
diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
@@ -139,6 +139,7 @@ def __init__(self, kwds):
         self.col_names = None
 
         self.parse_dates = _validate_parse_dates_arg(kwds.pop("parse_dates", False))
+        self._parse_date_cols: Iterable = []
         self.date_parser = kwds.pop("date_parser", None)
         self.dayfirst = kwds.pop("dayfirst", False)
         self.keep_date_col = kwds.pop("keep_date_col", False)
@@ -237,7 +238,7 @@ def _open_handles(
             errors=kwds.get("encoding_errors", "strict"),
         )
 
-    def _validate_parse_dates_presence(self, columns: list[str]) -> None:
+    def _validate_parse_dates_presence(self, columns: list[str]) -> Iterable:
         """
         Check if parse_dates are in columns.
 
@@ -249,6 +250,11 @@ def _validate_parse_dates_presence(self, columns: list[str]) -> None:
         columns : list
             List of names of the dataframe.
 
+        Returns
+        -------
+        The names of the columns which will get parsed later if a dict or list
+        is given as specification.
+
         Raises
         ------
         ValueError
@@ -271,6 +277,8 @@ def _validate_parse_dates_presence(self, columns: list[str]) -> None:
         else:
             cols_needed = []
 
+        cols_needed = list(cols_needed)
+
         # get only columns that are references using names (str), not by index
         missing_cols = ", ".join(
             sorted(
@@ -285,6 +293,11 @@ def _validate_parse_dates_presence(self, columns: list[str]) -> None:
             raise ValueError(
                 f"Missing column provided to 'parse_dates': '{missing_cols}'"
             )
+        # Convert positions to actual column names
+        return [
+            col if (isinstance(col, str) or col in columns) else columns[col]
+            for col in cols_needed
+        ]
 
     def close(self):
         if self.handles is not None:
@@ -556,6 +569,14 @@ def _convert_to_ndarrays(
             else:
                 col_na_values, col_na_fvalues = set(), set()
 
+            if c in self._parse_date_cols:
+                # GH#26203 Do not convert columns which get converted to dates
+                # but replace nans to ensure to_datetime works
+                mask = algorithms.isin(values, set(col_na_values) | col_na_fvalues)
+                np.putmask(values, mask, np.nan)
+                result[c] = values
+                continue
+
             if conv_f is not None:
                 # conv_f applied to data before inference
                 if cast_type is not None:
diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
@@ -152,7 +152,7 @@ def __init__(
         if self._col_indices is None:
             self._col_indices = list(range(len(self.columns)))
 
-        self._validate_parse_dates_presence(self.columns)
+        self._parse_date_cols = self._validate_parse_dates_presence(self.columns)
         no_thousands_columns: set[int] | None = None
         if self.parse_dates:
             no_thousands_columns = self._set_noconvert_dtype_columns(
@@ -277,9 +277,9 @@ def read(self, rows=None):
         alldata = self._rows_to_cols(content)
         data, columns = self._exclude_implicit_index(alldata)
 
+        data = self._convert_data(data)
         columns, data = self._do_date_conversions(columns, data)
 
-        data = self._convert_data(data)
         index, columns = self._make_index(data, alldata, columns, indexnamerow)
 
         return index, columns, data
diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py
@@ -286,8 +286,6 @@ def date_parser(*date_cols):
 
     if not keep_date_col:
         expected = expected.drop(["X1", "X2", "X3"], axis=1)
-    elif parser.engine == "python":
-        expected["X1"] = expected["X1"].astype(np.int64)
 
     # Python can sometimes be flaky about how
     # the aggregated columns are entered, so
@@ -425,8 +423,6 @@ def test_multiple_date_col(all_parsers, keep_date_col):
 
     if not keep_date_col:
         expected = expected.drop(["X1", "X2", "X3"], axis=1)
-    elif parser.engine == "python":
-        expected["X1"] = expected["X1"].astype(np.int64)
 
     tm.assert_frame_equal(result, expected)
 
@@ -1907,3 +1903,44 @@ def test_dayfirst_warnings():
             index_col="date",
         ).index
     tm.assert_index_equal(expected, res8)
+
+
+@skip_pyarrow
+def test_infer_first_column_as_index(all_parsers):
+    # GH#11019
+    parser = all_parsers
+    data = "a,b,c\n1970-01-01,2,3,4"
+    result = parser.read_csv(StringIO(data), parse_dates=["a"])
+    expected = DataFrame({"a": "2", "b": 3, "c": 4}, index=["1970-01-01"])
+    tm.assert_frame_equal(result, expected)
+
+
+@skip_pyarrow
+def test_replace_nans_before_parsing_dates(all_parsers):
+    # GH#26203
+    parser = all_parsers
+    data = """Test
+2012-10-01
+0
+2015-05-15
+#
+2017-09-09
+"""
+    result = parser.read_csv(
+        StringIO(data),
+        na_values={"Test": ["#", "0"]},
+        parse_dates=["Test"],
+        date_parser=lambda x: pd.to_datetime(x, format="%Y-%m-%d"),
+    )
+    expected = DataFrame(
+        {
+            "Test": [
+                Timestamp("2012-10-01"),
+                pd.NaT,
+                Timestamp("2015-05-15"),
+                pd.NaT,
+                Timestamp("2017-09-09"),
+            ]
+        }
+    )
+    tm.assert_frame_equal(result, expected)