From 57ccd4cb5d0b63da32d13e405ddec1b16607f4b9 Mon Sep 17 00:00:00 2001 From: phofl Date: Thu, 25 Nov 2021 23:57:03 +0100 Subject: [PATCH 1/5] BUG: read_csv not replacing nans before date convrsion --- doc/source/whatsnew/v1.4.0.rst | 2 + pandas/io/parsers/base_parser.py | 16 +++++++- pandas/io/parsers/python_parser.py | 4 +- pandas/tests/io/parser/test_parse_dates.py | 45 ++++++++++++++++++++-- pandas/tests/io/xml/__init__.py | 0 5 files changed, 60 insertions(+), 7 deletions(-) create mode 100644 pandas/tests/io/xml/__init__.py diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 39e3894f86302..7da3599225fb0 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -661,6 +661,8 @@ I/O - Bug in dumping/loading a :class:`DataFrame` with ``yaml.dump(frame)`` (:issue:`42748`) - Bug in :class:`ExcelWriter`, where ``engine_kwargs`` were not passed through to all engines (:issue:`43442`) - Bug in :func:`read_csv` raising ``ValueError`` when ``parse_dates`` was used with ``MultiIndex`` columns (:issue:`8991`) +- Bug in :func:`read_csv` converting columns to numeric after date parsing failed (:issue:`11019`) +- Bug in :func:`read_csv` not replacing ``NaN`` values with ``np.nan`` before attempting date conversion (:issue:`26203`) - Bug in :func:`read_csv` raising ``AttributeError`` when attempting to read a .csv file and infer index column dtype from an nullable integer type (:issue:`44079`) - :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` with ``compression`` set to ``'zip'`` no longer create a zip file containing a file ending with ".zip". Instead, they try to infer the inner file name more smartly. (:issue:`39465`) - Bug in :func:`read_csv` when passing simultaneously a parser in ``date_parser`` and ``parse_dates=False``, the parsing was still called (:issue:`44366`) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 25a89d1c57006..a02054ecda82e 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -139,6 +139,7 @@ def __init__(self, kwds): self.col_names = None self.parse_dates = _validate_parse_dates_arg(kwds.pop("parse_dates", False)) + self._parse_date_cols: Iterable = [] self.date_parser = kwds.pop("date_parser", None) self.dayfirst = kwds.pop("dayfirst", False) self.keep_date_col = kwds.pop("keep_date_col", False) @@ -237,7 +238,7 @@ def _open_handles( errors=kwds.get("encoding_errors", "strict"), ) - def _validate_parse_dates_presence(self, columns: list[str]) -> None: + def _validate_parse_dates_presence(self, columns: list[str]) -> Iterable: """ Check if parse_dates are in columns. @@ -271,6 +272,8 @@ def _validate_parse_dates_presence(self, columns: list[str]) -> None: else: cols_needed = [] + cols_needed = list(cols_needed) + # get only columns that are references using names (str), not by index missing_cols = ", ".join( sorted( @@ -285,6 +288,11 @@ def _validate_parse_dates_presence(self, columns: list[str]) -> None: raise ValueError( f"Missing column provided to 'parse_dates': '{missing_cols}'" ) + # Convert positions to actual column names + return [ + col if (isinstance(col, str) or col in columns) else columns[col] + for col in cols_needed + ] def close(self): if self.handles is not None: @@ -556,6 +564,12 @@ def _convert_to_ndarrays( else: col_na_values, col_na_fvalues = set(), set() + if c in self._parse_date_cols: + mask = algorithms.isin(values, set(col_na_values) | col_na_fvalues) + np.putmask(values, mask, np.nan) + result[c] = values + continue + if conv_f is not None: # conv_f applied to data before inference if cast_type is not None: diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index f5420618c0235..7c92dc41ecd1f 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -152,7 +152,7 @@ def __init__( if self._col_indices is None: self._col_indices = list(range(len(self.columns))) - self._validate_parse_dates_presence(self.columns) + self._parse_date_cols = self._validate_parse_dates_presence(self.columns) no_thousands_columns: set[int] | None = None if self.parse_dates: no_thousands_columns = self._set_noconvert_dtype_columns( @@ -277,9 +277,9 @@ def read(self, rows=None): alldata = self._rows_to_cols(content) data, columns = self._exclude_implicit_index(alldata) + data = self._convert_data(data) columns, data = self._do_date_conversions(columns, data) - data = self._convert_data(data) index, columns = self._make_index(data, alldata, columns, indexnamerow) return index, columns, data diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 10d260bad215e..a61f3bc03028c 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -286,8 +286,6 @@ def date_parser(*date_cols): if not keep_date_col: expected = expected.drop(["X1", "X2", "X3"], axis=1) - elif parser.engine == "python": - expected["X1"] = expected["X1"].astype(np.int64) # Python can sometimes be flaky about how # the aggregated columns are entered, so @@ -425,8 +423,6 @@ def test_multiple_date_col(all_parsers, keep_date_col): if not keep_date_col: expected = expected.drop(["X1", "X2", "X3"], axis=1) - elif parser.engine == "python": - expected["X1"] = expected["X1"].astype(np.int64) tm.assert_frame_equal(result, expected) @@ -1907,3 +1903,44 @@ def test_dayfirst_warnings(): index_col="date", ).index tm.assert_index_equal(expected, res8) + + +@skip_pyarrow +def test_infer_first_column_as_index(all_parsers): + # GH#11019 + parser = all_parsers + data = "a,b,c\n1970-01-01,2,3,4" + result = parser.read_csv(StringIO(data), parse_dates=["a"]) + expected = DataFrame({"a": "2", "b": 3, "c": 4}, index=["1970-01-01"]) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow +def test_replace_nans_before_parsing_dates(all_parsers): + # GH#26203 + parser = all_parsers + data = """Test +2012-10-01 +0 +2015-05-15 +# +2017-09-09 +""" + result = parser.read_csv( + StringIO(data), + na_values={"Test": ["#", "0"]}, + parse_dates=["Test"], + date_parser=lambda x: pd.to_datetime(x, format="%Y-%m-%d"), + ) + expected = DataFrame( + { + "Test": [ + Timestamp("2012-10-01"), + pd.NaT, + Timestamp("2015-05-15"), + pd.NaT, + Timestamp("2017-09-09"), + ] + } + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/xml/__init__.py b/pandas/tests/io/xml/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d From 72d7937fd3f05662418503abb21d03cf343f7c38 Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 26 Nov 2021 01:26:38 +0100 Subject: [PATCH 2/5] Remove mypy ignore --- pandas/tests/io/xml/test_xml.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index a99f66336bf22..ecb040b28e4ca 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -681,9 +681,7 @@ def test_names_option_wrong_type(datapath, parser): filename = datapath("io", "data", "xml", "books.xml") with pytest.raises(TypeError, match=("is not a valid type for names")): - read_xml( - filename, names="Col1, Col2, Col3", parser=parser # type: ignore[arg-type] - ) + read_xml(filename, names="Col1, Col2, Col3", parser=parser) # ENCODING From 965aec8c5bcdec76a03b7edcd8af3ee30b7147e9 Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 26 Nov 2021 16:09:36 +0100 Subject: [PATCH 3/5] Remove init file --- pandas/tests/io/xml/__init__.py | 0 pandas/tests/io/xml/test_xml.py | 4 +++- 2 files changed, 3 insertions(+), 1 deletion(-) delete mode 100644 pandas/tests/io/xml/__init__.py diff --git a/pandas/tests/io/xml/__init__.py b/pandas/tests/io/xml/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index ecb040b28e4ca..a99f66336bf22 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -681,7 +681,9 @@ def test_names_option_wrong_type(datapath, parser): filename = datapath("io", "data", "xml", "books.xml") with pytest.raises(TypeError, match=("is not a valid type for names")): - read_xml(filename, names="Col1, Col2, Col3", parser=parser) + read_xml( + filename, names="Col1, Col2, Col3", parser=parser # type: ignore[arg-type] + ) # ENCODING From 383505fd87490bc5551d58b559f952d39fc5742f Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 26 Nov 2021 16:24:32 +0100 Subject: [PATCH 4/5] Add comment --- pandas/io/parsers/base_parser.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index a02054ecda82e..5f99489d0795f 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -565,6 +565,8 @@ def _convert_to_ndarrays( col_na_values, col_na_fvalues = set(), set() if c in self._parse_date_cols: + # GH#26203 Do not convert columns which get converted to dates + # but replace nans to ensure to_datetime works mask = algorithms.isin(values, set(col_na_values) | col_na_fvalues) np.putmask(values, mask, np.nan) result[c] = values From 4e24a68d394adc8b9a74b668a68850eb25c3f543 Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 26 Nov 2021 16:40:26 +0100 Subject: [PATCH 5/5] Add returns section --- pandas/io/parsers/base_parser.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 5f99489d0795f..1b6acfbbe4e4a 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -250,6 +250,11 @@ def _validate_parse_dates_presence(self, columns: list[str]) -> Iterable: columns : list List of names of the dataframe. + Returns + ------- + The names of the columns which will get parsed later if a dict or list + is given as specification. + Raises ------ ValueError