Skip to content

Commit 9d55b50

Browse files
authored
BUG: read_csv not replacing nans before date conversion (#44620)
* BUG: read_csv not replacing nans before date convrsion * Remove mypy ignore * Remove init file * Add comment * Add returns section
1 parent a9a37d6 commit 9d55b50

File tree

4 files changed

+67
-7
lines changed

4 files changed

+67
-7
lines changed

doc/source/whatsnew/v1.4.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -663,6 +663,8 @@ I/O
663663
- Bug in dumping/loading a :class:`DataFrame` with ``yaml.dump(frame)`` (:issue:`42748`)
664664
- Bug in :class:`ExcelWriter`, where ``engine_kwargs`` were not passed through to all engines (:issue:`43442`)
665665
- Bug in :func:`read_csv` raising ``ValueError`` when ``parse_dates`` was used with ``MultiIndex`` columns (:issue:`8991`)
666+
- Bug in :func:`read_csv` converting columns to numeric after date parsing failed (:issue:`11019`)
667+
- Bug in :func:`read_csv` not replacing ``NaN`` values with ``np.nan`` before attempting date conversion (:issue:`26203`)
666668
- Bug in :func:`read_csv` raising ``AttributeError`` when attempting to read a .csv file and infer index column dtype from an nullable integer type (:issue:`44079`)
667669
- :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` with ``compression`` set to ``'zip'`` no longer create a zip file containing a file ending with ".zip". Instead, they try to infer the inner file name more smartly. (:issue:`39465`)
668670
- Bug in :func:`read_csv` when passing simultaneously a parser in ``date_parser`` and ``parse_dates=False``, the parsing was still called (:issue:`44366`)

pandas/io/parsers/base_parser.py

+22-1
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,7 @@ def __init__(self, kwds):
139139
self.col_names = None
140140

141141
self.parse_dates = _validate_parse_dates_arg(kwds.pop("parse_dates", False))
142+
self._parse_date_cols: Iterable = []
142143
self.date_parser = kwds.pop("date_parser", None)
143144
self.dayfirst = kwds.pop("dayfirst", False)
144145
self.keep_date_col = kwds.pop("keep_date_col", False)
@@ -237,7 +238,7 @@ def _open_handles(
237238
errors=kwds.get("encoding_errors", "strict"),
238239
)
239240

240-
def _validate_parse_dates_presence(self, columns: list[str]) -> None:
241+
def _validate_parse_dates_presence(self, columns: list[str]) -> Iterable:
241242
"""
242243
Check if parse_dates are in columns.
243244
@@ -249,6 +250,11 @@ def _validate_parse_dates_presence(self, columns: list[str]) -> None:
249250
columns : list
250251
List of names of the dataframe.
251252
253+
Returns
254+
-------
255+
The names of the columns which will get parsed later if a dict or list
256+
is given as specification.
257+
252258
Raises
253259
------
254260
ValueError
@@ -271,6 +277,8 @@ def _validate_parse_dates_presence(self, columns: list[str]) -> None:
271277
else:
272278
cols_needed = []
273279

280+
cols_needed = list(cols_needed)
281+
274282
# get only columns that are references using names (str), not by index
275283
missing_cols = ", ".join(
276284
sorted(
@@ -285,6 +293,11 @@ def _validate_parse_dates_presence(self, columns: list[str]) -> None:
285293
raise ValueError(
286294
f"Missing column provided to 'parse_dates': '{missing_cols}'"
287295
)
296+
# Convert positions to actual column names
297+
return [
298+
col if (isinstance(col, str) or col in columns) else columns[col]
299+
for col in cols_needed
300+
]
288301

289302
def close(self):
290303
if self.handles is not None:
@@ -556,6 +569,14 @@ def _convert_to_ndarrays(
556569
else:
557570
col_na_values, col_na_fvalues = set(), set()
558571

572+
if c in self._parse_date_cols:
573+
# GH#26203 Do not convert columns which get converted to dates
574+
# but replace nans to ensure to_datetime works
575+
mask = algorithms.isin(values, set(col_na_values) | col_na_fvalues)
576+
np.putmask(values, mask, np.nan)
577+
result[c] = values
578+
continue
579+
559580
if conv_f is not None:
560581
# conv_f applied to data before inference
561582
if cast_type is not None:

pandas/io/parsers/python_parser.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ def __init__(
152152
if self._col_indices is None:
153153
self._col_indices = list(range(len(self.columns)))
154154

155-
self._validate_parse_dates_presence(self.columns)
155+
self._parse_date_cols = self._validate_parse_dates_presence(self.columns)
156156
no_thousands_columns: set[int] | None = None
157157
if self.parse_dates:
158158
no_thousands_columns = self._set_noconvert_dtype_columns(
@@ -277,9 +277,9 @@ def read(self, rows=None):
277277
alldata = self._rows_to_cols(content)
278278
data, columns = self._exclude_implicit_index(alldata)
279279

280+
data = self._convert_data(data)
280281
columns, data = self._do_date_conversions(columns, data)
281282

282-
data = self._convert_data(data)
283283
index, columns = self._make_index(data, alldata, columns, indexnamerow)
284284

285285
return index, columns, data

pandas/tests/io/parser/test_parse_dates.py

+41-4
Original file line numberDiff line numberDiff line change
@@ -286,8 +286,6 @@ def date_parser(*date_cols):
286286

287287
if not keep_date_col:
288288
expected = expected.drop(["X1", "X2", "X3"], axis=1)
289-
elif parser.engine == "python":
290-
expected["X1"] = expected["X1"].astype(np.int64)
291289

292290
# Python can sometimes be flaky about how
293291
# the aggregated columns are entered, so
@@ -425,8 +423,6 @@ def test_multiple_date_col(all_parsers, keep_date_col):
425423

426424
if not keep_date_col:
427425
expected = expected.drop(["X1", "X2", "X3"], axis=1)
428-
elif parser.engine == "python":
429-
expected["X1"] = expected["X1"].astype(np.int64)
430426

431427
tm.assert_frame_equal(result, expected)
432428

@@ -1907,3 +1903,44 @@ def test_dayfirst_warnings():
19071903
index_col="date",
19081904
).index
19091905
tm.assert_index_equal(expected, res8)
1906+
1907+
1908+
@skip_pyarrow
1909+
def test_infer_first_column_as_index(all_parsers):
1910+
# GH#11019
1911+
parser = all_parsers
1912+
data = "a,b,c\n1970-01-01,2,3,4"
1913+
result = parser.read_csv(StringIO(data), parse_dates=["a"])
1914+
expected = DataFrame({"a": "2", "b": 3, "c": 4}, index=["1970-01-01"])
1915+
tm.assert_frame_equal(result, expected)
1916+
1917+
1918+
@skip_pyarrow
1919+
def test_replace_nans_before_parsing_dates(all_parsers):
1920+
# GH#26203
1921+
parser = all_parsers
1922+
data = """Test
1923+
2012-10-01
1924+
0
1925+
2015-05-15
1926+
#
1927+
2017-09-09
1928+
"""
1929+
result = parser.read_csv(
1930+
StringIO(data),
1931+
na_values={"Test": ["#", "0"]},
1932+
parse_dates=["Test"],
1933+
date_parser=lambda x: pd.to_datetime(x, format="%Y-%m-%d"),
1934+
)
1935+
expected = DataFrame(
1936+
{
1937+
"Test": [
1938+
Timestamp("2012-10-01"),
1939+
pd.NaT,
1940+
Timestamp("2015-05-15"),
1941+
pd.NaT,
1942+
Timestamp("2017-09-09"),
1943+
]
1944+
}
1945+
)
1946+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)