Skip to content

BUG: read_csv not replacing nans before date conversion #44620

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Nov 26, 2021
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -661,6 +661,8 @@ I/O
- Bug in dumping/loading a :class:`DataFrame` with ``yaml.dump(frame)`` (:issue:`42748`)
- Bug in :class:`ExcelWriter`, where ``engine_kwargs`` were not passed through to all engines (:issue:`43442`)
- Bug in :func:`read_csv` raising ``ValueError`` when ``parse_dates`` was used with ``MultiIndex`` columns (:issue:`8991`)
- Bug in :func:`read_csv` converting columns to numeric after date parsing failed (:issue:`11019`)
- Bug in :func:`read_csv` not replacing ``NaN`` values with ``np.nan`` before attempting date conversion (:issue:`26203`)
- Bug in :func:`read_csv` raising ``AttributeError`` when attempting to read a .csv file and infer index column dtype from an nullable integer type (:issue:`44079`)
- :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` with ``compression`` set to ``'zip'`` no longer create a zip file containing a file ending with ".zip". Instead, they try to infer the inner file name more smartly. (:issue:`39465`)
- Bug in :func:`read_csv` when passing simultaneously a parser in ``date_parser`` and ``parse_dates=False``, the parsing was still called (:issue:`44366`)
Expand Down
16 changes: 15 additions & 1 deletion pandas/io/parsers/base_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ def __init__(self, kwds):
self.col_names = None

self.parse_dates = _validate_parse_dates_arg(kwds.pop("parse_dates", False))
self._parse_date_cols: Iterable = []
self.date_parser = kwds.pop("date_parser", None)
self.dayfirst = kwds.pop("dayfirst", False)
self.keep_date_col = kwds.pop("keep_date_col", False)
Expand Down Expand Up @@ -237,7 +238,7 @@ def _open_handles(
errors=kwds.get("encoding_errors", "strict"),
)

def _validate_parse_dates_presence(self, columns: list[str]) -> None:
def _validate_parse_dates_presence(self, columns: list[str]) -> Iterable:
"""
Check if parse_dates are in columns.

Expand Down Expand Up @@ -271,6 +272,8 @@ def _validate_parse_dates_presence(self, columns: list[str]) -> None:
else:
cols_needed = []

cols_needed = list(cols_needed)

# get only columns that are references using names (str), not by index
missing_cols = ", ".join(
sorted(
Expand All @@ -285,6 +288,11 @@ def _validate_parse_dates_presence(self, columns: list[str]) -> None:
raise ValueError(
f"Missing column provided to 'parse_dates': '{missing_cols}'"
)
# Convert positions to actual column names
return [
col if (isinstance(col, str) or col in columns) else columns[col]
for col in cols_needed
]

def close(self):
if self.handles is not None:
Expand Down Expand Up @@ -556,6 +564,12 @@ def _convert_to_ndarrays(
else:
col_na_values, col_na_fvalues = set(), set()

if c in self._parse_date_cols:
mask = algorithms.isin(values, set(col_na_values) | col_na_fvalues)
np.putmask(values, mask, np.nan)
result[c] = values
continue

if conv_f is not None:
# conv_f applied to data before inference
if cast_type is not None:
Expand Down
4 changes: 2 additions & 2 deletions pandas/io/parsers/python_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ def __init__(
if self._col_indices is None:
self._col_indices = list(range(len(self.columns)))

self._validate_parse_dates_presence(self.columns)
self._parse_date_cols = self._validate_parse_dates_presence(self.columns)
no_thousands_columns: set[int] | None = None
if self.parse_dates:
no_thousands_columns = self._set_noconvert_dtype_columns(
Expand Down Expand Up @@ -277,9 +277,9 @@ def read(self, rows=None):
alldata = self._rows_to_cols(content)
data, columns = self._exclude_implicit_index(alldata)

data = self._convert_data(data)
columns, data = self._do_date_conversions(columns, data)

data = self._convert_data(data)
index, columns = self._make_index(data, alldata, columns, indexnamerow)

return index, columns, data
Expand Down
45 changes: 41 additions & 4 deletions pandas/tests/io/parser/test_parse_dates.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,8 +286,6 @@ def date_parser(*date_cols):

if not keep_date_col:
expected = expected.drop(["X1", "X2", "X3"], axis=1)
elif parser.engine == "python":
expected["X1"] = expected["X1"].astype(np.int64)

# Python can sometimes be flaky about how
# the aggregated columns are entered, so
Expand Down Expand Up @@ -425,8 +423,6 @@ def test_multiple_date_col(all_parsers, keep_date_col):

if not keep_date_col:
expected = expected.drop(["X1", "X2", "X3"], axis=1)
elif parser.engine == "python":
expected["X1"] = expected["X1"].astype(np.int64)

tm.assert_frame_equal(result, expected)

Expand Down Expand Up @@ -1907,3 +1903,44 @@ def test_dayfirst_warnings():
index_col="date",
).index
tm.assert_index_equal(expected, res8)


@skip_pyarrow
def test_infer_first_column_as_index(all_parsers):
# GH#11019
parser = all_parsers
data = "a,b,c\n1970-01-01,2,3,4"
result = parser.read_csv(StringIO(data), parse_dates=["a"])
expected = DataFrame({"a": "2", "b": 3, "c": 4}, index=["1970-01-01"])
tm.assert_frame_equal(result, expected)


@skip_pyarrow
def test_replace_nans_before_parsing_dates(all_parsers):
# GH#26203
parser = all_parsers
data = """Test
2012-10-01
0
2015-05-15
#
2017-09-09
"""
result = parser.read_csv(
StringIO(data),
na_values={"Test": ["#", "0"]},
parse_dates=["Test"],
date_parser=lambda x: pd.to_datetime(x, format="%Y-%m-%d"),
)
expected = DataFrame(
{
"Test": [
Timestamp("2012-10-01"),
pd.NaT,
Timestamp("2015-05-15"),
pd.NaT,
Timestamp("2017-09-09"),
]
}
)
tm.assert_frame_equal(result, expected)