diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 088168fd8a008..6044f25ca5147 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -79,6 +79,7 @@ I/O - Bug in class:`~pandas.io.stata.StataReader` not converting date/time columns with display formatting addressed (:issue:`17990`). Previously columns with display formatting were normally left as ordinal numbers and not converted to datetime objects. - Bug in :func:`read_csv` when reading a compressed UTF-16 encoded file (:issue:`18071`) +- Bug in :func:`read_csv` for handling null values in index columns when specifying ``na_filter=False`` (:issue:`5239`) - Bug in :meth:`DataFrame.to_csv` when the table had ``MultiIndex`` columns, and a list of strings was passed in for ``header`` (:issue:`5539`) Plotting diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 438192b4b0a59..ae79d70d4cf0a 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1249,6 +1249,8 @@ def __init__(self, kwds): self.na_values = kwds.get('na_values') self.na_fvalues = kwds.get('na_fvalues') + self.na_filter = kwds.get('na_filter', False) + self.true_values = kwds.get('true_values') self.false_values = kwds.get('false_values') self.as_recarray = kwds.get('as_recarray', False) @@ -1424,7 +1426,6 @@ def _make_index(self, data, alldata, columns, indexnamerow=False): elif not self._has_complex_date_col: index = self._get_simple_index(alldata, columns) index = self._agg_index(index) - elif self._has_complex_date_col: if not self._name_processed: (self.index_names, _, @@ -1504,8 +1505,12 @@ def _agg_index(self, index, try_parse_dates=True): if try_parse_dates and self._should_parse_dates(i): arr = self._date_conv(arr) - col_na_values = self.na_values - col_na_fvalues = self.na_fvalues + if self.na_filter: + col_na_values = self.na_values + col_na_fvalues = self.na_fvalues + else: + col_na_values = set() + col_na_fvalues = set() if isinstance(self.na_values, dict): col_name = self.index_names[i] @@ -2060,8 +2065,6 @@ def __init__(self, f, **kwds): self.names_passed = kwds['names'] or None - self.na_filter = kwds['na_filter'] - self.has_index_names = False if 'has_index_names' in kwds: self.has_index_names = kwds['has_index_names'] diff --git a/pandas/tests/io/parser/na_values.py b/pandas/tests/io/parser/na_values.py index 7fbf174e19eee..8dc599b42ddc7 100644 --- a/pandas/tests/io/parser/na_values.py +++ b/pandas/tests/io/parser/na_values.py @@ -312,3 +312,21 @@ def test_empty_na_values_no_default_with_index(self): out = self.read_csv(StringIO(data), keep_default_na=False, index_col=0) tm.assert_frame_equal(out, expected) + + def test_no_na_filter_on_index(self): + # see gh-5239 + data = "a,b,c\n1,,3\n4,5,6" + + # Don't parse NA-values in index when na_filter=False. + out = self.read_csv(StringIO(data), index_col=[1], na_filter=False) + + expected = DataFrame({"a": [1, 4], "c": [3, 6]}, + index=Index(["", "5"], name="b")) + tm.assert_frame_equal(out, expected) + + # Parse NA-values in index when na_filter=True. + out = self.read_csv(StringIO(data), index_col=[1], na_filter=True) + + expected = DataFrame({"a": [1, 4], "c": [3, 6]}, + index=Index([np.nan, 5.0], name="b")) + tm.assert_frame_equal(out, expected)