Skip to content

Commit d2c86a9

Browse files
gfyoungTomAugspurger
authored andcommitted
BUG: Don't parse NA-values in index when requested (#18127)
Closes gh-5239. (cherry picked from commit c176a3c)
1 parent 9a30b86 commit d2c86a9

File tree

3 files changed

+27
-5
lines changed

3 files changed

+27
-5
lines changed

doc/source/whatsnew/v0.21.1.txt

+1
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ I/O
7777

7878
- Bug in class:`~pandas.io.stata.StataReader` not converting date/time columns with display formatting addressed (:issue:`17990`). Previously columns with display formatting were normally left as ordinal numbers and not converted to datetime objects.
7979
- Bug in :func:`read_csv` when reading a compressed UTF-16 encoded file (:issue:`18071`)
80+
- Bug in :func:`read_csv` for handling null values in index columns when specifying ``na_filter=False`` (:issue:`5239`)
8081
- Bug in :meth:`DataFrame.to_csv` when the table had ``MultiIndex`` columns, and a list of strings was passed in for ``header`` (:issue:`5539`)
8182

8283

pandas/io/parsers.py

+8-5
Original file line numberDiff line numberDiff line change
@@ -1231,6 +1231,8 @@ def __init__(self, kwds):
12311231

12321232
self.na_values = kwds.get('na_values')
12331233
self.na_fvalues = kwds.get('na_fvalues')
1234+
self.na_filter = kwds.get('na_filter', False)
1235+
12341236
self.true_values = kwds.get('true_values')
12351237
self.false_values = kwds.get('false_values')
12361238
self.as_recarray = kwds.get('as_recarray', False)
@@ -1404,7 +1406,6 @@ def _make_index(self, data, alldata, columns, indexnamerow=False):
14041406
elif not self._has_complex_date_col:
14051407
index = self._get_simple_index(alldata, columns)
14061408
index = self._agg_index(index)
1407-
14081409
elif self._has_complex_date_col:
14091410
if not self._name_processed:
14101411
(self.index_names, _,
@@ -1487,8 +1488,12 @@ def _agg_index(self, index, try_parse_dates=True):
14871488
if (try_parse_dates and self._should_parse_dates(i)):
14881489
arr = self._date_conv(arr)
14891490

1490-
col_na_values = self.na_values
1491-
col_na_fvalues = self.na_fvalues
1491+
if self.na_filter:
1492+
col_na_values = self.na_values
1493+
col_na_fvalues = self.na_fvalues
1494+
else:
1495+
col_na_values = set()
1496+
col_na_fvalues = set()
14921497

14931498
if isinstance(self.na_values, dict):
14941499
col_name = self.index_names[i]
@@ -2043,8 +2048,6 @@ def __init__(self, f, **kwds):
20432048

20442049
self.names_passed = kwds['names'] or None
20452050

2046-
self.na_filter = kwds['na_filter']
2047-
20482051
self.has_index_names = False
20492052
if 'has_index_names' in kwds:
20502053
self.has_index_names = kwds['has_index_names']

pandas/tests/io/parser/na_values.py

+18
Original file line numberDiff line numberDiff line change
@@ -312,3 +312,21 @@ def test_empty_na_values_no_default_with_index(self):
312312
out = self.read_csv(StringIO(data), keep_default_na=False, index_col=0)
313313

314314
tm.assert_frame_equal(out, expected)
315+
316+
def test_no_na_filter_on_index(self):
317+
# see gh-5239
318+
data = "a,b,c\n1,,3\n4,5,6"
319+
320+
# Don't parse NA-values in index when na_filter=False.
321+
out = self.read_csv(StringIO(data), index_col=[1], na_filter=False)
322+
323+
expected = DataFrame({"a": [1, 4], "c": [3, 6]},
324+
index=Index(["", "5"], name="b"))
325+
tm.assert_frame_equal(out, expected)
326+
327+
# Parse NA-values in index when na_filter=True.
328+
out = self.read_csv(StringIO(data), index_col=[1], na_filter=True)
329+
330+
expected = DataFrame({"a": [1, 4], "c": [3, 6]},
331+
index=Index([np.nan, 5.0], name="b"))
332+
tm.assert_frame_equal(out, expected)

0 commit comments

Comments
 (0)