Skip to content

Commit 186fd34

Browse files
committed
ENH: add support for na_filter in Python engine
1 parent 9e7bfdd commit 186fd34

File tree

6 files changed

+37
-13
lines changed

6 files changed

+37
-13
lines changed

doc/source/whatsnew/v0.18.2.txt

+1
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ Other enhancements
7575
pd.Timestamp(year=2012, month=1, day=1, hour=8, minute=30)
7676

7777
- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``decimal`` option (:issue:`12933`)
78+
- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``na_filter`` option (:issue:`13321`)
7879

7980
- ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`)
8081
- ``Index`` now supports the ``.where()`` function for same shape indexing (:issue:`13170`)

pandas/io/parsers.py

+9-3
Original file line numberDiff line numberDiff line change
@@ -425,7 +425,6 @@ def _read(filepath_or_buffer, kwds):
425425
_c_unsupported = set(['skip_footer'])
426426
_python_unsupported = set([
427427
'as_recarray',
428-
'na_filter',
429428
'compact_ints',
430429
'use_unsigned',
431430
'low_memory',
@@ -1188,8 +1187,13 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False,
11881187
result = {}
11891188
for c, values in compat.iteritems(dct):
11901189
conv_f = None if converters is None else converters.get(c, None)
1191-
col_na_values, col_na_fvalues = _get_na_values(c, na_values,
1192-
na_fvalues)
1190+
1191+
if self.na_filter:
1192+
col_na_values, col_na_fvalues = _get_na_values(
1193+
c, na_values, na_fvalues)
1194+
else:
1195+
col_na_values, col_na_fvalues = set(), set()
1196+
11931197
coerce_type = True
11941198
if conv_f is not None:
11951199
try:
@@ -1634,6 +1638,8 @@ def __init__(self, f, **kwds):
16341638

16351639
self.names_passed = kwds['names'] or None
16361640

1641+
self.na_filter = kwds['na_filter']
1642+
16371643
self.has_index_names = False
16381644
if 'has_index_names' in kwds:
16391645
self.has_index_names = kwds['has_index_names']

pandas/io/tests/parser/c_parser_only.py

-6
Original file line numberDiff line numberDiff line change
@@ -61,12 +61,6 @@ def test_delim_whitespace_custom_terminator(self):
6161
columns=['a', 'b', 'c'])
6262
tm.assert_frame_equal(df, expected)
6363

64-
def test_parse_dates_empty_string(self):
65-
# see gh-2263
66-
s = StringIO("Date, test\n2012-01-01, 1\n,2")
67-
result = self.read_csv(s, parse_dates=["Date"], na_filter=False)
68-
self.assertTrue(result['Date'].isnull()[1])
69-
7064
def test_dtype_and_names_error(self):
7165
# see gh-8833: passing both dtype and names
7266
# resulting in an error reporting issue

pandas/io/tests/parser/common.py

+2-4
Original file line numberDiff line numberDiff line change
@@ -1319,10 +1319,8 @@ def test_inf_parsing(self):
13191319
df = self.read_csv(StringIO(data), index_col=0)
13201320
tm.assert_almost_equal(df['A'].values, expected.values)
13211321

1322-
if self.engine == 'c':
1323-
# TODO: remove condition when 'na_filter' is supported for Python
1324-
df = self.read_csv(StringIO(data), index_col=0, na_filter=False)
1325-
tm.assert_almost_equal(df['A'].values, expected.values)
1322+
df = self.read_csv(StringIO(data), index_col=0, na_filter=False)
1323+
tm.assert_almost_equal(df['A'].values, expected.values)
13261324

13271325
def test_raise_on_no_columns(self):
13281326
# single newline

pandas/io/tests/parser/na_values.py

+18
Original file line numberDiff line numberDiff line change
@@ -223,3 +223,21 @@ def test_na_values_keep_default(self):
223223
'Three': ['None', 'two', 'None', 'nan', 'five', '',
224224
'seven']})
225225
tm.assert_frame_equal(xp.reindex(columns=df.columns), df)
226+
227+
def test_na_values_na_filter_override(self):
228+
data = """\
229+
A,B
230+
1,A
231+
nan,B
232+
3,C
233+
"""
234+
235+
expected = DataFrame([[1, 'A'], [np.nan, np.nan], [3, 'C']],
236+
columns=['A', 'B'])
237+
out = self.read_csv(StringIO(data), na_values=['B'], na_filter=True)
238+
tm.assert_frame_equal(out, expected)
239+
240+
expected = DataFrame([['1', 'A'], ['nan', 'B'], ['3', 'C']],
241+
columns=['A', 'B'])
242+
out = self.read_csv(StringIO(data), na_values=['B'], na_filter=False)
243+
tm.assert_frame_equal(out, expected)

pandas/io/tests/parser/parse_dates.py

+7
Original file line numberDiff line numberDiff line change
@@ -467,3 +467,10 @@ def test_read_with_parse_dates_invalid_type(self):
467467
StringIO(data), parse_dates=np.array([4, 5]))
468468
tm.assertRaisesRegexp(TypeError, errmsg, self.read_csv,
469469
StringIO(data), parse_dates=set([1, 3, 3]))
470+
471+
def test_parse_dates_empty_string(self):
472+
# see gh-2263
473+
data = "Date, test\n2012-01-01, 1\n,2"
474+
result = self.read_csv(StringIO(data), parse_dates=["Date"],
475+
na_filter=False)
476+
self.assertTrue(result['Date'].isnull()[1])

0 commit comments

Comments
 (0)