Skip to content

ENH: add support for na_filter in Python engine #13321

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.18.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ Other enhancements
pd.Timestamp(year=2012, month=1, day=1, hour=8, minute=30)

- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``decimal`` option (:issue:`12933`)
- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``na_filter`` option (:issue:`13321`)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you combine this with the previous point in one sentence?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought we leave separate issues on separate lines?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not that important! (and I don't know if we have a rule about it)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fair enough. Unless there are more objections about this, I'll leave as is.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we try to have separate issue on separate lines that were closed by different PR's, but yes occasionally do bunch them up if they are repetitive.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So...how about this one?


- ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`)
- ``Index`` now supports the ``.where()`` function for same shape indexing (:issue:`13170`)
Expand Down
12 changes: 9 additions & 3 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -425,7 +425,6 @@ def _read(filepath_or_buffer, kwds):
_c_unsupported = set(['skip_footer'])
_python_unsupported = set([
'as_recarray',
'na_filter',
'compact_ints',
'use_unsigned',
'low_memory',
Expand Down Expand Up @@ -1188,8 +1187,13 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False,
result = {}
for c, values in compat.iteritems(dct):
conv_f = None if converters is None else converters.get(c, None)
col_na_values, col_na_fvalues = _get_na_values(c, na_values,
na_fvalues)

if self.na_filter:
col_na_values, col_na_fvalues = _get_na_values(
c, na_values, na_fvalues)
else:
col_na_values, col_na_fvalues = set(), set()

coerce_type = True
if conv_f is not None:
try:
Expand Down Expand Up @@ -1634,6 +1638,8 @@ def __init__(self, f, **kwds):

self.names_passed = kwds['names'] or None

self.na_filter = kwds['na_filter']

self.has_index_names = False
if 'has_index_names' in kwds:
self.has_index_names = kwds['has_index_names']
Expand Down
6 changes: 0 additions & 6 deletions pandas/io/tests/parser/c_parser_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,6 @@ def test_delim_whitespace_custom_terminator(self):
columns=['a', 'b', 'c'])
tm.assert_frame_equal(df, expected)

def test_parse_dates_empty_string(self):
# see gh-2263
s = StringIO("Date, test\n2012-01-01, 1\n,2")
result = self.read_csv(s, parse_dates=["Date"], na_filter=False)
self.assertTrue(result['Date'].isnull()[1])

def test_dtype_and_names_error(self):
# see gh-8833: passing both dtype and names
# resulting in an error reporting issue
Expand Down
6 changes: 2 additions & 4 deletions pandas/io/tests/parser/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1319,10 +1319,8 @@ def test_inf_parsing(self):
df = self.read_csv(StringIO(data), index_col=0)
tm.assert_almost_equal(df['A'].values, expected.values)

if self.engine == 'c':
# TODO: remove condition when 'na_filter' is supported for Python
df = self.read_csv(StringIO(data), index_col=0, na_filter=False)
tm.assert_almost_equal(df['A'].values, expected.values)
df = self.read_csv(StringIO(data), index_col=0, na_filter=False)
tm.assert_almost_equal(df['A'].values, expected.values)

def test_raise_on_no_columns(self):
# single newline
Expand Down
18 changes: 18 additions & 0 deletions pandas/io/tests/parser/na_values.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,3 +223,21 @@ def test_na_values_keep_default(self):
'Three': ['None', 'two', 'None', 'nan', 'five', '',
'seven']})
tm.assert_frame_equal(xp.reindex(columns=df.columns), df)

def test_na_values_na_filter_override(self):
data = """\
A,B
1,A
nan,B
3,C
"""

expected = DataFrame([[1, 'A'], [np.nan, np.nan], [3, 'C']],
columns=['A', 'B'])
out = self.read_csv(StringIO(data), na_values=['B'], na_filter=True)
tm.assert_frame_equal(out, expected)

expected = DataFrame([['1', 'A'], ['nan', 'B'], ['3', 'C']],
columns=['A', 'B'])
out = self.read_csv(StringIO(data), na_values=['B'], na_filter=False)
tm.assert_frame_equal(out, expected)
7 changes: 7 additions & 0 deletions pandas/io/tests/parser/parse_dates.py
Original file line number Diff line number Diff line change
Expand Up @@ -467,3 +467,10 @@ def test_read_with_parse_dates_invalid_type(self):
StringIO(data), parse_dates=np.array([4, 5]))
tm.assertRaisesRegexp(TypeError, errmsg, self.read_csv,
StringIO(data), parse_dates=set([1, 3, 3]))

def test_parse_dates_empty_string(self):
# see gh-2263
data = "Date, test\n2012-01-01, 1\n,2"
result = self.read_csv(StringIO(data), parse_dates=["Date"],
na_filter=False)
self.assertTrue(result['Date'].isnull()[1])