diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 2b67aca1dcf74..be38adb96e403 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -75,6 +75,7 @@ Other enhancements pd.Timestamp(year=2012, month=1, day=1, hour=8, minute=30) - The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``decimal`` option (:issue:`12933`) +- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``na_filter`` option (:issue:`13321`) - ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`) - ``Index`` now supports the ``.where()`` function for same shape indexing (:issue:`13170`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index bf4083f61155c..394fe1a98880a 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -425,7 +425,6 @@ def _read(filepath_or_buffer, kwds): _c_unsupported = set(['skip_footer']) _python_unsupported = set([ 'as_recarray', - 'na_filter', 'compact_ints', 'use_unsigned', 'low_memory', @@ -1188,8 +1187,13 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, result = {} for c, values in compat.iteritems(dct): conv_f = None if converters is None else converters.get(c, None) - col_na_values, col_na_fvalues = _get_na_values(c, na_values, - na_fvalues) + + if self.na_filter: + col_na_values, col_na_fvalues = _get_na_values( + c, na_values, na_fvalues) + else: + col_na_values, col_na_fvalues = set(), set() + coerce_type = True if conv_f is not None: try: @@ -1634,6 +1638,8 @@ def __init__(self, f, **kwds): self.names_passed = kwds['names'] or None + self.na_filter = kwds['na_filter'] + self.has_index_names = False if 'has_index_names' in kwds: self.has_index_names = kwds['has_index_names'] diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index 9dde669c9d39d..00c4e0a1c022b 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -61,12 +61,6 @@ def test_delim_whitespace_custom_terminator(self): columns=['a', 'b', 'c']) tm.assert_frame_equal(df, expected) - def test_parse_dates_empty_string(self): - # see gh-2263 - s = StringIO("Date, test\n2012-01-01, 1\n,2") - result = self.read_csv(s, parse_dates=["Date"], na_filter=False) - self.assertTrue(result['Date'].isnull()[1]) - def test_dtype_and_names_error(self): # see gh-8833: passing both dtype and names # resulting in an error reporting issue diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 2e3c102948cfa..44892dc17c47b 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -1319,10 +1319,8 @@ def test_inf_parsing(self): df = self.read_csv(StringIO(data), index_col=0) tm.assert_almost_equal(df['A'].values, expected.values) - if self.engine == 'c': - # TODO: remove condition when 'na_filter' is supported for Python - df = self.read_csv(StringIO(data), index_col=0, na_filter=False) - tm.assert_almost_equal(df['A'].values, expected.values) + df = self.read_csv(StringIO(data), index_col=0, na_filter=False) + tm.assert_almost_equal(df['A'].values, expected.values) def test_raise_on_no_columns(self): # single newline diff --git a/pandas/io/tests/parser/na_values.py b/pandas/io/tests/parser/na_values.py index 4705fd08af2b4..d826ae536c6cc 100644 --- a/pandas/io/tests/parser/na_values.py +++ b/pandas/io/tests/parser/na_values.py @@ -223,3 +223,21 @@ def test_na_values_keep_default(self): 'Three': ['None', 'two', 'None', 'nan', 'five', '', 'seven']}) tm.assert_frame_equal(xp.reindex(columns=df.columns), df) + + def test_na_values_na_filter_override(self): + data = """\ +A,B +1,A +nan,B +3,C +""" + + expected = DataFrame([[1, 'A'], [np.nan, np.nan], [3, 'C']], + columns=['A', 'B']) + out = self.read_csv(StringIO(data), na_values=['B'], na_filter=True) + tm.assert_frame_equal(out, expected) + + expected = DataFrame([['1', 'A'], ['nan', 'B'], ['3', 'C']], + columns=['A', 'B']) + out = self.read_csv(StringIO(data), na_values=['B'], na_filter=False) + tm.assert_frame_equal(out, expected) diff --git a/pandas/io/tests/parser/parse_dates.py b/pandas/io/tests/parser/parse_dates.py index ec368bb358ad5..01816bde66120 100644 --- a/pandas/io/tests/parser/parse_dates.py +++ b/pandas/io/tests/parser/parse_dates.py @@ -467,3 +467,10 @@ def test_read_with_parse_dates_invalid_type(self): StringIO(data), parse_dates=np.array([4, 5])) tm.assertRaisesRegexp(TypeError, errmsg, self.read_csv, StringIO(data), parse_dates=set([1, 3, 3])) + + def test_parse_dates_empty_string(self): + # see gh-2263 + data = "Date, test\n2012-01-01, 1\n,2" + result = self.read_csv(StringIO(data), parse_dates=["Date"], + na_filter=False) + self.assertTrue(result['Date'].isnull()[1])