diff --git a/doc/source/io.rst b/doc/source/io.rst index 2f29e390c0ba1..ae04996b4fddf 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -214,8 +214,20 @@ na_values : scalar, str, list-like, or dict, default ``None`` for a list of the values interpreted as NaN by default. keep_default_na : boolean, default ``True`` - If na_values are specified and keep_default_na is ``False`` the default NaN - values are overridden, otherwise they're appended to. + Whether or not to include the default NaN values when parsing the data. + Depending on whether `na_values` is passed in, the behavior is as follows: + + * If `keep_default_na` is True, and `na_values` are specified, `na_values` + is appended to the default NaN values used for parsing. + * If `keep_default_na` is True, and `na_values` are not specified, only + the default NaN values are used for parsing. + * If `keep_default_na` is False, and `na_values` are specified, only + the NaN values specified `na_values` are used for parsing. + * If `keep_default_na` is False, and `na_values` are not specified, no + strings will be parsed as NaN. + + Note that if `na_filter` is passed in as False, the `keep_default_na` and + `na_values` parameters will be ignored. na_filter : boolean, default ``True`` Detect missing value markers (empty strings and the value of na_values). In data without any NAs, passing ``na_filter=False`` can improve the performance diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 853d5cee11cd1..326673a54acfa 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -463,6 +463,7 @@ I/O - :func:`read_html` now rewinds seekable IO objects after parse failure, before attempting to parse with a new parser. If a parser errors and the object is non-seekable, an informative error is raised suggesting the use of a different parser (:issue:`17975`) - Bug in :func:`read_msgpack` with a non existent file is passed in Python 2 (:issue:`15296`) - Bug in :func:`read_csv` where a ``MultiIndex`` with duplicate columns was not being mangled appropriately (:issue:`18062`) +- Bug in :func:`read_csv` where missing values were not being handled properly when ``keep_default_na=False`` with dictionary ``na_values`` (:issue:`19227`) - Bug in :func:`read_sas` where a file with 0 variables gave an ``AttributeError`` incorrectly. Now it gives an ``EmptyDataError`` (:issue:`18184`) - Bug in :func:`DataFrame.to_latex()` where pairs of braces meant to serve as invisible placeholders were escaped (:issue:`18667`) - Bug in :func:`read_json` where large numeric values were causing an ``OverflowError`` (:issue:`18842`) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index cf63b5083885e..5efe2147f6f8e 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -288,7 +288,7 @@ cdef class TextReader: object file_handle, na_fvalues object true_values, false_values object handle - bint na_filter, verbose, has_usecols, has_mi_columns + bint na_filter, keep_default_na, verbose, has_usecols, has_mi_columns int64_t parser_start list clocks char *c_encoding @@ -352,6 +352,8 @@ cdef class TextReader: na_filter=True, na_values=None, na_fvalues=None, + keep_default_na=True, + true_values=None, false_values=None, allow_leading_cols=True, @@ -378,8 +380,8 @@ cdef class TextReader: self.parser = parser_new() self.parser.chunksize = tokenize_chunksize - self.mangle_dupe_cols=mangle_dupe_cols - self.tupleize_cols=tupleize_cols + self.mangle_dupe_cols = mangle_dupe_cols + self.tupleize_cols = tupleize_cols # For timekeeping self.clocks = [] @@ -477,6 +479,7 @@ cdef class TextReader: self.true_set = kset_from_list(self.true_values) self.false_set = kset_from_list(self.false_values) + self.keep_default_na = keep_default_na self.converters = converters self.na_filter = na_filter @@ -1299,7 +1302,10 @@ cdef class TextReader: elif i in self.na_values: key = i else: # No na_values provided for this column. - return _NA_VALUES, set() + if self.keep_default_na: + return _NA_VALUES, set() + + return list(), set() values = self.na_values[key] if values is not None and not isinstance(values, list): diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 150fccde81a60..1ba687541eecf 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -149,8 +149,20 @@ NaN: '""" + fill("', '".join(sorted(_NA_VALUES)), 70, subsequent_indent=" ") + """'. keep_default_na : bool, default True - If na_values are specified and keep_default_na is False the default NaN - values are overridden, otherwise they're appended to. + Whether or not to include the default NaN values when parsing the data. + Depending on whether `na_values` is passed in, the behavior is as follows: + + * If `keep_default_na` is True, and `na_values` are specified, `na_values` + is appended to the default NaN values used for parsing. + * If `keep_default_na` is True, and `na_values` are not specified, only + the default NaN values are used for parsing. + * If `keep_default_na` is False, and `na_values` are specified, only + the NaN values specified `na_values` are used for parsing. + * If `keep_default_na` is False, and `na_values` are not specified, no + strings will be parsed as NaN. + + Note that if `na_filter` is passed in as False, the `keep_default_na` and + `na_values` parameters will be ignored. na_filter : boolean, default True Detect missing value markers (empty strings and the value of na_values). In data without any NAs, passing na_filter=False can improve the performance @@ -910,9 +922,6 @@ def _clean_options(self, options, engine): na_values = options['na_values'] skiprows = options['skiprows'] - # really delete this one - keep_default_na = result.pop('keep_default_na') - _validate_header_arg(options['header']) depr_warning = '' @@ -957,6 +966,7 @@ def _clean_options(self, options, engine): converters = {} # Converting values to NA + keep_default_na = options['keep_default_na'] na_values, na_fvalues = _clean_na_values(na_values, keep_default_na) # handle skiprows; this is internally handled by the @@ -1225,6 +1235,7 @@ def __init__(self, kwds): self.na_values = kwds.get('na_values') self.na_fvalues = kwds.get('na_fvalues') self.na_filter = kwds.get('na_filter', False) + self.keep_default_na = kwds.get('keep_default_na', True) self.true_values = kwds.get('true_values') self.false_values = kwds.get('false_values') @@ -1487,7 +1498,8 @@ def _agg_index(self, index, try_parse_dates=True): col_name = self.index_names[i] if col_name is not None: col_na_values, col_na_fvalues = _get_na_values( - col_name, self.na_values, self.na_fvalues) + col_name, self.na_values, self.na_fvalues, + self.keep_default_na) arr, _ = self._infer_types(arr, col_na_values | col_na_fvalues) arrays.append(arr) @@ -1510,7 +1522,7 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, if self.na_filter: col_na_values, col_na_fvalues = _get_na_values( - c, na_values, na_fvalues) + c, na_values, na_fvalues, self.keep_default_na) else: col_na_values, col_na_fvalues = set(), set() @@ -3097,16 +3109,23 @@ def _clean_na_values(na_values, keep_default_na=True): na_values = set() na_fvalues = set() elif isinstance(na_values, dict): - na_values = na_values.copy() # Prevent aliasing. - if keep_default_na: - for k, v in compat.iteritems(na_values): - if not is_list_like(v): - v = [v] + old_na_values = na_values.copy() + na_values = {} # Prevent aliasing. + + # Convert the values in the na_values dictionary + # into array-likes for further use. This is also + # where we append the default NaN values, provided + # that `keep_default_na=True`. + for k, v in compat.iteritems(old_na_values): + if not is_list_like(v): + v = [v] + + if keep_default_na: v = set(v) | _NA_VALUES - na_values[k] = v - na_fvalues = dict( - (k, _floatify_na_values(v)) for k, v in na_values.items() # noqa - ) + + na_values[k] = v + na_fvalues = dict((k, _floatify_na_values(v)) + for k, v in na_values.items()) else: if not is_list_like(na_values): na_values = [na_values] @@ -3225,12 +3244,38 @@ def _stringify_na_values(na_values): return set(result) -def _get_na_values(col, na_values, na_fvalues): +def _get_na_values(col, na_values, na_fvalues, keep_default_na): + """ + Get the NaN values for a given column. + + Parameters + ---------- + col : str + The name of the column. + na_values : array-like, dict + The object listing the NaN values as strings. + na_fvalues : array-like, dict + The object listing the NaN values as floats. + keep_default_na : bool + If `na_values` is a dict, and the column is not mapped in the + dictionary, whether to return the default NaN values or the empty set. + + Returns + ------- + nan_tuple : A length-two tuple composed of + + 1) na_values : the string NaN values for that column. + 2) na_fvalues : the float NaN values for that column. + """ + if isinstance(na_values, dict): if col in na_values: return na_values[col], na_fvalues[col] else: - return _NA_VALUES, set() + if keep_default_na: + return _NA_VALUES, set() + + return set(), set() else: return na_values, na_fvalues diff --git a/pandas/tests/io/parser/na_values.py b/pandas/tests/io/parser/na_values.py index f8906d5a1f7ba..d2c3f82e95c4d 100644 --- a/pandas/tests/io/parser/na_values.py +++ b/pandas/tests/io/parser/na_values.py @@ -224,6 +224,45 @@ def test_na_values_keep_default(self): 'seven']}) tm.assert_frame_equal(xp.reindex(columns=df.columns), df) + def test_no_keep_default_na_dict_na_values(self): + # see gh-19227 + data = "a,b\n,2" + + df = self.read_csv(StringIO(data), na_values={"b": ["2"]}, + keep_default_na=False) + expected = DataFrame({"a": [""], "b": [np.nan]}) + tm.assert_frame_equal(df, expected) + + # Scalar values shouldn't cause the parsing to crash or fail. + data = "a,b\n1,2" + + df = self.read_csv(StringIO(data), na_values={"b": 2}, + keep_default_na=False) + expected = DataFrame({"a": [1], "b": [np.nan]}) + tm.assert_frame_equal(df, expected) + + data = """\ +113125,"blah","/blaha",kjsdkj,412.166,225.874,214.008 +729639,"qwer","",asdfkj,466.681,,252.373 +""" + expected = DataFrame({0: [np.nan, 729639.0], + 1: [np.nan, "qwer"], + 2: ["/blaha", np.nan], + 3: ["kjsdkj", "asdfkj"], + 4: [412.166, 466.681], + 5: ["225.874", ""], + 6: [np.nan, 252.373]}) + + df = self.read_csv(StringIO(data), header=None, keep_default_na=False, + na_values={2: "", 6: "214.008", + 1: "blah", 0: 113125}) + tm.assert_frame_equal(df, expected) + + df = self.read_csv(StringIO(data), header=None, keep_default_na=False, + na_values={2: "", 6: "214.008", + 1: "blah", 0: "113125"}) + tm.assert_frame_equal(df, expected) + def test_na_values_na_filter_override(self): data = """\ A,B