Skip to content

BUG: Patch handling of keep_default_na=False #19260

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jan 18, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions doc/source/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -214,8 +214,20 @@ na_values : scalar, str, list-like, or dict, default ``None``
for a list of the values interpreted as NaN by default.

keep_default_na : boolean, default ``True``
If na_values are specified and keep_default_na is ``False`` the default NaN
values are overridden, otherwise they're appended to.
Whether or not to include the default NaN values when parsing the data.
Depending on whether `na_values` is passed in, the behavior is as follows:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice!


* If `keep_default_na` is True, and `na_values` are specified, `na_values`
is appended to the default NaN values used for parsing.
* If `keep_default_na` is True, and `na_values` are not specified, only
the default NaN values are used for parsing.
* If `keep_default_na` is False, and `na_values` are specified, only
the NaN values specified `na_values` are used for parsing.
* If `keep_default_na` is False, and `na_values` are not specified, no
strings will be parsed as NaN.

Note that if `na_filter` is passed in as False, the `keep_default_na` and
`na_values` parameters will be ignored.
na_filter : boolean, default ``True``
Detect missing value markers (empty strings and the value of na_values). In
data without any NAs, passing ``na_filter=False`` can improve the performance
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.23.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -463,6 +463,7 @@ I/O
- :func:`read_html` now rewinds seekable IO objects after parse failure, before attempting to parse with a new parser. If a parser errors and the object is non-seekable, an informative error is raised suggesting the use of a different parser (:issue:`17975`)
- Bug in :func:`read_msgpack` with a non existent file is passed in Python 2 (:issue:`15296`)
- Bug in :func:`read_csv` where a ``MultiIndex`` with duplicate columns was not being mangled appropriately (:issue:`18062`)
- Bug in :func:`read_csv` where missing values were not being handled properly when ``keep_default_na=False`` with dictionary ``na_values`` (:issue:`19227`)
- Bug in :func:`read_sas` where a file with 0 variables gave an ``AttributeError`` incorrectly. Now it gives an ``EmptyDataError`` (:issue:`18184`)
- Bug in :func:`DataFrame.to_latex()` where pairs of braces meant to serve as invisible placeholders were escaped (:issue:`18667`)
- Bug in :func:`read_json` where large numeric values were causing an ``OverflowError`` (:issue:`18842`)
Expand Down
14 changes: 10 additions & 4 deletions pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ cdef class TextReader:
object file_handle, na_fvalues
object true_values, false_values
object handle
bint na_filter, verbose, has_usecols, has_mi_columns
bint na_filter, keep_default_na, verbose, has_usecols, has_mi_columns
int64_t parser_start
list clocks
char *c_encoding
Expand Down Expand Up @@ -352,6 +352,8 @@ cdef class TextReader:
na_filter=True,
na_values=None,
na_fvalues=None,
keep_default_na=True,

true_values=None,
false_values=None,
allow_leading_cols=True,
Expand All @@ -378,8 +380,8 @@ cdef class TextReader:
self.parser = parser_new()
self.parser.chunksize = tokenize_chunksize

self.mangle_dupe_cols=mangle_dupe_cols
self.tupleize_cols=tupleize_cols
self.mangle_dupe_cols = mangle_dupe_cols
self.tupleize_cols = tupleize_cols

# For timekeeping
self.clocks = []
Expand Down Expand Up @@ -477,6 +479,7 @@ cdef class TextReader:
self.true_set = kset_from_list(self.true_values)
self.false_set = kset_from_list(self.false_values)

self.keep_default_na = keep_default_na
self.converters = converters
self.na_filter = na_filter

Expand Down Expand Up @@ -1299,7 +1302,10 @@ cdef class TextReader:
elif i in self.na_values:
key = i
else: # No na_values provided for this column.
return _NA_VALUES, set()
if self.keep_default_na:
return _NA_VALUES, set()

return list(), set()

values = self.na_values[key]
if values is not None and not isinstance(values, list):
Expand Down
81 changes: 63 additions & 18 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,8 +149,20 @@
NaN: '""" + fill("', '".join(sorted(_NA_VALUES)),
70, subsequent_indent=" ") + """'.
keep_default_na : bool, default True
If na_values are specified and keep_default_na is False the default NaN
values are overridden, otherwise they're appended to.
Whether or not to include the default NaN values when parsing the data.
Depending on whether `na_values` is passed in, the behavior is as follows:

* If `keep_default_na` is True, and `na_values` are specified, `na_values`
is appended to the default NaN values used for parsing.
* If `keep_default_na` is True, and `na_values` are not specified, only
the default NaN values are used for parsing.
* If `keep_default_na` is False, and `na_values` are specified, only
the NaN values specified `na_values` are used for parsing.
* If `keep_default_na` is False, and `na_values` are not specified, no
strings will be parsed as NaN.

Note that if `na_filter` is passed in as False, the `keep_default_na` and
`na_values` parameters will be ignored.
na_filter : boolean, default True
Detect missing value markers (empty strings and the value of na_values). In
data without any NAs, passing na_filter=False can improve the performance
Expand Down Expand Up @@ -910,9 +922,6 @@ def _clean_options(self, options, engine):
na_values = options['na_values']
skiprows = options['skiprows']

# really delete this one
keep_default_na = result.pop('keep_default_na')

_validate_header_arg(options['header'])

depr_warning = ''
Expand Down Expand Up @@ -957,6 +966,7 @@ def _clean_options(self, options, engine):
converters = {}

# Converting values to NA
keep_default_na = options['keep_default_na']
na_values, na_fvalues = _clean_na_values(na_values, keep_default_na)

# handle skiprows; this is internally handled by the
Expand Down Expand Up @@ -1225,6 +1235,7 @@ def __init__(self, kwds):
self.na_values = kwds.get('na_values')
self.na_fvalues = kwds.get('na_fvalues')
self.na_filter = kwds.get('na_filter', False)
self.keep_default_na = kwds.get('keep_default_na', True)

self.true_values = kwds.get('true_values')
self.false_values = kwds.get('false_values')
Expand Down Expand Up @@ -1487,7 +1498,8 @@ def _agg_index(self, index, try_parse_dates=True):
col_name = self.index_names[i]
if col_name is not None:
col_na_values, col_na_fvalues = _get_na_values(
col_name, self.na_values, self.na_fvalues)
col_name, self.na_values, self.na_fvalues,
self.keep_default_na)

arr, _ = self._infer_types(arr, col_na_values | col_na_fvalues)
arrays.append(arr)
Expand All @@ -1510,7 +1522,7 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False,

if self.na_filter:
col_na_values, col_na_fvalues = _get_na_values(
c, na_values, na_fvalues)
c, na_values, na_fvalues, self.keep_default_na)
else:
col_na_values, col_na_fvalues = set(), set()

Expand Down Expand Up @@ -3097,16 +3109,23 @@ def _clean_na_values(na_values, keep_default_na=True):
na_values = set()
na_fvalues = set()
elif isinstance(na_values, dict):
na_values = na_values.copy() # Prevent aliasing.
if keep_default_na:
for k, v in compat.iteritems(na_values):
if not is_list_like(v):
v = [v]
old_na_values = na_values.copy()
na_values = {} # Prevent aliasing.

# Convert the values in the na_values dictionary
# into array-likes for further use. This is also
# where we append the default NaN values, provided
# that `keep_default_na=True`.
for k, v in compat.iteritems(old_na_values):
if not is_list_like(v):
v = [v]

if keep_default_na:
v = set(v) | _NA_VALUES
na_values[k] = v
na_fvalues = dict(
(k, _floatify_na_values(v)) for k, v in na_values.items() # noqa
)

na_values[k] = v
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is an anti-pattern to modify the dict that you are iterating. can you create you create a new one here?

Copy link
Member Author

@gfyoung gfyoung Jan 16, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback : I realize that, but I think that's why up above, someone wrote na_values = na_values.copy(). The reference to the original input is destroyed and created a "new" dictionary. Do you just want me to change the assigned variable name?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah you should create a new empty dict and then assign to it (it doesn't have to be a copy). iterating and modifying is a no-no.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've decided to create a copy because at the bottom, we return na_values and na_fvalues regardless of the logic branch. However, I'll iterate over the old_na_values instead.

na_fvalues = dict((k, _floatify_na_values(v))
for k, v in na_values.items())
else:
if not is_list_like(na_values):
na_values = [na_values]
Expand Down Expand Up @@ -3225,12 +3244,38 @@ def _stringify_na_values(na_values):
return set(result)


def _get_na_values(col, na_values, na_fvalues):
def _get_na_values(col, na_values, na_fvalues, keep_default_na):
"""
Get the NaN values for a given column.

Parameters
----------
col : str
The name of the column.
na_values : array-like, dict
The object listing the NaN values as strings.
na_fvalues : array-like, dict
The object listing the NaN values as floats.
keep_default_na : bool
If `na_values` is a dict, and the column is not mapped in the
dictionary, whether to return the default NaN values or the empty set.

Returns
-------
nan_tuple : A length-two tuple composed of

1) na_values : the string NaN values for that column.
2) na_fvalues : the float NaN values for that column.
"""

if isinstance(na_values, dict):
if col in na_values:
return na_values[col], na_fvalues[col]
else:
return _NA_VALUES, set()
if keep_default_na:
return _NA_VALUES, set()

return set(), set()
else:
return na_values, na_fvalues

Expand Down
39 changes: 39 additions & 0 deletions pandas/tests/io/parser/na_values.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,45 @@ def test_na_values_keep_default(self):
'seven']})
tm.assert_frame_equal(xp.reindex(columns=df.columns), df)

def test_no_keep_default_na_dict_na_values(self):
# see gh-19227
data = "a,b\n,2"

df = self.read_csv(StringIO(data), na_values={"b": ["2"]},
keep_default_na=False)
expected = DataFrame({"a": [""], "b": [np.nan]})
tm.assert_frame_equal(df, expected)

# Scalar values shouldn't cause the parsing to crash or fail.
data = "a,b\n1,2"

df = self.read_csv(StringIO(data), na_values={"b": 2},
keep_default_na=False)
expected = DataFrame({"a": [1], "b": [np.nan]})
tm.assert_frame_equal(df, expected)

data = """\
113125,"blah","/blaha",kjsdkj,412.166,225.874,214.008
729639,"qwer","",asdfkj,466.681,,252.373
"""
expected = DataFrame({0: [np.nan, 729639.0],
1: [np.nan, "qwer"],
2: ["/blaha", np.nan],
3: ["kjsdkj", "asdfkj"],
4: [412.166, 466.681],
5: ["225.874", ""],
6: [np.nan, 252.373]})

df = self.read_csv(StringIO(data), header=None, keep_default_na=False,
na_values={2: "", 6: "214.008",
1: "blah", 0: 113125})
tm.assert_frame_equal(df, expected)

df = self.read_csv(StringIO(data), header=None, keep_default_na=False,
na_values={2: "", 6: "214.008",
1: "blah", 0: "113125"})
tm.assert_frame_equal(df, expected)

def test_na_values_na_filter_override(self):
data = """\
A,B
Expand Down