-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
BUG: Patch handling of keep_default_na=False #19260
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -149,8 +149,20 @@ | |
NaN: '""" + fill("', '".join(sorted(_NA_VALUES)), | ||
70, subsequent_indent=" ") + """'. | ||
keep_default_na : bool, default True | ||
If na_values are specified and keep_default_na is False the default NaN | ||
values are overridden, otherwise they're appended to. | ||
Whether or not to include the default NaN values when parsing the data. | ||
Depending on whether `na_values` is passed in, the behavior is as follows: | ||
|
||
* If `keep_default_na` is True, and `na_values` are specified, `na_values` | ||
is appended to the default NaN values used for parsing. | ||
* If `keep_default_na` is True, and `na_values` are not specified, only | ||
the default NaN values are used for parsing. | ||
* If `keep_default_na` is False, and `na_values` are specified, only | ||
the NaN values specified `na_values` are used for parsing. | ||
* If `keep_default_na` is False, and `na_values` are not specified, no | ||
strings will be parsed as NaN. | ||
|
||
Note that if `na_filter` is passed in as False, the `keep_default_na` and | ||
`na_values` parameters will be ignored. | ||
na_filter : boolean, default True | ||
Detect missing value markers (empty strings and the value of na_values). In | ||
data without any NAs, passing na_filter=False can improve the performance | ||
|
@@ -910,9 +922,6 @@ def _clean_options(self, options, engine): | |
na_values = options['na_values'] | ||
skiprows = options['skiprows'] | ||
|
||
# really delete this one | ||
keep_default_na = result.pop('keep_default_na') | ||
|
||
_validate_header_arg(options['header']) | ||
|
||
depr_warning = '' | ||
|
@@ -957,6 +966,7 @@ def _clean_options(self, options, engine): | |
converters = {} | ||
|
||
# Converting values to NA | ||
keep_default_na = options['keep_default_na'] | ||
na_values, na_fvalues = _clean_na_values(na_values, keep_default_na) | ||
|
||
# handle skiprows; this is internally handled by the | ||
|
@@ -1225,6 +1235,7 @@ def __init__(self, kwds): | |
self.na_values = kwds.get('na_values') | ||
self.na_fvalues = kwds.get('na_fvalues') | ||
self.na_filter = kwds.get('na_filter', False) | ||
self.keep_default_na = kwds.get('keep_default_na', True) | ||
|
||
self.true_values = kwds.get('true_values') | ||
self.false_values = kwds.get('false_values') | ||
|
@@ -1487,7 +1498,8 @@ def _agg_index(self, index, try_parse_dates=True): | |
col_name = self.index_names[i] | ||
if col_name is not None: | ||
col_na_values, col_na_fvalues = _get_na_values( | ||
col_name, self.na_values, self.na_fvalues) | ||
col_name, self.na_values, self.na_fvalues, | ||
self.keep_default_na) | ||
|
||
arr, _ = self._infer_types(arr, col_na_values | col_na_fvalues) | ||
arrays.append(arr) | ||
|
@@ -1510,7 +1522,7 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, | |
|
||
if self.na_filter: | ||
col_na_values, col_na_fvalues = _get_na_values( | ||
c, na_values, na_fvalues) | ||
c, na_values, na_fvalues, self.keep_default_na) | ||
else: | ||
col_na_values, col_na_fvalues = set(), set() | ||
|
||
|
@@ -3097,16 +3109,23 @@ def _clean_na_values(na_values, keep_default_na=True): | |
na_values = set() | ||
na_fvalues = set() | ||
elif isinstance(na_values, dict): | ||
na_values = na_values.copy() # Prevent aliasing. | ||
if keep_default_na: | ||
for k, v in compat.iteritems(na_values): | ||
if not is_list_like(v): | ||
v = [v] | ||
old_na_values = na_values.copy() | ||
na_values = {} # Prevent aliasing. | ||
|
||
# Convert the values in the na_values dictionary | ||
# into array-likes for further use. This is also | ||
# where we append the default NaN values, provided | ||
# that `keep_default_na=True`. | ||
for k, v in compat.iteritems(old_na_values): | ||
if not is_list_like(v): | ||
v = [v] | ||
|
||
if keep_default_na: | ||
v = set(v) | _NA_VALUES | ||
na_values[k] = v | ||
na_fvalues = dict( | ||
(k, _floatify_na_values(v)) for k, v in na_values.items() # noqa | ||
) | ||
|
||
na_values[k] = v | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is an anti-pattern to modify the dict that you are iterating. can you create you create a new one here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jreback : I realize that, but I think that's why up above, someone wrote There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah you should create a new empty dict and then assign to it (it doesn't have to be a copy). iterating and modifying is a no-no. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've decided to create a copy because at the bottom, we return |
||
na_fvalues = dict((k, _floatify_na_values(v)) | ||
for k, v in na_values.items()) | ||
else: | ||
if not is_list_like(na_values): | ||
na_values = [na_values] | ||
|
@@ -3225,12 +3244,38 @@ def _stringify_na_values(na_values): | |
return set(result) | ||
|
||
|
||
def _get_na_values(col, na_values, na_fvalues): | ||
def _get_na_values(col, na_values, na_fvalues, keep_default_na): | ||
""" | ||
Get the NaN values for a given column. | ||
|
||
Parameters | ||
---------- | ||
col : str | ||
The name of the column. | ||
na_values : array-like, dict | ||
The object listing the NaN values as strings. | ||
na_fvalues : array-like, dict | ||
The object listing the NaN values as floats. | ||
keep_default_na : bool | ||
If `na_values` is a dict, and the column is not mapped in the | ||
dictionary, whether to return the default NaN values or the empty set. | ||
|
||
Returns | ||
------- | ||
nan_tuple : A length-two tuple composed of | ||
|
||
1) na_values : the string NaN values for that column. | ||
2) na_fvalues : the float NaN values for that column. | ||
""" | ||
|
||
if isinstance(na_values, dict): | ||
if col in na_values: | ||
return na_values[col], na_fvalues[col] | ||
else: | ||
return _NA_VALUES, set() | ||
if keep_default_na: | ||
return _NA_VALUES, set() | ||
|
||
return set(), set() | ||
else: | ||
return na_values, na_fvalues | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nice!