Skip to content

Commit c9532f0

Browse files
gfyoungjreback
authored andcommitted
BUG: Patch handling of keep_default_na=False (#19260)
Patches very buggy behavior of keep_default_na=False whenever na_values is a dict * Respect keep_default_na for column that doesn't exist in na_values dictionary * Don't crash / break when na_value is a scalar in the na_values dictionary. In addition, clarifies documentation on behavior of keep_default_na with respect to na_filter and na_values. Closes gh-19227.
1 parent 88d0238 commit c9532f0

File tree

5 files changed

+127
-24
lines changed

5 files changed

+127
-24
lines changed

doc/source/io.rst

+14-2
Original file line numberDiff line numberDiff line change
@@ -214,8 +214,20 @@ na_values : scalar, str, list-like, or dict, default ``None``
214214
for a list of the values interpreted as NaN by default.
215215

216216
keep_default_na : boolean, default ``True``
217-
If na_values are specified and keep_default_na is ``False`` the default NaN
218-
values are overridden, otherwise they're appended to.
217+
Whether or not to include the default NaN values when parsing the data.
218+
Depending on whether `na_values` is passed in, the behavior is as follows:
219+
220+
* If `keep_default_na` is True, and `na_values` are specified, `na_values`
221+
is appended to the default NaN values used for parsing.
222+
* If `keep_default_na` is True, and `na_values` are not specified, only
223+
the default NaN values are used for parsing.
224+
* If `keep_default_na` is False, and `na_values` are specified, only
225+
the NaN values specified `na_values` are used for parsing.
226+
* If `keep_default_na` is False, and `na_values` are not specified, no
227+
strings will be parsed as NaN.
228+
229+
Note that if `na_filter` is passed in as False, the `keep_default_na` and
230+
`na_values` parameters will be ignored.
219231
na_filter : boolean, default ``True``
220232
Detect missing value markers (empty strings and the value of na_values). In
221233
data without any NAs, passing ``na_filter=False`` can improve the performance

doc/source/whatsnew/v0.23.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -462,6 +462,7 @@ I/O
462462
- :func:`read_html` now rewinds seekable IO objects after parse failure, before attempting to parse with a new parser. If a parser errors and the object is non-seekable, an informative error is raised suggesting the use of a different parser (:issue:`17975`)
463463
- Bug in :func:`read_msgpack` with a non existent file is passed in Python 2 (:issue:`15296`)
464464
- Bug in :func:`read_csv` where a ``MultiIndex`` with duplicate columns was not being mangled appropriately (:issue:`18062`)
465+
- Bug in :func:`read_csv` where missing values were not being handled properly when ``keep_default_na=False`` with dictionary ``na_values`` (:issue:`19227`)
465466
- Bug in :func:`read_sas` where a file with 0 variables gave an ``AttributeError`` incorrectly. Now it gives an ``EmptyDataError`` (:issue:`18184`)
466467
- Bug in :func:`DataFrame.to_latex()` where pairs of braces meant to serve as invisible placeholders were escaped (:issue:`18667`)
467468
- Bug in :func:`read_json` where large numeric values were causing an ``OverflowError`` (:issue:`18842`)

pandas/_libs/parsers.pyx

+10-4
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,7 @@ cdef class TextReader:
288288
object file_handle, na_fvalues
289289
object true_values, false_values
290290
object handle
291-
bint na_filter, verbose, has_usecols, has_mi_columns
291+
bint na_filter, keep_default_na, verbose, has_usecols, has_mi_columns
292292
int64_t parser_start
293293
list clocks
294294
char *c_encoding
@@ -352,6 +352,8 @@ cdef class TextReader:
352352
na_filter=True,
353353
na_values=None,
354354
na_fvalues=None,
355+
keep_default_na=True,
356+
355357
true_values=None,
356358
false_values=None,
357359
allow_leading_cols=True,
@@ -378,8 +380,8 @@ cdef class TextReader:
378380
self.parser = parser_new()
379381
self.parser.chunksize = tokenize_chunksize
380382

381-
self.mangle_dupe_cols=mangle_dupe_cols
382-
self.tupleize_cols=tupleize_cols
383+
self.mangle_dupe_cols = mangle_dupe_cols
384+
self.tupleize_cols = tupleize_cols
383385

384386
# For timekeeping
385387
self.clocks = []
@@ -477,6 +479,7 @@ cdef class TextReader:
477479
self.true_set = kset_from_list(self.true_values)
478480
self.false_set = kset_from_list(self.false_values)
479481

482+
self.keep_default_na = keep_default_na
480483
self.converters = converters
481484
self.na_filter = na_filter
482485

@@ -1299,7 +1302,10 @@ cdef class TextReader:
12991302
elif i in self.na_values:
13001303
key = i
13011304
else: # No na_values provided for this column.
1302-
return _NA_VALUES, set()
1305+
if self.keep_default_na:
1306+
return _NA_VALUES, set()
1307+
1308+
return list(), set()
13031309

13041310
values = self.na_values[key]
13051311
if values is not None and not isinstance(values, list):

pandas/io/parsers.py

+63-18
Original file line numberDiff line numberDiff line change
@@ -149,8 +149,20 @@
149149
NaN: '""" + fill("', '".join(sorted(_NA_VALUES)),
150150
70, subsequent_indent=" ") + """'.
151151
keep_default_na : bool, default True
152-
If na_values are specified and keep_default_na is False the default NaN
153-
values are overridden, otherwise they're appended to.
152+
Whether or not to include the default NaN values when parsing the data.
153+
Depending on whether `na_values` is passed in, the behavior is as follows:
154+
155+
* If `keep_default_na` is True, and `na_values` are specified, `na_values`
156+
is appended to the default NaN values used for parsing.
157+
* If `keep_default_na` is True, and `na_values` are not specified, only
158+
the default NaN values are used for parsing.
159+
* If `keep_default_na` is False, and `na_values` are specified, only
160+
the NaN values specified `na_values` are used for parsing.
161+
* If `keep_default_na` is False, and `na_values` are not specified, no
162+
strings will be parsed as NaN.
163+
164+
Note that if `na_filter` is passed in as False, the `keep_default_na` and
165+
`na_values` parameters will be ignored.
154166
na_filter : boolean, default True
155167
Detect missing value markers (empty strings and the value of na_values). In
156168
data without any NAs, passing na_filter=False can improve the performance
@@ -910,9 +922,6 @@ def _clean_options(self, options, engine):
910922
na_values = options['na_values']
911923
skiprows = options['skiprows']
912924

913-
# really delete this one
914-
keep_default_na = result.pop('keep_default_na')
915-
916925
_validate_header_arg(options['header'])
917926

918927
depr_warning = ''
@@ -957,6 +966,7 @@ def _clean_options(self, options, engine):
957966
converters = {}
958967

959968
# Converting values to NA
969+
keep_default_na = options['keep_default_na']
960970
na_values, na_fvalues = _clean_na_values(na_values, keep_default_na)
961971

962972
# handle skiprows; this is internally handled by the
@@ -1225,6 +1235,7 @@ def __init__(self, kwds):
12251235
self.na_values = kwds.get('na_values')
12261236
self.na_fvalues = kwds.get('na_fvalues')
12271237
self.na_filter = kwds.get('na_filter', False)
1238+
self.keep_default_na = kwds.get('keep_default_na', True)
12281239

12291240
self.true_values = kwds.get('true_values')
12301241
self.false_values = kwds.get('false_values')
@@ -1487,7 +1498,8 @@ def _agg_index(self, index, try_parse_dates=True):
14871498
col_name = self.index_names[i]
14881499
if col_name is not None:
14891500
col_na_values, col_na_fvalues = _get_na_values(
1490-
col_name, self.na_values, self.na_fvalues)
1501+
col_name, self.na_values, self.na_fvalues,
1502+
self.keep_default_na)
14911503

14921504
arr, _ = self._infer_types(arr, col_na_values | col_na_fvalues)
14931505
arrays.append(arr)
@@ -1510,7 +1522,7 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False,
15101522

15111523
if self.na_filter:
15121524
col_na_values, col_na_fvalues = _get_na_values(
1513-
c, na_values, na_fvalues)
1525+
c, na_values, na_fvalues, self.keep_default_na)
15141526
else:
15151527
col_na_values, col_na_fvalues = set(), set()
15161528

@@ -3097,16 +3109,23 @@ def _clean_na_values(na_values, keep_default_na=True):
30973109
na_values = set()
30983110
na_fvalues = set()
30993111
elif isinstance(na_values, dict):
3100-
na_values = na_values.copy() # Prevent aliasing.
3101-
if keep_default_na:
3102-
for k, v in compat.iteritems(na_values):
3103-
if not is_list_like(v):
3104-
v = [v]
3112+
old_na_values = na_values.copy()
3113+
na_values = {} # Prevent aliasing.
3114+
3115+
# Convert the values in the na_values dictionary
3116+
# into array-likes for further use. This is also
3117+
# where we append the default NaN values, provided
3118+
# that `keep_default_na=True`.
3119+
for k, v in compat.iteritems(old_na_values):
3120+
if not is_list_like(v):
3121+
v = [v]
3122+
3123+
if keep_default_na:
31053124
v = set(v) | _NA_VALUES
3106-
na_values[k] = v
3107-
na_fvalues = dict(
3108-
(k, _floatify_na_values(v)) for k, v in na_values.items() # noqa
3109-
)
3125+
3126+
na_values[k] = v
3127+
na_fvalues = dict((k, _floatify_na_values(v))
3128+
for k, v in na_values.items())
31103129
else:
31113130
if not is_list_like(na_values):
31123131
na_values = [na_values]
@@ -3225,12 +3244,38 @@ def _stringify_na_values(na_values):
32253244
return set(result)
32263245

32273246

3228-
def _get_na_values(col, na_values, na_fvalues):
3247+
def _get_na_values(col, na_values, na_fvalues, keep_default_na):
3248+
"""
3249+
Get the NaN values for a given column.
3250+
3251+
Parameters
3252+
----------
3253+
col : str
3254+
The name of the column.
3255+
na_values : array-like, dict
3256+
The object listing the NaN values as strings.
3257+
na_fvalues : array-like, dict
3258+
The object listing the NaN values as floats.
3259+
keep_default_na : bool
3260+
If `na_values` is a dict, and the column is not mapped in the
3261+
dictionary, whether to return the default NaN values or the empty set.
3262+
3263+
Returns
3264+
-------
3265+
nan_tuple : A length-two tuple composed of
3266+
3267+
1) na_values : the string NaN values for that column.
3268+
2) na_fvalues : the float NaN values for that column.
3269+
"""
3270+
32293271
if isinstance(na_values, dict):
32303272
if col in na_values:
32313273
return na_values[col], na_fvalues[col]
32323274
else:
3233-
return _NA_VALUES, set()
3275+
if keep_default_na:
3276+
return _NA_VALUES, set()
3277+
3278+
return set(), set()
32343279
else:
32353280
return na_values, na_fvalues
32363281

pandas/tests/io/parser/na_values.py

+39
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,45 @@ def test_na_values_keep_default(self):
224224
'seven']})
225225
tm.assert_frame_equal(xp.reindex(columns=df.columns), df)
226226

227+
def test_no_keep_default_na_dict_na_values(self):
228+
# see gh-19227
229+
data = "a,b\n,2"
230+
231+
df = self.read_csv(StringIO(data), na_values={"b": ["2"]},
232+
keep_default_na=False)
233+
expected = DataFrame({"a": [""], "b": [np.nan]})
234+
tm.assert_frame_equal(df, expected)
235+
236+
# Scalar values shouldn't cause the parsing to crash or fail.
237+
data = "a,b\n1,2"
238+
239+
df = self.read_csv(StringIO(data), na_values={"b": 2},
240+
keep_default_na=False)
241+
expected = DataFrame({"a": [1], "b": [np.nan]})
242+
tm.assert_frame_equal(df, expected)
243+
244+
data = """\
245+
113125,"blah","/blaha",kjsdkj,412.166,225.874,214.008
246+
729639,"qwer","",asdfkj,466.681,,252.373
247+
"""
248+
expected = DataFrame({0: [np.nan, 729639.0],
249+
1: [np.nan, "qwer"],
250+
2: ["/blaha", np.nan],
251+
3: ["kjsdkj", "asdfkj"],
252+
4: [412.166, 466.681],
253+
5: ["225.874", ""],
254+
6: [np.nan, 252.373]})
255+
256+
df = self.read_csv(StringIO(data), header=None, keep_default_na=False,
257+
na_values={2: "", 6: "214.008",
258+
1: "blah", 0: 113125})
259+
tm.assert_frame_equal(df, expected)
260+
261+
df = self.read_csv(StringIO(data), header=None, keep_default_na=False,
262+
na_values={2: "", 6: "214.008",
263+
1: "blah", 0: "113125"})
264+
tm.assert_frame_equal(df, expected)
265+
227266
def test_na_values_na_filter_override(self):
228267
data = """\
229268
A,B

0 commit comments

Comments
 (0)