Skip to content

Commit 8ef3ae5

Browse files
committed
BUG: Patch handling of keep_default_na=False
Patches very buggy behavior of keep_default_na=False whenever na_values is a dict * Respect keep_default_na for column that doesn't exist in na_values dictionary * Don't crash / break when na_value is a scalar in the na_values dictionary. In addition, clarifies documentation on behavior of keep_default_na with respect to na_filter and na_values. Closes gh-19227.
1 parent 7208610 commit 8ef3ae5

File tree

5 files changed

+123
-23
lines changed

5 files changed

+123
-23
lines changed

doc/source/io.rst

+14-2
Original file line numberDiff line numberDiff line change
@@ -214,8 +214,20 @@ na_values : scalar, str, list-like, or dict, default ``None``
214214
for a list of the values interpreted as NaN by default.
215215

216216
keep_default_na : boolean, default ``True``
217-
If na_values are specified and keep_default_na is ``False`` the default NaN
218-
values are overridden, otherwise they're appended to.
217+
Whether or not to include the default NaN values when parsing the data.
218+
Provided that `na_filter` is True, depending on whether `na_values` is
219+
passed in, the behavior is as follows:
220+
221+
* If `keep_default_na` is True, and `na_values` are specified, `na_values`
222+
is appended to the default NaN values used for parsing.
223+
* If `keep_default_na` is True, and `na_values` are not specified, only
224+
the default NaN values are used for parsing.
225+
* If `keep_default_na` is False, and `na_values` are not specified, only
226+
the NaN values specified `na_values` are used for parsing.
227+
* If `keep_default_na` is False, and `na_values` are not specified, no
228+
strings will be parsed as NaN.
229+
230+
If `na_filter` is False, `keep_default_na` and `na_values` will be ignored.
219231
na_filter : boolean, default ``True``
220232
Detect missing value markers (empty strings and the value of na_values). In
221233
data without any NAs, passing ``na_filter=False`` can improve the performance

doc/source/whatsnew/v0.23.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -452,6 +452,7 @@ I/O
452452
- :func:`read_html` now rewinds seekable IO objects after parse failure, before attempting to parse with a new parser. If a parser errors and the object is non-seekable, an informative error is raised suggesting the use of a different parser (:issue:`17975`)
453453
- Bug in :func:`read_msgpack` with a non existent file is passed in Python 2 (:issue:`15296`)
454454
- Bug in :func:`read_csv` where a ``MultiIndex`` with duplicate columns was not being mangled appropriately (:issue:`18062`)
455+
- Bug in :func:`read_csv` where missing values were not being handled properly when ``keep_default_na=False`` with dictionary ``na_values`` (:issue:`19227`)
455456
- Bug in :func:`read_sas` where a file with 0 variables gave an ``AttributeError`` incorrectly. Now it gives an ``EmptyDataError`` (:issue:`18184`)
456457
- Bug in :func:`DataFrame.to_latex()` where pairs of braces meant to serve as invisible placeholders were escaped (:issue:`18667`)
457458
- Bug in :func:`read_json` where large numeric values were causing an ``OverflowError`` (:issue:`18842`)

pandas/_libs/parsers.pyx

+10-4
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,7 @@ cdef class TextReader:
288288
object file_handle, na_fvalues
289289
object true_values, false_values
290290
object handle
291-
bint na_filter, verbose, has_usecols, has_mi_columns
291+
bint na_filter, keep_default_na, verbose, has_usecols, has_mi_columns
292292
int64_t parser_start
293293
list clocks
294294
char *c_encoding
@@ -352,6 +352,8 @@ cdef class TextReader:
352352
na_filter=True,
353353
na_values=None,
354354
na_fvalues=None,
355+
keep_default_na=True,
356+
355357
true_values=None,
356358
false_values=None,
357359
allow_leading_cols=True,
@@ -378,8 +380,8 @@ cdef class TextReader:
378380
self.parser = parser_new()
379381
self.parser.chunksize = tokenize_chunksize
380382

381-
self.mangle_dupe_cols=mangle_dupe_cols
382-
self.tupleize_cols=tupleize_cols
383+
self.mangle_dupe_cols = mangle_dupe_cols
384+
self.tupleize_cols = tupleize_cols
383385

384386
# For timekeeping
385387
self.clocks = []
@@ -477,6 +479,7 @@ cdef class TextReader:
477479
self.true_set = kset_from_list(self.true_values)
478480
self.false_set = kset_from_list(self.false_values)
479481

482+
self.keep_default_na = keep_default_na
480483
self.converters = converters
481484
self.na_filter = na_filter
482485

@@ -1299,7 +1302,10 @@ cdef class TextReader:
12991302
elif i in self.na_values:
13001303
key = i
13011304
else: # No na_values provided for this column.
1302-
return _NA_VALUES, set()
1305+
if self.keep_default_na:
1306+
return _NA_VALUES, set()
1307+
1308+
return list(), set()
13031309

13041310
values = self.na_values[key]
13051311
if values is not None and not isinstance(values, list):

pandas/io/parsers.py

+57-17
Original file line numberDiff line numberDiff line change
@@ -149,8 +149,20 @@
149149
NaN: '""" + fill("', '".join(sorted(_NA_VALUES)),
150150
70, subsequent_indent=" ") + """'.
151151
keep_default_na : bool, default True
152-
If na_values are specified and keep_default_na is False the default NaN
153-
values are overridden, otherwise they're appended to.
152+
Whether or not to include the default NaN values when parsing the data.
153+
Provided that `na_filter` is True, depending on whether `na_values` is
154+
passed in, the behavior is as follows:
155+
156+
* If `keep_default_na` is True, and `na_values` are specified, `na_values`
157+
is appended to the default NaN values used for parsing.
158+
* If `keep_default_na` is True, and `na_values` are not specified, only
159+
the default NaN values are used for parsing.
160+
* If `keep_default_na` is False, and `na_values` are not specified, only
161+
the NaN values specified `na_values` are used for parsing.
162+
* If `keep_default_na` is False, and `na_values` are not specified, no
163+
strings will be parsed as NaN.
164+
165+
If `na_filter` is False, `keep_default_na` and `na_values` will be ignored.
154166
na_filter : boolean, default True
155167
Detect missing value markers (empty strings and the value of na_values). In
156168
data without any NAs, passing na_filter=False can improve the performance
@@ -910,9 +922,6 @@ def _clean_options(self, options, engine):
910922
na_values = options['na_values']
911923
skiprows = options['skiprows']
912924

913-
# really delete this one
914-
keep_default_na = result.pop('keep_default_na')
915-
916925
_validate_header_arg(options['header'])
917926

918927
depr_warning = ''
@@ -957,6 +966,7 @@ def _clean_options(self, options, engine):
957966
converters = {}
958967

959968
# Converting values to NA
969+
keep_default_na = options['keep_default_na']
960970
na_values, na_fvalues = _clean_na_values(na_values, keep_default_na)
961971

962972
# handle skiprows; this is internally handled by the
@@ -1225,6 +1235,7 @@ def __init__(self, kwds):
12251235
self.na_values = kwds.get('na_values')
12261236
self.na_fvalues = kwds.get('na_fvalues')
12271237
self.na_filter = kwds.get('na_filter', False)
1238+
self.keep_default_na = kwds.get('keep_default_na', True)
12281239

12291240
self.true_values = kwds.get('true_values')
12301241
self.false_values = kwds.get('false_values')
@@ -1487,7 +1498,8 @@ def _agg_index(self, index, try_parse_dates=True):
14871498
col_name = self.index_names[i]
14881499
if col_name is not None:
14891500
col_na_values, col_na_fvalues = _get_na_values(
1490-
col_name, self.na_values, self.na_fvalues)
1501+
col_name, self.na_values, self.na_fvalues,
1502+
self.keep_default_na)
14911503

14921504
arr, _ = self._infer_types(arr, col_na_values | col_na_fvalues)
14931505
arrays.append(arr)
@@ -1510,7 +1522,7 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False,
15101522

15111523
if self.na_filter:
15121524
col_na_values, col_na_fvalues = _get_na_values(
1513-
c, na_values, na_fvalues)
1525+
c, na_values, na_fvalues, self.keep_default_na)
15141526
else:
15151527
col_na_values, col_na_fvalues = set(), set()
15161528

@@ -3098,15 +3110,17 @@ def _clean_na_values(na_values, keep_default_na=True):
30983110
na_fvalues = set()
30993111
elif isinstance(na_values, dict):
31003112
na_values = na_values.copy() # Prevent aliasing.
3101-
if keep_default_na:
3102-
for k, v in compat.iteritems(na_values):
3103-
if not is_list_like(v):
3104-
v = [v]
3113+
3114+
for k, v in compat.iteritems(na_values):
3115+
if not is_list_like(v):
3116+
v = [v]
3117+
3118+
if keep_default_na:
31053119
v = set(v) | _NA_VALUES
3106-
na_values[k] = v
3107-
na_fvalues = dict(
3108-
(k, _floatify_na_values(v)) for k, v in na_values.items() # noqa
3109-
)
3120+
3121+
na_values[k] = v
3122+
na_fvalues = dict((k, _floatify_na_values(v))
3123+
for k, v in na_values.items())
31103124
else:
31113125
if not is_list_like(na_values):
31123126
na_values = [na_values]
@@ -3225,12 +3239,38 @@ def _stringify_na_values(na_values):
32253239
return set(result)
32263240

32273241

3228-
def _get_na_values(col, na_values, na_fvalues):
3242+
def _get_na_values(col, na_values, na_fvalues, keep_default_na):
3243+
"""
3244+
Get the NaN values for a given column.
3245+
3246+
Parameters
3247+
----------
3248+
col : str
3249+
The name of the column.
3250+
na_values : array-like, dict
3251+
The object listing the NaN values as strings.
3252+
na_fvalues : array-like, dict
3253+
The object listing the NaN values as floats.
3254+
keep_default_na : bool
3255+
If `na_values` is a dict, and the column is not mapped in the
3256+
dictionary, whether to return the default NaN values or the empty set.
3257+
3258+
Returns
3259+
-------
3260+
nan_tuple : A length-two tuple composed of
3261+
3262+
1) na_values : the string NaN values for that column.
3263+
2) na_fvalues : the float NaN values for that column.
3264+
"""
3265+
32293266
if isinstance(na_values, dict):
32303267
if col in na_values:
32313268
return na_values[col], na_fvalues[col]
32323269
else:
3233-
return _NA_VALUES, set()
3270+
if keep_default_na:
3271+
return _NA_VALUES, set()
3272+
3273+
return set(), set()
32343274
else:
32353275
return na_values, na_fvalues
32363276

pandas/tests/io/parser/na_values.py

+41
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,47 @@ def test_na_values_keep_default(self):
224224
'seven']})
225225
tm.assert_frame_equal(xp.reindex(columns=df.columns), df)
226226

227+
# see gh-19227: keep_default_na=False should be enforced
228+
# when na_values is a dictionary.
229+
data = "a,b\n,2"
230+
231+
df = self.read_csv(StringIO(data), na_values={"b": ["2"]},
232+
keep_default_na=False)
233+
expected = DataFrame({"a": [""], "b": [np.nan]})
234+
tm.assert_frame_equal(df, expected)
235+
236+
# see gh-19227: keep_default_na=False should cause the parser
237+
# to crash or fail if a scalar is passed in as a value when
238+
# na_values is a dictionary.
239+
data = "a,b\n1,2"
240+
241+
df = self.read_csv(StringIO(data), na_values={"b": 2},
242+
keep_default_na=False)
243+
expected = DataFrame({"a": [1], "b": [np.nan]})
244+
tm.assert_frame_equal(df, expected)
245+
246+
data = """\
247+
113125,"blah","/blaha",kjsdkj,412.166,225.874,214.008
248+
729639,"qwer","",asdfkj,466.681,,252.373
249+
"""
250+
expected = DataFrame({0: [np.nan, 729639.0],
251+
1: [np.nan, "qwer"],
252+
2: ["/blaha", np.nan],
253+
3: ["kjsdkj", "asdfkj"],
254+
4: [412.166, 466.681],
255+
5: ["225.874", ""],
256+
6: [np.nan, 252.373]})
257+
258+
df = self.read_csv(StringIO(data), header=None, keep_default_na=False,
259+
na_values={2: "", 6: "214.008",
260+
1: "blah", 0: 113125})
261+
tm.assert_frame_equal(df, expected)
262+
263+
df = self.read_csv(StringIO(data), header=None, keep_default_na=False,
264+
na_values={2: "", 6: "214.008",
265+
1: "blah", 0: "113125"})
266+
tm.assert_frame_equal(df, expected)
267+
227268
def test_na_values_na_filter_override(self):
228269
data = """\
229270
A,B

0 commit comments

Comments
 (0)