Skip to content

Commit 721be62

Browse files
gfyoungjreback
authored andcommitted
BUG: Check for NaN after data conversion to numeric
Author: gfyoung <[email protected]> Closes pandas-dev#13314 from gfyoung/nan-check-post-numeric-conversion and squashes the following commits: 07f0538 [gfyoung] BUG: Check for NaN after data conversion to numeric
1 parent 70be8a9 commit 721be62

File tree

4 files changed

+46
-53
lines changed

4 files changed

+46
-53
lines changed

doc/source/whatsnew/v0.18.2.txt

+1
Original file line numberDiff line numberDiff line change
@@ -291,6 +291,7 @@ Bug Fixes
291291

292292

293293

294+
- Bug in ``pd.read_csv()`` with ``engine='python'`` in which ``NaN`` values weren't being detected after data was converted to numeric values (:issue:`13314`)
294295
- Bug in ``MultiIndex`` slicing where extra elements were returned when level is non-unique (:issue:`12896`)
295296

296297

pandas/io/tests/parser/na_values.py

+25-52
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
import pandas.io.parsers as parsers
1212
import pandas.util.testing as tm
1313

14-
from pandas import DataFrame, MultiIndex, read_csv
14+
from pandas import DataFrame, MultiIndex
1515
from pandas.compat import StringIO, range
1616

1717

@@ -43,57 +43,30 @@ def test_detect_string_na(self):
4343
tm.assert_numpy_array_equal(df.values, expected)
4444

4545
def test_non_string_na_values(self):
46-
# see gh-3611, na_values that are not a string are an issue
47-
with tm.ensure_clean('__non_string_na_values__.csv') as path:
48-
df = DataFrame({'A': [-999, 2, 3], 'B': [1.2, -999, 4.5]})
49-
df.to_csv(path, sep=' ', index=False)
50-
result1 = self.read_csv(path, sep=' ', header=0,
51-
na_values=['-999.0', '-999'])
52-
result2 = self.read_csv(path, sep=' ', header=0,
53-
na_values=[-999, -999.0])
54-
result3 = self.read_csv(path, sep=' ', header=0,
55-
na_values=[-999.0, -999])
56-
tm.assert_frame_equal(result1, result2)
57-
tm.assert_frame_equal(result2, result3)
58-
59-
result4 = self.read_csv(
60-
path, sep=' ', header=0, na_values=['-999.0'])
61-
result5 = self.read_csv(
62-
path, sep=' ', header=0, na_values=['-999'])
63-
result6 = self.read_csv(
64-
path, sep=' ', header=0, na_values=[-999.0])
65-
result7 = self.read_csv(
66-
path, sep=' ', header=0, na_values=[-999])
67-
tm.assert_frame_equal(result4, result3)
68-
tm.assert_frame_equal(result5, result3)
69-
tm.assert_frame_equal(result6, result3)
70-
tm.assert_frame_equal(result7, result3)
71-
72-
good_compare = result3
73-
74-
# with an odd float format, so we can't match the string 999.0
75-
# exactly, but need float matching
76-
# TODO: change these to self.read_csv when Python bug is squashed
77-
df.to_csv(path, sep=' ', index=False, float_format='%.3f')
78-
result1 = read_csv(path, sep=' ', header=0,
79-
na_values=['-999.0', '-999'])
80-
result2 = read_csv(path, sep=' ', header=0,
81-
na_values=[-999.0, -999])
82-
tm.assert_frame_equal(result1, good_compare)
83-
tm.assert_frame_equal(result2, good_compare)
84-
85-
result3 = read_csv(path, sep=' ',
86-
header=0, na_values=['-999.0'])
87-
result4 = read_csv(path, sep=' ',
88-
header=0, na_values=['-999'])
89-
result5 = read_csv(path, sep=' ',
90-
header=0, na_values=[-999.0])
91-
result6 = read_csv(path, sep=' ',
92-
header=0, na_values=[-999])
93-
tm.assert_frame_equal(result3, good_compare)
94-
tm.assert_frame_equal(result4, good_compare)
95-
tm.assert_frame_equal(result5, good_compare)
96-
tm.assert_frame_equal(result6, good_compare)
46+
# see gh-3611: with an odd float format, we can't match
47+
# the string '999.0' exactly but still need float matching
48+
nice = """A,B
49+
-999,1.2
50+
2,-999
51+
3,4.5
52+
"""
53+
ugly = """A,B
54+
-999,1.200
55+
2,-999.000
56+
3,4.500
57+
"""
58+
na_values_param = [['-999.0', '-999'],
59+
[-999, -999.0],
60+
[-999.0, -999],
61+
['-999.0'], ['-999'],
62+
[-999.0], [-999]]
63+
expected = DataFrame([[np.nan, 1.2], [2.0, np.nan],
64+
[3.0, 4.5]], columns=['A', 'B'])
65+
66+
for data in (nice, ugly):
67+
for na_values in na_values_param:
68+
out = self.read_csv(StringIO(data), na_values=na_values)
69+
tm.assert_frame_equal(out, expected)
9770

9871
def test_default_na_values(self):
9972
_NA_VALUES = set(['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN',

pandas/src/inference.pyx

+7-1
Original file line numberDiff line numberDiff line change
@@ -596,7 +596,13 @@ def maybe_convert_numeric(object[:] values, set na_values,
596596
else:
597597
try:
598598
status = floatify(val, &fval, &maybe_int)
599-
floats[i] = fval
599+
600+
if fval in na_values:
601+
floats[i] = complexes[i] = nan
602+
seen_float = True
603+
else:
604+
floats[i] = fval
605+
600606
if not seen_float:
601607
if maybe_int:
602608
as_int = int(val)

pandas/tests/test_lib.py

+13
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,9 @@ def test_isinf_scalar(self):
188188
self.assertFalse(lib.isneginf_scalar(1))
189189
self.assertFalse(lib.isneginf_scalar('a'))
190190

191+
192+
# tests related to functions imported from inference.pyx
193+
class TestInference(tm.TestCase):
191194
def test_maybe_convert_numeric_infinities(self):
192195
# see gh-13274
193196
infinities = ['inf', 'inF', 'iNf', 'Inf',
@@ -227,6 +230,16 @@ def test_maybe_convert_numeric_infinities(self):
227230
np.array(['foo_' + infinity], dtype=object),
228231
na_values, maybe_int)
229232

233+
def test_maybe_convert_numeric_post_floatify_nan(self):
234+
# see gh-13314
235+
data = np.array(['1.200', '-999.000', '4.500'], dtype=object)
236+
expected = np.array([1.2, np.nan, 4.5], dtype=np.float64)
237+
nan_values = set([-999, -999.0])
238+
239+
for coerce_type in (True, False):
240+
out = lib.maybe_convert_numeric(data, nan_values, coerce_type)
241+
tm.assert_numpy_array_equal(out, expected)
242+
230243

231244
class Testisscalar(tm.TestCase):
232245

0 commit comments

Comments
 (0)