Skip to content

BUG: Check for NaN after data conversion to numeric #13314

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.18.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,7 @@ Bug Fixes



- Bug in ``pd.read_csv()`` with ``engine='python'`` in which ``NaN`` values weren't being detected after data was converted to numeric values (:issue:`13314`)
- Bug in ``MultiIndex`` slicing where extra elements were returned when level is non-unique (:issue:`12896`)


Expand Down
77 changes: 25 additions & 52 deletions pandas/io/tests/parser/na_values.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import pandas.io.parsers as parsers
import pandas.util.testing as tm

from pandas import DataFrame, MultiIndex, read_csv
from pandas import DataFrame, MultiIndex
from pandas.compat import StringIO, range


Expand Down Expand Up @@ -43,57 +43,30 @@ def test_detect_string_na(self):
tm.assert_numpy_array_equal(df.values, expected)

def test_non_string_na_values(self):
# see gh-3611, na_values that are not a string are an issue
with tm.ensure_clean('__non_string_na_values__.csv') as path:
df = DataFrame({'A': [-999, 2, 3], 'B': [1.2, -999, 4.5]})
df.to_csv(path, sep=' ', index=False)
result1 = self.read_csv(path, sep=' ', header=0,
na_values=['-999.0', '-999'])
result2 = self.read_csv(path, sep=' ', header=0,
na_values=[-999, -999.0])
result3 = self.read_csv(path, sep=' ', header=0,
na_values=[-999.0, -999])
tm.assert_frame_equal(result1, result2)
tm.assert_frame_equal(result2, result3)

result4 = self.read_csv(
path, sep=' ', header=0, na_values=['-999.0'])
result5 = self.read_csv(
path, sep=' ', header=0, na_values=['-999'])
result6 = self.read_csv(
path, sep=' ', header=0, na_values=[-999.0])
result7 = self.read_csv(
path, sep=' ', header=0, na_values=[-999])
tm.assert_frame_equal(result4, result3)
tm.assert_frame_equal(result5, result3)
tm.assert_frame_equal(result6, result3)
tm.assert_frame_equal(result7, result3)

good_compare = result3

# with an odd float format, so we can't match the string 999.0
# exactly, but need float matching
# TODO: change these to self.read_csv when Python bug is squashed
df.to_csv(path, sep=' ', index=False, float_format='%.3f')
result1 = read_csv(path, sep=' ', header=0,
na_values=['-999.0', '-999'])
result2 = read_csv(path, sep=' ', header=0,
na_values=[-999.0, -999])
tm.assert_frame_equal(result1, good_compare)
tm.assert_frame_equal(result2, good_compare)

result3 = read_csv(path, sep=' ',
header=0, na_values=['-999.0'])
result4 = read_csv(path, sep=' ',
header=0, na_values=['-999'])
result5 = read_csv(path, sep=' ',
header=0, na_values=[-999.0])
result6 = read_csv(path, sep=' ',
header=0, na_values=[-999])
tm.assert_frame_equal(result3, good_compare)
tm.assert_frame_equal(result4, good_compare)
tm.assert_frame_equal(result5, good_compare)
tm.assert_frame_equal(result6, good_compare)
# see gh-3611: with an odd float format, we can't match
# the string '999.0' exactly but still need float matching
nice = """A,B
-999,1.2
2,-999
3,4.5
"""
ugly = """A,B
-999,1.200
2,-999.000
3,4.500
"""
na_values_param = [['-999.0', '-999'],
[-999, -999.0],
[-999.0, -999],
['-999.0'], ['-999'],
[-999.0], [-999]]
expected = DataFrame([[np.nan, 1.2], [2.0, np.nan],
[3.0, 4.5]], columns=['A', 'B'])

for data in (nice, ugly):
for na_values in na_values_param:
out = self.read_csv(StringIO(data), na_values=na_values)
tm.assert_frame_equal(out, expected)

def test_default_na_values(self):
_NA_VALUES = set(['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN',
Expand Down
8 changes: 7 additions & 1 deletion pandas/src/inference.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -596,7 +596,13 @@ def maybe_convert_numeric(object[:] values, set na_values,
else:
try:
status = floatify(val, &fval, &maybe_int)
floats[i] = fval

if fval in na_values:
floats[i] = complexes[i] = nan
seen_float = True
else:
floats[i] = fval

if not seen_float:
if maybe_int:
as_int = int(val)
Expand Down
13 changes: 13 additions & 0 deletions pandas/tests/test_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,9 @@ def test_isinf_scalar(self):
self.assertFalse(lib.isneginf_scalar(1))
self.assertFalse(lib.isneginf_scalar('a'))


# tests related to functions imported from inference.pyx
class TestInference(tm.TestCase):
def test_maybe_convert_numeric_infinities(self):
# see gh-13274
infinities = ['inf', 'inF', 'iNf', 'Inf',
Expand Down Expand Up @@ -227,6 +230,16 @@ def test_maybe_convert_numeric_infinities(self):
np.array(['foo_' + infinity], dtype=object),
na_values, maybe_int)

def test_maybe_convert_numeric_post_floatify_nan(self):
# see gh-13314
data = np.array(['1.200', '-999.000', '4.500'], dtype=object)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you move the like tests to a separate class (TestInference), rather than TestMisc

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep. Done.

expected = np.array([1.2, np.nan, 4.5], dtype=np.float64)
nan_values = set([-999, -999.0])

for coerce_type in (True, False):
out = lib.maybe_convert_numeric(data, nan_values, coerce_type)
tm.assert_numpy_array_equal(out, expected)


class Testisscalar(tm.TestCase):

Expand Down