From 07f05384b4c6bdcecc29a4757b429171ba6c0d38 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sun, 29 May 2016 00:15:53 +0200 Subject: [PATCH] BUG: Check for NaN after data conversion to numeric --- doc/source/whatsnew/v0.18.2.txt | 1 + pandas/io/tests/parser/na_values.py | 77 ++++++++++------------------- pandas/src/inference.pyx | 8 ++- pandas/tests/test_lib.py | 13 +++++ 4 files changed, 46 insertions(+), 53 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index dfb5ebc9379b1..262ad9773b71f 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -291,6 +291,7 @@ Bug Fixes +- Bug in ``pd.read_csv()`` with ``engine='python'`` in which ``NaN`` values weren't being detected after data was converted to numeric values (:issue:`13314`) - Bug in ``MultiIndex`` slicing where extra elements were returned when level is non-unique (:issue:`12896`) diff --git a/pandas/io/tests/parser/na_values.py b/pandas/io/tests/parser/na_values.py index c34549835cb46..b03ae4ae9fc22 100644 --- a/pandas/io/tests/parser/na_values.py +++ b/pandas/io/tests/parser/na_values.py @@ -11,7 +11,7 @@ import pandas.io.parsers as parsers import pandas.util.testing as tm -from pandas import DataFrame, MultiIndex, read_csv +from pandas import DataFrame, MultiIndex from pandas.compat import StringIO, range @@ -43,57 +43,30 @@ def test_detect_string_na(self): tm.assert_numpy_array_equal(df.values, expected) def test_non_string_na_values(self): - # see gh-3611, na_values that are not a string are an issue - with tm.ensure_clean('__non_string_na_values__.csv') as path: - df = DataFrame({'A': [-999, 2, 3], 'B': [1.2, -999, 4.5]}) - df.to_csv(path, sep=' ', index=False) - result1 = self.read_csv(path, sep=' ', header=0, - na_values=['-999.0', '-999']) - result2 = self.read_csv(path, sep=' ', header=0, - na_values=[-999, -999.0]) - result3 = self.read_csv(path, sep=' ', header=0, - na_values=[-999.0, -999]) - tm.assert_frame_equal(result1, result2) - tm.assert_frame_equal(result2, result3) - - result4 = self.read_csv( - path, sep=' ', header=0, na_values=['-999.0']) - result5 = self.read_csv( - path, sep=' ', header=0, na_values=['-999']) - result6 = self.read_csv( - path, sep=' ', header=0, na_values=[-999.0]) - result7 = self.read_csv( - path, sep=' ', header=0, na_values=[-999]) - tm.assert_frame_equal(result4, result3) - tm.assert_frame_equal(result5, result3) - tm.assert_frame_equal(result6, result3) - tm.assert_frame_equal(result7, result3) - - good_compare = result3 - - # with an odd float format, so we can't match the string 999.0 - # exactly, but need float matching - # TODO: change these to self.read_csv when Python bug is squashed - df.to_csv(path, sep=' ', index=False, float_format='%.3f') - result1 = read_csv(path, sep=' ', header=0, - na_values=['-999.0', '-999']) - result2 = read_csv(path, sep=' ', header=0, - na_values=[-999.0, -999]) - tm.assert_frame_equal(result1, good_compare) - tm.assert_frame_equal(result2, good_compare) - - result3 = read_csv(path, sep=' ', - header=0, na_values=['-999.0']) - result4 = read_csv(path, sep=' ', - header=0, na_values=['-999']) - result5 = read_csv(path, sep=' ', - header=0, na_values=[-999.0]) - result6 = read_csv(path, sep=' ', - header=0, na_values=[-999]) - tm.assert_frame_equal(result3, good_compare) - tm.assert_frame_equal(result4, good_compare) - tm.assert_frame_equal(result5, good_compare) - tm.assert_frame_equal(result6, good_compare) + # see gh-3611: with an odd float format, we can't match + # the string '999.0' exactly but still need float matching + nice = """A,B +-999,1.2 +2,-999 +3,4.5 +""" + ugly = """A,B +-999,1.200 +2,-999.000 +3,4.500 +""" + na_values_param = [['-999.0', '-999'], + [-999, -999.0], + [-999.0, -999], + ['-999.0'], ['-999'], + [-999.0], [-999]] + expected = DataFrame([[np.nan, 1.2], [2.0, np.nan], + [3.0, 4.5]], columns=['A', 'B']) + + for data in (nice, ugly): + for na_values in na_values_param: + out = self.read_csv(StringIO(data), na_values=na_values) + tm.assert_frame_equal(out, expected) def test_default_na_values(self): _NA_VALUES = set(['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx index 3ccc1c4f9336c..e2c59a34bdf21 100644 --- a/pandas/src/inference.pyx +++ b/pandas/src/inference.pyx @@ -596,7 +596,13 @@ def maybe_convert_numeric(object[:] values, set na_values, else: try: status = floatify(val, &fval, &maybe_int) - floats[i] = fval + + if fval in na_values: + floats[i] = complexes[i] = nan + seen_float = True + else: + floats[i] = fval + if not seen_float: if maybe_int: as_int = int(val) diff --git a/pandas/tests/test_lib.py b/pandas/tests/test_lib.py index 2aa31063df446..c6a703673a4c4 100644 --- a/pandas/tests/test_lib.py +++ b/pandas/tests/test_lib.py @@ -188,6 +188,9 @@ def test_isinf_scalar(self): self.assertFalse(lib.isneginf_scalar(1)) self.assertFalse(lib.isneginf_scalar('a')) + +# tests related to functions imported from inference.pyx +class TestInference(tm.TestCase): def test_maybe_convert_numeric_infinities(self): # see gh-13274 infinities = ['inf', 'inF', 'iNf', 'Inf', @@ -227,6 +230,16 @@ def test_maybe_convert_numeric_infinities(self): np.array(['foo_' + infinity], dtype=object), na_values, maybe_int) + def test_maybe_convert_numeric_post_floatify_nan(self): + # see gh-13314 + data = np.array(['1.200', '-999.000', '4.500'], dtype=object) + expected = np.array([1.2, np.nan, 4.5], dtype=np.float64) + nan_values = set([-999, -999.0]) + + for coerce_type in (True, False): + out = lib.maybe_convert_numeric(data, nan_values, coerce_type) + tm.assert_numpy_array_equal(out, expected) + class Testisscalar(tm.TestCase):