BUG: Check for NaN after data conversion to numeric

gfyoung · jreback · commit 721be6297ec5 · 2016-05-30T09:26:47.000-04:00
Author: gfyoung <gfyoung17@gmail.com> Closes pandas-dev#13314 from gfyoung/nan-check-post-numeric-conversion and squashes the following commits: 07f0538 [gfyoung] BUG: Check for NaN after data conversion to numeric
diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt
@@ -291,6 +291,7 @@ Bug Fixes
 
 
 
+- Bug in ``pd.read_csv()`` with ``engine='python'`` in which ``NaN`` values weren't being detected after data was converted to numeric values (:issue:`13314`)
 - Bug in ``MultiIndex`` slicing where extra elements were returned when level is non-unique (:issue:`12896`)
 
 
diff --git a/pandas/io/tests/parser/na_values.py b/pandas/io/tests/parser/na_values.py
@@ -11,7 +11,7 @@
 import pandas.io.parsers as parsers
 import pandas.util.testing as tm
 
-from pandas import DataFrame, MultiIndex, read_csv
+from pandas import DataFrame, MultiIndex
 from pandas.compat import StringIO, range
 
 
@@ -43,57 +43,30 @@ def test_detect_string_na(self):
         tm.assert_numpy_array_equal(df.values, expected)
 
     def test_non_string_na_values(self):
-        # see gh-3611, na_values that are not a string are an issue
-        with tm.ensure_clean('__non_string_na_values__.csv') as path:
-            df = DataFrame({'A': [-999, 2, 3], 'B': [1.2, -999, 4.5]})
-            df.to_csv(path, sep=' ', index=False)
-            result1 = self.read_csv(path, sep=' ', header=0,
-                                    na_values=['-999.0', '-999'])
-            result2 = self.read_csv(path, sep=' ', header=0,
-                                    na_values=[-999, -999.0])
-            result3 = self.read_csv(path, sep=' ', header=0,
-                                    na_values=[-999.0, -999])
-            tm.assert_frame_equal(result1, result2)
-            tm.assert_frame_equal(result2, result3)
-
-            result4 = self.read_csv(
-                path, sep=' ', header=0, na_values=['-999.0'])
-            result5 = self.read_csv(
-                path, sep=' ', header=0, na_values=['-999'])
-            result6 = self.read_csv(
-                path, sep=' ', header=0, na_values=[-999.0])
-            result7 = self.read_csv(
-                path, sep=' ', header=0, na_values=[-999])
-            tm.assert_frame_equal(result4, result3)
-            tm.assert_frame_equal(result5, result3)
-            tm.assert_frame_equal(result6, result3)
-            tm.assert_frame_equal(result7, result3)
-
-            good_compare = result3
-
-            # with an odd float format, so we can't match the string 999.0
-            # exactly, but need float matching
-            # TODO: change these to self.read_csv when Python bug is squashed
-            df.to_csv(path, sep=' ', index=False, float_format='%.3f')
-            result1 = read_csv(path, sep=' ', header=0,
-                               na_values=['-999.0', '-999'])
-            result2 = read_csv(path, sep=' ', header=0,
-                               na_values=[-999.0, -999])
-            tm.assert_frame_equal(result1, good_compare)
-            tm.assert_frame_equal(result2, good_compare)
-
-            result3 = read_csv(path, sep=' ',
-                               header=0, na_values=['-999.0'])
-            result4 = read_csv(path, sep=' ',
-                               header=0, na_values=['-999'])
-            result5 = read_csv(path, sep=' ',
-                               header=0, na_values=[-999.0])
-            result6 = read_csv(path, sep=' ',
-                               header=0, na_values=[-999])
-            tm.assert_frame_equal(result3, good_compare)
-            tm.assert_frame_equal(result4, good_compare)
-            tm.assert_frame_equal(result5, good_compare)
-            tm.assert_frame_equal(result6, good_compare)
+        # see gh-3611: with an odd float format, we can't match
+        # the string '999.0' exactly but still need float matching
+        nice = """A,B
+-999,1.2
+2,-999
+3,4.5
+"""
+        ugly = """A,B
+-999,1.200
+2,-999.000
+3,4.500
+"""
+        na_values_param = [['-999.0', '-999'],
+                           [-999, -999.0],
+                           [-999.0, -999],
+                           ['-999.0'], ['-999'],
+                           [-999.0], [-999]]
+        expected = DataFrame([[np.nan, 1.2], [2.0, np.nan],
+                              [3.0, 4.5]], columns=['A', 'B'])
+
+        for data in (nice, ugly):
+            for na_values in na_values_param:
+                out = self.read_csv(StringIO(data), na_values=na_values)
+                tm.assert_frame_equal(out, expected)
 
     def test_default_na_values(self):
         _NA_VALUES = set(['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN',
diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx
@@ -596,7 +596,13 @@ def maybe_convert_numeric(object[:] values, set na_values,
         else:
             try:
                 status = floatify(val, &fval, &maybe_int)
-                floats[i] = fval
+
+                if fval in na_values:
+                    floats[i] = complexes[i] = nan
+                    seen_float = True
+                else:
+                    floats[i] = fval
+
                 if not seen_float:
                     if maybe_int:
                         as_int = int(val)
diff --git a/pandas/tests/test_lib.py b/pandas/tests/test_lib.py
@@ -188,6 +188,9 @@ def test_isinf_scalar(self):
         self.assertFalse(lib.isneginf_scalar(1))
         self.assertFalse(lib.isneginf_scalar('a'))
 
+
+# tests related to functions imported from inference.pyx
+class TestInference(tm.TestCase):
     def test_maybe_convert_numeric_infinities(self):
         # see gh-13274
         infinities = ['inf', 'inF', 'iNf', 'Inf',
@@ -227,6 +230,16 @@ def test_maybe_convert_numeric_infinities(self):
                         np.array(['foo_' + infinity], dtype=object),
                         na_values, maybe_int)
 
+    def test_maybe_convert_numeric_post_floatify_nan(self):
+        # see gh-13314
+        data = np.array(['1.200', '-999.000', '4.500'], dtype=object)
+        expected = np.array([1.2, np.nan, 4.5], dtype=np.float64)
+        nan_values = set([-999, -999.0])
+
+        for coerce_type in (True, False):
+            out = lib.maybe_convert_numeric(data, nan_values, coerce_type)
+            tm.assert_numpy_array_equal(out, expected)
+
 
 class Testisscalar(tm.TestCase):
 

Original file line number	Diff line number	Diff line change
`@@ -291,6 +291,7 @@ Bug Fixes`
`291`	`291`
`292`	`292`
`293`	`293`
	`294`	+- Bug in ``pd.read_csv()`` with ``engine='python'`` in which ``NaN`` values weren't being detected after data was converted to numeric values (:issue:`13314`)
`294`	`295`	- Bug in ``MultiIndex`` slicing where extra elements were returned when level is non-unique (:issue:`12896`)
`295`	`296`
`296`	`297`