ENH: raise exc if find NA values when explicit integer dtype passed to read_* functions. close #2631

wesm · wesm · commit 5da8df72ff4a · 2013-01-19T19:10:52.000-05:00
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -57,6 +57,8 @@ pandas 0.10.1
   - ``pivot_table`` aggfunc can be anything used in GroupBy.aggregate (GH2643_)
   - Implement DataFrame merges in case where set cardinalities might overflow
     64-bit integer (GH2690_)
+  - Raise exception in C file parser if integer dtype specified and have NA
+    values. (GH2631_)
 
 **Bug fixes**
 
@@ -103,6 +105,7 @@ pandas 0.10.1
 .. _GH2616: https://github.com/pydata/pandas/issues/2616
 .. _GH2625: https://github.com/pydata/pandas/issues/2625
 .. _GH2643: https://github.com/pydata/pandas/issues/2643
+.. _GH2631: https://github.com/pydata/pandas/issues/2631
 .. _GH2633: https://github.com/pydata/pandas/issues/2633
 .. _GH2637: https://github.com/pydata/pandas/issues/2637
 .. _GH2690: https://github.com/pydata/pandas/issues/2690
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -2007,6 +2007,16 @@ def test_custom_lineterminator(self):
         result = self.assertRaises(ValueError, read_csv, StringIO(data2),
                                    lineterminator='~~')
 
+    def test_raise_on_passed_int_dtype_with_nas(self):
+        # #2631
+        data = """YEAR, DOY, a
+2001,106380451,10
+2001,,11
+2001,106380451,67"""
+        self.assertRaises(Exception, read_csv, StringIO(data), sep=",",
+                          skipinitialspace=True,
+                          dtype={'DOY': np.int64})
+
 
 class TestParseSQL(unittest.TestCase):
 
diff --git a/pandas/src/parser.pyx b/pandas/src/parser.pyx
@@ -870,7 +870,7 @@ cdef class TextReader:
                         col_dtype = np.dtype(col_dtype).str
 
                 return self._convert_with_dtype(col_dtype, i, start, end,
-                                                na_filter, na_hashset)
+                                                na_filter, 1, na_hashset)
 
         if i in self.noconvert:
             return self._string_convert(i, start, end, na_filter, na_hashset)
@@ -879,10 +879,10 @@ cdef class TextReader:
             for dt in dtype_cast_order:
                 try:
                     col_res, na_count = self._convert_with_dtype(
-                        dt, i, start, end, na_filter, na_hashset)
+                        dt, i, start, end, na_filter, 0, na_hashset)
                 except OverflowError:
                     col_res, na_count = self._convert_with_dtype(
-                        '|O8', i, start, end, na_filter, na_hashset)
+                        '|O8', i, start, end, na_filter, 0, na_hashset)
 
                 if col_res is not None:
                     break
@@ -891,14 +891,16 @@ cdef class TextReader:
 
     cdef _convert_with_dtype(self, object dtype, Py_ssize_t i,
                              int start, int end,
-                             bint na_filter, kh_str_t *na_hashset):
+                             bint na_filter,
+                             bint user_dtype,
+                             kh_str_t *na_hashset):
         cdef kh_str_t *true_set, *false_set
 
         if dtype[1] == 'i' or dtype[1] == 'u':
             result, na_count = _try_int64(self.parser, i, start, end,
                                           na_filter, na_hashset)
-            # if na_count > 0:
-            #     raise Exception('Integer column has NA values')
+            if user_dtype and na_count > 0:
+                raise Exception('Integer column has NA values')
 
             if dtype[1:] != 'i8':
                 result = result.astype(dtype)