BUG: Warn when dtypes differ in between chunks in csv parser

guyrt · guyrt · commit 3104b4318e31 · 2013-09-26T23:30:34.000-04:00
closes pandas-dev#3866 Silently fix problem rather than warning if we can coerce to numerical type.
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -480,6 +480,8 @@ Bug Fixes
   - Fixed wrong check for overlapping in ``DatetimeIndex.union`` (:issue:`4564`)
   - Fixed conflict between thousands separator and date parser in csv_parser (:issue:`4678`)
   - Fix appending when dtypes are not the same (error showing mixing float/np.datetime64) (:issue:`4993`)
+  - Fixed a bug where low memory c parser could create different types in different
+    chunks of the same file. Now coerces to numerical type or raises warning. (:issue:`3866`)
 
 pandas 0.12.0
 -------------
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -8,6 +8,7 @@
 import re
 import unittest
 import nose
+import warnings
 
 from numpy import nan
 import numpy as np
@@ -2301,7 +2302,6 @@ def test_usecols_dtypes(self):
         self.assertTrue((result.dtypes == [object, np.int, np.float]).all())
         self.assertTrue((result2.dtypes == [object, np.float]).all())
 
-
     def test_usecols_implicit_index_col(self):
         # #2654
         data = 'a,b,c\n4,apple,bat,5.7\n8,orange,cow,10'
@@ -2528,17 +2528,36 @@ def test_tokenize_CR_with_quoting(self):
 
     def test_raise_on_no_columns(self):
         # single newline
-        data = """
-"""
+        data = "\n"
         self.assertRaises(ValueError, self.read_csv, StringIO(data))
 
         # test with more than a single newline
-        data = """
-
-
-"""
+        data = "\n\n\n"
         self.assertRaises(ValueError, self.read_csv, StringIO(data))
 
+    def test_chunks_have_consistent_numerical_type(self):
+        # Issue #3866 If chunks are different types and *can*
+        # be coerced using numerical types, then do so.
+        integers = [str(i) for i in range(499999)]
+        data = "a\n" + "\n".join(integers + ["1.0", "2.0"] + integers)
+
+        with warnings.catch_warnings(record=True) as w:
+            df = self.read_csv(StringIO(data), low_memory=True)
+            if len(w) > 0:
+                self.fail("Unexpected warning raised.")
+        self.assertTrue(type(df.a[0]) is np.float64)  # Assert that types were coerced.
+        self.assertEqual(df.a.dtype, np.float)
+
+    def test_warn_if_chunks_have_mismatched_type(self):
+        # Issue #3866 If chunks are different types and can't
+        # be coerced using numerical types, then issue warning.
+        integers = [str(i) for i in range(499999)]
+        data = "a\n" + "\n".join(integers + ['a', 'b'] + integers)
+
+        df = self.read_csv(StringIO(data), low_memory=True)
+        tm.assert_produces_warning()
+        self.assertEqual(df.a.dtype, np.object)
+
 
 class TestParseSQL(unittest.TestCase):
 
diff --git a/pandas/parser.pyx b/pandas/parser.pyx
@@ -1739,6 +1739,15 @@ def _concatenate_chunks(list chunks):
     result = {}
     for name in names:
         arrs = [chunk.pop(name) for chunk in chunks]
+        # Check each arr for consistent types.
+        dtypes = set([a.dtype for a in arrs])
+        if len(dtypes) > 1:
+            common_type = np.find_common_type(dtypes, [])
+            if common_type == np.object:
+                warning_message = " ".join(["Column %s has mixed types." % name,
+                    "Specify dtype option on import or set low_memory=False."
+                  ])
+                print >> sys.stderr, warning_message
         result[name] = np.concatenate(arrs)
     return result