Merge branch 'issue-3866-column-conversion' of https://github.com/guyrt/pandas into guyrt-issue-3866-column-conversion

jreback · jreback · commit 15d8535185f3 · 2013-09-29T15:26:46.000-04:00
Conflicts:
	doc/source/release.rst
	pandas/io/tests/test_parsers.py
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -511,6 +511,8 @@ Bug Fixes
   - ``Timestamp`` objects can now appear in the left hand side of a comparison
     operation with a ``Series`` or ``DataFrame`` object (:issue:`4982`).
   - Fix a bug when indexing with ``np.nan`` via ``iloc/loc`` (:issue:`5016`)
+  - Fixed a bug where low memory c parser could create different types in different
+    chunks of the same file. Now coerces to numerical type or raises warning. (:issue:`3866`)
 
 pandas 0.12.0
 -------------
diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -36,10 +36,15 @@ def urlopen(*args, **kwargs):
 _VALID_URLS = set(uses_relative + uses_netloc + uses_params)
 _VALID_URLS.discard('')
 
+
 class PerformanceWarning(Warning):
     pass
 
 
+class DtypeWarning(Warning):
+    pass
+
+
 def _is_url(url):
     """Check to see if a URL has a valid protocol.
 
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -12,6 +12,7 @@
 
 from numpy import nan
 import numpy as np
+from pandas.io.common import DtypeWarning
 
 from pandas import DataFrame, Series, Index, MultiIndex, DatetimeIndex
 from pandas.compat import(
@@ -1889,6 +1890,24 @@ def test_usecols_index_col_conflict(self):
         df = pd.read_csv(StringIO(data), usecols=['Price', 'P2', 'P3'], parse_dates=True, index_col=['Price', 'P2'])
         tm.assert_frame_equal(expected, df)
 
+    def test_chunks_have_consistent_numerical_type(self):
+        integers = [str(i) for i in range(499999)]
+        data = "a\n" + "\n".join(integers + ["1.0", "2.0"] + integers)
+
+        with tm.assert_produces_warning(False):
+            df = self.read_csv(StringIO(data))
+        self.assertTrue(type(df.a[0]) is np.float64)  # Assert that types were coerced.
+        self.assertEqual(df.a.dtype, np.float)
+
+    def test_warn_if_chunks_have_mismatched_type(self):
+        # See test in TestCParserLowMemory.
+        integers = [str(i) for i in range(499999)]
+        data = "a\n" + "\n".join(integers + ['a', 'b'] + integers)
+
+        with tm.assert_produces_warning(False):
+            df = self.read_csv(StringIO(data))
+        self.assertEqual(df.a.dtype, np.object)
+
 
 class TestPythonParser(ParserTests, unittest.TestCase):
     def test_negative_skipfooter_raises(self):
@@ -2352,7 +2371,6 @@ def test_usecols_dtypes(self):
         self.assertTrue((result.dtypes == [object, np.int, np.float]).all())
         self.assertTrue((result2.dtypes == [object, np.float]).all())
 
-
     def test_usecols_implicit_index_col(self):
         # #2654
         data = 'a,b,c\n4,apple,bat,5.7\n8,orange,cow,10'
@@ -2579,16 +2597,22 @@ def test_tokenize_CR_with_quoting(self):
 
     def test_raise_on_no_columns(self):
         # single newline
-        data = """
-"""
+        data = "\n"
         self.assertRaises(ValueError, self.read_csv, StringIO(data))
 
         # test with more than a single newline
-        data = """
+        data = "\n\n\n"
+        self.assertRaises(ValueError, self.read_csv, StringIO(data))
 
+    def test_warn_if_chunks_have_mismatched_type(self):
+        # Issue #3866 If chunks are different types and can't
+        # be coerced using numerical types, then issue warning.
+        integers = [str(i) for i in range(499999)]
+        data = "a\n" + "\n".join(integers + ['a', 'b'] + integers)
 
-"""
-        self.assertRaises(ValueError, self.read_csv, StringIO(data))
+        with tm.assert_produces_warning(DtypeWarning):
+            df = self.read_csv(StringIO(data))
+        self.assertEqual(df.a.dtype, np.object)
 
 
 class TestParseSQL(unittest.TestCase):
diff --git a/pandas/parser.pyx b/pandas/parser.pyx
@@ -5,10 +5,12 @@ from libc.stdio cimport fopen, fclose
 from libc.stdlib cimport malloc, free
 from libc.string cimport strncpy, strlen, strcmp, strcasecmp
 cimport libc.stdio as stdio
+import warnings
 
 from cpython cimport (PyObject, PyBytes_FromString,
                       PyBytes_AsString, PyBytes_Check,
                       PyUnicode_Check, PyUnicode_AsUTF8String)
+from io.common import DtypeWarning
 
 
 cdef extern from "Python.h":
@@ -1735,11 +1737,28 @@ def _concatenate_chunks(list chunks):
     cdef:
         list names = list(chunks[0].keys())
         object name
+        list warning_columns
+        object warning_names
+        object common_type
 
     result = {}
+    warning_columns = list()
     for name in names:
         arrs = [chunk.pop(name) for chunk in chunks]
+        # Check each arr for consistent types.
+        dtypes = set([a.dtype for a in arrs])
+        if len(dtypes) > 1:
+            common_type = np.find_common_type(dtypes, [])
+            if common_type == np.object:
+                warning_columns.append(str(name))
         result[name] = np.concatenate(arrs)
+
+    if warning_columns:
+        warning_names = ','.join(warning_columns)
+        warning_message = " ".join(["Columns (%s) have mixed types." % warning_names,
+            "Specify dtype option on import or set low_memory=False."
+          ])
+        warnings.warn(warning_message, DtypeWarning)
     return result
 
 #----------------------------------------------------------------------