diff --git a/doc/source/release.rst b/doc/source/release.rst index ce08a1ca0a175..810889cbc4b26 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -480,6 +480,8 @@ Bug Fixes - Fixed wrong check for overlapping in ``DatetimeIndex.union`` (:issue:`4564`) - Fixed conflict between thousands separator and date parser in csv_parser (:issue:`4678`) - Fix appending when dtypes are not the same (error showing mixing float/np.datetime64) (:issue:`4993`) + - Fixed a bug where low memory c parser could create different types in different + chunks of the same file. Now coerces to numerical type or raises warning. (:issue:`3866`) pandas 0.12.0 ------------- diff --git a/pandas/io/common.py b/pandas/io/common.py index 02242c5a91493..aa5fdb29f3b5b 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -36,10 +36,15 @@ def urlopen(*args, **kwargs): _VALID_URLS = set(uses_relative + uses_netloc + uses_params) _VALID_URLS.discard('') + class PerformanceWarning(Warning): pass +class DtypeWarning(Warning): + pass + + def _is_url(url): """Check to see if a URL has a valid protocol. diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 48c47238aec6f..24ec88cff727b 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -11,6 +11,7 @@ from numpy import nan import numpy as np +from pandas.io.common import DtypeWarning from pandas import DataFrame, Series, Index, MultiIndex, DatetimeIndex from pandas.compat import( @@ -1865,6 +1866,24 @@ def test_parse_integers_above_fp_precision(self): self.assertTrue(np.array_equal(result['Numbers'], expected['Numbers'])) + def test_chunks_have_consistent_numerical_type(self): + integers = [str(i) for i in range(499999)] + data = "a\n" + "\n".join(integers + ["1.0", "2.0"] + integers) + + with tm.assert_produces_warning(False): + df = self.read_csv(StringIO(data)) + self.assertTrue(type(df.a[0]) is np.float64) # Assert that types were coerced. + self.assertEqual(df.a.dtype, np.float) + + def test_warn_if_chunks_have_mismatched_type(self): + # See test in TestCParserLowMemory. + integers = [str(i) for i in range(499999)] + data = "a\n" + "\n".join(integers + ['a', 'b'] + integers) + + with tm.assert_produces_warning(False): + df = self.read_csv(StringIO(data)) + self.assertEqual(df.a.dtype, np.object) + class TestPythonParser(ParserTests, unittest.TestCase): @@ -2301,7 +2320,6 @@ def test_usecols_dtypes(self): self.assertTrue((result.dtypes == [object, np.int, np.float]).all()) self.assertTrue((result2.dtypes == [object, np.float]).all()) - def test_usecols_implicit_index_col(self): # #2654 data = 'a,b,c\n4,apple,bat,5.7\n8,orange,cow,10' @@ -2528,16 +2546,22 @@ def test_tokenize_CR_with_quoting(self): def test_raise_on_no_columns(self): # single newline - data = """ -""" + data = "\n" self.assertRaises(ValueError, self.read_csv, StringIO(data)) # test with more than a single newline - data = """ + data = "\n\n\n" + self.assertRaises(ValueError, self.read_csv, StringIO(data)) + def test_warn_if_chunks_have_mismatched_type(self): + # Issue #3866 If chunks are different types and can't + # be coerced using numerical types, then issue warning. + integers = [str(i) for i in range(499999)] + data = "a\n" + "\n".join(integers + ['a', 'b'] + integers) -""" - self.assertRaises(ValueError, self.read_csv, StringIO(data)) + with tm.assert_produces_warning(DtypeWarning): + df = self.read_csv(StringIO(data)) + self.assertEqual(df.a.dtype, np.object) class TestParseSQL(unittest.TestCase): diff --git a/pandas/parser.pyx b/pandas/parser.pyx index b97929023adb6..d08c020c9e9bc 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -5,10 +5,12 @@ from libc.stdio cimport fopen, fclose from libc.stdlib cimport malloc, free from libc.string cimport strncpy, strlen, strcmp, strcasecmp cimport libc.stdio as stdio +import warnings from cpython cimport (PyObject, PyBytes_FromString, PyBytes_AsString, PyBytes_Check, PyUnicode_Check, PyUnicode_AsUTF8String) +from io.common import DtypeWarning cdef extern from "Python.h": @@ -1735,11 +1737,28 @@ def _concatenate_chunks(list chunks): cdef: list names = list(chunks[0].keys()) object name + list warning_columns + object warning_names + object common_type result = {} + warning_columns = list() for name in names: arrs = [chunk.pop(name) for chunk in chunks] + # Check each arr for consistent types. + dtypes = set([a.dtype for a in arrs]) + if len(dtypes) > 1: + common_type = np.find_common_type(dtypes, []) + if common_type == np.object: + warning_columns.append(str(name)) result[name] = np.concatenate(arrs) + + if warning_columns: + warning_names = ','.join(warning_columns) + warning_message = " ".join(["Columns (%s) have mixed types." % warning_names, + "Specify dtype option on import or set low_memory=False." + ]) + warnings.warn(warning_message, DtypeWarning) return result #----------------------------------------------------------------------