Skip to content

BUG: Warn when dtypes differ in between chunks in csv parser #4991

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 29, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -480,6 +480,8 @@ Bug Fixes
- Fixed wrong check for overlapping in ``DatetimeIndex.union`` (:issue:`4564`)
- Fixed conflict between thousands separator and date parser in csv_parser (:issue:`4678`)
- Fix appending when dtypes are not the same (error showing mixing float/np.datetime64) (:issue:`4993`)
- Fixed a bug where low memory c parser could create different types in different
chunks of the same file. Now coerces to numerical type or raises warning. (:issue:`3866`)

pandas 0.12.0
-------------
Expand Down
5 changes: 5 additions & 0 deletions pandas/io/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,15 @@ def urlopen(*args, **kwargs):
_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
_VALID_URLS.discard('')


class PerformanceWarning(Warning):
pass


class DtypeWarning(Warning):
pass


def _is_url(url):
"""Check to see if a URL has a valid protocol.

Expand Down
36 changes: 30 additions & 6 deletions pandas/io/tests/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

from numpy import nan
import numpy as np
from pandas.io.common import DtypeWarning

from pandas import DataFrame, Series, Index, MultiIndex, DatetimeIndex
from pandas.compat import(
Expand Down Expand Up @@ -1865,6 +1866,24 @@ def test_parse_integers_above_fp_precision(self):

self.assertTrue(np.array_equal(result['Numbers'], expected['Numbers']))

def test_chunks_have_consistent_numerical_type(self):
integers = [str(i) for i in range(499999)]
data = "a\n" + "\n".join(integers + ["1.0", "2.0"] + integers)

with tm.assert_produces_warning(False):
df = self.read_csv(StringIO(data))
self.assertTrue(type(df.a[0]) is np.float64) # Assert that types were coerced.
self.assertEqual(df.a.dtype, np.float)

def test_warn_if_chunks_have_mismatched_type(self):
# See test in TestCParserLowMemory.
integers = [str(i) for i in range(499999)]
data = "a\n" + "\n".join(integers + ['a', 'b'] + integers)

with tm.assert_produces_warning(False):
df = self.read_csv(StringIO(data))
self.assertEqual(df.a.dtype, np.object)


class TestPythonParser(ParserTests, unittest.TestCase):

Expand Down Expand Up @@ -2301,7 +2320,6 @@ def test_usecols_dtypes(self):
self.assertTrue((result.dtypes == [object, np.int, np.float]).all())
self.assertTrue((result2.dtypes == [object, np.float]).all())


def test_usecols_implicit_index_col(self):
# #2654
data = 'a,b,c\n4,apple,bat,5.7\n8,orange,cow,10'
Expand Down Expand Up @@ -2528,16 +2546,22 @@ def test_tokenize_CR_with_quoting(self):

def test_raise_on_no_columns(self):
# single newline
data = """
"""
data = "\n"
self.assertRaises(ValueError, self.read_csv, StringIO(data))

# test with more than a single newline
data = """
data = "\n\n\n"
self.assertRaises(ValueError, self.read_csv, StringIO(data))

def test_warn_if_chunks_have_mismatched_type(self):
# Issue #3866 If chunks are different types and can't
# be coerced using numerical types, then issue warning.
integers = [str(i) for i in range(499999)]
data = "a\n" + "\n".join(integers + ['a', 'b'] + integers)

"""
self.assertRaises(ValueError, self.read_csv, StringIO(data))
with tm.assert_produces_warning(DtypeWarning):
df = self.read_csv(StringIO(data))
self.assertEqual(df.a.dtype, np.object)


class TestParseSQL(unittest.TestCase):
Expand Down
19 changes: 19 additions & 0 deletions pandas/parser.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@ from libc.stdio cimport fopen, fclose
from libc.stdlib cimport malloc, free
from libc.string cimport strncpy, strlen, strcmp, strcasecmp
cimport libc.stdio as stdio
import warnings

from cpython cimport (PyObject, PyBytes_FromString,
PyBytes_AsString, PyBytes_Check,
PyUnicode_Check, PyUnicode_AsUTF8String)
from io.common import DtypeWarning


cdef extern from "Python.h":
Expand Down Expand Up @@ -1735,11 +1737,28 @@ def _concatenate_chunks(list chunks):
cdef:
list names = list(chunks[0].keys())
object name
list warning_columns
object warning_names
object common_type

result = {}
warning_columns = list()
for name in names:
arrs = [chunk.pop(name) for chunk in chunks]
# Check each arr for consistent types.
dtypes = set([a.dtype for a in arrs])
if len(dtypes) > 1:
common_type = np.find_common_type(dtypes, [])
if common_type == np.object:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

don't convert to np.str, leave as np.object. But I think this might need be a tad more restrictive. If all types are numeric (and none of np.datetime64 or np.timedelta64), then use the common. else with mixed typed use np.object. User can then deal with it. You could do a UserWarning in the 2nd case (e.g more than 1 type and its going to be object). See if it triggers at all currently (and need a test for triggering it), you can use tm.assert_raises_warning.

warning_columns.append(str(name))
result[name] = np.concatenate(arrs)

if warning_columns:
warning_names = ','.join(warning_columns)
warning_message = " ".join(["Columns (%s) have mixed types." % warning_names,
"Specify dtype option on import or set low_memory=False."
])
warnings.warn(warning_message, DtypeWarning)
return result

#----------------------------------------------------------------------
Expand Down