Skip to content

Commit 3104b43

Browse files
committed
BUG: Warn when dtypes differ in between chunks in csv parser
closes pandas-dev#3866 Silently fix problem rather than warning if we can coerce to numerical type.
1 parent 2b5e525 commit 3104b43

File tree

3 files changed

+37
-7
lines changed

3 files changed

+37
-7
lines changed

doc/source/release.rst

+2
Original file line numberDiff line numberDiff line change
@@ -480,6 +480,8 @@ Bug Fixes
480480
- Fixed wrong check for overlapping in ``DatetimeIndex.union`` (:issue:`4564`)
481481
- Fixed conflict between thousands separator and date parser in csv_parser (:issue:`4678`)
482482
- Fix appending when dtypes are not the same (error showing mixing float/np.datetime64) (:issue:`4993`)
483+
- Fixed a bug where low memory c parser could create different types in different
484+
chunks of the same file. Now coerces to numerical type or raises warning. (:issue:`3866`)
483485

484486
pandas 0.12.0
485487
-------------

pandas/io/tests/test_parsers.py

+26-7
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import re
99
import unittest
1010
import nose
11+
import warnings
1112

1213
from numpy import nan
1314
import numpy as np
@@ -2301,7 +2302,6 @@ def test_usecols_dtypes(self):
23012302
self.assertTrue((result.dtypes == [object, np.int, np.float]).all())
23022303
self.assertTrue((result2.dtypes == [object, np.float]).all())
23032304

2304-
23052305
def test_usecols_implicit_index_col(self):
23062306
# #2654
23072307
data = 'a,b,c\n4,apple,bat,5.7\n8,orange,cow,10'
@@ -2528,17 +2528,36 @@ def test_tokenize_CR_with_quoting(self):
25282528

25292529
def test_raise_on_no_columns(self):
25302530
# single newline
2531-
data = """
2532-
"""
2531+
data = "\n"
25332532
self.assertRaises(ValueError, self.read_csv, StringIO(data))
25342533

25352534
# test with more than a single newline
2536-
data = """
2537-
2538-
2539-
"""
2535+
data = "\n\n\n"
25402536
self.assertRaises(ValueError, self.read_csv, StringIO(data))
25412537

2538+
def test_chunks_have_consistent_numerical_type(self):
2539+
# Issue #3866 If chunks are different types and *can*
2540+
# be coerced using numerical types, then do so.
2541+
integers = [str(i) for i in range(499999)]
2542+
data = "a\n" + "\n".join(integers + ["1.0", "2.0"] + integers)
2543+
2544+
with warnings.catch_warnings(record=True) as w:
2545+
df = self.read_csv(StringIO(data), low_memory=True)
2546+
if len(w) > 0:
2547+
self.fail("Unexpected warning raised.")
2548+
self.assertTrue(type(df.a[0]) is np.float64) # Assert that types were coerced.
2549+
self.assertEqual(df.a.dtype, np.float)
2550+
2551+
def test_warn_if_chunks_have_mismatched_type(self):
2552+
# Issue #3866 If chunks are different types and can't
2553+
# be coerced using numerical types, then issue warning.
2554+
integers = [str(i) for i in range(499999)]
2555+
data = "a\n" + "\n".join(integers + ['a', 'b'] + integers)
2556+
2557+
df = self.read_csv(StringIO(data), low_memory=True)
2558+
tm.assert_produces_warning()
2559+
self.assertEqual(df.a.dtype, np.object)
2560+
25422561

25432562
class TestParseSQL(unittest.TestCase):
25442563

pandas/parser.pyx

+9
Original file line numberDiff line numberDiff line change
@@ -1739,6 +1739,15 @@ def _concatenate_chunks(list chunks):
17391739
result = {}
17401740
for name in names:
17411741
arrs = [chunk.pop(name) for chunk in chunks]
1742+
# Check each arr for consistent types.
1743+
dtypes = set([a.dtype for a in arrs])
1744+
if len(dtypes) > 1:
1745+
common_type = np.find_common_type(dtypes, [])
1746+
if common_type == np.object:
1747+
warning_message = " ".join(["Column %s has mixed types." % name,
1748+
"Specify dtype option on import or set low_memory=False."
1749+
])
1750+
print >> sys.stderr, warning_message
17421751
result[name] = np.concatenate(arrs)
17431752
return result
17441753

0 commit comments

Comments
 (0)