Skip to content

Commit c1836fa

Browse files
committed
BUG: Warn when dtypes differ in between chunks in csv parser
closes #3866 Silently fix problem rather than warning if we can coerce to numerical type.
1 parent 2b5e525 commit c1836fa

File tree

4 files changed

+56
-6
lines changed

4 files changed

+56
-6
lines changed

doc/source/release.rst

+2
Original file line numberDiff line numberDiff line change
@@ -480,6 +480,8 @@ Bug Fixes
480480
- Fixed wrong check for overlapping in ``DatetimeIndex.union`` (:issue:`4564`)
481481
- Fixed conflict between thousands separator and date parser in csv_parser (:issue:`4678`)
482482
- Fix appending when dtypes are not the same (error showing mixing float/np.datetime64) (:issue:`4993`)
483+
- Fixed a bug where low memory c parser could create different types in different
484+
chunks of the same file. Now coerces to numerical type or raises warning. (:issue:`3866`)
483485

484486
pandas 0.12.0
485487
-------------

pandas/io/common.py

+5
Original file line numberDiff line numberDiff line change
@@ -36,10 +36,15 @@ def urlopen(*args, **kwargs):
3636
_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
3737
_VALID_URLS.discard('')
3838

39+
3940
class PerformanceWarning(Warning):
4041
pass
4142

4243

44+
class DtypeWarning(Warning):
45+
pass
46+
47+
4348
def _is_url(url):
4449
"""Check to see if a URL has a valid protocol.
4550

pandas/io/tests/test_parsers.py

+30-6
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
from numpy import nan
1313
import numpy as np
14+
from pandas.io.common import DtypeWarning
1415

1516
from pandas import DataFrame, Series, Index, MultiIndex, DatetimeIndex
1617
from pandas.compat import(
@@ -1865,6 +1866,24 @@ def test_parse_integers_above_fp_precision(self):
18651866

18661867
self.assertTrue(np.array_equal(result['Numbers'], expected['Numbers']))
18671868

1869+
def test_chunks_have_consistent_numerical_type(self):
1870+
integers = [str(i) for i in range(499999)]
1871+
data = "a\n" + "\n".join(integers + ["1.0", "2.0"] + integers)
1872+
1873+
with tm.assert_produces_warning(False):
1874+
df = self.read_csv(StringIO(data))
1875+
self.assertTrue(type(df.a[0]) is np.float64) # Assert that types were coerced.
1876+
self.assertEqual(df.a.dtype, np.float)
1877+
1878+
def test_warn_if_chunks_have_mismatched_type(self):
1879+
# See test in TestCParserLowMemory.
1880+
integers = [str(i) for i in range(499999)]
1881+
data = "a\n" + "\n".join(integers + ['a', 'b'] + integers)
1882+
1883+
with tm.assert_produces_warning(False):
1884+
df = self.read_csv(StringIO(data))
1885+
self.assertEqual(df.a.dtype, np.object)
1886+
18681887

18691888
class TestPythonParser(ParserTests, unittest.TestCase):
18701889

@@ -2301,7 +2320,6 @@ def test_usecols_dtypes(self):
23012320
self.assertTrue((result.dtypes == [object, np.int, np.float]).all())
23022321
self.assertTrue((result2.dtypes == [object, np.float]).all())
23032322

2304-
23052323
def test_usecols_implicit_index_col(self):
23062324
# #2654
23072325
data = 'a,b,c\n4,apple,bat,5.7\n8,orange,cow,10'
@@ -2528,16 +2546,22 @@ def test_tokenize_CR_with_quoting(self):
25282546

25292547
def test_raise_on_no_columns(self):
25302548
# single newline
2531-
data = """
2532-
"""
2549+
data = "\n"
25332550
self.assertRaises(ValueError, self.read_csv, StringIO(data))
25342551

25352552
# test with more than a single newline
2536-
data = """
2553+
data = "\n\n\n"
2554+
self.assertRaises(ValueError, self.read_csv, StringIO(data))
25372555

2556+
def test_warn_if_chunks_have_mismatched_type(self):
2557+
# Issue #3866 If chunks are different types and can't
2558+
# be coerced using numerical types, then issue warning.
2559+
integers = [str(i) for i in range(499999)]
2560+
data = "a\n" + "\n".join(integers + ['a', 'b'] + integers)
25382561

2539-
"""
2540-
self.assertRaises(ValueError, self.read_csv, StringIO(data))
2562+
with tm.assert_produces_warning(DtypeWarning):
2563+
df = self.read_csv(StringIO(data))
2564+
self.assertEqual(df.a.dtype, np.object)
25412565

25422566

25432567
class TestParseSQL(unittest.TestCase):

pandas/parser.pyx

+19
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,12 @@ from libc.stdio cimport fopen, fclose
55
from libc.stdlib cimport malloc, free
66
from libc.string cimport strncpy, strlen, strcmp, strcasecmp
77
cimport libc.stdio as stdio
8+
import warnings
89

910
from cpython cimport (PyObject, PyBytes_FromString,
1011
PyBytes_AsString, PyBytes_Check,
1112
PyUnicode_Check, PyUnicode_AsUTF8String)
13+
from io.common import DtypeWarning
1214

1315

1416
cdef extern from "Python.h":
@@ -1735,11 +1737,28 @@ def _concatenate_chunks(list chunks):
17351737
cdef:
17361738
list names = list(chunks[0].keys())
17371739
object name
1740+
list warning_columns
1741+
object warning_names
1742+
object common_type
17381743

17391744
result = {}
1745+
warning_columns = list()
17401746
for name in names:
17411747
arrs = [chunk.pop(name) for chunk in chunks]
1748+
# Check each arr for consistent types.
1749+
dtypes = set([a.dtype for a in arrs])
1750+
if len(dtypes) > 1:
1751+
common_type = np.find_common_type(dtypes, [])
1752+
if common_type == np.object:
1753+
warning_columns.append(str(name))
17421754
result[name] = np.concatenate(arrs)
1755+
1756+
if warning_columns:
1757+
warning_names = ','.join(warning_columns)
1758+
warning_message = " ".join(["Columns (%s) have mixed types." % warning_names,
1759+
"Specify dtype option on import or set low_memory=False."
1760+
])
1761+
warnings.warn(warning_message, DtypeWarning)
17431762
return result
17441763

17451764
#----------------------------------------------------------------------

0 commit comments

Comments
 (0)