Skip to content

Commit 15d8535

Browse files
committed
Merge branch 'issue-3866-column-conversion' of https://github.com/guyrt/pandas into guyrt-issue-3866-column-conversion
Conflicts: doc/source/release.rst pandas/io/tests/test_parsers.py
2 parents b3fee7c + c1836fa commit 15d8535

File tree

4 files changed

+56
-6
lines changed

4 files changed

+56
-6
lines changed

doc/source/release.rst

+2
Original file line numberDiff line numberDiff line change
@@ -511,6 +511,8 @@ Bug Fixes
511511
- ``Timestamp`` objects can now appear in the left hand side of a comparison
512512
operation with a ``Series`` or ``DataFrame`` object (:issue:`4982`).
513513
- Fix a bug when indexing with ``np.nan`` via ``iloc/loc`` (:issue:`5016`)
514+
- Fixed a bug where low memory c parser could create different types in different
515+
chunks of the same file. Now coerces to numerical type or raises warning. (:issue:`3866`)
514516

515517
pandas 0.12.0
516518
-------------

pandas/io/common.py

+5
Original file line numberDiff line numberDiff line change
@@ -36,10 +36,15 @@ def urlopen(*args, **kwargs):
3636
_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
3737
_VALID_URLS.discard('')
3838

39+
3940
class PerformanceWarning(Warning):
4041
pass
4142

4243

44+
class DtypeWarning(Warning):
45+
pass
46+
47+
4348
def _is_url(url):
4449
"""Check to see if a URL has a valid protocol.
4550

pandas/io/tests/test_parsers.py

+30-6
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
from numpy import nan
1414
import numpy as np
15+
from pandas.io.common import DtypeWarning
1516

1617
from pandas import DataFrame, Series, Index, MultiIndex, DatetimeIndex
1718
from pandas.compat import(
@@ -1889,6 +1890,24 @@ def test_usecols_index_col_conflict(self):
18891890
df = pd.read_csv(StringIO(data), usecols=['Price', 'P2', 'P3'], parse_dates=True, index_col=['Price', 'P2'])
18901891
tm.assert_frame_equal(expected, df)
18911892

1893+
def test_chunks_have_consistent_numerical_type(self):
1894+
integers = [str(i) for i in range(499999)]
1895+
data = "a\n" + "\n".join(integers + ["1.0", "2.0"] + integers)
1896+
1897+
with tm.assert_produces_warning(False):
1898+
df = self.read_csv(StringIO(data))
1899+
self.assertTrue(type(df.a[0]) is np.float64) # Assert that types were coerced.
1900+
self.assertEqual(df.a.dtype, np.float)
1901+
1902+
def test_warn_if_chunks_have_mismatched_type(self):
1903+
# See test in TestCParserLowMemory.
1904+
integers = [str(i) for i in range(499999)]
1905+
data = "a\n" + "\n".join(integers + ['a', 'b'] + integers)
1906+
1907+
with tm.assert_produces_warning(False):
1908+
df = self.read_csv(StringIO(data))
1909+
self.assertEqual(df.a.dtype, np.object)
1910+
18921911

18931912
class TestPythonParser(ParserTests, unittest.TestCase):
18941913
def test_negative_skipfooter_raises(self):
@@ -2352,7 +2371,6 @@ def test_usecols_dtypes(self):
23522371
self.assertTrue((result.dtypes == [object, np.int, np.float]).all())
23532372
self.assertTrue((result2.dtypes == [object, np.float]).all())
23542373

2355-
23562374
def test_usecols_implicit_index_col(self):
23572375
# #2654
23582376
data = 'a,b,c\n4,apple,bat,5.7\n8,orange,cow,10'
@@ -2579,16 +2597,22 @@ def test_tokenize_CR_with_quoting(self):
25792597

25802598
def test_raise_on_no_columns(self):
25812599
# single newline
2582-
data = """
2583-
"""
2600+
data = "\n"
25842601
self.assertRaises(ValueError, self.read_csv, StringIO(data))
25852602

25862603
# test with more than a single newline
2587-
data = """
2604+
data = "\n\n\n"
2605+
self.assertRaises(ValueError, self.read_csv, StringIO(data))
25882606

2607+
def test_warn_if_chunks_have_mismatched_type(self):
2608+
# Issue #3866 If chunks are different types and can't
2609+
# be coerced using numerical types, then issue warning.
2610+
integers = [str(i) for i in range(499999)]
2611+
data = "a\n" + "\n".join(integers + ['a', 'b'] + integers)
25892612

2590-
"""
2591-
self.assertRaises(ValueError, self.read_csv, StringIO(data))
2613+
with tm.assert_produces_warning(DtypeWarning):
2614+
df = self.read_csv(StringIO(data))
2615+
self.assertEqual(df.a.dtype, np.object)
25922616

25932617

25942618
class TestParseSQL(unittest.TestCase):

pandas/parser.pyx

+19
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,12 @@ from libc.stdio cimport fopen, fclose
55
from libc.stdlib cimport malloc, free
66
from libc.string cimport strncpy, strlen, strcmp, strcasecmp
77
cimport libc.stdio as stdio
8+
import warnings
89

910
from cpython cimport (PyObject, PyBytes_FromString,
1011
PyBytes_AsString, PyBytes_Check,
1112
PyUnicode_Check, PyUnicode_AsUTF8String)
13+
from io.common import DtypeWarning
1214

1315

1416
cdef extern from "Python.h":
@@ -1735,11 +1737,28 @@ def _concatenate_chunks(list chunks):
17351737
cdef:
17361738
list names = list(chunks[0].keys())
17371739
object name
1740+
list warning_columns
1741+
object warning_names
1742+
object common_type
17381743

17391744
result = {}
1745+
warning_columns = list()
17401746
for name in names:
17411747
arrs = [chunk.pop(name) for chunk in chunks]
1748+
# Check each arr for consistent types.
1749+
dtypes = set([a.dtype for a in arrs])
1750+
if len(dtypes) > 1:
1751+
common_type = np.find_common_type(dtypes, [])
1752+
if common_type == np.object:
1753+
warning_columns.append(str(name))
17421754
result[name] = np.concatenate(arrs)
1755+
1756+
if warning_columns:
1757+
warning_names = ','.join(warning_columns)
1758+
warning_message = " ".join(["Columns (%s) have mixed types." % warning_names,
1759+
"Specify dtype option on import or set low_memory=False."
1760+
])
1761+
warnings.warn(warning_message, DtypeWarning)
17431762
return result
17441763

17451764
#----------------------------------------------------------------------

0 commit comments

Comments
 (0)