diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 45cdd23140487..0262463af40d7 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -457,6 +457,7 @@ API changes - ``pd.Timedelta(None)`` is now accepted and will return ``NaT``, mirroring ``pd.Timestamp`` (:issue:`13687`) - ``Timestamp``, ``Period``, ``DatetimeIndex``, ``PeriodIndex`` and ``.dt`` accessor have gained a ``.is_leap_year`` property to check whether the date belongs to a leap year. (:issue:`13727`) - ``pd.read_hdf`` will now raise a ``ValueError`` instead of ``KeyError``, if a mode other than ``r``, ``r+`` and ``a`` is supplied. (:issue:`13623`) +- ``pd.read_csv()`` in the C engine will now issue a ``ParserWarning`` or raise a ``ValueError`` when ``sep`` encoded is more than one character long (:issue:`14065`) - ``DataFrame.values`` will now return ``float64`` with a ``DataFrame`` of mixed ``int64`` and ``uint64`` dtypes, conforming to ``np.find_common_type`` (:issue:`10364`, :issue:`13917`) - ``Series.unique()`` with datetime and timezone now returns return array of ``Timestamp`` with timezone (:issue:`13565`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 62f2ad1419d92..3bd8579d456d3 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -5,6 +5,7 @@ from collections import defaultdict import re import csv +import sys import warnings import datetime @@ -782,6 +783,7 @@ def _clean_options(self, options, engine): " skipfooter" engine = 'python' + encoding = sys.getfilesystemencoding() or 'utf-8' if sep is None and not delim_whitespace: if engine == 'c': fallback_reason = "the 'c' engine does not support"\ @@ -798,6 +800,14 @@ def _clean_options(self, options, engine): " different from '\s+' are"\ " interpreted as regex)" engine = 'python' + + elif len(sep.encode(encoding)) > 1: + if engine not in ('python', 'python-fwf'): + fallback_reason = "the separator encoded in {encoding}"\ + " is > 1 char long, and the 'c' engine"\ + " does not support such separators".format( + encoding=encoding) + engine = 'python' elif delim_whitespace: if 'python' in engine: result['delimiter'] = '\s+' diff --git a/pandas/io/tests/parser/test_unsupported.py b/pandas/io/tests/parser/test_unsupported.py index ef8f7967193ff..e575843a7fc22 100644 --- a/pandas/io/tests/parser/test_unsupported.py +++ b/pandas/io/tests/parser/test_unsupported.py @@ -60,6 +60,8 @@ def test_c_engine(self): sep=None, delim_whitespace=False) with tm.assertRaisesRegexp(ValueError, msg): read_table(StringIO(data), engine='c', sep='\s') + with tm.assertRaisesRegexp(ValueError, msg): + read_table(StringIO(data), engine='c', sep='ยง') with tm.assertRaisesRegexp(ValueError, msg): read_table(StringIO(data), engine='c', skipfooter=1)