API: Warn or raise for > 1 char encoded sep

gfyoung · gfyoung · commit 152b6851120d · 2016-08-29T22:42:38.000-04:00
The system file encoding can cause a separator to be encoded as more than one character even though it maybe provided as one character. Multi-char separators are not supported by the C engine, so we need to catch this case. Closes gh-14065.
diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt
@@ -457,6 +457,7 @@ API changes
 - ``pd.Timedelta(None)`` is now accepted and will return ``NaT``, mirroring ``pd.Timestamp`` (:issue:`13687`)
 - ``Timestamp``, ``Period``, ``DatetimeIndex``, ``PeriodIndex`` and ``.dt`` accessor have gained a ``.is_leap_year`` property to check whether the date belongs to a leap year. (:issue:`13727`)
 - ``pd.read_hdf`` will now raise a ``ValueError`` instead of ``KeyError``, if a mode other than ``r``, ``r+`` and ``a`` is supplied. (:issue:`13623`)
+- ``pd.read_csv()`` in the C engine will now issue a ``ParserWarning`` or raise a ``ValueError`` when ``sep`` encoded is more than one character long (:issue:`14065`)
 - ``DataFrame.values`` will now return ``float64`` with a ``DataFrame`` of mixed ``int64`` and ``uint64`` dtypes, conforming to ``np.find_common_type`` (:issue:`10364`, :issue:`13917`)
 - ``Series.unique()`` with datetime and timezone now returns return array of ``Timestamp`` with timezone (:issue:`13565`)
 
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -5,6 +5,7 @@
 from collections import defaultdict
 import re
 import csv
+import sys
 import warnings
 import datetime
 
@@ -782,6 +783,7 @@ def _clean_options(self, options, engine):
                                   " skipfooter"
                 engine = 'python'
 
+        encoding = sys.getfilesystemencoding() or 'utf-8'
         if sep is None and not delim_whitespace:
             if engine == 'c':
                 fallback_reason = "the 'c' engine does not support"\
@@ -798,6 +800,14 @@ def _clean_options(self, options, engine):
                                   " different from '\s+' are"\
                                   " interpreted as regex)"
                 engine = 'python'
+
+        elif len(sep.encode(encoding)) > 1:
+            if engine not in ('python', 'python-fwf'):
+                fallback_reason = "the separator encoded in {encoding}"\
+                                  " is > 1 char long, and the 'c' engine"\
+                                  " does not support such separators".format(
+                                      encoding=encoding)
+                engine = 'python'
         elif delim_whitespace:
             if 'python' in engine:
                 result['delimiter'] = '\s+'
diff --git a/pandas/io/tests/parser/test_unsupported.py b/pandas/io/tests/parser/test_unsupported.py
@@ -60,6 +60,8 @@ def test_c_engine(self):
                        sep=None, delim_whitespace=False)
         with tm.assertRaisesRegexp(ValueError, msg):
             read_table(StringIO(data), engine='c', sep='\s')
+        with tm.assertRaisesRegexp(ValueError, msg):
+            read_table(StringIO(data), engine='c', sep='§')
         with tm.assertRaisesRegexp(ValueError, msg):
             read_table(StringIO(data), engine='c', skipfooter=1)