Skip to content

Commit db08da2

Browse files
gfyoungjorisvandenbossche
authored andcommitted
ERR: Disallow multi-char quotechar for C engine (pandas-dev#15050)
Raise ValueError or issue ParserWarning when a multi-char quotechar is passed in, and the C engine is used. Closes pandas-devgh-11592.
1 parent de09c98 commit db08da2

File tree

4 files changed

+17
-1
lines changed

4 files changed

+17
-1
lines changed

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,7 @@ Other API Changes
246246
- ``DataFrame.applymap()`` with an empty ``DataFrame`` will return a copy of the empty ``DataFrame`` instead of a ``Series`` (:issue:`8222`)
247247

248248
- ``pd.read_csv()`` will now issue a ``ParserWarning`` whenever there are conflicting values provided by the ``dialect`` parameter and the user (:issue:`14898`)
249+
- ``pd.read_csv()`` will now raise a ``ValueError`` for the C engine if the quote character is larger than than one byte (:issue:`11592`)
249250

250251
.. _whatsnew_0200.deprecations:
251252

pandas/io/parsers.py

+11
Original file line numberDiff line numberDiff line change
@@ -841,6 +841,17 @@ def _clean_options(self, options, engine):
841841
encoding=encoding)
842842
engine = 'python'
843843

844+
quotechar = options['quotechar']
845+
if (quotechar is not None and
846+
isinstance(quotechar, (str, compat.text_type, bytes))):
847+
if (len(quotechar) == 1 and ord(quotechar) > 127 and
848+
engine not in ('python', 'python-fwf')):
849+
fallback_reason = ("ord(quotechar) > 127, meaning the "
850+
"quotechar is larger than one byte, "
851+
"and the 'c' engine does not support "
852+
"such quotechars")
853+
engine = 'python'
854+
844855
if fallback_reason and engine_specified:
845856
raise ValueError(fallback_reason)
846857

pandas/io/tests/parser/quoting.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -149,5 +149,5 @@ def test_quotechar_unicode(self):
149149

150150
# Compared to Python 3.x, Python 2.x does not handle unicode well.
151151
if PY3:
152-
result = self.read_csv(StringIO(data), quotechar=u('\u0394'))
152+
result = self.read_csv(StringIO(data), quotechar=u('\u0001'))
153153
tm.assert_frame_equal(result, expected)

pandas/io/tests/parser/test_unsupported.py

+4
Original file line numberDiff line numberDiff line change
@@ -50,12 +50,16 @@ def test_c_engine(self):
5050
sep=None, delim_whitespace=False)
5151
with tm.assertRaisesRegexp(ValueError, msg):
5252
read_table(StringIO(data), engine='c', sep=r'\s')
53+
with tm.assertRaisesRegexp(ValueError, msg):
54+
read_table(StringIO(data), engine='c', quotechar=chr(128))
5355
with tm.assertRaisesRegexp(ValueError, msg):
5456
read_table(StringIO(data), engine='c', skipfooter=1)
5557

5658
# specify C-unsupported options without python-unsupported options
5759
with tm.assert_produces_warning(parsers.ParserWarning):
5860
read_table(StringIO(data), sep=None, delim_whitespace=False)
61+
with tm.assert_produces_warning(parsers.ParserWarning):
62+
read_table(StringIO(data), quotechar=chr(128))
5963
with tm.assert_produces_warning(parsers.ParserWarning):
6064
read_table(StringIO(data), sep=r'\s')
6165
with tm.assert_produces_warning(parsers.ParserWarning):

0 commit comments

Comments
 (0)