[Backport pandas-dev#14492] BUG: Accept unicode quotechars again in pd.read_csv

gfyoung · jorisvandenbossche · commit 64400670e60a · 2016-11-02T11:49:47.000+01:00
Title is self-explanatory. Affects Python 2.x only. Closes pandas-dev#14477. Author: gfyoung <gfyoung17@gmail.com> Closes pandas-dev#14492 from gfyoung/quotechar-unicode-2.x and squashes the following commits: ec9f59a [gfyoung] BUG: Accept unicode quotechars again in pd.read_csv (cherry picked from commit 6130e77)
diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt
@@ -36,6 +36,7 @@ Bug Fixes
 - Compat with Cython 0.25 for building (:issue:`14496`)
 
 
+- Bug in ``pd.read_csv`` for Python 2.x in which Unicode quote characters were no longer being respected (:issue:`14477`)
 - Bug in localizing an ambiguous timezone when a boolean is passed (:issue:`14402`)
 - Bug in ``TimedeltaIndex`` addition with a Datetime-like object where addition overflow in the negative direction was not being caught (:issue:`14068`, :issue:`14453`)
 
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -1759,6 +1759,9 @@ def __init__(self, f, **kwds):
         self.delimiter = kwds['delimiter']
 
         self.quotechar = kwds['quotechar']
+        if isinstance(self.quotechar, compat.text_type):
+            self.quotechar = str(self.quotechar)
+
         self.escapechar = kwds['escapechar']
         self.doublequote = kwds['doublequote']
         self.skipinitialspace = kwds['skipinitialspace']
diff --git a/pandas/io/tests/parser/quoting.py b/pandas/io/tests/parser/quoting.py
@@ -9,7 +9,7 @@
 import pandas.util.testing as tm
 
 from pandas import DataFrame
-from pandas.compat import StringIO
+from pandas.compat import PY3, StringIO, u
 
 
 class QuotingTests(object):
@@ -138,3 +138,16 @@ def test_double_quote(self):
         result = self.read_csv(StringIO(data), quotechar='"',
                                doublequote=False)
         tm.assert_frame_equal(result, expected)
+
+    def test_quotechar_unicode(self):
+        # See gh-14477
+        data = 'a\n1'
+        expected = DataFrame({'a': [1]})
+
+        result = self.read_csv(StringIO(data), quotechar=u('"'))
+        tm.assert_frame_equal(result, expected)
+
+        # Compared to Python 3.x, Python 2.x does not handle unicode well.
+        if PY3:
+            result = self.read_csv(StringIO(data), quotechar=u('\u0394'))
+            tm.assert_frame_equal(result, expected)
diff --git a/pandas/parser.pyx b/pandas/parser.pyx
@@ -570,7 +570,8 @@ cdef class TextReader:
         if not QUOTE_MINIMAL <= quoting <= QUOTE_NONE:
             raise TypeError('bad "quoting" value')
 
-        if not isinstance(quote_char, (str, bytes)) and quote_char is not None:
+        if not isinstance(quote_char, (str, compat.text_type,
+                                       bytes)) and quote_char is not None:
             dtype = type(quote_char).__name__
             raise TypeError('"quotechar" must be string, '
                             'not {dtype}'.format(dtype=dtype))