BUG: Fix inconsistent C engine quoting behaviour

gfyoung · gfyoung · commit 0e791a5a4b59 · 2016-06-17T11:05:36.000-04:00
1) Add significant testing to quoting in read_csv

2) Fix bug in C engine in which a NULL quotechar
would raise even though quoting=csv.QUOTE_NONE

3) Fix bug in C engine in which quoting=csv.QUOTE_
NONNUMERIC wouldn't cause non-quoted fields to be
cast to float.

4) Fixed minor doc error for quoting parameter, as
the default value is ZERO not None (this will raise
an error in fact).
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -287,11 +287,10 @@ lineterminator : str (length 1), default ``None``
 quotechar : str (length 1)
   The character used to denote the start and end of a quoted item. Quoted items
   can include the delimiter and it will be ignored.
-quoting : int or ``csv.QUOTE_*`` instance, default ``None``
+quoting : int or ``csv.QUOTE_*`` instance, default ``0``
   Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of
   ``QUOTE_MINIMAL`` (0), ``QUOTE_ALL`` (1), ``QUOTE_NONNUMERIC`` (2) or
-  ``QUOTE_NONE`` (3). Default (``None``) results in ``QUOTE_MINIMAL``
-  behavior.
+  ``QUOTE_NONE`` (3).
 doublequote : boolean, default ``True``
    When ``quotechar`` is specified and ``quoting`` is not ``QUOTE_NONE``,
    indicate whether or not to interpret two consecutive ``quotechar`` elements
diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt
@@ -492,6 +492,8 @@ Bug Fixes
 - Bug in ``pd.read_csv()`` with ``engine='python'`` in which trailing ``NaN`` values were not being parsed (:issue:`13320`)
 - Bug in ``pd.read_csv()`` that prevents ``usecols`` kwarg from accepting single-byte unicode strings (:issue:`13219`)
 - Bug in ``pd.read_csv()`` that prevents ``usecols`` from being an empty set (:issue:`13402`)
+- Bug in ``pd.read_csv()`` with ``engine=='c'`` in which null ``quotechar`` was not accepted even though ``quoting`` was specified as ``None`` (:issue:`13411`)
+- Bug in ``pd.read_csv()`` with ``engine=='c'`` in which fields were not properly cast to float when quoting was specified as non-numeric (:issue:`13411`)
 
 
 
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -202,10 +202,9 @@
 quotechar : str (length 1), optional
     The character used to denote the start and end of a quoted item. Quoted
     items can include the delimiter and it will be ignored.
-quoting : int or csv.QUOTE_* instance, default None
+quoting : int or csv.QUOTE_* instance, default 0
     Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of
     QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3).
-    Default (None) results in QUOTE_MINIMAL behavior.
 doublequote : boolean, default ``True``
    When quotechar is specified and quoting is not ``QUOTE_NONE``, indicate
    whether or not to interpret two consecutive quotechar elements INSIDE a
diff --git a/pandas/io/tests/parser/quoting.py b/pandas/io/tests/parser/quoting.py
@@ -0,0 +1,140 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests that quoting specifications are properly handled
+during parsing for all of the parsers defined in parsers.py
+"""
+
+import csv
+import pandas.util.testing as tm
+
+from pandas import DataFrame
+from pandas.compat import StringIO
+
+
+class QuotingTests(object):
+
+    def test_bad_quote_char(self):
+        data = '1,2,3'
+
+        # Python 2.x: "...must be an 1-character..."
+        # Python 3.x: "...must be a 1-character..."
+        msg = '"quotechar" must be a(n)? 1-character string'
+        tm.assertRaisesRegexp(TypeError, msg, self.read_csv,
+                              StringIO(data), quotechar='foo')
+
+        msg = 'quotechar must be set if quoting enabled'
+        tm.assertRaisesRegexp(TypeError, msg, self.read_csv,
+                              StringIO(data), quotechar=None,
+                              quoting=csv.QUOTE_MINIMAL)
+
+        msg = '"quotechar" must be string, not int'
+        tm.assertRaisesRegexp(TypeError, msg, self.read_csv,
+                              StringIO(data), quotechar=2)
+
+    def test_bad_quoting(self):
+        data = '1,2,3'
+
+        msg = '"quoting" must be an integer'
+        tm.assertRaisesRegexp(TypeError, msg, self.read_csv,
+                              StringIO(data), quoting='foo')
+
+        # quoting must in the range [0, 3]
+        msg = 'bad "quoting" value'
+        tm.assertRaisesRegexp(TypeError, msg, self.read_csv,
+                              StringIO(data), quoting=5)
+
+    def test_quote_char_basic(self):
+        data = 'a,b,c\n1,2,"cat"'
+        expected = DataFrame([[1, 2, 'cat']],
+                             columns=['a', 'b', 'c'])
+        result = self.read_csv(StringIO(data), quotechar='"')
+        tm.assert_frame_equal(result, expected)
+
+    def test_quote_char_various(self):
+        data = 'a,b,c\n1,2,"cat"'
+        expected = DataFrame([[1, 2, 'cat']],
+                             columns=['a', 'b', 'c'])
+        quote_chars = ['~', '*', '%', '$', '@', 'P']
+
+        for quote_char in quote_chars:
+            new_data = data.replace('"', quote_char)
+            result = self.read_csv(StringIO(new_data), quotechar=quote_char)
+            tm.assert_frame_equal(result, expected)
+
+    def test_null_quote_char(self):
+        data = 'a,b,c\n1,2,3'
+
+        # sanity checks
+        msg = 'quotechar must be set if quoting enabled'
+
+        tm.assertRaisesRegexp(TypeError, msg, self.read_csv,
+                              StringIO(data), quotechar=None,
+                              quoting=csv.QUOTE_MINIMAL)
+
+        tm.assertRaisesRegexp(TypeError, msg, self.read_csv,
+                              StringIO(data), quotechar='',
+                              quoting=csv.QUOTE_MINIMAL)
+
+        # no errors should be raised if quoting is None
+        expected = DataFrame([[1, 2, 3]],
+                             columns=['a', 'b', 'c'])
+
+        result = self.read_csv(StringIO(data), quotechar=None,
+                               quoting=csv.QUOTE_NONE)
+        tm.assert_frame_equal(result, expected)
+
+        result = self.read_csv(StringIO(data), quotechar='',
+                               quoting=csv.QUOTE_NONE)
+        tm.assert_frame_equal(result, expected)
+
+    def test_quoting_various(self):
+        data = '1,2,"foo"'
+        cols = ['a', 'b', 'c']
+
+        # QUOTE_MINIMAL and QUOTE_ALL apply only to
+        # the CSV writer, so they should have no
+        # special effect for the CSV reader
+        expected = DataFrame([[1, 2, 'foo']], columns=cols)
+
+        # test default (afterwards, arguments are all explicit)
+        result = self.read_csv(StringIO(data), names=cols)
+        tm.assert_frame_equal(result, expected)
+
+        result = self.read_csv(StringIO(data), quotechar='"',
+                               quoting=csv.QUOTE_MINIMAL, names=cols)
+        tm.assert_frame_equal(result, expected)
+
+        result = self.read_csv(StringIO(data), quotechar='"',
+                               quoting=csv.QUOTE_ALL, names=cols)
+        tm.assert_frame_equal(result, expected)
+
+        # QUOTE_NONE tells the reader to do no special handling
+        # of quote characters and leave them alone
+        expected = DataFrame([[1, 2, '"foo"']], columns=cols)
+        result = self.read_csv(StringIO(data), quotechar='"',
+                               quoting=csv.QUOTE_NONE, names=cols)
+        tm.assert_frame_equal(result, expected)
+
+        # QUOTE_NONNUMERIC tells the reader to cast
+        # all non-quoted fields to float
+        expected = DataFrame([[1.0, 2.0, 'foo']], columns=cols)
+        result = self.read_csv(StringIO(data), quotechar='"',
+                               quoting=csv.QUOTE_NONNUMERIC,
+                               names=cols)
+        tm.assert_frame_equal(result, expected)
+
+    def test_double_quote(self):
+        data = 'a,b\n3,"4 "" 5"'
+
+        expected = DataFrame([[3, '4 " 5']],
+                             columns=['a', 'b'])
+        result = self.read_csv(StringIO(data), quotechar='"',
+                               doublequote=True)
+        tm.assert_frame_equal(result, expected)
+
+        expected = DataFrame([[3, '4 " 5"']],
+                             columns=['a', 'b'])
+        result = self.read_csv(StringIO(data), quotechar='"',
+                               doublequote=False)
+        tm.assert_frame_equal(result, expected)
diff --git a/pandas/io/tests/parser/test_parsers.py b/pandas/io/tests/parser/test_parsers.py
@@ -11,6 +11,7 @@
 from .common import ParserTests
 from .header import HeaderTests
 from .comment import CommentTests
+from .quoting import QuotingTests
 from .usecols import UsecolsTests
 from .skiprows import SkipRowsTests
 from .index_col import IndexColTests
@@ -28,7 +29,7 @@ class BaseParser(CommentTests, CompressionTests,
                  IndexColTests, MultithreadTests,
                  NAvaluesTests, ParseDatesTests,
                  ParserTests, SkipRowsTests,
-                 UsecolsTests):
+                 UsecolsTests, QuotingTests):
     def read_csv(self, *args, **kwargs):
         raise NotImplementedError
 
diff --git a/pandas/parser.pyx b/pandas/parser.pyx
@@ -7,6 +7,7 @@ from libc.string cimport strncpy, strlen, strcmp, strcasecmp
 cimport libc.stdio as stdio
 import warnings
 
+from csv import QUOTE_MINIMAL, QUOTE_NONNUMERIC, QUOTE_NONE
 from cpython cimport (PyObject, PyBytes_FromString,
                       PyBytes_AsString, PyBytes_Check,
                       PyUnicode_Check, PyUnicode_AsUTF8String)
@@ -283,6 +284,7 @@ cdef class TextReader:
         object compression
         object mangle_dupe_cols
         object tupleize_cols
+        list dtype_cast_order
         set noconvert, usecols
 
     def __cinit__(self, source,
@@ -393,8 +395,13 @@ cdef class TextReader:
                 raise ValueError('Only length-1 escapes  supported')
             self.parser.escapechar = ord(escapechar)
 
-        self.parser.quotechar = ord(quotechar)
-        self.parser.quoting = quoting
+        self._set_quoting(quotechar, quoting)
+
+        # TODO: endianness just a placeholder?
+        if quoting == QUOTE_NONNUMERIC:
+            self.dtype_cast_order = ['<f8', '<i8', '|b1', '|O8']
+        else:
+            self.dtype_cast_order = ['<i8', '<f8', '|b1', '|O8']
 
         if comment is not None:
             if len(comment) > 1:
@@ -548,6 +555,29 @@ cdef class TextReader:
     def set_error_bad_lines(self, int status):
         self.parser.error_bad_lines = status
 
+    def _set_quoting(self, quote_char, quoting):
+        if not isinstance(quoting, int):
+            raise TypeError('"quoting" must be an integer')
+
+        if not QUOTE_MINIMAL <= quoting <= QUOTE_NONE:
+            raise TypeError('bad "quoting" value')
+
+        if not isinstance(quote_char, (str, bytes)) and quote_char is not None:
+            dtype = type(quote_char).__name__
+            raise TypeError('"quotechar" must be string, '
+                            'not {dtype}'.format(dtype=dtype))
+
+        if quote_char is None or quote_char == '':
+            if quoting != QUOTE_NONE:
+                raise TypeError("quotechar must be set if quoting enabled")
+            self.parser.quoting = quoting
+            self.parser.quotechar = -1
+        elif len(quote_char) > 1:  # 0-len case handled earlier
+            raise TypeError('"quotechar" must be a 1-character string')
+        else:
+            self.parser.quoting = quoting
+            self.parser.quotechar = ord(quote_char)
+
     cdef _make_skiprow_set(self):
         if isinstance(self.skiprows, (int, np.integer)):
             parser_set_skipfirstnrows(self.parser, self.skiprows)
@@ -1066,7 +1096,7 @@ cdef class TextReader:
             return self._string_convert(i, start, end, na_filter, na_hashset)
         else:
             col_res = None
-            for dt in dtype_cast_order:
+            for dt in self.dtype_cast_order:
                 try:
                     col_res, na_count = self._convert_with_dtype(
                         dt, i, start, end, na_filter, 0, na_hashset, na_flist)
@@ -1847,12 +1877,6 @@ cdef kh_float64_t* kset_float64_from_list(values) except NULL:
     return table
 
 
-# if at first you don't succeed...
-
-# TODO: endianness just a placeholder?
-cdef list dtype_cast_order = ['<i8', '<f8', '|b1', '|O8']
-
-
 cdef raise_parser_error(object base, parser_t *parser):
     message = '%s. C error: ' % base
     if parser.error_msg != NULL: