BUG, DOC: Improve dialect handling in read_csv

gfyoung · gfyoung · commit 9b7965365978 · 2016-12-30T11:24:19.000-08:00
1) Update documentation about how the dialect parameter is handled. 2) Verify that the dialect parameter passed in is valid before accessing the dialect attributes. Closes gh-14898.
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -325,8 +325,11 @@ encoding : str, default ``None``
   Python standard encodings
   <https://docs.python.org/3/library/codecs.html#standard-encodings>`_.
 dialect : str or :class:`python:csv.Dialect` instance, default ``None``
-  If ``None`` defaults to Excel dialect. Ignored if sep longer than 1 char. See
-  :class:`python:csv.Dialect` documentation for more details.
+  If provided, this parameter will override values (default or not) for the
+  following parameters: `delimiter`, `doublequote`, `escapechar`,
+  `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to
+  override values, a ParserWarning will be issued. See :class:`python:csv.Dialect`
+  documentation for more details.
 tupleize_cols : boolean, default ``False``
   Leave a list of tuples on columns as is (default is to convert to a MultiIndex
   on the columns).
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -243,6 +243,8 @@ Other API Changes
 - ``SparseArray.cumsum()`` and ``SparseSeries.cumsum()`` will now always return ``SparseArray`` and ``SparseSeries`` respectively (:issue:`12855`)
 - ``DataFrame.applymap()`` with an empty ``DataFrame`` will return a copy of the empty ``DataFrame`` instead of a ``Series`` (:issue:`8222`)
 
+- ``pd.read_csv()`` will now issue a ``ParserWarning`` whenever there are conflicting values provided by the ``dialect`` parameter and the user (:issue:`14898`)
+
 .. _whatsnew_0200.deprecations:
 
 Deprecations
@@ -290,6 +292,8 @@ Bug Fixes
 - Bug in ``DataFrame(..).apply(to_numeric)`` when values are of type decimal.Decimal. (:issue:`14827`)
 - Bug in ``describe()`` when passing a numpy array which does not contain the median to the ``percentiles`` keyword argument (:issue:`14908`)
 - Bug in ``DataFrame.sort_values()`` when sorting by multiple columns where one column is of type ``int64`` and contains ``NaT`` (:issue:`14922`)
+- Bug in ``pd.read_csv()`` in which the ``dialect`` parameter was not being verified before processing (:issue:`14898`)
+
 
 
 - Bug in ``pd.read_msgpack()`` in which ``Series`` categoricals were being improperly processed (:issue:`14901`)
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -244,8 +244,11 @@
     standard encodings
     <https://docs.python.org/3/library/codecs.html#standard-encodings>`_
 dialect : str or csv.Dialect instance, default None
-    If None defaults to Excel dialect. Ignored if sep longer than 1 char
-    See csv.Dialect documentation for more details
+    If provided, this parameter will override values (default or not) for the
+    following parameters: `delimiter`, `doublequote`, `escapechar`,
+    `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to
+    override values, a ParserWarning will be issued. See csv.Dialect
+    documentation for more details.
 tupleize_cols : boolean, default False
     Leave a list of tuples on columns as is (default is to convert to
     a Multi Index on the columns)
@@ -698,12 +701,33 @@ def __init__(self, f, engine=None, **kwds):
             dialect = kwds['dialect']
             if dialect in csv.list_dialects():
                 dialect = csv.get_dialect(dialect)
-            kwds['delimiter'] = dialect.delimiter
-            kwds['doublequote'] = dialect.doublequote
-            kwds['escapechar'] = dialect.escapechar
-            kwds['skipinitialspace'] = dialect.skipinitialspace
-            kwds['quotechar'] = dialect.quotechar
-            kwds['quoting'] = dialect.quoting
+
+            # Any valid dialect should have these attributes.
+            # If any are missing, we will raise automatically.
+            for param in ('delimiter', 'doublequote', 'escapechar',
+                          'skipinitialspace', 'quotechar', 'quoting'):
+                try:
+                    dialect_val = getattr(dialect, param)
+                except AttributeError:
+                    raise ValueError("Invalid dialect '{dialect}' provided"
+                                     .format(dialect=kwds['dialect']))
+                provided = kwds.get(param, _parser_defaults[param])
+
+                # Messages for conflicting values between the dialect instance
+                # and the actual parameters provided.
+                conflict_msgs = []
+
+                if dialect_val != provided:
+                    conflict_msgs.append((
+                        "Conflicting values for '{param}': '{val}' was "
+                        "provided, but the dialect specifies '{diaval}'. "
+                        "Using the dialect-specified value.".format(
+                            param=param, val=provided, diaval=dialect_val)))
+
+                if conflict_msgs:
+                    warnings.warn('\n\n'.join(conflict_msgs), ParserWarning,
+                                  stacklevel=2)
+                kwds[param] = dialect_val
 
         if kwds.get('header', 'infer') == 'infer':
             kwds['header'] = 0 if kwds.get('names') is None else None
diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py
@@ -77,41 +77,6 @@ def test_read_csv(self):
             fname = prefix + compat.text_type(self.csv1)
             self.read_csv(fname, index_col=0, parse_dates=True)
 
-    def test_dialect(self):
-        data = """\
-label1,label2,label3
-index1,"a,c,e
-index2,b,d,f
-"""
-
-        dia = csv.excel()
-        dia.quoting = csv.QUOTE_NONE
-        df = self.read_csv(StringIO(data), dialect=dia)
-
-        data = '''\
-label1,label2,label3
-index1,a,c,e
-index2,b,d,f
-'''
-        exp = self.read_csv(StringIO(data))
-        exp.replace('a', '"a', inplace=True)
-        tm.assert_frame_equal(df, exp)
-
-    def test_dialect_str(self):
-        data = """\
-fruit:vegetable
-apple:brocolli
-pear:tomato
-"""
-        exp = DataFrame({
-            'fruit': ['apple', 'pear'],
-            'vegetable': ['brocolli', 'tomato']
-        })
-        dia = csv.register_dialect('mydialect', delimiter=':')  # noqa
-        df = self.read_csv(StringIO(data), dialect='mydialect')
-        tm.assert_frame_equal(df, exp)
-        csv.unregister_dialect('mydialect')
-
     def test_1000_sep(self):
         data = """A|B|C
 1|2,334|5
diff --git a/pandas/io/tests/parser/dialect.py b/pandas/io/tests/parser/dialect.py
@@ -0,0 +1,78 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests that dialects are properly handled during parsing
+for all of the parsers defined in parsers.py
+"""
+
+import csv
+
+from pandas import DataFrame
+from pandas.compat import StringIO
+from pandas.io.common import ParserWarning
+
+import pandas.util.testing as tm
+
+
+class DialectTests(object):
+
+    def test_dialect(self):
+        data = """\
+label1,label2,label3
+index1,"a,c,e
+index2,b,d,f
+"""
+
+        dia = csv.excel()
+        dia.quoting = csv.QUOTE_NONE
+        with tm.assert_produces_warning(ParserWarning):
+            df = self.read_csv(StringIO(data), dialect=dia)
+
+        data = '''\
+label1,label2,label3
+index1,a,c,e
+index2,b,d,f
+'''
+        exp = self.read_csv(StringIO(data))
+        exp.replace('a', '"a', inplace=True)
+        tm.assert_frame_equal(df, exp)
+
+    def test_dialect_str(self):
+        data = """\
+fruit:vegetable
+apple:brocolli
+pear:tomato
+"""
+        exp = DataFrame({
+            'fruit': ['apple', 'pear'],
+            'vegetable': ['brocolli', 'tomato']
+        })
+        csv.register_dialect('mydialect', delimiter=':')
+        with tm.assert_produces_warning(ParserWarning):
+            df = self.read_csv(StringIO(data), dialect='mydialect')
+
+        tm.assert_frame_equal(df, exp)
+        csv.unregister_dialect('mydialect')
+
+    def test_invalid_dialect(self):
+        class InvalidDialect(object):
+            pass
+
+        data = 'a\n1'
+        msg = 'Invalid dialect'
+
+        with tm.assertRaisesRegexp(ValueError, msg):
+            self.read_csv(StringIO(data), dialect=InvalidDialect)
+
+    def test_dialect_conflict(self):
+        data = 'a,b\n1,2'
+        dialect = 'excel'
+        exp = DataFrame({'a': [1], 'b': [2]})
+
+        with tm.assert_produces_warning(None):
+            df = self.read_csv(StringIO(data), delimiter=',', dialect=dialect)
+            tm.assert_frame_equal(df, exp)
+
+        with tm.assert_produces_warning(ParserWarning):
+            df = self.read_csv(StringIO(data), delimiter='.', dialect=dialect)
+            tm.assert_frame_equal(df, exp)
diff --git a/pandas/io/tests/parser/test_parsers.py b/pandas/io/tests/parser/test_parsers.py
@@ -11,6 +11,7 @@
 from .common import ParserTests
 from .header import HeaderTests
 from .comment import CommentTests
+from .dialect import DialectTests
 from .quoting import QuotingTests
 from .usecols import UsecolsTests
 from .skiprows import SkipRowsTests
@@ -26,12 +27,12 @@
 
 
 class BaseParser(CommentTests, CompressionTests,
-                 ConverterTests, HeaderTests,
-                 IndexColTests, MultithreadTests,
-                 NAvaluesTests, ParseDatesTests,
-                 ParserTests, SkipRowsTests,
-                 UsecolsTests, QuotingTests,
-                 DtypeTests):
+                 ConverterTests, DialectTests,
+                 HeaderTests, IndexColTests,
+                 MultithreadTests, NAvaluesTests,
+                 ParseDatesTests, ParserTests,
+                 SkipRowsTests, UsecolsTests,
+                 QuotingTests, DtypeTests):
     def read_csv(self, *args, **kwargs):
         raise NotImplementedError