From 16e48bcb00d4422e9ef8548519b93080f2d83860 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sun, 18 Dec 2016 15:31:24 -0500 Subject: [PATCH] BUG, DOC: Improve dialect handling in read_csv 1) Update documentation about how the dialect parameter is handled. 2) Verify that the dialect parameter passed in is valid before accessing the dialect attributes. Closes gh-14898. --- doc/source/io.rst | 7 ++- doc/source/whatsnew/v0.20.0.txt | 4 ++ pandas/io/parsers.py | 40 ++++++++++--- pandas/io/tests/parser/common.py | 35 ------------ pandas/io/tests/parser/dialect.py | 78 ++++++++++++++++++++++++++ pandas/io/tests/parser/test_parsers.py | 13 +++-- 6 files changed, 126 insertions(+), 51 deletions(-) create mode 100644 pandas/io/tests/parser/dialect.py diff --git a/doc/source/io.rst b/doc/source/io.rst index 8ddf4186eba25..af05a89a54a62 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -325,8 +325,11 @@ encoding : str, default ``None`` Python standard encodings `_. dialect : str or :class:`python:csv.Dialect` instance, default ``None`` - If ``None`` defaults to Excel dialect. Ignored if sep longer than 1 char. See - :class:`python:csv.Dialect` documentation for more details. + If provided, this parameter will override values (default or not) for the + following parameters: `delimiter`, `doublequote`, `escapechar`, + `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to + override values, a ParserWarning will be issued. See :class:`python:csv.Dialect` + documentation for more details. tupleize_cols : boolean, default ``False`` Leave a list of tuples on columns as is (default is to convert to a MultiIndex on the columns). diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 10684021d5599..095a7a5c13ba3 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -243,6 +243,8 @@ Other API Changes - ``SparseArray.cumsum()`` and ``SparseSeries.cumsum()`` will now always return ``SparseArray`` and ``SparseSeries`` respectively (:issue:`12855`) - ``DataFrame.applymap()`` with an empty ``DataFrame`` will return a copy of the empty ``DataFrame`` instead of a ``Series`` (:issue:`8222`) +- ``pd.read_csv()`` will now issue a ``ParserWarning`` whenever there are conflicting values provided by the ``dialect`` parameter and the user (:issue:`14898`) + .. _whatsnew_0200.deprecations: Deprecations @@ -291,6 +293,8 @@ Bug Fixes - Bug in ``describe()`` when passing a numpy array which does not contain the median to the ``percentiles`` keyword argument (:issue:`14908`) - Bug in ``DataFrame.sort_values()`` when sorting by multiple columns where one column is of type ``int64`` and contains ``NaT`` (:issue:`14922`) - Bug in ``DataFrame.reindex()`` in which ``method`` was ignored when passing ``columns`` (:issue:`14992`) +- Bug in ``pd.read_csv()`` in which the ``dialect`` parameter was not being verified before processing (:issue:`14898`) + - Bug in ``pd.read_msgpack()`` in which ``Series`` categoricals were being improperly processed (:issue:`14901`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 2332a9ade93ff..040ec3d803303 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -244,8 +244,11 @@ standard encodings `_ dialect : str or csv.Dialect instance, default None - If None defaults to Excel dialect. Ignored if sep longer than 1 char - See csv.Dialect documentation for more details + If provided, this parameter will override values (default or not) for the + following parameters: `delimiter`, `doublequote`, `escapechar`, + `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to + override values, a ParserWarning will be issued. See csv.Dialect + documentation for more details. tupleize_cols : boolean, default False Leave a list of tuples on columns as is (default is to convert to a Multi Index on the columns) @@ -698,12 +701,33 @@ def __init__(self, f, engine=None, **kwds): dialect = kwds['dialect'] if dialect in csv.list_dialects(): dialect = csv.get_dialect(dialect) - kwds['delimiter'] = dialect.delimiter - kwds['doublequote'] = dialect.doublequote - kwds['escapechar'] = dialect.escapechar - kwds['skipinitialspace'] = dialect.skipinitialspace - kwds['quotechar'] = dialect.quotechar - kwds['quoting'] = dialect.quoting + + # Any valid dialect should have these attributes. + # If any are missing, we will raise automatically. + for param in ('delimiter', 'doublequote', 'escapechar', + 'skipinitialspace', 'quotechar', 'quoting'): + try: + dialect_val = getattr(dialect, param) + except AttributeError: + raise ValueError("Invalid dialect '{dialect}' provided" + .format(dialect=kwds['dialect'])) + provided = kwds.get(param, _parser_defaults[param]) + + # Messages for conflicting values between the dialect instance + # and the actual parameters provided. + conflict_msgs = [] + + if dialect_val != provided: + conflict_msgs.append(( + "Conflicting values for '{param}': '{val}' was " + "provided, but the dialect specifies '{diaval}'. " + "Using the dialect-specified value.".format( + param=param, val=provided, diaval=dialect_val))) + + if conflict_msgs: + warnings.warn('\n\n'.join(conflict_msgs), ParserWarning, + stacklevel=2) + kwds[param] = dialect_val if kwds.get('header', 'infer') == 'infer': kwds['header'] = 0 if kwds.get('names') is None else None diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index c6c2a9e954f55..e694e529212aa 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -77,41 +77,6 @@ def test_read_csv(self): fname = prefix + compat.text_type(self.csv1) self.read_csv(fname, index_col=0, parse_dates=True) - def test_dialect(self): - data = """\ -label1,label2,label3 -index1,"a,c,e -index2,b,d,f -""" - - dia = csv.excel() - dia.quoting = csv.QUOTE_NONE - df = self.read_csv(StringIO(data), dialect=dia) - - data = '''\ -label1,label2,label3 -index1,a,c,e -index2,b,d,f -''' - exp = self.read_csv(StringIO(data)) - exp.replace('a', '"a', inplace=True) - tm.assert_frame_equal(df, exp) - - def test_dialect_str(self): - data = """\ -fruit:vegetable -apple:brocolli -pear:tomato -""" - exp = DataFrame({ - 'fruit': ['apple', 'pear'], - 'vegetable': ['brocolli', 'tomato'] - }) - dia = csv.register_dialect('mydialect', delimiter=':') # noqa - df = self.read_csv(StringIO(data), dialect='mydialect') - tm.assert_frame_equal(df, exp) - csv.unregister_dialect('mydialect') - def test_1000_sep(self): data = """A|B|C 1|2,334|5 diff --git a/pandas/io/tests/parser/dialect.py b/pandas/io/tests/parser/dialect.py new file mode 100644 index 0000000000000..ee50cf812f72e --- /dev/null +++ b/pandas/io/tests/parser/dialect.py @@ -0,0 +1,78 @@ +# -*- coding: utf-8 -*- + +""" +Tests that dialects are properly handled during parsing +for all of the parsers defined in parsers.py +""" + +import csv + +from pandas import DataFrame +from pandas.compat import StringIO +from pandas.io.common import ParserWarning + +import pandas.util.testing as tm + + +class DialectTests(object): + + def test_dialect(self): + data = """\ +label1,label2,label3 +index1,"a,c,e +index2,b,d,f +""" + + dia = csv.excel() + dia.quoting = csv.QUOTE_NONE + with tm.assert_produces_warning(ParserWarning): + df = self.read_csv(StringIO(data), dialect=dia) + + data = '''\ +label1,label2,label3 +index1,a,c,e +index2,b,d,f +''' + exp = self.read_csv(StringIO(data)) + exp.replace('a', '"a', inplace=True) + tm.assert_frame_equal(df, exp) + + def test_dialect_str(self): + data = """\ +fruit:vegetable +apple:brocolli +pear:tomato +""" + exp = DataFrame({ + 'fruit': ['apple', 'pear'], + 'vegetable': ['brocolli', 'tomato'] + }) + csv.register_dialect('mydialect', delimiter=':') + with tm.assert_produces_warning(ParserWarning): + df = self.read_csv(StringIO(data), dialect='mydialect') + + tm.assert_frame_equal(df, exp) + csv.unregister_dialect('mydialect') + + def test_invalid_dialect(self): + class InvalidDialect(object): + pass + + data = 'a\n1' + msg = 'Invalid dialect' + + with tm.assertRaisesRegexp(ValueError, msg): + self.read_csv(StringIO(data), dialect=InvalidDialect) + + def test_dialect_conflict(self): + data = 'a,b\n1,2' + dialect = 'excel' + exp = DataFrame({'a': [1], 'b': [2]}) + + with tm.assert_produces_warning(None): + df = self.read_csv(StringIO(data), delimiter=',', dialect=dialect) + tm.assert_frame_equal(df, exp) + + with tm.assert_produces_warning(ParserWarning): + df = self.read_csv(StringIO(data), delimiter='.', dialect=dialect) + tm.assert_frame_equal(df, exp) diff --git a/pandas/io/tests/parser/test_parsers.py b/pandas/io/tests/parser/test_parsers.py index 6cca2e35e1135..a90f546d37fc8 100644 --- a/pandas/io/tests/parser/test_parsers.py +++ b/pandas/io/tests/parser/test_parsers.py @@ -11,6 +11,7 @@ from .common import ParserTests from .header import HeaderTests from .comment import CommentTests +from .dialect import DialectTests from .quoting import QuotingTests from .usecols import UsecolsTests from .skiprows import SkipRowsTests @@ -26,12 +27,12 @@ class BaseParser(CommentTests, CompressionTests, - ConverterTests, HeaderTests, - IndexColTests, MultithreadTests, - NAvaluesTests, ParseDatesTests, - ParserTests, SkipRowsTests, - UsecolsTests, QuotingTests, - DtypeTests): + ConverterTests, DialectTests, + HeaderTests, IndexColTests, + MultithreadTests, NAvaluesTests, + ParseDatesTests, ParserTests, + SkipRowsTests, UsecolsTests, + QuotingTests, DtypeTests): def read_csv(self, *args, **kwargs): raise NotImplementedError