Skip to content

Commit 16e48bc

Browse files
committed
BUG, DOC: Improve dialect handling in read_csv
1) Update documentation about how the dialect parameter is handled. 2) Verify that the dialect parameter passed in is valid before accessing the dialect attributes. Closes pandas-devgh-14898.
1 parent caab85b commit 16e48bc

File tree

6 files changed

+126
-51
lines changed

6 files changed

+126
-51
lines changed

doc/source/io.rst

+5-2
Original file line numberDiff line numberDiff line change
@@ -325,8 +325,11 @@ encoding : str, default ``None``
325325
Python standard encodings
326326
<https://docs.python.org/3/library/codecs.html#standard-encodings>`_.
327327
dialect : str or :class:`python:csv.Dialect` instance, default ``None``
328-
If ``None`` defaults to Excel dialect. Ignored if sep longer than 1 char. See
329-
:class:`python:csv.Dialect` documentation for more details.
328+
If provided, this parameter will override values (default or not) for the
329+
following parameters: `delimiter`, `doublequote`, `escapechar`,
330+
`skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to
331+
override values, a ParserWarning will be issued. See :class:`python:csv.Dialect`
332+
documentation for more details.
330333
tupleize_cols : boolean, default ``False``
331334
Leave a list of tuples on columns as is (default is to convert to a MultiIndex
332335
on the columns).

doc/source/whatsnew/v0.20.0.txt

+4
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,8 @@ Other API Changes
243243
- ``SparseArray.cumsum()`` and ``SparseSeries.cumsum()`` will now always return ``SparseArray`` and ``SparseSeries`` respectively (:issue:`12855`)
244244
- ``DataFrame.applymap()`` with an empty ``DataFrame`` will return a copy of the empty ``DataFrame`` instead of a ``Series`` (:issue:`8222`)
245245

246+
- ``pd.read_csv()`` will now issue a ``ParserWarning`` whenever there are conflicting values provided by the ``dialect`` parameter and the user (:issue:`14898`)
247+
246248
.. _whatsnew_0200.deprecations:
247249

248250
Deprecations
@@ -291,6 +293,8 @@ Bug Fixes
291293
- Bug in ``describe()`` when passing a numpy array which does not contain the median to the ``percentiles`` keyword argument (:issue:`14908`)
292294
- Bug in ``DataFrame.sort_values()`` when sorting by multiple columns where one column is of type ``int64`` and contains ``NaT`` (:issue:`14922`)
293295
- Bug in ``DataFrame.reindex()`` in which ``method`` was ignored when passing ``columns`` (:issue:`14992`)
296+
- Bug in ``pd.read_csv()`` in which the ``dialect`` parameter was not being verified before processing (:issue:`14898`)
297+
294298

295299

296300
- Bug in ``pd.read_msgpack()`` in which ``Series`` categoricals were being improperly processed (:issue:`14901`)

pandas/io/parsers.py

+32-8
Original file line numberDiff line numberDiff line change
@@ -244,8 +244,11 @@
244244
standard encodings
245245
<https://docs.python.org/3/library/codecs.html#standard-encodings>`_
246246
dialect : str or csv.Dialect instance, default None
247-
If None defaults to Excel dialect. Ignored if sep longer than 1 char
248-
See csv.Dialect documentation for more details
247+
If provided, this parameter will override values (default or not) for the
248+
following parameters: `delimiter`, `doublequote`, `escapechar`,
249+
`skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to
250+
override values, a ParserWarning will be issued. See csv.Dialect
251+
documentation for more details.
249252
tupleize_cols : boolean, default False
250253
Leave a list of tuples on columns as is (default is to convert to
251254
a Multi Index on the columns)
@@ -698,12 +701,33 @@ def __init__(self, f, engine=None, **kwds):
698701
dialect = kwds['dialect']
699702
if dialect in csv.list_dialects():
700703
dialect = csv.get_dialect(dialect)
701-
kwds['delimiter'] = dialect.delimiter
702-
kwds['doublequote'] = dialect.doublequote
703-
kwds['escapechar'] = dialect.escapechar
704-
kwds['skipinitialspace'] = dialect.skipinitialspace
705-
kwds['quotechar'] = dialect.quotechar
706-
kwds['quoting'] = dialect.quoting
704+
705+
# Any valid dialect should have these attributes.
706+
# If any are missing, we will raise automatically.
707+
for param in ('delimiter', 'doublequote', 'escapechar',
708+
'skipinitialspace', 'quotechar', 'quoting'):
709+
try:
710+
dialect_val = getattr(dialect, param)
711+
except AttributeError:
712+
raise ValueError("Invalid dialect '{dialect}' provided"
713+
.format(dialect=kwds['dialect']))
714+
provided = kwds.get(param, _parser_defaults[param])
715+
716+
# Messages for conflicting values between the dialect instance
717+
# and the actual parameters provided.
718+
conflict_msgs = []
719+
720+
if dialect_val != provided:
721+
conflict_msgs.append((
722+
"Conflicting values for '{param}': '{val}' was "
723+
"provided, but the dialect specifies '{diaval}'. "
724+
"Using the dialect-specified value.".format(
725+
param=param, val=provided, diaval=dialect_val)))
726+
727+
if conflict_msgs:
728+
warnings.warn('\n\n'.join(conflict_msgs), ParserWarning,
729+
stacklevel=2)
730+
kwds[param] = dialect_val
707731

708732
if kwds.get('header', 'infer') == 'infer':
709733
kwds['header'] = 0 if kwds.get('names') is None else None

pandas/io/tests/parser/common.py

-35
Original file line numberDiff line numberDiff line change
@@ -77,41 +77,6 @@ def test_read_csv(self):
7777
fname = prefix + compat.text_type(self.csv1)
7878
self.read_csv(fname, index_col=0, parse_dates=True)
7979

80-
def test_dialect(self):
81-
data = """\
82-
label1,label2,label3
83-
index1,"a,c,e
84-
index2,b,d,f
85-
"""
86-
87-
dia = csv.excel()
88-
dia.quoting = csv.QUOTE_NONE
89-
df = self.read_csv(StringIO(data), dialect=dia)
90-
91-
data = '''\
92-
label1,label2,label3
93-
index1,a,c,e
94-
index2,b,d,f
95-
'''
96-
exp = self.read_csv(StringIO(data))
97-
exp.replace('a', '"a', inplace=True)
98-
tm.assert_frame_equal(df, exp)
99-
100-
def test_dialect_str(self):
101-
data = """\
102-
fruit:vegetable
103-
apple:brocolli
104-
pear:tomato
105-
"""
106-
exp = DataFrame({
107-
'fruit': ['apple', 'pear'],
108-
'vegetable': ['brocolli', 'tomato']
109-
})
110-
dia = csv.register_dialect('mydialect', delimiter=':') # noqa
111-
df = self.read_csv(StringIO(data), dialect='mydialect')
112-
tm.assert_frame_equal(df, exp)
113-
csv.unregister_dialect('mydialect')
114-
11580
def test_1000_sep(self):
11681
data = """A|B|C
11782
1|2,334|5

pandas/io/tests/parser/dialect.py

+78
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
# -*- coding: utf-8 -*-
2+
3+
"""
4+
Tests that dialects are properly handled during parsing
5+
for all of the parsers defined in parsers.py
6+
"""
7+
8+
import csv
9+
10+
from pandas import DataFrame
11+
from pandas.compat import StringIO
12+
from pandas.io.common import ParserWarning
13+
14+
import pandas.util.testing as tm
15+
16+
17+
class DialectTests(object):
18+
19+
def test_dialect(self):
20+
data = """\
21+
label1,label2,label3
22+
index1,"a,c,e
23+
index2,b,d,f
24+
"""
25+
26+
dia = csv.excel()
27+
dia.quoting = csv.QUOTE_NONE
28+
with tm.assert_produces_warning(ParserWarning):
29+
df = self.read_csv(StringIO(data), dialect=dia)
30+
31+
data = '''\
32+
label1,label2,label3
33+
index1,a,c,e
34+
index2,b,d,f
35+
'''
36+
exp = self.read_csv(StringIO(data))
37+
exp.replace('a', '"a', inplace=True)
38+
tm.assert_frame_equal(df, exp)
39+
40+
def test_dialect_str(self):
41+
data = """\
42+
fruit:vegetable
43+
apple:brocolli
44+
pear:tomato
45+
"""
46+
exp = DataFrame({
47+
'fruit': ['apple', 'pear'],
48+
'vegetable': ['brocolli', 'tomato']
49+
})
50+
csv.register_dialect('mydialect', delimiter=':')
51+
with tm.assert_produces_warning(ParserWarning):
52+
df = self.read_csv(StringIO(data), dialect='mydialect')
53+
54+
tm.assert_frame_equal(df, exp)
55+
csv.unregister_dialect('mydialect')
56+
57+
def test_invalid_dialect(self):
58+
class InvalidDialect(object):
59+
pass
60+
61+
data = 'a\n1'
62+
msg = 'Invalid dialect'
63+
64+
with tm.assertRaisesRegexp(ValueError, msg):
65+
self.read_csv(StringIO(data), dialect=InvalidDialect)
66+
67+
def test_dialect_conflict(self):
68+
data = 'a,b\n1,2'
69+
dialect = 'excel'
70+
exp = DataFrame({'a': [1], 'b': [2]})
71+
72+
with tm.assert_produces_warning(None):
73+
df = self.read_csv(StringIO(data), delimiter=',', dialect=dialect)
74+
tm.assert_frame_equal(df, exp)
75+
76+
with tm.assert_produces_warning(ParserWarning):
77+
df = self.read_csv(StringIO(data), delimiter='.', dialect=dialect)
78+
tm.assert_frame_equal(df, exp)

pandas/io/tests/parser/test_parsers.py

+7-6
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from .common import ParserTests
1212
from .header import HeaderTests
1313
from .comment import CommentTests
14+
from .dialect import DialectTests
1415
from .quoting import QuotingTests
1516
from .usecols import UsecolsTests
1617
from .skiprows import SkipRowsTests
@@ -26,12 +27,12 @@
2627

2728

2829
class BaseParser(CommentTests, CompressionTests,
29-
ConverterTests, HeaderTests,
30-
IndexColTests, MultithreadTests,
31-
NAvaluesTests, ParseDatesTests,
32-
ParserTests, SkipRowsTests,
33-
UsecolsTests, QuotingTests,
34-
DtypeTests):
30+
ConverterTests, DialectTests,
31+
HeaderTests, IndexColTests,
32+
MultithreadTests, NAvaluesTests,
33+
ParseDatesTests, ParserTests,
34+
SkipRowsTests, UsecolsTests,
35+
QuotingTests, DtypeTests):
3536
def read_csv(self, *args, **kwargs):
3637
raise NotImplementedError
3738

0 commit comments

Comments
 (0)