Skip to content

Commit a3f3a88

Browse files
committed
BUG, DOC: Improve dialect handling in read_csv
1) Update documentation about how the dialect parameter is handled. 2) Verify that the dialect parameter passed in is valid before accessing the dialect attributes. Closes gh-14898.
1 parent e503d40 commit a3f3a88

File tree

5 files changed

+106
-49
lines changed

5 files changed

+106
-49
lines changed

doc/source/whatsnew/v0.20.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,7 @@ Map on Index types now return other Index types
193193
Other API Changes
194194
^^^^^^^^^^^^^^^^^
195195

196+
- ``pd.read_csv()`` will now issue a ``UserWarning`` whenever there are conflicting values provided by the ``dialect`` parameter and the user (:issue:`14898`)
196197

197198
.. _whatsnew_0200.deprecations:
198199

@@ -236,6 +237,7 @@ Bug Fixes
236237

237238
- Bug in ``TimedeltaIndex`` addition where overflow was being allowed without error (:issue:`14816`)
238239
- Bug in ``astype()`` where ``inf`` values were incorrectly converted to integers. Now raises error now with ``astype()`` for Series and DataFrames (:issue:`14265`)
240+
- Bug in ``pd.read_csv()`` in which the ``dialect`` parameter was not being verified before processing (:issue:`14898`)
239241

240242

241243

pandas/io/parsers.py

+30-8
Original file line numberDiff line numberDiff line change
@@ -244,8 +244,9 @@
244244
standard encodings
245245
<https://docs.python.org/3/library/codecs.html#standard-encodings>`_
246246
dialect : str or csv.Dialect instance, default None
247-
If None defaults to Excel dialect. Ignored if sep longer than 1 char
248-
See csv.Dialect documentation for more details
247+
If provided, this parameter will override values for the following
248+
parameters: `delimiter`, `doublequote`, `escapechar`, `skipinitialspace`,
249+
`quotechar`, and `quoting`. See csv.Dialect documentation for more details.
249250
tupleize_cols : boolean, default False
250251
Leave a list of tuples on columns as is (default is to convert to
251252
a Multi Index on the columns)
@@ -692,12 +693,33 @@ def __init__(self, f, engine=None, **kwds):
692693
dialect = kwds['dialect']
693694
if dialect in csv.list_dialects():
694695
dialect = csv.get_dialect(dialect)
695-
kwds['delimiter'] = dialect.delimiter
696-
kwds['doublequote'] = dialect.doublequote
697-
kwds['escapechar'] = dialect.escapechar
698-
kwds['skipinitialspace'] = dialect.skipinitialspace
699-
kwds['quotechar'] = dialect.quotechar
700-
kwds['quoting'] = dialect.quoting
696+
697+
# Any valid dialect should have these attributes.
698+
# If any are missing, we will raise automatically.
699+
for param in ('delimiter', 'doublequote', 'escapechar',
700+
'skipinitialspace', 'quotechar', 'quoting'):
701+
try:
702+
dialect_val = getattr(dialect, param)
703+
except AttributeError:
704+
raise ValueError("Invalid dialect '{dialect}' provided"
705+
.format(dialect=kwds['dialect']))
706+
provided = kwds.get(param, _parser_defaults[param])
707+
708+
# Messages for conflicting values between the dialect instance
709+
# and the actual parameters provided.
710+
conflict_msgs = []
711+
712+
if dialect_val != provided:
713+
conflict_msgs.append((
714+
"Conflicting values for '{param}': '{val}' was "
715+
"provided, but the dialect specifies '{diaval}'. "
716+
"Using the dialect-specified value.".format(
717+
param=param, val=provided, diaval=dialect_val)))
718+
719+
if conflict_msgs:
720+
warnings.warn('\n\n'.join(conflict_msgs), UserWarning,
721+
stacklevel=2)
722+
kwds[param] = dialect_val
701723

702724
if kwds.get('header', 'infer') == 'infer':
703725
kwds['header'] = 0 if kwds.get('names') is None else None

pandas/io/tests/parser/common.py

-35
Original file line numberDiff line numberDiff line change
@@ -77,41 +77,6 @@ def test_read_csv(self):
7777
fname = prefix + compat.text_type(self.csv1)
7878
self.read_csv(fname, index_col=0, parse_dates=True)
7979

80-
def test_dialect(self):
81-
data = """\
82-
label1,label2,label3
83-
index1,"a,c,e
84-
index2,b,d,f
85-
"""
86-
87-
dia = csv.excel()
88-
dia.quoting = csv.QUOTE_NONE
89-
df = self.read_csv(StringIO(data), dialect=dia)
90-
91-
data = '''\
92-
label1,label2,label3
93-
index1,a,c,e
94-
index2,b,d,f
95-
'''
96-
exp = self.read_csv(StringIO(data))
97-
exp.replace('a', '"a', inplace=True)
98-
tm.assert_frame_equal(df, exp)
99-
100-
def test_dialect_str(self):
101-
data = """\
102-
fruit:vegetable
103-
apple:brocolli
104-
pear:tomato
105-
"""
106-
exp = DataFrame({
107-
'fruit': ['apple', 'pear'],
108-
'vegetable': ['brocolli', 'tomato']
109-
})
110-
dia = csv.register_dialect('mydialect', delimiter=':') # noqa
111-
df = self.read_csv(StringIO(data), dialect='mydialect')
112-
tm.assert_frame_equal(df, exp)
113-
csv.unregister_dialect('mydialect')
114-
11580
def test_1000_sep(self):
11681
data = """A|B|C
11782
1|2,334|5

pandas/io/tests/parser/dialect.py

+67
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
# -*- coding: utf-8 -*-
2+
3+
"""
4+
Tests that dialects are properly handled during parsing
5+
for all of the parsers defined in parsers.py
6+
"""
7+
8+
import csv
9+
10+
from pandas import DataFrame
11+
from pandas.compat import StringIO
12+
13+
import pandas.util.testing as tm
14+
15+
16+
class DialectTests(object):
17+
18+
def test_dialect(self):
19+
data = """\
20+
label1,label2,label3
21+
index1,"a,c,e
22+
index2,b,d,f
23+
"""
24+
25+
dia = csv.excel()
26+
dia.quoting = csv.QUOTE_NONE
27+
with tm.assert_produces_warning(UserWarning):
28+
df = self.read_csv(StringIO(data), dialect=dia)
29+
30+
data = '''\
31+
label1,label2,label3
32+
index1,a,c,e
33+
index2,b,d,f
34+
'''
35+
exp = self.read_csv(StringIO(data))
36+
exp.replace('a', '"a', inplace=True)
37+
tm.assert_frame_equal(df, exp)
38+
39+
def test_dialect_str(self):
40+
data = """\
41+
fruit:vegetable
42+
apple:brocolli
43+
pear:tomato
44+
"""
45+
exp = DataFrame({
46+
'fruit': ['apple', 'pear'],
47+
'vegetable': ['brocolli', 'tomato']
48+
})
49+
csv.register_dialect('mydialect', delimiter=':')
50+
with tm.assert_produces_warning(UserWarning):
51+
df = self.read_csv(StringIO(data), dialect='mydialect')
52+
53+
tm.assert_frame_equal(df, exp)
54+
csv.unregister_dialect('mydialect')
55+
56+
def test_dialect_conflict(self):
57+
data = 'a,b\n1,2'
58+
dialect = 'excel'
59+
exp = DataFrame({'a': [1], 'b': [2]})
60+
61+
with tm.assert_produces_warning(None):
62+
df = self.read_csv(StringIO(data), delimiter=',', dialect=dialect)
63+
tm.assert_frame_equal(df, exp)
64+
65+
with tm.assert_produces_warning(UserWarning):
66+
df = self.read_csv(StringIO(data), delimiter='.', dialect=dialect)
67+
tm.assert_frame_equal(df, exp)

pandas/io/tests/parser/test_parsers.py

+7-6
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from .common import ParserTests
1212
from .header import HeaderTests
1313
from .comment import CommentTests
14+
from .dialect import DialectTests
1415
from .quoting import QuotingTests
1516
from .usecols import UsecolsTests
1617
from .skiprows import SkipRowsTests
@@ -26,12 +27,12 @@
2627

2728

2829
class BaseParser(CommentTests, CompressionTests,
29-
ConverterTests, HeaderTests,
30-
IndexColTests, MultithreadTests,
31-
NAvaluesTests, ParseDatesTests,
32-
ParserTests, SkipRowsTests,
33-
UsecolsTests, QuotingTests,
34-
DtypeTests):
30+
ConverterTests, DialectTests,
31+
HeaderTests, IndexColTests,
32+
MultithreadTests, NAvaluesTests,
33+
ParseDatesTests, ParserTests,
34+
SkipRowsTests, UsecolsTests,
35+
QuotingTests, DtypeTests):
3536
def read_csv(self, *args, **kwargs):
3637
raise NotImplementedError
3738

0 commit comments

Comments
 (0)