Skip to content

Commit ba1bb73

Browse files
committed
BUG, DOC: Improve dialect handling in read_csv
1) Update documentation about how the dialect parameter is handled. 2) Verify that the dialect parameter passed in is valid before accessing the dialect attributes. Closes pandas-devgh-14898.
1 parent d7e8f31 commit ba1bb73

File tree

5 files changed

+111
-49
lines changed

5 files changed

+111
-49
lines changed

doc/source/whatsnew/v0.20.0.txt

+4
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,8 @@ Other API Changes
238238
- ``CParserError`` has been renamed to ``ParserError`` in ``pd.read_csv`` and will be removed in the future (:issue:`12665`)
239239
- ``SparseArray.cumsum()`` and ``SparseSeries.cumsum()`` will now always return ``SparseArray`` and ``SparseSeries`` respectively (:issue:`12855`)
240240

241+
- ``pd.read_csv()`` will now issue a ``ParserWarning`` whenever there are conflicting values provided by the ``dialect`` parameter and the user (:issue:`14898`)
242+
241243
.. _whatsnew_0200.deprecations:
242244

243245
Deprecations
@@ -284,6 +286,8 @@ Bug Fixes
284286
- Bug in ``DataFrame(..).apply(to_numeric)`` when values are of type decimal.Decimal. (:issue:`14827`)
285287
- Bug in ``describe()`` when passing a numpy array which does not contain the median to the ``percentiles`` keyword argument (:issue:`14908`)
286288
- Bug in ``DataFrame.sort_values()`` when sorting by multiple columns where one column is of type ``int64`` and contains ``NaT`` (:issue:`14922`)
289+
- Bug in ``pd.read_csv()`` in which the ``dialect`` parameter was not being verified before processing (:issue:`14898`)
290+
287291

288292

289293

pandas/io/parsers.py

+32-8
Original file line numberDiff line numberDiff line change
@@ -244,8 +244,11 @@
244244
standard encodings
245245
<https://docs.python.org/3/library/codecs.html#standard-encodings>`_
246246
dialect : str or csv.Dialect instance, default None
247-
If None defaults to Excel dialect. Ignored if sep longer than 1 char
248-
See csv.Dialect documentation for more details
247+
If provided, this parameter will override values (default or not) for the
248+
following parameters: `delimiter`, `doublequote`, `escapechar`,
249+
`skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to
250+
override values, a ParserWarning will be issued. See csv.Dialect
251+
documentation for more details.
249252
tupleize_cols : boolean, default False
250253
Leave a list of tuples on columns as is (default is to convert to
251254
a Multi Index on the columns)
@@ -692,12 +695,33 @@ def __init__(self, f, engine=None, **kwds):
692695
dialect = kwds['dialect']
693696
if dialect in csv.list_dialects():
694697
dialect = csv.get_dialect(dialect)
695-
kwds['delimiter'] = dialect.delimiter
696-
kwds['doublequote'] = dialect.doublequote
697-
kwds['escapechar'] = dialect.escapechar
698-
kwds['skipinitialspace'] = dialect.skipinitialspace
699-
kwds['quotechar'] = dialect.quotechar
700-
kwds['quoting'] = dialect.quoting
698+
699+
# Any valid dialect should have these attributes.
700+
# If any are missing, we will raise automatically.
701+
for param in ('delimiter', 'doublequote', 'escapechar',
702+
'skipinitialspace', 'quotechar', 'quoting'):
703+
try:
704+
dialect_val = getattr(dialect, param)
705+
except AttributeError:
706+
raise ValueError("Invalid dialect '{dialect}' provided"
707+
.format(dialect=kwds['dialect']))
708+
provided = kwds.get(param, _parser_defaults[param])
709+
710+
# Messages for conflicting values between the dialect instance
711+
# and the actual parameters provided.
712+
conflict_msgs = []
713+
714+
if dialect_val != provided:
715+
conflict_msgs.append((
716+
"Conflicting values for '{param}': '{val}' was "
717+
"provided, but the dialect specifies '{diaval}'. "
718+
"Using the dialect-specified value.".format(
719+
param=param, val=provided, diaval=dialect_val)))
720+
721+
if conflict_msgs:
722+
warnings.warn('\n\n'.join(conflict_msgs), ParserWarning,
723+
stacklevel=2)
724+
kwds[param] = dialect_val
701725

702726
if kwds.get('header', 'infer') == 'infer':
703727
kwds['header'] = 0 if kwds.get('names') is None else None

pandas/io/tests/parser/common.py

-35
Original file line numberDiff line numberDiff line change
@@ -77,41 +77,6 @@ def test_read_csv(self):
7777
fname = prefix + compat.text_type(self.csv1)
7878
self.read_csv(fname, index_col=0, parse_dates=True)
7979

80-
def test_dialect(self):
81-
data = """\
82-
label1,label2,label3
83-
index1,"a,c,e
84-
index2,b,d,f
85-
"""
86-
87-
dia = csv.excel()
88-
dia.quoting = csv.QUOTE_NONE
89-
df = self.read_csv(StringIO(data), dialect=dia)
90-
91-
data = '''\
92-
label1,label2,label3
93-
index1,a,c,e
94-
index2,b,d,f
95-
'''
96-
exp = self.read_csv(StringIO(data))
97-
exp.replace('a', '"a', inplace=True)
98-
tm.assert_frame_equal(df, exp)
99-
100-
def test_dialect_str(self):
101-
data = """\
102-
fruit:vegetable
103-
apple:brocolli
104-
pear:tomato
105-
"""
106-
exp = DataFrame({
107-
'fruit': ['apple', 'pear'],
108-
'vegetable': ['brocolli', 'tomato']
109-
})
110-
dia = csv.register_dialect('mydialect', delimiter=':') # noqa
111-
df = self.read_csv(StringIO(data), dialect='mydialect')
112-
tm.assert_frame_equal(df, exp)
113-
csv.unregister_dialect('mydialect')
114-
11580
def test_1000_sep(self):
11681
data = """A|B|C
11782
1|2,334|5

pandas/io/tests/parser/dialect.py

+68
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
# -*- coding: utf-8 -*-
2+
3+
"""
4+
Tests that dialects are properly handled during parsing
5+
for all of the parsers defined in parsers.py
6+
"""
7+
8+
import csv
9+
10+
from pandas import DataFrame
11+
from pandas.compat import StringIO
12+
from pandas.io.common import ParserWarning
13+
14+
import pandas.util.testing as tm
15+
16+
17+
class DialectTests(object):
18+
19+
def test_dialect(self):
20+
data = """\
21+
label1,label2,label3
22+
index1,"a,c,e
23+
index2,b,d,f
24+
"""
25+
26+
dia = csv.excel()
27+
dia.quoting = csv.QUOTE_NONE
28+
with tm.assert_produces_warning(ParserWarning):
29+
df = self.read_csv(StringIO(data), dialect=dia)
30+
31+
data = '''\
32+
label1,label2,label3
33+
index1,a,c,e
34+
index2,b,d,f
35+
'''
36+
exp = self.read_csv(StringIO(data))
37+
exp.replace('a', '"a', inplace=True)
38+
tm.assert_frame_equal(df, exp)
39+
40+
def test_dialect_str(self):
41+
data = """\
42+
fruit:vegetable
43+
apple:brocolli
44+
pear:tomato
45+
"""
46+
exp = DataFrame({
47+
'fruit': ['apple', 'pear'],
48+
'vegetable': ['brocolli', 'tomato']
49+
})
50+
csv.register_dialect('mydialect', delimiter=':')
51+
with tm.assert_produces_warning(ParserWarning):
52+
df = self.read_csv(StringIO(data), dialect='mydialect')
53+
54+
tm.assert_frame_equal(df, exp)
55+
csv.unregister_dialect('mydialect')
56+
57+
def test_dialect_conflict(self):
58+
data = 'a,b\n1,2'
59+
dialect = 'excel'
60+
exp = DataFrame({'a': [1], 'b': [2]})
61+
62+
with tm.assert_produces_warning(None):
63+
df = self.read_csv(StringIO(data), delimiter=',', dialect=dialect)
64+
tm.assert_frame_equal(df, exp)
65+
66+
with tm.assert_produces_warning(ParserWarning):
67+
df = self.read_csv(StringIO(data), delimiter='.', dialect=dialect)
68+
tm.assert_frame_equal(df, exp)

pandas/io/tests/parser/test_parsers.py

+7-6
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from .common import ParserTests
1212
from .header import HeaderTests
1313
from .comment import CommentTests
14+
from .dialect import DialectTests
1415
from .quoting import QuotingTests
1516
from .usecols import UsecolsTests
1617
from .skiprows import SkipRowsTests
@@ -26,12 +27,12 @@
2627

2728

2829
class BaseParser(CommentTests, CompressionTests,
29-
ConverterTests, HeaderTests,
30-
IndexColTests, MultithreadTests,
31-
NAvaluesTests, ParseDatesTests,
32-
ParserTests, SkipRowsTests,
33-
UsecolsTests, QuotingTests,
34-
DtypeTests):
30+
ConverterTests, DialectTests,
31+
HeaderTests, IndexColTests,
32+
MultithreadTests, NAvaluesTests,
33+
ParseDatesTests, ParserTests,
34+
SkipRowsTests, UsecolsTests,
35+
QuotingTests, DtypeTests):
3536
def read_csv(self, *args, **kwargs):
3637
raise NotImplementedError
3738

0 commit comments

Comments
 (0)