-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
BUG, DOC: Improve dialect handling in read_csv #14911
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -244,8 +244,11 @@ | |
standard encodings | ||
<https://docs.python.org/3/library/codecs.html#standard-encodings>`_ | ||
dialect : str or csv.Dialect instance, default None | ||
If None defaults to Excel dialect. Ignored if sep longer than 1 char | ||
See csv.Dialect documentation for more details | ||
If provided, this parameter will override values (default or not) for the | ||
following parameters: `delimiter`, `doublequote`, `escapechar`, | ||
`skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to | ||
override values, a ParserWarning will be issued. See csv.Dialect | ||
documentation for more details. | ||
tupleize_cols : boolean, default False | ||
Leave a list of tuples on columns as is (default is to convert to | ||
a Multi Index on the columns) | ||
|
@@ -698,12 +701,33 @@ def __init__(self, f, engine=None, **kwds): | |
dialect = kwds['dialect'] | ||
if dialect in csv.list_dialects(): | ||
dialect = csv.get_dialect(dialect) | ||
kwds['delimiter'] = dialect.delimiter | ||
kwds['doublequote'] = dialect.doublequote | ||
kwds['escapechar'] = dialect.escapechar | ||
kwds['skipinitialspace'] = dialect.skipinitialspace | ||
kwds['quotechar'] = dialect.quotechar | ||
kwds['quoting'] = dialect.quoting | ||
|
||
# Any valid dialect should have these attributes. | ||
# If any are missing, we will raise automatically. | ||
for param in ('delimiter', 'doublequote', 'escapechar', | ||
'skipinitialspace', 'quotechar', 'quoting'): | ||
try: | ||
dialect_val = getattr(dialect, param) | ||
except AttributeError: | ||
raise ValueError("Invalid dialect '{dialect}' provided" | ||
.format(dialect=kwds['dialect'])) | ||
provided = kwds.get(param, _parser_defaults[param]) | ||
|
||
# Messages for conflicting values between the dialect instance | ||
# and the actual parameters provided. | ||
conflict_msgs = [] | ||
|
||
if dialect_val != provided: | ||
conflict_msgs.append(( | ||
"Conflicting values for '{param}': '{val}' was " | ||
"provided, but the dialect specifies '{diaval}'. " | ||
"Using the dialect-specified value.".format( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. see this is contrary to your previous statement. Is the dialect or the override used? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure what is unclear about "dialect-specified value." For example, if you pass in explicitly |
||
param=param, val=provided, diaval=dialect_val))) | ||
|
||
if conflict_msgs: | ||
warnings.warn('\n\n'.join(conflict_msgs), ParserWarning, | ||
stacklevel=2) | ||
kwds[param] = dialect_val | ||
|
||
if kwds.get('header', 'infer') == 'infer': | ||
kwds['header'] = 0 if kwds.get('names') is None else None | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
""" | ||
Tests that dialects are properly handled during parsing | ||
for all of the parsers defined in parsers.py | ||
""" | ||
|
||
import csv | ||
|
||
from pandas import DataFrame | ||
from pandas.compat import StringIO | ||
from pandas.io.common import ParserWarning | ||
|
||
import pandas.util.testing as tm | ||
|
||
|
||
class DialectTests(object): | ||
|
||
def test_dialect(self): | ||
data = """\ | ||
label1,label2,label3 | ||
index1,"a,c,e | ||
index2,b,d,f | ||
""" | ||
|
||
dia = csv.excel() | ||
dia.quoting = csv.QUOTE_NONE | ||
with tm.assert_produces_warning(ParserWarning): | ||
df = self.read_csv(StringIO(data), dialect=dia) | ||
|
||
data = '''\ | ||
label1,label2,label3 | ||
index1,a,c,e | ||
index2,b,d,f | ||
''' | ||
exp = self.read_csv(StringIO(data)) | ||
exp.replace('a', '"a', inplace=True) | ||
tm.assert_frame_equal(df, exp) | ||
|
||
def test_dialect_str(self): | ||
data = """\ | ||
fruit:vegetable | ||
apple:brocolli | ||
pear:tomato | ||
""" | ||
exp = DataFrame({ | ||
'fruit': ['apple', 'pear'], | ||
'vegetable': ['brocolli', 'tomato'] | ||
}) | ||
csv.register_dialect('mydialect', delimiter=':') | ||
with tm.assert_produces_warning(ParserWarning): | ||
df = self.read_csv(StringIO(data), dialect='mydialect') | ||
|
||
tm.assert_frame_equal(df, exp) | ||
csv.unregister_dialect('mydialect') | ||
|
||
def test_invalid_dialect(self): | ||
class InvalidDialect(object): | ||
pass | ||
|
||
data = 'a\n1' | ||
msg = 'Invalid dialect' | ||
|
||
with tm.assertRaisesRegexp(ValueError, msg): | ||
self.read_csv(StringIO(data), dialect=InvalidDialect) | ||
|
||
def test_dialect_conflict(self): | ||
data = 'a,b\n1,2' | ||
dialect = 'excel' | ||
exp = DataFrame({'a': [1], 'b': [2]}) | ||
|
||
with tm.assert_produces_warning(None): | ||
df = self.read_csv(StringIO(data), delimiter=',', dialect=dialect) | ||
tm.assert_frame_equal(df, exp) | ||
|
||
with tm.assert_produces_warning(ParserWarning): | ||
df = self.read_csv(StringIO(data), delimiter='.', dialect=dialect) | ||
tm.assert_frame_equal(df, exp) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this seems kind of odd to check for, IOW, you are validating the dialect itself?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
IOW, has this ever happened in tests?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, I am validating the dialect itself (because we don't check otherwise). I can add a test for this to make sure.