Skip to content

Commit 2946745

Browse files
gfyoungjreback
authored andcommitted
BUG: Don't warn if default conflicts with dialect (#23775)
xref gh-23761.
1 parent 3d6d873 commit 2946745

File tree

4 files changed

+133
-30
lines changed

4 files changed

+133
-30
lines changed

doc/source/whatsnew/v0.24.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -1385,6 +1385,7 @@ Notice how we now instead output ``np.nan`` itself instead of a stringified form
13851385
- Bug in :func:`read_csv()` in which memory leaks occurred in the C engine when parsing ``NaN`` values due to insufficient cleanup on completion or error (:issue:`21353`)
13861386
- Bug in :func:`read_csv()` in which incorrect error messages were being raised when ``skipfooter`` was passed in along with ``nrows``, ``iterator``, or ``chunksize`` (:issue:`23711`)
13871387
- Bug in :meth:`read_csv()` in which :class:`MultiIndex` index names were being improperly handled in the cases when they were not provided (:issue:`23484`)
1388+
- Bug in :meth:`read_csv()` in which unnecessary warnings were being raised when the dialect's values conflicted with the default arguments (:issue:`23761`)
13881389
- Bug in :meth:`read_html()` in which the error message was not displaying the valid flavors when an invalid one was provided (:issue:`23549`)
13891390
- Bug in :meth:`read_excel()` in which extraneous header names were extracted, even though none were specified (:issue:`11733`)
13901391
- Bug in :meth:`read_excel()` in which ``index_col=None`` was not being respected and parsing index columns anyway (:issue:`20480`)

pandas/io/parsers.py

+37-10
Original file line numberDiff line numberDiff line change
@@ -632,6 +632,24 @@ def parser_f(filepath_or_buffer,
632632
if sep is False:
633633
sep = default_sep
634634

635+
# gh-23761
636+
#
637+
# When a dialect is passed, it overrides any of the overlapping
638+
# parameters passed in directly. We don't want to warn if the
639+
# default parameters were passed in (since it probably means
640+
# that the user didn't pass them in explicitly in the first place).
641+
#
642+
# "delimiter" is the annoying corner case because we alias it to
643+
# "sep" before doing comparison to the dialect values later on.
644+
# Thus, we need a flag to indicate that we need to "override"
645+
# the comparison to dialect values by checking if default values
646+
# for BOTH "delimiter" and "sep" were provided.
647+
if dialect is not None:
648+
sep_override = delimiter is None and sep == default_sep
649+
kwds = dict(sep_override=sep_override)
650+
else:
651+
kwds = dict()
652+
635653
# Alias sep -> delimiter.
636654
if delimiter is None:
637655
delimiter = sep
@@ -647,7 +665,7 @@ def parser_f(filepath_or_buffer,
647665
engine = 'c'
648666
engine_specified = False
649667

650-
kwds = dict(delimiter=delimiter,
668+
kwds.update(delimiter=delimiter,
651669
engine=engine,
652670
dialect=dialect,
653671
compression=compression,
@@ -769,18 +787,27 @@ def __init__(self, f, engine=None, **kwds):
769787
except AttributeError:
770788
raise ValueError("Invalid dialect '{dialect}' provided"
771789
.format(dialect=kwds['dialect']))
772-
provided = kwds.get(param, _parser_defaults[param])
790+
parser_default = _parser_defaults[param]
791+
provided = kwds.get(param, parser_default)
773792

774-
# Messages for conflicting values between the dialect instance
775-
# and the actual parameters provided.
793+
# Messages for conflicting values between the dialect
794+
# instance and the actual parameters provided.
776795
conflict_msgs = []
777796

778-
if dialect_val != provided:
779-
conflict_msgs.append((
780-
"Conflicting values for '{param}': '{val}' was "
781-
"provided, but the dialect specifies '{diaval}'. "
782-
"Using the dialect-specified value.".format(
783-
param=param, val=provided, diaval=dialect_val)))
797+
# Don't warn if the default parameter was passed in,
798+
# even if it conflicts with the dialect (gh-23761).
799+
if provided != parser_default and provided != dialect_val:
800+
msg = ("Conflicting values for '{param}': '{val}' was "
801+
"provided, but the dialect specifies '{diaval}'. "
802+
"Using the dialect-specified value.".format(
803+
param=param, val=provided, diaval=dialect_val))
804+
805+
# Annoying corner case for not warning about
806+
# conflicts between dialect and delimiter parameter.
807+
# Refer to the outer "_read_" function for more info.
808+
if not (param == "delimiter" and
809+
kwds.pop("sep_override", False)):
810+
conflict_msgs.append(msg)
784811

785812
if conflict_msgs:
786813
warnings.warn('\n\n'.join(conflict_msgs), ParserWarning,

pandas/tests/io/parser/test_dialect.py

+64-20
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,14 @@
1616
import pandas.util.testing as tm
1717

1818

19+
@pytest.fixture
20+
def custom_dialect():
21+
dialect_name = "weird"
22+
dialect_kwargs = dict(doublequote=False, escapechar="~", delimiter=":",
23+
skipinitialspace=False, quotechar="~", quoting=3)
24+
return dialect_name, dialect_kwargs
25+
26+
1927
def test_dialect(all_parsers):
2028
parser = all_parsers
2129
data = """\
@@ -26,10 +34,7 @@ def test_dialect(all_parsers):
2634

2735
dia = csv.excel()
2836
dia.quoting = csv.QUOTE_NONE
29-
30-
# Conflicting dialect quoting.
31-
with tm.assert_produces_warning(ParserWarning):
32-
df = parser.read_csv(StringIO(data), dialect=dia)
37+
df = parser.read_csv(StringIO(data), dialect=dia)
3338

3439
data = """\
3540
label1,label2,label3
@@ -53,14 +58,10 @@ def test_dialect_str(all_parsers):
5358
"fruit": ["apple", "pear"],
5459
"vegetable": ["broccoli", "tomato"]
5560
})
56-
csv.register_dialect(dialect_name, delimiter=":")
5761

58-
# Conflicting dialect delimiter.
59-
with tm.assert_produces_warning(ParserWarning):
62+
with tm.with_csv_dialect(dialect_name, delimiter=":"):
6063
df = parser.read_csv(StringIO(data), dialect=dialect_name)
61-
62-
tm.assert_frame_equal(df, exp)
63-
csv.unregister_dialect(dialect_name)
64+
tm.assert_frame_equal(df, exp)
6465

6566

6667
def test_invalid_dialect(all_parsers):
@@ -75,17 +76,60 @@ class InvalidDialect(object):
7576
parser.read_csv(StringIO(data), dialect=InvalidDialect)
7677

7778

78-
@pytest.mark.parametrize("delimiter", [",", "."])
79-
def test_dialect_conflict(all_parsers, delimiter):
80-
data = "a,b\n1,2"
81-
dialect = "excel"
79+
@pytest.mark.parametrize("arg", [None, "doublequote", "escapechar",
80+
"skipinitialspace", "quotechar", "quoting"])
81+
@pytest.mark.parametrize("value", ["dialect", "default", "other"])
82+
def test_dialect_conflict_except_delimiter(all_parsers, custom_dialect,
83+
arg, value):
84+
# see gh-23761.
85+
dialect_name, dialect_kwargs = custom_dialect
86+
parser = all_parsers
87+
88+
expected = DataFrame({"a": [1], "b": [2]})
89+
data = "a:b\n1:2"
90+
91+
warning_klass = None
92+
kwds = dict()
93+
94+
# arg=None tests when we pass in the dialect without any other arguments.
95+
if arg is not None:
96+
if "value" == "dialect": # No conflict --> no warning.
97+
kwds[arg] = dialect_kwargs[arg]
98+
elif "value" == "default": # Default --> no warning.
99+
from pandas.io.parsers import _parser_defaults
100+
kwds[arg] = _parser_defaults[arg]
101+
else: # Non-default + conflict with dialect --> warning.
102+
warning_klass = ParserWarning
103+
kwds[arg] = "blah"
104+
105+
with tm.with_csv_dialect(dialect_name, **dialect_kwargs):
106+
with tm.assert_produces_warning(warning_klass):
107+
result = parser.read_csv(StringIO(data),
108+
dialect=dialect_name, **kwds)
109+
tm.assert_frame_equal(result, expected)
110+
111+
112+
@pytest.mark.parametrize("kwargs,warning_klass", [
113+
(dict(sep=","), None), # sep is default --> sep_override=True
114+
(dict(sep="."), ParserWarning), # sep isn't default --> sep_override=False
115+
(dict(delimiter=":"), None), # No conflict
116+
(dict(delimiter=None), None), # Default arguments --> sep_override=True
117+
(dict(delimiter=","), ParserWarning), # Conflict
118+
(dict(delimiter="."), ParserWarning), # Conflict
119+
], ids=["sep-override-true", "sep-override-false",
120+
"delimiter-no-conflict", "delimiter-default-arg",
121+
"delimiter-conflict", "delimiter-conflict2"])
122+
def test_dialect_conflict_delimiter(all_parsers, custom_dialect,
123+
kwargs, warning_klass):
124+
# see gh-23761.
125+
dialect_name, dialect_kwargs = custom_dialect
82126
parser = all_parsers
83127

84128
expected = DataFrame({"a": [1], "b": [2]})
85-
warning_klass = None if delimiter == "," else ParserWarning
129+
data = "a:b\n1:2"
86130

87-
with tm.assert_produces_warning(warning_klass):
88-
result = parser.read_csv(StringIO(data),
89-
delimiter=delimiter,
90-
dialect=dialect)
91-
tm.assert_frame_equal(result, expected)
131+
with tm.with_csv_dialect(dialect_name, **dialect_kwargs):
132+
with tm.assert_produces_warning(warning_klass):
133+
result = parser.read_csv(StringIO(data),
134+
dialect=dialect_name, **kwargs)
135+
tm.assert_frame_equal(result, expected)

pandas/util/testing.py

+31
Original file line numberDiff line numberDiff line change
@@ -2835,6 +2835,37 @@ def __exit__(self, exc_type, exc_value, traceback):
28352835
np.random.set_state(self.start_state)
28362836

28372837

2838+
@contextmanager
2839+
def with_csv_dialect(name, **kwargs):
2840+
"""
2841+
Context manager to temporarily register a CSV dialect for parsing CSV.
2842+
2843+
Parameters
2844+
----------
2845+
name : str
2846+
The name of the dialect.
2847+
kwargs : mapping
2848+
The parameters for the dialect.
2849+
2850+
Raises
2851+
------
2852+
ValueError : the name of the dialect conflicts with a builtin one.
2853+
2854+
See Also
2855+
--------
2856+
csv : Python's CSV library.
2857+
"""
2858+
import csv
2859+
_BUILTIN_DIALECTS = {"excel", "excel-tab", "unix"}
2860+
2861+
if name in _BUILTIN_DIALECTS:
2862+
raise ValueError("Cannot override builtin dialect.")
2863+
2864+
csv.register_dialect(name, **kwargs)
2865+
yield
2866+
csv.unregister_dialect(name)
2867+
2868+
28382869
@contextmanager
28392870
def use_numexpr(use, min_elements=None):
28402871
from pandas.core.computation import expressions as expr

0 commit comments

Comments
 (0)