Skip to content

REF/CLN: pandas/io/parsers.py #36852

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Oct 14, 2020
178 changes: 107 additions & 71 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -606,7 +606,7 @@ def read_csv(
del kwds["filepath_or_buffer"]
del kwds["sep"]

kwds_defaults = _check_defaults_read(
kwds_defaults = _refine_defaults_read(
dialect, delimiter, delim_whitespace, engine, sep, defaults={"delimiter": ","}
)
kwds.update(kwds_defaults)
Expand Down Expand Up @@ -684,7 +684,7 @@ def read_table(
del kwds["filepath_or_buffer"]
del kwds["sep"]

kwds_defaults = _check_defaults_read(
kwds_defaults = _refine_defaults_read(
dialect, delimiter, delim_whitespace, engine, sep, defaults={"delimiter": "\t"}
)
kwds.update(kwds_defaults)
Expand Down Expand Up @@ -789,71 +789,23 @@ def __init__(self, f, engine=None, **kwds):
else:
engine = "python"
engine_specified = False

self.engine = engine
self._engine_specified = kwds.get("engine_specified", engine_specified)

_validate_skipfooter(kwds)

if kwds.get("dialect") is not None:
dialect = kwds["dialect"]
if dialect in csv.list_dialects():
dialect = csv.get_dialect(dialect)

# Any valid dialect should have these attributes.
# If any are missing, we will raise automatically.
for param in (
"delimiter",
"doublequote",
"escapechar",
"skipinitialspace",
"quotechar",
"quoting",
):
try:
dialect_val = getattr(dialect, param)
except AttributeError as err:
raise ValueError(
f"Invalid dialect {kwds['dialect']} provided"
) from err
parser_default = _parser_defaults[param]
provided = kwds.get(param, parser_default)

# Messages for conflicting values between the dialect
# instance and the actual parameters provided.
conflict_msgs = []

# Don't warn if the default parameter was passed in,
# even if it conflicts with the dialect (gh-23761).
if provided != parser_default and provided != dialect_val:
msg = (
f"Conflicting values for '{param}': '{provided}' was "
f"provided, but the dialect specifies '{dialect_val}'. "
"Using the dialect-specified value."
)

# Annoying corner case for not warning about
# conflicts between dialect and delimiter parameter.
# Refer to the outer "_read_" function for more info.
if not (param == "delimiter" and kwds.pop("sep_override", False)):
conflict_msgs.append(msg)

if conflict_msgs:
warnings.warn(
"\n\n".join(conflict_msgs), ParserWarning, stacklevel=2
)
kwds[param] = dialect_val

if kwds.get("skipfooter"):
if kwds.get("iterator") or kwds.get("chunksize"):
raise ValueError("'skipfooter' not supported for 'iteration'")
if kwds.get("nrows"):
raise ValueError("'skipfooter' not supported with 'nrows'")
kwds = _merge_with_dialect_properties(dialect, kwds)

if kwds.get("header", "infer") == "infer":
kwds["header"] = 0 if kwds.get("names") is None else None

self.orig_options = kwds

# miscellanea
self.engine = engine
self._currow = 0

options = self._get_options_with_defaults(engine)
Expand Down Expand Up @@ -928,7 +880,6 @@ def _check_file_or_buffer(self, f, engine):
def _clean_options(self, options, engine):
result = options.copy()

engine_specified = self._engine_specified
fallback_reason = None

# C engine not supported yet
Expand Down Expand Up @@ -992,7 +943,7 @@ def _clean_options(self, options, engine):
)
engine = "python"

if fallback_reason and engine_specified:
if fallback_reason and self._engine_specified:
raise ValueError(fallback_reason)

if engine == "c":
Expand Down Expand Up @@ -1028,25 +979,18 @@ def _clean_options(self, options, engine):

validate_header_arg(options["header"])

depr_warning = ""

for arg in _deprecated_args:
parser_default = _c_parser_defaults[arg]
depr_default = _deprecated_defaults[arg]

msg = (
f"The {repr(arg)} argument has been deprecated and will be "
"removed in a future version."
)

if result.get(arg, depr_default) != depr_default:
depr_warning += msg + "\n\n"
msg = (
f"The {arg} argument has been deprecated and will be "
"removed in a future version.\n\n"
)
warnings.warn(msg, FutureWarning, stacklevel=2)
else:
result[arg] = parser_default

if depr_warning != "":
warnings.warn(depr_warning, FutureWarning, stacklevel=2)

if index_col is True:
raise ValueError("The value of index_col couldn't be 'True'")
if _is_index_col(index_col):
Expand Down Expand Up @@ -3706,15 +3650,15 @@ def _make_reader(self, f):
)


def _check_defaults_read(
def _refine_defaults_read(
dialect: Union[str, csv.Dialect],
delimiter: Union[str, object],
delim_whitespace: bool,
engine: str,
sep: Union[str, object],
defaults: Dict[str, Any],
):
"""Check default values of input parameters of read_csv, read_table.
"""Validate/refine default values of input parameters of read_csv, read_table.

Parameters
----------
Expand Down Expand Up @@ -3766,7 +3710,7 @@ def _check_defaults_read(
# the comparison to dialect values by checking if default values
# for BOTH "delimiter" and "sep" were provided.
if dialect is not None:
kwds["sep_override"] = (delimiter is None) and (
kwds["sep_override"] = delimiter is None and (
sep is lib.no_default or sep == delim_default
)

Expand All @@ -3793,3 +3737,95 @@ def _check_defaults_read(
kwds["engine_specified"] = False

return kwds


def _merge_with_dialect_properties(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

reverse these (always put kwds things last).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

rename kwds -> defaults

so similar to the _check_defaults_read

also not averse to renaming _check_defaults_read to something better.

Copy link
Member Author

@ivanovmg ivanovmg Oct 11, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have done it.
Also, I found the issue with the ValueError message, when dialect is incorrect.
Originally, kwds['dialect'], but I decided to use dialect.__name__ for better readability, which is not accepted by mypy (throws error "Dialect" has no attribute "__name__" [attr-defined]).
So, now I use string representation of dialect instance, however I can still pass one extra parameter (dialect_name) to _merge_with_dialect_properties and use it in the error message.

dialect: csv.Dialect,
defaults: Dict[str, Any],
) -> Dict[str, Any]:
"""
Merge default kwargs in TextFileReader with dialect parameters.

Parameters
----------
dialect : csv.Dialect
Concrete csv dialect. See csv.Dialect documentation for more details.
defaults : dict
Keyword arguments passed to TextFileReader.

Returns
-------
kwds : dict
Updated keyword arguments, merged with dialect parameters.

Raises
------
ValueError
If incorrect dialect is provided.
"""
kwds = defaults.copy()

# Any valid dialect should have these attributes.
# If any are missing, we will raise automatically.
mandatory_dialect_attrs = (
"delimiter",
"doublequote",
"escapechar",
"skipinitialspace",
"quotechar",
"quoting",
)

for param in mandatory_dialect_attrs:
try:
dialect_val = getattr(dialect, param)
except AttributeError as err:
raise ValueError(f"Invalid dialect {dialect} provided") from err

parser_default = _parser_defaults[param]
provided = kwds.get(param, parser_default)

# Messages for conflicting values between the dialect
# instance and the actual parameters provided.
conflict_msgs = []

# Don't warn if the default parameter was passed in,
# even if it conflicts with the dialect (gh-23761).
if provided != parser_default and provided != dialect_val:
msg = (
f"Conflicting values for '{param}': '{provided}' was "
f"provided, but the dialect specifies '{dialect_val}'. "
"Using the dialect-specified value."
)

# Annoying corner case for not warning about
# conflicts between dialect and delimiter parameter.
# Refer to the outer "_read_" function for more info.
if not (param == "delimiter" and kwds.pop("sep_override", False)):
conflict_msgs.append(msg)

if conflict_msgs:
warnings.warn("\n\n".join(conflict_msgs), ParserWarning, stacklevel=2)
kwds[param] = dialect_val
return kwds


def _validate_skipfooter(kwds: Dict[str, Any]) -> None:
"""
Check whether skipfooter is compatible with other kwargs in TextFileReader.

Parameters
----------
kwds : dict
Keyword arguments passed to TextFileReader.

Raises
------
ValueError
If skipfooter is not compatible with other parameters.
"""
if kwds.get("skipfooter"):
if kwds.get("iterator") or kwds.get("chunksize"):
raise ValueError("'skipfooter' not supported for 'iteration'")
if kwds.get("nrows"):
raise ValueError("'skipfooter' not supported with 'nrows'")