Skip to content

REF/CLN: pandas/io/parsers.py #36852

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Oct 14, 2020
166 changes: 100 additions & 66 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -787,71 +787,23 @@ def __init__(self, f, engine=None, **kwds):
else:
engine = "python"
engine_specified = False

self.engine = engine
self._engine_specified = kwds.get("engine_specified", engine_specified)

_validate_skipfooter(kwds)

if kwds.get("dialect") is not None:
dialect = kwds["dialect"]
if dialect in csv.list_dialects():
dialect = csv.get_dialect(dialect)

# Any valid dialect should have these attributes.
# If any are missing, we will raise automatically.
for param in (
"delimiter",
"doublequote",
"escapechar",
"skipinitialspace",
"quotechar",
"quoting",
):
try:
dialect_val = getattr(dialect, param)
except AttributeError as err:
raise ValueError(
f"Invalid dialect {kwds['dialect']} provided"
) from err
parser_default = _parser_defaults[param]
provided = kwds.get(param, parser_default)

# Messages for conflicting values between the dialect
# instance and the actual parameters provided.
conflict_msgs = []

# Don't warn if the default parameter was passed in,
# even if it conflicts with the dialect (gh-23761).
if provided != parser_default and provided != dialect_val:
msg = (
f"Conflicting values for '{param}': '{provided}' was "
f"provided, but the dialect specifies '{dialect_val}'. "
"Using the dialect-specified value."
)

# Annoying corner case for not warning about
# conflicts between dialect and delimiter parameter.
# Refer to the outer "_read_" function for more info.
if not (param == "delimiter" and kwds.pop("sep_override", False)):
conflict_msgs.append(msg)

if conflict_msgs:
warnings.warn(
"\n\n".join(conflict_msgs), ParserWarning, stacklevel=2
)
kwds[param] = dialect_val

if kwds.get("skipfooter"):
if kwds.get("iterator") or kwds.get("chunksize"):
raise ValueError("'skipfooter' not supported for 'iteration'")
if kwds.get("nrows"):
raise ValueError("'skipfooter' not supported with 'nrows'")
kwds = _merge_with_dialect_properties(kwds, dialect)

if kwds.get("header", "infer") == "infer":
kwds["header"] = 0 if kwds.get("names") is None else None

self.orig_options = kwds

# miscellanea
self.engine = engine
self._currow = 0

options = self._get_options_with_defaults(engine)
Expand Down Expand Up @@ -926,7 +878,6 @@ def _check_file_or_buffer(self, f, engine):
def _clean_options(self, options, engine):
result = options.copy()

engine_specified = self._engine_specified
fallback_reason = None

# C engine not supported yet
Expand Down Expand Up @@ -990,7 +941,7 @@ def _clean_options(self, options, engine):
)
engine = "python"

if fallback_reason and engine_specified:
if fallback_reason and self._engine_specified:
raise ValueError(fallback_reason)

if engine == "c":
Expand Down Expand Up @@ -1026,25 +977,18 @@ def _clean_options(self, options, engine):

validate_header_arg(options["header"])

depr_warning = ""

for arg in _deprecated_args:
parser_default = _c_parser_defaults[arg]
depr_default = _deprecated_defaults[arg]

msg = (
f"The {repr(arg)} argument has been deprecated and will be "
"removed in a future version."
)

if result.get(arg, depr_default) != depr_default:
depr_warning += msg + "\n\n"
msg = (
f"The {arg} argument has been deprecated and will be "
"removed in a future version.\n\n"
)
warnings.warn(msg, FutureWarning, stacklevel=2)
else:
result[arg] = parser_default

if depr_warning != "":
warnings.warn(depr_warning, FutureWarning, stacklevel=2)

if index_col is True:
raise ValueError("The value of index_col couldn't be 'True'")
if _is_index_col(index_col):
Expand Down Expand Up @@ -3791,3 +3735,93 @@ def _check_defaults_read(
kwds["engine_specified"] = False

return kwds


def _merge_with_dialect_properties(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

reverse these (always put kwds things last).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

rename kwds -> defaults

so similar to the _check_defaults_read

also not averse to renaming _check_defaults_read to something better.

Copy link
Member Author

@ivanovmg ivanovmg Oct 11, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have done it.
Also, I found the issue with the ValueError message, when dialect is incorrect.
Originally, kwds['dialect'], but I decided to use dialect.__name__ for better readability, which is not accepted by mypy (throws error "Dialect" has no attribute "__name__" [attr-defined]).
So, now I use string representation of dialect instance, however I can still pass one extra parameter (dialect_name) to _merge_with_dialect_properties and use it in the error message.

kwds: Dict[str, Any],
dialect: csv.Dialect,
) -> Dict[str, Any]:
"""
Merge kwargs in TextFileReader with dialect parameters.

Parameters
----------
kwds : dict
Keyword arguments passed to TextFileReader.
dialect : csv.Dialect
Concrete csv dialect. See csv.Dialect documentation for more details.

Returns
-------
kwds : dict
Updated keyword arguments, merged with dialect parameters.

Raises
------
ValueError
If incorrect dialect is provided.
"""
# Any valid dialect should have these attributes.
# If any are missing, we will raise automatically.
mandatory_dialect_attrs = (
"delimiter",
"doublequote",
"escapechar",
"skipinitialspace",
"quotechar",
"quoting",
)

for param in mandatory_dialect_attrs:
try:
dialect_val = getattr(dialect, param)
except AttributeError as err:
raise ValueError(f"Invalid dialect {dialect.__name__} provided") from err

parser_default = _parser_defaults[param]
provided = kwds.get(param, parser_default)

# Messages for conflicting values between the dialect
# instance and the actual parameters provided.
conflict_msgs = []

# Don't warn if the default parameter was passed in,
# even if it conflicts with the dialect (gh-23761).
if provided != parser_default and provided != dialect_val:
msg = (
f"Conflicting values for '{param}': '{provided}' was "
f"provided, but the dialect specifies '{dialect_val}'. "
"Using the dialect-specified value."
)

# Annoying corner case for not warning about
# conflicts between dialect and delimiter parameter.
# Refer to the outer "_read_" function for more info.
if not (param == "delimiter" and kwds.pop("sep_override", False)):
conflict_msgs.append(msg)

if conflict_msgs:
warnings.warn("\n\n".join(conflict_msgs), ParserWarning, stacklevel=2)
kwds[param] = dialect_val
return kwds


def _validate_skipfooter(kwds: Dict[str, Any]) -> None:
"""
Check whether skipfooter is compatible with other kwargs in TextFileReader.

Parameters
----------
kwds : dict
Keyword arguments passed to TextFileReader.

Raises
------
ValueError
If skipfooter is not compatible with other parameters.
"""
if kwds.get("skipfooter"):
if kwds.get("iterator") or kwds.get("chunksize"):
raise ValueError("'skipfooter' not supported for 'iteration'")
if kwds.get("nrows"):
raise ValueError("'skipfooter' not supported with 'nrows'")