Skip to content

Fix delim_whitespace behavior in read_table, read_csv #36709

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Oct 8, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -416,6 +416,7 @@ I/O
- Bug in :meth:`read_csv` with ``engine='python'`` truncating data if multiple items present in first row and first element started with BOM (:issue:`36343`)
- Removed ``private_key`` and ``verbose`` from :func:`read_gbq` as they are no longer supported in ``pandas-gbq`` (:issue:`34654`, :issue:`30200`)
- Bumped minimum pytables version to 3.5.1 to avoid a ``ValueError`` in :meth:`read_hdf` (:issue:`24839`)
- Bug in :func:`read_table` and :func:`read_csv` when ``delim_whitespace=True`` and ``sep=default`` (:issue:`36583`)
- Bug in :meth:`read_parquet` with fixed offset timezones. String representation of timezones was not recognized (:issue:`35997`, :issue:`36004`)

Plotting
Expand Down
205 changes: 107 additions & 98 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -542,7 +542,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
)
def read_csv(
filepath_or_buffer: FilePathOrBuffer,
sep=",",
sep=lib.no_default,
delimiter=None,
# Column and Index Locations and Names
header="infer",
Expand Down Expand Up @@ -600,93 +600,14 @@ def read_csv(
float_precision=None,
storage_options: StorageOptions = None,
):
# gh-23761
#
# When a dialect is passed, it overrides any of the overlapping
# parameters passed in directly. We don't want to warn if the
# default parameters were passed in (since it probably means
# that the user didn't pass them in explicitly in the first place).
#
# "delimiter" is the annoying corner case because we alias it to
# "sep" before doing comparison to the dialect values later on.
# Thus, we need a flag to indicate that we need to "override"
# the comparison to dialect values by checking if default values
# for BOTH "delimiter" and "sep" were provided.
default_sep = ","

if dialect is not None:
sep_override = delimiter is None and sep == default_sep
kwds = dict(sep_override=sep_override)
else:
kwds = dict()

# Alias sep -> delimiter.
if delimiter is None:
delimiter = sep
kwds = locals()
del kwds["filepath_or_buffer"]
del kwds["sep"]

if delim_whitespace and delimiter != default_sep:
raise ValueError(
"Specified a delimiter with both sep and "
"delim_whitespace=True; you can only specify one."
)

if engine is not None:
engine_specified = True
else:
engine = "c"
engine_specified = False

kwds.update(
delimiter=delimiter,
engine=engine,
dialect=dialect,
compression=compression,
engine_specified=engine_specified,
doublequote=doublequote,
escapechar=escapechar,
quotechar=quotechar,
quoting=quoting,
skipinitialspace=skipinitialspace,
lineterminator=lineterminator,
header=header,
index_col=index_col,
names=names,
prefix=prefix,
skiprows=skiprows,
skipfooter=skipfooter,
na_values=na_values,
true_values=true_values,
false_values=false_values,
keep_default_na=keep_default_na,
thousands=thousands,
comment=comment,
decimal=decimal,
parse_dates=parse_dates,
keep_date_col=keep_date_col,
dayfirst=dayfirst,
date_parser=date_parser,
cache_dates=cache_dates,
nrows=nrows,
iterator=iterator,
chunksize=chunksize,
converters=converters,
dtype=dtype,
usecols=usecols,
verbose=verbose,
encoding=encoding,
squeeze=squeeze,
memory_map=memory_map,
float_precision=float_precision,
na_filter=na_filter,
delim_whitespace=delim_whitespace,
warn_bad_lines=warn_bad_lines,
error_bad_lines=error_bad_lines,
low_memory=low_memory,
mangle_dupe_cols=mangle_dupe_cols,
infer_datetime_format=infer_datetime_format,
skip_blank_lines=skip_blank_lines,
storage_options=storage_options,
kwds_defaults = _check_defaults_read(
dialect, delimiter, delim_whitespace, engine, sep, defaults={"delimiter": ","}
)
kwds.update(kwds_defaults)

return _read(filepath_or_buffer, kwds)

Expand All @@ -700,7 +621,7 @@ def read_csv(
)
def read_table(
filepath_or_buffer: FilePathOrBuffer,
sep="\t",
sep=lib.no_default,
delimiter=None,
# Column and Index Locations and Names
header="infer",
Expand Down Expand Up @@ -757,17 +678,16 @@ def read_table(
memory_map=False,
float_precision=None,
):
# TODO: validation duplicated in read_csv
if delim_whitespace and (delimiter is not None or sep != "\t"):
raise ValueError(
"Specified a delimiter with both sep and "
"delim_whitespace=True; you can only specify one."
)
if delim_whitespace:
# In this case sep is not used so we set it to the read_csv
# default to avoid a ValueError
sep = ","
return read_csv(**locals())
kwds = locals()
del kwds["filepath_or_buffer"]
del kwds["sep"]

kwds_defaults = _check_defaults_read(
dialect, delimiter, delim_whitespace, engine, sep, defaults={"delimiter": "\t"}
)
kwds.update(kwds_defaults)

return _read(filepath_or_buffer, kwds)


def read_fwf(
Expand Down Expand Up @@ -3782,3 +3702,92 @@ def _make_reader(self, f):
self.skiprows,
self.infer_nrows,
)


def _check_defaults_read(
dialect: Union[str, csv.Dialect],
delimiter: Union[str, object],
delim_whitespace: bool,
engine: str,
sep: Union[str, object],
defaults: Dict[str, Any],
):
"""Check default values of input parameters of read_csv, read_table.

Parameters
----------
dialect : str or csv.Dialect
If provided, this parameter will override values (default or not) for the
following parameters: `delimiter`, `doublequote`, `escapechar`,
`skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to
override values, a ParserWarning will be issued. See csv.Dialect
documentation for more details.
delimiter : str or object
Alias for sep.
delim_whitespace : bool
Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be
used as the sep. Equivalent to setting ``sep='\\s+'``. If this option
is set to True, nothing should be passed in for the ``delimiter``
parameter.
engine : {{'c', 'python'}}
Parser engine to use. The C engine is faster while the python engine is
currently more feature-complete.
sep : str or object
A delimiter provided by the user (str) or a sentinel value, i.e.
pandas._libs.lib.no_default.
defaults: dict
Default values of input parameters.

Returns
-------
kwds : dict
Input parameters with correct values.

Raises
------
ValueError : If a delimiter was specified with ``sep`` (or ``delimiter``) and
``delim_whitespace=True``.
"""
# fix types for sep, delimiter to Union(str, Any)
delim_default = defaults["delimiter"]
kwds: Dict[str, Any] = {}
# gh-23761
#
# When a dialect is passed, it overrides any of the overlapping
# parameters passed in directly. We don't want to warn if the
# default parameters were passed in (since it probably means
# that the user didn't pass them in explicitly in the first place).
#
# "delimiter" is the annoying corner case because we alias it to
# "sep" before doing comparison to the dialect values later on.
# Thus, we need a flag to indicate that we need to "override"
# the comparison to dialect values by checking if default values
# for BOTH "delimiter" and "sep" were provided.
if dialect is not None:
kwds["sep_override"] = (delimiter is None) and (
sep is lib.no_default or sep == delim_default
)

# Alias sep -> delimiter.
if delimiter is None:
delimiter = sep

if delim_whitespace and (delimiter is not lib.no_default):
raise ValueError(
"Specified a delimiter with both sep and "
"delim_whitespace=True; you can only specify one."
)

if delimiter is lib.no_default:
# assign default separator value
kwds["delimiter"] = delim_default
else:
kwds["delimiter"] = delimiter

if engine is not None:
kwds["engine_specified"] = True
else:
kwds["engine"] = "c"
kwds["engine_specified"] = False

return kwds
24 changes: 22 additions & 2 deletions pandas/tests/io/parser/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -2211,7 +2211,8 @@ def test_read_table_delim_whitespace_default_sep(all_parsers):
tm.assert_frame_equal(result, expected)


def test_read_table_delim_whitespace_non_default_sep(all_parsers):
@pytest.mark.parametrize("delimiter", [",", "\t"])
def test_read_csv_delim_whitespace_non_default_sep(all_parsers, delimiter):
# GH: 35958
f = StringIO("a b c\n1 -2 -3\n4 5 6")
parser = all_parsers
Expand All @@ -2220,4 +2221,23 @@ def test_read_table_delim_whitespace_non_default_sep(all_parsers):
"delim_whitespace=True; you can only specify one."
)
with pytest.raises(ValueError, match=msg):
parser.read_table(f, delim_whitespace=True, sep=",")
parser.read_csv(f, delim_whitespace=True, sep=delimiter)

with pytest.raises(ValueError, match=msg):
parser.read_csv(f, delim_whitespace=True, delimiter=delimiter)


@pytest.mark.parametrize("delimiter", [",", "\t"])
def test_read_table_delim_whitespace_non_default_sep(all_parsers, delimiter):
# GH: 35958
f = StringIO("a b c\n1 -2 -3\n4 5 6")
parser = all_parsers
msg = (
"Specified a delimiter with both sep and "
"delim_whitespace=True; you can only specify one."
)
with pytest.raises(ValueError, match=msg):
parser.read_table(f, delim_whitespace=True, sep=delimiter)

with pytest.raises(ValueError, match=msg):
parser.read_table(f, delim_whitespace=True, delimiter=delimiter)