Skip to content

Fix delim_whitespace behavior in read_table, read_csv #36709

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Oct 8, 2020
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -382,6 +382,7 @@ I/O
- Bug in :meth:`read_csv` with ``engine='python'`` truncating data if multiple items present in first row and first element started with BOM (:issue:`36343`)
- Removed ``private_key`` and ``verbose`` from :func:`read_gbq` as they are no longer supported in ``pandas-gbq`` (:issue:`34654`, :issue:`30200`)
- Bumped minimum pytables version to 3.5.1 to avoid a ``ValueError`` in :meth:`read_hdf` (:issue:`24839`)
- BUG in :func:`read_table` and :func:`read_csv` when ``delim_whitespace=True`` and ``sep=default`` (:issue:`36583`)

Plotting
^^^^^^^^
Expand Down
162 changes: 60 additions & 102 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -542,7 +542,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
)
def read_csv(
filepath_or_buffer: FilePathOrBuffer,
sep=",",
sep=lib.no_default,
delimiter=None,
# Column and Index Locations and Names
header="infer",
Expand Down Expand Up @@ -600,95 +600,7 @@ def read_csv(
float_precision=None,
storage_options: StorageOptions = None,
):
# gh-23761
#
# When a dialect is passed, it overrides any of the overlapping
# parameters passed in directly. We don't want to warn if the
# default parameters were passed in (since it probably means
# that the user didn't pass them in explicitly in the first place).
#
# "delimiter" is the annoying corner case because we alias it to
# "sep" before doing comparison to the dialect values later on.
# Thus, we need a flag to indicate that we need to "override"
# the comparison to dialect values by checking if default values
# for BOTH "delimiter" and "sep" were provided.
default_sep = ","

if dialect is not None:
sep_override = delimiter is None and sep == default_sep
kwds = dict(sep_override=sep_override)
else:
kwds = dict()

# Alias sep -> delimiter.
if delimiter is None:
delimiter = sep

if delim_whitespace and delimiter != default_sep:
raise ValueError(
"Specified a delimiter with both sep and "
"delim_whitespace=True; you can only specify one."
)

if engine is not None:
engine_specified = True
else:
engine = "c"
engine_specified = False

kwds.update(
delimiter=delimiter,
engine=engine,
dialect=dialect,
compression=compression,
engine_specified=engine_specified,
doublequote=doublequote,
escapechar=escapechar,
quotechar=quotechar,
quoting=quoting,
skipinitialspace=skipinitialspace,
lineterminator=lineterminator,
header=header,
index_col=index_col,
names=names,
prefix=prefix,
skiprows=skiprows,
skipfooter=skipfooter,
na_values=na_values,
true_values=true_values,
false_values=false_values,
keep_default_na=keep_default_na,
thousands=thousands,
comment=comment,
decimal=decimal,
parse_dates=parse_dates,
keep_date_col=keep_date_col,
dayfirst=dayfirst,
date_parser=date_parser,
cache_dates=cache_dates,
nrows=nrows,
iterator=iterator,
chunksize=chunksize,
converters=converters,
dtype=dtype,
usecols=usecols,
verbose=verbose,
encoding=encoding,
squeeze=squeeze,
memory_map=memory_map,
float_precision=float_precision,
na_filter=na_filter,
delim_whitespace=delim_whitespace,
warn_bad_lines=warn_bad_lines,
error_bad_lines=error_bad_lines,
low_memory=low_memory,
mangle_dupe_cols=mangle_dupe_cols,
infer_datetime_format=infer_datetime_format,
skip_blank_lines=skip_blank_lines,
storage_options=storage_options,
)

return _read(filepath_or_buffer, kwds)
return _check_defaults_read(locals(), {"delimiter": ","})


@Appender(
Expand All @@ -700,7 +612,7 @@ def read_csv(
)
def read_table(
filepath_or_buffer: FilePathOrBuffer,
sep="\t",
sep=lib.no_default,
delimiter=None,
# Column and Index Locations and Names
header="infer",
Expand Down Expand Up @@ -757,17 +669,7 @@ def read_table(
memory_map=False,
float_precision=None,
):
# TODO: validation duplicated in read_csv
if delim_whitespace and (delimiter is not None or sep != "\t"):
raise ValueError(
"Specified a delimiter with both sep and "
"delim_whitespace=True; you can only specify one."
)
if delim_whitespace:
# In this case sep is not used so we set it to the read_csv
# default to avoid a ValueError
sep = ","
return read_csv(**locals())
return _check_defaults_read(locals(), {"delimiter": "\t"})


def read_fwf(
Expand Down Expand Up @@ -3782,3 +3684,59 @@ def _make_reader(self, f):
self.skiprows,
self.infer_nrows,
)


def _check_defaults_read(read_kwds: dict, defaults: dict):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we instead pass by keyword args (you can still pass the defaults as a dict i think), type is using Dict[str, Any] though

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok

"""Check default values of input parameters, read a line file."""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this should return a new dict of the paramaters, it should actually do the reading; leave that to the caller.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok

engine = read_kwds["engine"]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pls add a full-on doc-string here with the Parameters, Returns, and Raises section.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok

dialect = read_kwds["dialect"]
delimiter = read_kwds["delimiter"]
delim_whitespace = read_kwds["delim_whitespace"]
filepath_or_buffer = read_kwds["filepath_or_buffer"]
sep = read_kwds["sep"]

del read_kwds["filepath_or_buffer"]
del read_kwds["sep"]

delimiter_default = defaults["delimiter"]

# gh-23761
#
# When a dialect is passed, it overrides any of the overlapping
# parameters passed in directly. We don't want to warn if the
# default parameters were passed in (since it probably means
# that the user didn't pass them in explicitly in the first place).
#
# "delimiter" is the annoying corner case because we alias it to
# "sep" before doing comparison to the dialect values later on.
# Thus, we need a flag to indicate that we need to "override"
# the comparison to dialect values by checking if default values
# for BOTH "delimiter" and "sep" were provided.
if dialect is not None:
read_kwds["sep_override"] = (delimiter is None) and (
sep is lib.no_default or sep == delimiter_default
)

# Alias sep -> delimiter.
if delimiter is None:
delimiter = sep

if delim_whitespace and (delimiter is not lib.no_default):
raise ValueError(
"Specified a delimiter with both sep and "
"delim_whitespace=True; you can only specify one."
)

if delimiter is lib.no_default:
# assign default separator value
read_kwds["delimiter"] = delimiter_default
else:
read_kwds["delimiter"] = delimiter

if engine is not None:
read_kwds["engine_specified"] = True
else:
read_kwds["engine"] = "c"
read_kwds["engine_specified"] = False

return _read(filepath_or_buffer, read_kwds)
24 changes: 22 additions & 2 deletions pandas/tests/io/parser/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -2211,7 +2211,8 @@ def test_read_table_delim_whitespace_default_sep(all_parsers):
tm.assert_frame_equal(result, expected)


def test_read_table_delim_whitespace_non_default_sep(all_parsers):
@pytest.mark.parametrize("delimiter", [",", "\t"])
def test_read_csv_delim_whitespace_non_default_sep(all_parsers, delimiter):
# GH: 35958
f = StringIO("a b c\n1 -2 -3\n4 5 6")
parser = all_parsers
Expand All @@ -2220,4 +2221,23 @@ def test_read_table_delim_whitespace_non_default_sep(all_parsers):
"delim_whitespace=True; you can only specify one."
)
with pytest.raises(ValueError, match=msg):
parser.read_table(f, delim_whitespace=True, sep=",")
parser.read_csv(f, delim_whitespace=True, sep=delimiter)

with pytest.raises(ValueError, match=msg):
parser.read_csv(f, delim_whitespace=True, delimiter=delimiter)


@pytest.mark.parametrize("delimiter", [",", "\t"])
def test_read_table_delim_whitespace_non_default_sep(all_parsers, delimiter):
# GH: 35958
f = StringIO("a b c\n1 -2 -3\n4 5 6")
parser = all_parsers
msg = (
"Specified a delimiter with both sep and "
"delim_whitespace=True; you can only specify one."
)
with pytest.raises(ValueError, match=msg):
parser.read_table(f, delim_whitespace=True, sep=delimiter)

with pytest.raises(ValueError, match=msg):
parser.read_table(f, delim_whitespace=True, delimiter=delimiter)