Skip to content

Commit eb0c443

Browse files
authored
Fix delim_whitespace behavior in read_table, read_csv (#36709)
1 parent 350d8cb commit eb0c443

File tree

3 files changed

+130
-100
lines changed

3 files changed

+130
-100
lines changed

doc/source/whatsnew/v1.2.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -416,6 +416,7 @@ I/O
416416
- Bug in :meth:`read_csv` with ``engine='python'`` truncating data if multiple items present in first row and first element started with BOM (:issue:`36343`)
417417
- Removed ``private_key`` and ``verbose`` from :func:`read_gbq` as they are no longer supported in ``pandas-gbq`` (:issue:`34654`, :issue:`30200`)
418418
- Bumped minimum pytables version to 3.5.1 to avoid a ``ValueError`` in :meth:`read_hdf` (:issue:`24839`)
419+
- Bug in :func:`read_table` and :func:`read_csv` when ``delim_whitespace=True`` and ``sep=default`` (:issue:`36583`)
419420
- Bug in :meth:`read_parquet` with fixed offset timezones. String representation of timezones was not recognized (:issue:`35997`, :issue:`36004`)
420421

421422
Plotting

pandas/io/parsers.py

+107-98
Original file line numberDiff line numberDiff line change
@@ -542,7 +542,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
542542
)
543543
def read_csv(
544544
filepath_or_buffer: FilePathOrBuffer,
545-
sep=",",
545+
sep=lib.no_default,
546546
delimiter=None,
547547
# Column and Index Locations and Names
548548
header="infer",
@@ -600,93 +600,14 @@ def read_csv(
600600
float_precision=None,
601601
storage_options: StorageOptions = None,
602602
):
603-
# gh-23761
604-
#
605-
# When a dialect is passed, it overrides any of the overlapping
606-
# parameters passed in directly. We don't want to warn if the
607-
# default parameters were passed in (since it probably means
608-
# that the user didn't pass them in explicitly in the first place).
609-
#
610-
# "delimiter" is the annoying corner case because we alias it to
611-
# "sep" before doing comparison to the dialect values later on.
612-
# Thus, we need a flag to indicate that we need to "override"
613-
# the comparison to dialect values by checking if default values
614-
# for BOTH "delimiter" and "sep" were provided.
615-
default_sep = ","
616-
617-
if dialect is not None:
618-
sep_override = delimiter is None and sep == default_sep
619-
kwds = dict(sep_override=sep_override)
620-
else:
621-
kwds = dict()
622-
623-
# Alias sep -> delimiter.
624-
if delimiter is None:
625-
delimiter = sep
603+
kwds = locals()
604+
del kwds["filepath_or_buffer"]
605+
del kwds["sep"]
626606

627-
if delim_whitespace and delimiter != default_sep:
628-
raise ValueError(
629-
"Specified a delimiter with both sep and "
630-
"delim_whitespace=True; you can only specify one."
631-
)
632-
633-
if engine is not None:
634-
engine_specified = True
635-
else:
636-
engine = "c"
637-
engine_specified = False
638-
639-
kwds.update(
640-
delimiter=delimiter,
641-
engine=engine,
642-
dialect=dialect,
643-
compression=compression,
644-
engine_specified=engine_specified,
645-
doublequote=doublequote,
646-
escapechar=escapechar,
647-
quotechar=quotechar,
648-
quoting=quoting,
649-
skipinitialspace=skipinitialspace,
650-
lineterminator=lineterminator,
651-
header=header,
652-
index_col=index_col,
653-
names=names,
654-
prefix=prefix,
655-
skiprows=skiprows,
656-
skipfooter=skipfooter,
657-
na_values=na_values,
658-
true_values=true_values,
659-
false_values=false_values,
660-
keep_default_na=keep_default_na,
661-
thousands=thousands,
662-
comment=comment,
663-
decimal=decimal,
664-
parse_dates=parse_dates,
665-
keep_date_col=keep_date_col,
666-
dayfirst=dayfirst,
667-
date_parser=date_parser,
668-
cache_dates=cache_dates,
669-
nrows=nrows,
670-
iterator=iterator,
671-
chunksize=chunksize,
672-
converters=converters,
673-
dtype=dtype,
674-
usecols=usecols,
675-
verbose=verbose,
676-
encoding=encoding,
677-
squeeze=squeeze,
678-
memory_map=memory_map,
679-
float_precision=float_precision,
680-
na_filter=na_filter,
681-
delim_whitespace=delim_whitespace,
682-
warn_bad_lines=warn_bad_lines,
683-
error_bad_lines=error_bad_lines,
684-
low_memory=low_memory,
685-
mangle_dupe_cols=mangle_dupe_cols,
686-
infer_datetime_format=infer_datetime_format,
687-
skip_blank_lines=skip_blank_lines,
688-
storage_options=storage_options,
607+
kwds_defaults = _check_defaults_read(
608+
dialect, delimiter, delim_whitespace, engine, sep, defaults={"delimiter": ","}
689609
)
610+
kwds.update(kwds_defaults)
690611

691612
return _read(filepath_or_buffer, kwds)
692613

@@ -700,7 +621,7 @@ def read_csv(
700621
)
701622
def read_table(
702623
filepath_or_buffer: FilePathOrBuffer,
703-
sep="\t",
624+
sep=lib.no_default,
704625
delimiter=None,
705626
# Column and Index Locations and Names
706627
header="infer",
@@ -757,17 +678,16 @@ def read_table(
757678
memory_map=False,
758679
float_precision=None,
759680
):
760-
# TODO: validation duplicated in read_csv
761-
if delim_whitespace and (delimiter is not None or sep != "\t"):
762-
raise ValueError(
763-
"Specified a delimiter with both sep and "
764-
"delim_whitespace=True; you can only specify one."
765-
)
766-
if delim_whitespace:
767-
# In this case sep is not used so we set it to the read_csv
768-
# default to avoid a ValueError
769-
sep = ","
770-
return read_csv(**locals())
681+
kwds = locals()
682+
del kwds["filepath_or_buffer"]
683+
del kwds["sep"]
684+
685+
kwds_defaults = _check_defaults_read(
686+
dialect, delimiter, delim_whitespace, engine, sep, defaults={"delimiter": "\t"}
687+
)
688+
kwds.update(kwds_defaults)
689+
690+
return _read(filepath_or_buffer, kwds)
771691

772692

773693
def read_fwf(
@@ -3782,3 +3702,92 @@ def _make_reader(self, f):
37823702
self.skiprows,
37833703
self.infer_nrows,
37843704
)
3705+
3706+
3707+
def _check_defaults_read(
3708+
dialect: Union[str, csv.Dialect],
3709+
delimiter: Union[str, object],
3710+
delim_whitespace: bool,
3711+
engine: str,
3712+
sep: Union[str, object],
3713+
defaults: Dict[str, Any],
3714+
):
3715+
"""Check default values of input parameters of read_csv, read_table.
3716+
3717+
Parameters
3718+
----------
3719+
dialect : str or csv.Dialect
3720+
If provided, this parameter will override values (default or not) for the
3721+
following parameters: `delimiter`, `doublequote`, `escapechar`,
3722+
`skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to
3723+
override values, a ParserWarning will be issued. See csv.Dialect
3724+
documentation for more details.
3725+
delimiter : str or object
3726+
Alias for sep.
3727+
delim_whitespace : bool
3728+
Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be
3729+
used as the sep. Equivalent to setting ``sep='\\s+'``. If this option
3730+
is set to True, nothing should be passed in for the ``delimiter``
3731+
parameter.
3732+
engine : {{'c', 'python'}}
3733+
Parser engine to use. The C engine is faster while the python engine is
3734+
currently more feature-complete.
3735+
sep : str or object
3736+
A delimiter provided by the user (str) or a sentinel value, i.e.
3737+
pandas._libs.lib.no_default.
3738+
defaults: dict
3739+
Default values of input parameters.
3740+
3741+
Returns
3742+
-------
3743+
kwds : dict
3744+
Input parameters with correct values.
3745+
3746+
Raises
3747+
------
3748+
ValueError : If a delimiter was specified with ``sep`` (or ``delimiter``) and
3749+
``delim_whitespace=True``.
3750+
"""
3751+
# fix types for sep, delimiter to Union(str, Any)
3752+
delim_default = defaults["delimiter"]
3753+
kwds: Dict[str, Any] = {}
3754+
# gh-23761
3755+
#
3756+
# When a dialect is passed, it overrides any of the overlapping
3757+
# parameters passed in directly. We don't want to warn if the
3758+
# default parameters were passed in (since it probably means
3759+
# that the user didn't pass them in explicitly in the first place).
3760+
#
3761+
# "delimiter" is the annoying corner case because we alias it to
3762+
# "sep" before doing comparison to the dialect values later on.
3763+
# Thus, we need a flag to indicate that we need to "override"
3764+
# the comparison to dialect values by checking if default values
3765+
# for BOTH "delimiter" and "sep" were provided.
3766+
if dialect is not None:
3767+
kwds["sep_override"] = (delimiter is None) and (
3768+
sep is lib.no_default or sep == delim_default
3769+
)
3770+
3771+
# Alias sep -> delimiter.
3772+
if delimiter is None:
3773+
delimiter = sep
3774+
3775+
if delim_whitespace and (delimiter is not lib.no_default):
3776+
raise ValueError(
3777+
"Specified a delimiter with both sep and "
3778+
"delim_whitespace=True; you can only specify one."
3779+
)
3780+
3781+
if delimiter is lib.no_default:
3782+
# assign default separator value
3783+
kwds["delimiter"] = delim_default
3784+
else:
3785+
kwds["delimiter"] = delimiter
3786+
3787+
if engine is not None:
3788+
kwds["engine_specified"] = True
3789+
else:
3790+
kwds["engine"] = "c"
3791+
kwds["engine_specified"] = False
3792+
3793+
return kwds

pandas/tests/io/parser/test_common.py

+22-2
Original file line numberDiff line numberDiff line change
@@ -2211,7 +2211,8 @@ def test_read_table_delim_whitespace_default_sep(all_parsers):
22112211
tm.assert_frame_equal(result, expected)
22122212

22132213

2214-
def test_read_table_delim_whitespace_non_default_sep(all_parsers):
2214+
@pytest.mark.parametrize("delimiter", [",", "\t"])
2215+
def test_read_csv_delim_whitespace_non_default_sep(all_parsers, delimiter):
22152216
# GH: 35958
22162217
f = StringIO("a b c\n1 -2 -3\n4 5 6")
22172218
parser = all_parsers
@@ -2220,4 +2221,23 @@ def test_read_table_delim_whitespace_non_default_sep(all_parsers):
22202221
"delim_whitespace=True; you can only specify one."
22212222
)
22222223
with pytest.raises(ValueError, match=msg):
2223-
parser.read_table(f, delim_whitespace=True, sep=",")
2224+
parser.read_csv(f, delim_whitespace=True, sep=delimiter)
2225+
2226+
with pytest.raises(ValueError, match=msg):
2227+
parser.read_csv(f, delim_whitespace=True, delimiter=delimiter)
2228+
2229+
2230+
@pytest.mark.parametrize("delimiter", [",", "\t"])
2231+
def test_read_table_delim_whitespace_non_default_sep(all_parsers, delimiter):
2232+
# GH: 35958
2233+
f = StringIO("a b c\n1 -2 -3\n4 5 6")
2234+
parser = all_parsers
2235+
msg = (
2236+
"Specified a delimiter with both sep and "
2237+
"delim_whitespace=True; you can only specify one."
2238+
)
2239+
with pytest.raises(ValueError, match=msg):
2240+
parser.read_table(f, delim_whitespace=True, sep=delimiter)
2241+
2242+
with pytest.raises(ValueError, match=msg):
2243+
parser.read_table(f, delim_whitespace=True, delimiter=delimiter)

0 commit comments

Comments
 (0)