From dcd2cbf765418705cd29e4b722d33935e3b88bb9 Mon Sep 17 00:00:00 2001 From: Oleh Kozynets Date: Mon, 28 Sep 2020 18:35:05 +0200 Subject: [PATCH 1/7] Fix delim_whitespace behavior in read_table, read_csv --- pandas/io/parsers.py | 32 ++++++++++-------- pandas/tests/io/parser/test_common.py | 48 +++++++++++++++++++++++++++ 2 files changed, 66 insertions(+), 14 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index dd3588faedf7a..1a74d5e9f32e7 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -542,7 +542,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): ) def read_csv( filepath_or_buffer: FilePathOrBuffer, - sep=",", + sep=lib.no_default, delimiter=None, # Column and Index Locations and Names header="infer", @@ -612,10 +612,8 @@ def read_csv( # Thus, we need a flag to indicate that we need to "override" # the comparison to dialect values by checking if default values # for BOTH "delimiter" and "sep" were provided. - default_sep = "," - if dialect is not None: - sep_override = delimiter is None and sep == default_sep + sep_override = (delimiter is None) and (sep is lib.no_default) kwds = dict(sep_override=sep_override) else: kwds = dict() @@ -624,12 +622,16 @@ def read_csv( if delimiter is None: delimiter = sep - if delim_whitespace and delimiter != default_sep: + if delim_whitespace and (delimiter is not lib.no_default): raise ValueError( "Specified a delimiter with both sep and " "delim_whitespace=True; you can only specify one." ) + if delimiter is lib.no_default: + # assign default separator value + delimiter = "," + if engine is not None: engine_specified = True else: @@ -700,7 +702,7 @@ def read_csv( ) def read_table( filepath_or_buffer: FilePathOrBuffer, - sep="\t", + sep=lib.no_default, delimiter=None, # Column and Index Locations and Names header="infer", @@ -758,15 +760,17 @@ def read_table( float_precision=None, ): # TODO: validation duplicated in read_csv - if delim_whitespace and (delimiter is not None or sep != "\t"): - raise ValueError( - "Specified a delimiter with both sep and " - "delim_whitespace=True; you can only specify one." - ) if delim_whitespace: - # In this case sep is not used so we set it to the read_csv - # default to avoid a ValueError - sep = "," + if (delimiter is not None) or (sep is not lib.no_default): + raise ValueError( + "Specified a delimiter with both sep and " + "delim_whitespace=True; you can only specify one." + ) + else: + if sep is lib.no_default: + # assign default delimeter value + sep = "\t" + return read_csv(**locals()) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 78c2f2bce5a02..5f45ac2cfa015 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -2211,6 +2211,21 @@ def test_read_table_delim_whitespace_default_sep(all_parsers): tm.assert_frame_equal(result, expected) +def test_read_csv_delim_whitespace_non_default_sep(all_parsers): + # GH: 35958 + f = StringIO("a b c\n1 -2 -3\n4 5 6") + parser = all_parsers + msg = ( + "Specified a delimiter with both sep and " + "delim_whitespace=True; you can only specify one." + ) + with pytest.raises(ValueError, match=msg): + parser.read_csv(f, delim_whitespace=True, sep="\t") + + with pytest.raises(ValueError, match=msg): + parser.read_csv(f, delim_whitespace=True, delimiter="\t") + + def test_read_table_delim_whitespace_non_default_sep(all_parsers): # GH: 35958 f = StringIO("a b c\n1 -2 -3\n4 5 6") @@ -2221,3 +2236,36 @@ def test_read_table_delim_whitespace_non_default_sep(all_parsers): ) with pytest.raises(ValueError, match=msg): parser.read_table(f, delim_whitespace=True, sep=",") + + with pytest.raises(ValueError, match=msg): + parser.read_table(f, delim_whitespace=True, delimiter=",") + + +def test_read_csv_delim_whitespace_explicit_default_sep(all_parsers): + # GH: 35958 + f = StringIO("a b c\n1 -2 -3\n4 5 6") + parser = all_parsers + msg = ( + "Specified a delimiter with both sep and " + "delim_whitespace=True; you can only specify one." + ) + with pytest.raises(ValueError, match=msg): + parser.read_csv(f, delim_whitespace=True, sep=",") + + with pytest.raises(ValueError, match=msg): + parser.read_csv(f, delim_whitespace=True, delimiter=",") + + +def test_read_table_delim_whitespace_explicit_default_sep(all_parsers): + # GH: 35958 + f = StringIO("a b c\n1 -2 -3\n4 5 6") + parser = all_parsers + msg = ( + "Specified a delimiter with both sep and " + "delim_whitespace=True; you can only specify one." + ) + with pytest.raises(ValueError, match=msg): + parser.read_table(f, delim_whitespace=True, sep="\t") + + with pytest.raises(ValueError, match=msg): + parser.read_table(f, delim_whitespace=True, delimiter="\t") From 9019fd01263b3661e7dbe24ec8a827d52b1850f4 Mon Sep 17 00:00:00 2001 From: Oleh Kozynets Date: Mon, 28 Sep 2020 22:42:52 +0200 Subject: [PATCH 2/7] Fix CI errors. --- pandas/io/parsers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 1a74d5e9f32e7..ee6bc09194df0 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -613,7 +613,8 @@ def read_csv( # the comparison to dialect values by checking if default values # for BOTH "delimiter" and "sep" were provided. if dialect is not None: - sep_override = (delimiter is None) and (sep is lib.no_default) + sep_override = (delimiter is None) and (sep is lib.no_default + or sep == ",") kwds = dict(sep_override=sep_override) else: kwds = dict() From b75d940f42aa30917c4f60cb636ad7217af062d5 Mon Sep 17 00:00:00 2001 From: Oleh Kozynets Date: Tue, 29 Sep 2020 22:51:14 +0200 Subject: [PATCH 3/7] Code review fixes. --- pandas/io/parsers.py | 3 +- pandas/tests/io/parser/test_common.py | 44 +++++---------------------- 2 files changed, 9 insertions(+), 38 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index ee6bc09194df0..805fec34ea2d1 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -613,8 +613,7 @@ def read_csv( # the comparison to dialect values by checking if default values # for BOTH "delimiter" and "sep" were provided. if dialect is not None: - sep_override = (delimiter is None) and (sep is lib.no_default - or sep == ",") + sep_override = (delimiter is None) and (sep is lib.no_default or sep == ",") kwds = dict(sep_override=sep_override) else: kwds = dict() diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 5f45ac2cfa015..a6a9e5c5610f2 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -2211,7 +2211,8 @@ def test_read_table_delim_whitespace_default_sep(all_parsers): tm.assert_frame_equal(result, expected) -def test_read_csv_delim_whitespace_non_default_sep(all_parsers): +@pytest.mark.parametrize("delimiter", [",", "\t"]) +def test_read_csv_delim_whitespace_non_default_sep(all_parsers, delimiter): # GH: 35958 f = StringIO("a b c\n1 -2 -3\n4 5 6") parser = all_parsers @@ -2220,13 +2221,14 @@ def test_read_csv_delim_whitespace_non_default_sep(all_parsers): "delim_whitespace=True; you can only specify one." ) with pytest.raises(ValueError, match=msg): - parser.read_csv(f, delim_whitespace=True, sep="\t") + parser.read_csv(f, delim_whitespace=True, sep=delimiter) with pytest.raises(ValueError, match=msg): - parser.read_csv(f, delim_whitespace=True, delimiter="\t") + parser.read_csv(f, delim_whitespace=True, delimiter=delimiter) -def test_read_table_delim_whitespace_non_default_sep(all_parsers): +@pytest.mark.parametrize("delimiter", [",", "\t"]) +def test_read_table_delim_whitespace_non_default_sep(all_parsers, delimiter): # GH: 35958 f = StringIO("a b c\n1 -2 -3\n4 5 6") parser = all_parsers @@ -2235,37 +2237,7 @@ def test_read_table_delim_whitespace_non_default_sep(all_parsers): "delim_whitespace=True; you can only specify one." ) with pytest.raises(ValueError, match=msg): - parser.read_table(f, delim_whitespace=True, sep=",") + parser.read_table(f, delim_whitespace=True, sep=delimiter) with pytest.raises(ValueError, match=msg): - parser.read_table(f, delim_whitespace=True, delimiter=",") - - -def test_read_csv_delim_whitespace_explicit_default_sep(all_parsers): - # GH: 35958 - f = StringIO("a b c\n1 -2 -3\n4 5 6") - parser = all_parsers - msg = ( - "Specified a delimiter with both sep and " - "delim_whitespace=True; you can only specify one." - ) - with pytest.raises(ValueError, match=msg): - parser.read_csv(f, delim_whitespace=True, sep=",") - - with pytest.raises(ValueError, match=msg): - parser.read_csv(f, delim_whitespace=True, delimiter=",") - - -def test_read_table_delim_whitespace_explicit_default_sep(all_parsers): - # GH: 35958 - f = StringIO("a b c\n1 -2 -3\n4 5 6") - parser = all_parsers - msg = ( - "Specified a delimiter with both sep and " - "delim_whitespace=True; you can only specify one." - ) - with pytest.raises(ValueError, match=msg): - parser.read_table(f, delim_whitespace=True, sep="\t") - - with pytest.raises(ValueError, match=msg): - parser.read_table(f, delim_whitespace=True, delimiter="\t") + parser.read_table(f, delim_whitespace=True, delimiter=delimiter) From 960e2802931de3f75b4f31725f37878fbb7c771d Mon Sep 17 00:00:00 2001 From: Oleh Kozynets Date: Fri, 2 Oct 2020 14:12:04 +0200 Subject: [PATCH 4/7] Add bugfix to whatsnew --- doc/source/whatsnew/v1.2.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 016e8d90e7d21..66ea5a3980167 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -382,6 +382,7 @@ I/O - Bug in :meth:`read_csv` with ``engine='python'`` truncating data if multiple items present in first row and first element started with BOM (:issue:`36343`) - Removed ``private_key`` and ``verbose`` from :func:`read_gbq` as they are no longer supported in ``pandas-gbq`` (:issue:`34654`, :issue:`30200`) - Bumped minimum pytables version to 3.5.1 to avoid a ``ValueError`` in :meth:`read_hdf` (:issue:`24839`) +- BUG in :func:`read_table` and :func:`read_csv` when ``delim_whitespace=True`` and ``sep=default`` (:issue:`36583`) Plotting ^^^^^^^^ From d2dd8cdaf429fbcd965f2e0b24c8972779930f77 Mon Sep 17 00:00:00 2001 From: Oleh Kozynets Date: Fri, 2 Oct 2020 14:12:40 +0200 Subject: [PATCH 5/7] Move parameter validation to a separate function. --- pandas/io/parsers.py | 162 ++++++++++++++++--------------------------- 1 file changed, 58 insertions(+), 104 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 805fec34ea2d1..31ac1b6dc4bc8 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -600,97 +600,7 @@ def read_csv( float_precision=None, storage_options: StorageOptions = None, ): - # gh-23761 - # - # When a dialect is passed, it overrides any of the overlapping - # parameters passed in directly. We don't want to warn if the - # default parameters were passed in (since it probably means - # that the user didn't pass them in explicitly in the first place). - # - # "delimiter" is the annoying corner case because we alias it to - # "sep" before doing comparison to the dialect values later on. - # Thus, we need a flag to indicate that we need to "override" - # the comparison to dialect values by checking if default values - # for BOTH "delimiter" and "sep" were provided. - if dialect is not None: - sep_override = (delimiter is None) and (sep is lib.no_default or sep == ",") - kwds = dict(sep_override=sep_override) - else: - kwds = dict() - - # Alias sep -> delimiter. - if delimiter is None: - delimiter = sep - - if delim_whitespace and (delimiter is not lib.no_default): - raise ValueError( - "Specified a delimiter with both sep and " - "delim_whitespace=True; you can only specify one." - ) - - if delimiter is lib.no_default: - # assign default separator value - delimiter = "," - - if engine is not None: - engine_specified = True - else: - engine = "c" - engine_specified = False - - kwds.update( - delimiter=delimiter, - engine=engine, - dialect=dialect, - compression=compression, - engine_specified=engine_specified, - doublequote=doublequote, - escapechar=escapechar, - quotechar=quotechar, - quoting=quoting, - skipinitialspace=skipinitialspace, - lineterminator=lineterminator, - header=header, - index_col=index_col, - names=names, - prefix=prefix, - skiprows=skiprows, - skipfooter=skipfooter, - na_values=na_values, - true_values=true_values, - false_values=false_values, - keep_default_na=keep_default_na, - thousands=thousands, - comment=comment, - decimal=decimal, - parse_dates=parse_dates, - keep_date_col=keep_date_col, - dayfirst=dayfirst, - date_parser=date_parser, - cache_dates=cache_dates, - nrows=nrows, - iterator=iterator, - chunksize=chunksize, - converters=converters, - dtype=dtype, - usecols=usecols, - verbose=verbose, - encoding=encoding, - squeeze=squeeze, - memory_map=memory_map, - float_precision=float_precision, - na_filter=na_filter, - delim_whitespace=delim_whitespace, - warn_bad_lines=warn_bad_lines, - error_bad_lines=error_bad_lines, - low_memory=low_memory, - mangle_dupe_cols=mangle_dupe_cols, - infer_datetime_format=infer_datetime_format, - skip_blank_lines=skip_blank_lines, - storage_options=storage_options, - ) - - return _read(filepath_or_buffer, kwds) + return _check_defaults_read(locals(), {"delimiter": ","}) @Appender( @@ -759,19 +669,7 @@ def read_table( memory_map=False, float_precision=None, ): - # TODO: validation duplicated in read_csv - if delim_whitespace: - if (delimiter is not None) or (sep is not lib.no_default): - raise ValueError( - "Specified a delimiter with both sep and " - "delim_whitespace=True; you can only specify one." - ) - else: - if sep is lib.no_default: - # assign default delimeter value - sep = "\t" - - return read_csv(**locals()) + return _check_defaults_read(locals(), {"delimiter": "\t"}) def read_fwf( @@ -3786,3 +3684,59 @@ def _make_reader(self, f): self.skiprows, self.infer_nrows, ) + + +def _check_defaults_read(read_kwds: dict, defaults: dict): + """Check default values of input parameters, read a line file.""" + engine = read_kwds["engine"] + dialect = read_kwds["dialect"] + delimiter = read_kwds["delimiter"] + delim_whitespace = read_kwds["delim_whitespace"] + filepath_or_buffer = read_kwds["filepath_or_buffer"] + sep = read_kwds["sep"] + + del read_kwds["filepath_or_buffer"] + del read_kwds["sep"] + + delimiter_default = defaults["delimiter"] + + # gh-23761 + # + # When a dialect is passed, it overrides any of the overlapping + # parameters passed in directly. We don't want to warn if the + # default parameters were passed in (since it probably means + # that the user didn't pass them in explicitly in the first place). + # + # "delimiter" is the annoying corner case because we alias it to + # "sep" before doing comparison to the dialect values later on. + # Thus, we need a flag to indicate that we need to "override" + # the comparison to dialect values by checking if default values + # for BOTH "delimiter" and "sep" were provided. + if dialect is not None: + read_kwds["sep_override"] = (delimiter is None) and ( + sep is lib.no_default or sep == delimiter_default + ) + + # Alias sep -> delimiter. + if delimiter is None: + delimiter = sep + + if delim_whitespace and (delimiter is not lib.no_default): + raise ValueError( + "Specified a delimiter with both sep and " + "delim_whitespace=True; you can only specify one." + ) + + if delimiter is lib.no_default: + # assign default separator value + read_kwds["delimiter"] = delimiter_default + else: + read_kwds["delimiter"] = delimiter + + if engine is not None: + read_kwds["engine_specified"] = True + else: + read_kwds["engine"] = "c" + read_kwds["engine_specified"] = False + + return _read(filepath_or_buffer, read_kwds) From 2912dc4ee8a5a4589185d1cbac782257fa40d1d4 Mon Sep 17 00:00:00 2001 From: Oleh Kozynets Date: Sat, 3 Oct 2020 12:58:40 +0200 Subject: [PATCH 6/7] Code review changes to _check_defaults_read. --- pandas/io/parsers.py | 91 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 70 insertions(+), 21 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 31ac1b6dc4bc8..595b27111e3dc 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -600,7 +600,16 @@ def read_csv( float_precision=None, storage_options: StorageOptions = None, ): - return _check_defaults_read(locals(), {"delimiter": ","}) + kwds = locals() + del kwds["filepath_or_buffer"] + del kwds["sep"] + + kwds_defaults = _check_defaults_read( + dialect, delimiter, delim_whitespace, engine, sep, defaults={"delimiter": ","} + ) + kwds.update(kwds_defaults) + + return _read(filepath_or_buffer, kwds) @Appender( @@ -669,7 +678,16 @@ def read_table( memory_map=False, float_precision=None, ): - return _check_defaults_read(locals(), {"delimiter": "\t"}) + kwds = locals() + del kwds["filepath_or_buffer"] + del kwds["sep"] + + kwds_defaults = _check_defaults_read( + dialect, delimiter, delim_whitespace, engine, sep, defaults={"delimiter": "\t"} + ) + kwds.update(kwds_defaults) + + return _read(filepath_or_buffer, kwds) def read_fwf( @@ -3686,20 +3704,51 @@ def _make_reader(self, f): ) -def _check_defaults_read(read_kwds: dict, defaults: dict): - """Check default values of input parameters, read a line file.""" - engine = read_kwds["engine"] - dialect = read_kwds["dialect"] - delimiter = read_kwds["delimiter"] - delim_whitespace = read_kwds["delim_whitespace"] - filepath_or_buffer = read_kwds["filepath_or_buffer"] - sep = read_kwds["sep"] +def _check_defaults_read( + dialect: Union[str, csv.Dialect], + delimiter: str, + delim_whitespace: bool, + engine: str, + sep: str, + defaults: Dict[str, Any], +): + """Check default values of input parameters of read_csv, read_table. - del read_kwds["filepath_or_buffer"] - del read_kwds["sep"] + Parameters + ---------- + dialect : str or csv.Dialect + If provided, this parameter will override values (default or not) for the + following parameters: `delimiter`, `doublequote`, `escapechar`, + `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to + override values, a ParserWarning will be issued. See csv.Dialect + documentation for more details. + delimiter : str + Alias for sep. + delim_whitespace : bool + Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be + used as the sep. Equivalent to setting ``sep='\\s+'``. If this option + is set to True, nothing should be passed in for the ``delimiter`` + parameter. + engine : {{'c', 'python'}} + Parser engine to use. The C engine is faster while the python engine is + currently more feature-complete. + sep : str + Delimiter to use. + defaults: Dict[str, Any] + Default values of input parameters. - delimiter_default = defaults["delimiter"] + Returns + ------- + kwds : dict + Input parameters with correct values. + Raises + ------ + ValueError : If a delimiter was specified with ``sep`` (or ``delimiter``) and + ``delim_whitespace=True``. + """ + delim_default = defaults["delimiter"] + kwds = {} # gh-23761 # # When a dialect is passed, it overrides any of the overlapping @@ -3713,8 +3762,8 @@ def _check_defaults_read(read_kwds: dict, defaults: dict): # the comparison to dialect values by checking if default values # for BOTH "delimiter" and "sep" were provided. if dialect is not None: - read_kwds["sep_override"] = (delimiter is None) and ( - sep is lib.no_default or sep == delimiter_default + kwds["sep_override"] = (delimiter is None) and ( + sep is lib.no_default or sep == delim_default ) # Alias sep -> delimiter. @@ -3729,14 +3778,14 @@ def _check_defaults_read(read_kwds: dict, defaults: dict): if delimiter is lib.no_default: # assign default separator value - read_kwds["delimiter"] = delimiter_default + kwds["delimiter"] = delim_default else: - read_kwds["delimiter"] = delimiter + kwds["delimiter"] = delimiter if engine is not None: - read_kwds["engine_specified"] = True + kwds["engine_specified"] = True else: - read_kwds["engine"] = "c" - read_kwds["engine_specified"] = False + kwds["engine"] = "c" + kwds["engine_specified"] = False - return _read(filepath_or_buffer, read_kwds) + return kwds From e98b26d6eec34f78e6f7ed77a7ad5335cdf8d318 Mon Sep 17 00:00:00 2001 From: Oleh Kozynets Date: Sat, 3 Oct 2020 18:54:25 +0200 Subject: [PATCH 7/7] Fix typing errors. --- pandas/io/parsers.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 595b27111e3dc..ede4fdc5e1d8b 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -3706,10 +3706,10 @@ def _make_reader(self, f): def _check_defaults_read( dialect: Union[str, csv.Dialect], - delimiter: str, + delimiter: Union[str, object], delim_whitespace: bool, engine: str, - sep: str, + sep: Union[str, object], defaults: Dict[str, Any], ): """Check default values of input parameters of read_csv, read_table. @@ -3722,7 +3722,7 @@ def _check_defaults_read( `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to override values, a ParserWarning will be issued. See csv.Dialect documentation for more details. - delimiter : str + delimiter : str or object Alias for sep. delim_whitespace : bool Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be @@ -3732,9 +3732,10 @@ def _check_defaults_read( engine : {{'c', 'python'}} Parser engine to use. The C engine is faster while the python engine is currently more feature-complete. - sep : str - Delimiter to use. - defaults: Dict[str, Any] + sep : str or object + A delimiter provided by the user (str) or a sentinel value, i.e. + pandas._libs.lib.no_default. + defaults: dict Default values of input parameters. Returns @@ -3747,8 +3748,9 @@ def _check_defaults_read( ValueError : If a delimiter was specified with ``sep`` (or ``delimiter``) and ``delim_whitespace=True``. """ + # fix types for sep, delimiter to Union(str, Any) delim_default = defaults["delimiter"] - kwds = {} + kwds: Dict[str, Any] = {} # gh-23761 # # When a dialect is passed, it overrides any of the overlapping