From 2fbbecaf880b7975ce99f612ca37bba066c159da Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 24 Apr 2021 23:57:49 +0200 Subject: [PATCH 1/2] BUG: raise ValueError when sep and delimiter are defined in read_csv --- doc/source/whatsnew/v1.3.0.rst | 1 + pandas/io/parsers/readers.py | 3 +++ pandas/tests/io/parser/common/test_common_basic.py | 9 +++++++++ 3 files changed, 13 insertions(+) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 842b50ce53b21..b1c52afd0a478 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -797,6 +797,7 @@ I/O - Bug in :func:`read_excel` raising ``AttributeError`` with ``MultiIndex`` header followed by two empty rows and no index, and bug affecting :func:`read_excel`, :func:`read_csv`, :func:`read_table`, :func:`read_fwf`, and :func:`read_clipboard` where one blank row after a ``MultiIndex`` header with no index would be dropped (:issue:`40442`) - Bug in :meth:`DataFrame.to_string` misplacing the truncation column when ``index=False`` (:issue:`40907`) - Bug in :func:`read_orc` always raising ``AttributeError`` (:issue:`40918`) +- Bug in :func:`read_csv` silently ignoring ``sep`` if ``delimiter`` and ``sep`` are defined, now raising ``ValueError`` (:issue:`39823`) Period ^^^^^^ diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 796d44dc7877a..f4867bd1af5f1 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -1230,6 +1230,9 @@ def _refine_defaults_read( sep is lib.no_default or sep == delim_default ) + if delimiter and (sep is not lib.no_default): + raise ValueError("Specified a sep and a delimiter; you can only specify one.") + # Alias sep -> delimiter. if delimiter is None: delimiter = sep diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 572bc09c96886..6fccf79238588 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -709,6 +709,15 @@ def test_read_csv_delim_whitespace_non_default_sep(all_parsers, delimiter): parser.read_csv(f, delim_whitespace=True, delimiter=delimiter) +def test_read_csv_delimiter_and_sep_no_default(all_parsers): + # GH#39823 + f = StringIO("a,b\n1,2") + parser = all_parsers + msg = "Specified a sep and a delimiter; you can only specify one." + with pytest.raises(ValueError, match=msg): + parser.read_csv(f, sep=" ", delimiter=".") + + @pytest.mark.parametrize("delimiter", [",", "\t"]) def test_read_table_delim_whitespace_non_default_sep(all_parsers, delimiter): # GH: 35958 From f2c1af7ba55198a0a77a1678c009779ae8a55944 Mon Sep 17 00:00:00 2001 From: phofl Date: Thu, 13 May 2021 00:48:31 +0200 Subject: [PATCH 2/2] BUG: Raise ValueError if names and prefix are both defined --- doc/source/whatsnew/v1.3.0.rst | 1 + pandas/io/parsers/readers.py | 41 ++++++++++++++++--- .../io/parser/common/test_common_basic.py | 12 ++++++ 3 files changed, 48 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index c02cb911536e0..2d4ee171c1834 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -840,6 +840,7 @@ I/O - Bug in :func:`read_excel` raising ``AttributeError`` with ``MultiIndex`` header followed by two empty rows and no index, and bug affecting :func:`read_excel`, :func:`read_csv`, :func:`read_table`, :func:`read_fwf`, and :func:`read_clipboard` where one blank row after a ``MultiIndex`` header with no index would be dropped (:issue:`40442`) - Bug in :meth:`DataFrame.to_string` misplacing the truncation column when ``index=False`` (:issue:`40907`) - Bug in :func:`read_orc` always raising ``AttributeError`` (:issue:`40918`) +- Bug in :func:`read_csv` and :func:`read_table` silently ignoring ``prefix`` if ``names`` and ``prefix`` are defined, now raising ``ValueError`` (:issue:`39123`) - Bug in :func:`read_csv` and :func:`read_excel` not respecting dtype for duplicated column name when ``mangle_dupe_cols`` is set to ``True`` (:issue:`35211`) - Bug in :func:`read_csv` silently ignoring ``sep`` if ``delimiter`` and ``sep`` are defined, now raising ``ValueError`` (:issue:`39823`) - Bug in :func:`read_csv` and :func:`read_table` misinterpreting arguments when ``sys.setprofile`` had been previously called (:issue:`41069`) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 6874e1f00b744..159142569108c 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -20,6 +20,7 @@ import pandas._libs.lib as lib from pandas._libs.parsers import STR_NA_VALUES from pandas._typing import ( + ArrayLike, DtypeArg, FilePathOrBuffer, StorageOptions, @@ -485,11 +486,11 @@ def read_csv( delimiter=None, # Column and Index Locations and Names header="infer", - names=None, + names=lib.no_default, index_col=None, usecols=None, squeeze=False, - prefix=None, + prefix=lib.no_default, mangle_dupe_cols=True, # General Parsing Configuration dtype: Optional[DtypeArg] = None, @@ -546,7 +547,14 @@ def read_csv( del kwds["sep"] kwds_defaults = _refine_defaults_read( - dialect, delimiter, delim_whitespace, engine, sep, defaults={"delimiter": ","} + dialect, + delimiter, + delim_whitespace, + engine, + sep, + names, + prefix, + defaults={"delimiter": ","}, ) kwds.update(kwds_defaults) @@ -567,11 +575,11 @@ def read_table( delimiter=None, # Column and Index Locations and Names header="infer", - names=None, + names=lib.no_default, index_col=None, usecols=None, squeeze=False, - prefix=None, + prefix=lib.no_default, mangle_dupe_cols=True, # General Parsing Configuration dtype: Optional[DtypeArg] = None, @@ -627,7 +635,14 @@ def read_table( del kwds["sep"] kwds_defaults = _refine_defaults_read( - dialect, delimiter, delim_whitespace, engine, sep, defaults={"delimiter": "\t"} + dialect, + delimiter, + delim_whitespace, + engine, + sep, + names, + prefix, + defaults={"delimiter": "\t"}, ) kwds.update(kwds_defaults) @@ -1174,6 +1189,8 @@ def _refine_defaults_read( delim_whitespace: bool, engine: str, sep: Union[str, object], + names: Optional[ArrayLike], + prefix: Optional[str], defaults: Dict[str, Any], ): """Validate/refine default values of input parameters of read_csv, read_table. @@ -1199,6 +1216,12 @@ def _refine_defaults_read( sep : str or object A delimiter provided by the user (str) or a sentinel value, i.e. pandas._libs.lib.no_default. + names : array-like, optional + List of column names to use. If the file contains a header row, + then you should explicitly pass ``header=0`` to override the column names. + Duplicates in this list are not allowed. + prefix : str, optional + Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ... defaults: dict Default values of input parameters. @@ -1235,6 +1258,12 @@ def _refine_defaults_read( if delimiter and (sep is not lib.no_default): raise ValueError("Specified a sep and a delimiter; you can only specify one.") + if names is not lib.no_default and prefix is not lib.no_default: + raise ValueError("Specified named and prefix; you can only specify one.") + + kwds["names"] = None if names is lib.no_default else names + kwds["prefix"] = None if prefix is lib.no_default else prefix + # Alias sep -> delimiter. if delimiter is None: delimiter = sep diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index d0bf3838cbef8..adafbf38439d5 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -749,6 +749,18 @@ def test_read_table_delim_whitespace_non_default_sep(all_parsers, delimiter): parser.read_table(f, delim_whitespace=True, delimiter=delimiter) +@pytest.mark.parametrize("func", ["read_csv", "read_table"]) +@pytest.mark.parametrize("prefix", [None, "x"]) +@pytest.mark.parametrize("names", [None, ["a"]]) +def test_names_and_prefix_not_lib_no_default(all_parsers, names, prefix, func): + # GH#39123 + f = StringIO("a,b\n1,2") + parser = all_parsers + msg = "Specified named and prefix; you can only specify one." + with pytest.raises(ValueError, match=msg): + getattr(parser, func)(f, names=names, prefix=prefix) + + def test_dict_keys_as_names(all_parsers): # GH: 36928 data = "1,2"