From 30d08f1c6a3810bc9340ce91da026095e729f38e Mon Sep 17 00:00:00 2001 From: Mohammad Jafar Mashhadi Date: Tue, 23 Jun 2020 10:24:21 -0600 Subject: [PATCH 1/5] ENH: GH34946 Check type of names argument to `read_csv`, `read_table`, `read_fwf` to not be a `set`. --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/io/parsers.py | 9 +++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 60aa1759958f6..25f2b30b70b67 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1121,6 +1121,7 @@ Other - :class:`IntegerArray` now implements the ``sum`` operation (:issue:`33172`) - Bug in :class:`Tick` comparisons raising ``TypeError`` when comparing against timedelta-like objects (:issue:`34088`) - Bug in :class:`Tick` multiplication raising ``TypeError`` when multiplying by a float (:issue:`34486`) +- Passing a `set` as `names` argument to :func:`pandas.read_csv`, :func:`pandas.read_table`, or :func:`pandas.read_fwf` will raise ``ValueError: Names should have consistent ordering. Consider a list instead.`` (:issue:`34946`) .. --------------------------------------------------------------------------- diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 679cf4c2d8929..c52a33b90acce 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -397,7 +397,8 @@ def _validate_integer(name, val, min_val=0): def _validate_names(names): """ - Raise ValueError if the `names` parameter contains duplicates. + Raise ValueError if the `names` parameter contains duplicates or has an + invalid data type. Parameters ---------- @@ -407,11 +408,15 @@ def _validate_names(names): Raises ------ ValueError - If names are not unique. + If names are not unique or have incosistent ordering (e.g. set). """ if names is not None: if len(names) != len(set(names)): raise ValueError("Duplicate names are not allowed.") + if not is_list_like(names, allow_sets=False): + raise ValueError( + "Names should have consistent ordering. Consider a list instead." + ) def _read(filepath_or_buffer: FilePathOrBuffer, kwds): From b0f1bcd5a082bfb466c6a880c91f5cb6f2ad9519 Mon Sep 17 00:00:00 2001 From: Mohammad Jafar Mashhadi Date: Tue, 23 Jun 2020 12:11:16 -0600 Subject: [PATCH 2/5] Added a test case --- pandas/tests/io/parser/test_common.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index e38fcf1380220..904f9b79af3cf 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -2135,3 +2135,22 @@ def test_no_header_two_extra_columns(all_parsers): parser = all_parsers df = parser.read_csv(stream, header=None, names=column_names, index_col=False) tm.assert_frame_equal(df, ref) + + +def test_read_csv_names_types(all_parsers): + # GH 34946 + data = """\ + 1,2,3 + 4,5,6 + 7,8,9 + 10,11,12\n""" + parser = all_parsers + msg = "Names should have consistent ordering. Consider a list instead." + names = "QAZ" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), names=set(names)) + + ref = DataFrame(data={"Q": [1, 4, 7, 10], "A": [2, 5, 8, 11], "Z": [3, 6, 9, 12]}) + for valid_type_converter in (list, tuple): + df = parser.read_csv(StringIO(data), names=valid_type_converter(names)) + tm.assert_frame_equal(df, ref) From 3e81b760997b2d27e7abb743bc39fd40904aef7a Mon Sep 17 00:00:00 2001 From: Mohammad Jafar Mashhadi Date: Tue, 23 Jun 2020 22:09:25 -0600 Subject: [PATCH 3/5] Update pandas/io/parsers.py Co-authored-by: gfyoung --- pandas/io/parsers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index c52a33b90acce..4cfd3abfc6cb5 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -408,7 +408,7 @@ def _validate_names(names): Raises ------ ValueError - If names are not unique or have incosistent ordering (e.g. set). + If names are not unique or have inconsistent ordering (e.g. set). """ if names is not None: if len(names) != len(set(names)): From 66586e628f6a9ff2d638cd3d7c09490f6fa14e9b Mon Sep 17 00:00:00 2001 From: Mohammad Jafar Mashhadi Date: Tue, 23 Jun 2020 22:26:37 -0600 Subject: [PATCH 4/5] Updated the docs and the error message to be more idiomatic. --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/io/parsers.py | 6 ++---- pandas/tests/io/parser/test_common.py | 2 +- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 25f2b30b70b67..9d151c78b2048 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1121,7 +1121,7 @@ Other - :class:`IntegerArray` now implements the ``sum`` operation (:issue:`33172`) - Bug in :class:`Tick` comparisons raising ``TypeError`` when comparing against timedelta-like objects (:issue:`34088`) - Bug in :class:`Tick` multiplication raising ``TypeError`` when multiplying by a float (:issue:`34486`) -- Passing a `set` as `names` argument to :func:`pandas.read_csv`, :func:`pandas.read_table`, or :func:`pandas.read_fwf` will raise ``ValueError: Names should have consistent ordering. Consider a list instead.`` (:issue:`34946`) +- Passing a `set` as `names` argument to :func:`pandas.read_csv`, :func:`pandas.read_table`, or :func:`pandas.read_fwf` will raise ``ValueError: Names should be an ordered collection.`` (:issue:`34946`) .. --------------------------------------------------------------------------- diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 4cfd3abfc6cb5..62347f7110d76 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -408,15 +408,13 @@ def _validate_names(names): Raises ------ ValueError - If names are not unique or have inconsistent ordering (e.g. set). + If names are not unique or are not ordered (e.g. set). """ if names is not None: if len(names) != len(set(names)): raise ValueError("Duplicate names are not allowed.") if not is_list_like(names, allow_sets=False): - raise ValueError( - "Names should have consistent ordering. Consider a list instead." - ) + raise ValueError("Names should be an ordered collection.") def _read(filepath_or_buffer: FilePathOrBuffer, kwds): diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 904f9b79af3cf..41599546a455b 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -2145,7 +2145,7 @@ def test_read_csv_names_types(all_parsers): 7,8,9 10,11,12\n""" parser = all_parsers - msg = "Names should have consistent ordering. Consider a list instead." + msg = "Names should be an ordered collection." names = "QAZ" with pytest.raises(ValueError, match=msg): parser.read_csv(StringIO(data), names=set(names)) From 0ff2e3ea70f35de4df120ac5c100c3eac280d484 Mon Sep 17 00:00:00 2001 From: Mohammad Jafar Mashhadi Date: Tue, 23 Jun 2020 22:33:05 -0600 Subject: [PATCH 5/5] Removed an unnecessary part of the test. --- pandas/tests/io/parser/test_common.py | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 41599546a455b..e6e868689b060 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -2137,20 +2137,11 @@ def test_no_header_two_extra_columns(all_parsers): tm.assert_frame_equal(df, ref) -def test_read_csv_names_types(all_parsers): +def test_read_csv_names_not_accepting_sets(all_parsers): # GH 34946 data = """\ 1,2,3 - 4,5,6 - 7,8,9 - 10,11,12\n""" + 4,5,6\n""" parser = all_parsers - msg = "Names should be an ordered collection." - names = "QAZ" - with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(data), names=set(names)) - - ref = DataFrame(data={"Q": [1, 4, 7, 10], "A": [2, 5, 8, 11], "Z": [3, 6, 9, 12]}) - for valid_type_converter in (list, tuple): - df = parser.read_csv(StringIO(data), names=valid_type_converter(names)) - tm.assert_frame_equal(df, ref) + with pytest.raises(ValueError, match="Names should be an ordered collection."): + parser.read_csv(StringIO(data), names=set("QAZ"))