From b0e102a5e1406e023329c31190771e40c6a13156 Mon Sep 17 00:00:00 2001 From: Aaron Critchley Date: Tue, 22 Aug 2017 14:06:18 +0100 Subject: [PATCH 01/10] ENH: better error message if usecols doesn't match GH17301: Improving the error message given when usecols doesn't match with the columns provided. Signed-off-by: Aaron Critchley --- pandas/io/parsers.py | 12 ++++++++++-- pandas/tests/io/parser/usecols.py | 2 +- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index a9821be3fa5e2..4187842a7c03b 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1662,14 +1662,22 @@ def __init__(self, src, **kwds): # GH 14671 if (self.usecols_dtype == 'string' and not set(usecols).issubset(self.orig_names)): - raise ValueError("Usecols do not match names.") + missing = [c for c in usecols if c not in self.orig_names] + raise ValueError( + "Usecols do not match columns, " + "columns expected but not found: %s" % missing + ) if len(self.names) > len(usecols): self.names = [n for i, n in enumerate(self.names) if (i in usecols or n in usecols)] if len(self.names) < len(usecols): - raise ValueError("Usecols do not match names.") + missing = [c for c in usecols if c not in self.orig_names] + raise ValueError( + "Usecols do not match columns, " + "columns expected but not found: %s" % missing + ) self._set_noconvert_columns() diff --git a/pandas/tests/io/parser/usecols.py b/pandas/tests/io/parser/usecols.py index f582e5037ca07..4fe74aa700385 100644 --- a/pandas/tests/io/parser/usecols.py +++ b/pandas/tests/io/parser/usecols.py @@ -481,7 +481,7 @@ def test_raise_on_usecols_names_mismatch(self): data = 'a,b,c,d\n1,2,3,4\n5,6,7,8' if self.engine == 'c': - msg = 'Usecols do not match names' + msg = "do not match columns, columns expected but not found" else: msg = 'is not in list' From 15d47863503ff91e581c6948091a5b861375ad94 Mon Sep 17 00:00:00 2001 From: Aaron Critchley Date: Wed, 4 Oct 2017 20:07:43 +0100 Subject: [PATCH 02/10] WIP: Changing Python parser behaviour, better tests --- pandas/io/parsers.py | 14 +++++++++++--- pandas/tests/io/parser/usecols.py | 16 ++++++++-------- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 4187842a7c03b..e9b0aad81083b 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1665,7 +1665,7 @@ def __init__(self, src, **kwds): missing = [c for c in usecols if c not in self.orig_names] raise ValueError( "Usecols do not match columns, " - "columns expected but not found: %s" % missing + "columns expected but not found: {}".format(missing) ) if len(self.names) > len(usecols): @@ -1673,10 +1673,10 @@ def __init__(self, src, **kwds): if (i in usecols or n in usecols)] if len(self.names) < len(usecols): - missing = [c for c in usecols if c not in self.orig_names] + missing = [c for c in usecols if c not in self.names] raise ValueError( "Usecols do not match columns, " - "columns expected but not found: %s" % missing + "columns expected but not found: {}".format(missing) ) self._set_noconvert_columns() @@ -2450,6 +2450,14 @@ def _handle_usecols(self, columns, usecols_key): raise ValueError("If using multiple headers, usecols must " "be integers.") col_indices = [] + + missing = [c for c in self.usecols if c not in usecols_key] + if len(missing) > 0: + raise ValueError( + "Usecols do not match columns, " + "columns expected but not found: {}".format(missing) + ) + for col in self.usecols: if isinstance(col, string_types): col_indices.append(usecols_key.index(col)) diff --git a/pandas/tests/io/parser/usecols.py b/pandas/tests/io/parser/usecols.py index 4fe74aa700385..c64e1c4aa8821 100644 --- a/pandas/tests/io/parser/usecols.py +++ b/pandas/tests/io/parser/usecols.py @@ -480,10 +480,10 @@ def test_raise_on_usecols_names_mismatch(self): # GH 14671 data = 'a,b,c,d\n1,2,3,4\n5,6,7,8' - if self.engine == 'c': - msg = "do not match columns, columns expected but not found" - else: - msg = 'is not in list' + msg = ( + "Usecols do not match columns, " + "columns expected but not found: {}" + ) usecols = ['a', 'b', 'c', 'd'] df = self.read_csv(StringIO(data), usecols=usecols) @@ -492,11 +492,11 @@ def test_raise_on_usecols_names_mismatch(self): tm.assert_frame_equal(df, expected) usecols = ['a', 'b', 'c', 'f'] - with tm.assert_raises_regex(ValueError, msg): + with tm.assert_raises_regex(ValueError, msg.format(['f'])): self.read_csv(StringIO(data), usecols=usecols) usecols = ['a', 'b', 'f'] - with tm.assert_raises_regex(ValueError, msg): + with tm.assert_raises_regex(ValueError, msg.format(['f'])): self.read_csv(StringIO(data), usecols=usecols) names = ['A', 'B', 'C', 'D'] @@ -520,9 +520,9 @@ def test_raise_on_usecols_names_mismatch(self): # tm.assert_frame_equal(df, expected) usecols = ['A', 'B', 'C', 'f'] - with tm.assert_raises_regex(ValueError, msg): + with tm.assert_ra(ValueError, msg.format(['f'])): self.read_csv(StringIO(data), header=0, names=names, usecols=usecols) usecols = ['A', 'B', 'f'] - with tm.assert_raises_regex(ValueError, msg): + with tm.assert_raises_regex(ValueError, msg.format(['f'])): self.read_csv(StringIO(data), names=names, usecols=usecols) From 841a6cc5d37300387c43e9649cc7669fbe990ec1 Mon Sep 17 00:00:00 2001 From: Aaron Critchley Date: Wed, 4 Oct 2017 20:38:41 +0100 Subject: [PATCH 03/10] WIP: Implementing suggested changes --- pandas/io/parsers.py | 38 ++++++++++++++++--------------- pandas/tests/io/parser/usecols.py | 10 ++++---- 2 files changed, 25 insertions(+), 23 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index e9b0aad81083b..90a04e18cd65a 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1065,6 +1065,20 @@ def _evaluate_usecols(usecols, names): return usecols +def _validate_usecols(usecols, names): + """ + Validates that all usecols are present in a given + list of names, if not, raise a ValueError that + shows what usecols are missing. + """ + missing = [c for c in usecols if c not in names] + if len(missing) > 0: + raise ValueError( + "Usecols do not match columns, " + "columns expected but not found: {missing}".format(missing=missing) + ) + + def _validate_skipfooter_arg(skipfooter): """ Validate the 'skipfooter' parameter. @@ -1662,22 +1676,14 @@ def __init__(self, src, **kwds): # GH 14671 if (self.usecols_dtype == 'string' and not set(usecols).issubset(self.orig_names)): - missing = [c for c in usecols if c not in self.orig_names] - raise ValueError( - "Usecols do not match columns, " - "columns expected but not found: {}".format(missing) - ) + _validate_usecols(usecols, self.orig_names) if len(self.names) > len(usecols): self.names = [n for i, n in enumerate(self.names) if (i in usecols or n in usecols)] if len(self.names) < len(usecols): - missing = [c for c in usecols if c not in self.names] - raise ValueError( - "Usecols do not match columns, " - "columns expected but not found: {}".format(missing) - ) + _validate_usecols(usecols, self.names) self._set_noconvert_columns() @@ -2451,16 +2457,12 @@ def _handle_usecols(self, columns, usecols_key): "be integers.") col_indices = [] - missing = [c for c in self.usecols if c not in usecols_key] - if len(missing) > 0: - raise ValueError( - "Usecols do not match columns, " - "columns expected but not found: {}".format(missing) - ) - for col in self.usecols: if isinstance(col, string_types): - col_indices.append(usecols_key.index(col)) + try: + col_indices.append(usecols_key.index(col)) + except ValueError: + _validate_usecols(self.usecols, usecols_key) else: col_indices.append(col) else: diff --git a/pandas/tests/io/parser/usecols.py b/pandas/tests/io/parser/usecols.py index c64e1c4aa8821..88f268b11b6fd 100644 --- a/pandas/tests/io/parser/usecols.py +++ b/pandas/tests/io/parser/usecols.py @@ -482,7 +482,7 @@ def test_raise_on_usecols_names_mismatch(self): msg = ( "Usecols do not match columns, " - "columns expected but not found: {}" + "columns expected but not found: {missing}" ) usecols = ['a', 'b', 'c', 'd'] @@ -492,11 +492,11 @@ def test_raise_on_usecols_names_mismatch(self): tm.assert_frame_equal(df, expected) usecols = ['a', 'b', 'c', 'f'] - with tm.assert_raises_regex(ValueError, msg.format(['f'])): + with tm.assert_raises_regex(ValueError, msg.format(missing=['f'])): self.read_csv(StringIO(data), usecols=usecols) usecols = ['a', 'b', 'f'] - with tm.assert_raises_regex(ValueError, msg.format(['f'])): + with tm.assert_raises_regex(ValueError, msg.format(missing=['f'])): self.read_csv(StringIO(data), usecols=usecols) names = ['A', 'B', 'C', 'D'] @@ -520,9 +520,9 @@ def test_raise_on_usecols_names_mismatch(self): # tm.assert_frame_equal(df, expected) usecols = ['A', 'B', 'C', 'f'] - with tm.assert_ra(ValueError, msg.format(['f'])): + with tm.assert_ra(ValueError, msg.format(missing=['f'])): self.read_csv(StringIO(data), header=0, names=names, usecols=usecols) usecols = ['A', 'B', 'f'] - with tm.assert_raises_regex(ValueError, msg.format(['f'])): + with tm.assert_raises_regex(ValueError, msg.format(missing=['f'])): self.read_csv(StringIO(data), names=names, usecols=usecols) From dced1b7a19a24c53aae3ff51c0a80d0003e6dee9 Mon Sep 17 00:00:00 2001 From: Aaron Critchley Date: Wed, 4 Oct 2017 20:57:12 +0100 Subject: [PATCH 04/10] Fixing silly error in tests --- pandas/tests/io/parser/usecols.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/io/parser/usecols.py b/pandas/tests/io/parser/usecols.py index 88f268b11b6fd..210d40fd5c805 100644 --- a/pandas/tests/io/parser/usecols.py +++ b/pandas/tests/io/parser/usecols.py @@ -492,11 +492,11 @@ def test_raise_on_usecols_names_mismatch(self): tm.assert_frame_equal(df, expected) usecols = ['a', 'b', 'c', 'f'] - with tm.assert_raises_regex(ValueError, msg.format(missing=['f'])): + with tm.assert_raises_regex(ValueError, msg.format(missing="\['f'\]")): self.read_csv(StringIO(data), usecols=usecols) usecols = ['a', 'b', 'f'] - with tm.assert_raises_regex(ValueError, msg.format(missing=['f'])): + with tm.assert_raises_regex(ValueError, msg.format(missing="\['f'\]")): self.read_csv(StringIO(data), usecols=usecols) names = ['A', 'B', 'C', 'D'] @@ -520,9 +520,9 @@ def test_raise_on_usecols_names_mismatch(self): # tm.assert_frame_equal(df, expected) usecols = ['A', 'B', 'C', 'f'] - with tm.assert_ra(ValueError, msg.format(missing=['f'])): + with tm.assert_raises_regex(ValueError, msg.format(missing="\['f'\]")): self.read_csv(StringIO(data), header=0, names=names, usecols=usecols) usecols = ['A', 'B', 'f'] - with tm.assert_raises_regex(ValueError, msg.format(missing=['f'])): + with tm.assert_raises_regex(ValueError, msg.format(missing="\['f'\]")): self.read_csv(StringIO(data), names=names, usecols=usecols) From 5bf89a88d0166ad0ec59e93ac180e23bb24b92c7 Mon Sep 17 00:00:00 2001 From: Aaron Critchley Date: Thu, 5 Oct 2017 00:42:16 +0100 Subject: [PATCH 05/10] Better documentation as per @gfyoung --- pandas/io/parsers.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 90a04e18cd65a..491d5568b3412 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1068,8 +1068,24 @@ def _evaluate_usecols(usecols, names): def _validate_usecols(usecols, names): """ Validates that all usecols are present in a given - list of names, if not, raise a ValueError that + list of names. If not, raise a ValueError that shows what usecols are missing. + + Parameters + ---------- + usecols : iterable of usecols + The columns to validate are present in names. + names : iterable of names + The column names to check against. + + Returns + ------- + usecols : iterable of usecols + The `usecols` parameter if the validation succeeds. + + Raises + ------ + ValueError : Detailing which usecols are missing, if any. """ missing = [c for c in usecols if c not in names] if len(missing) > 0: @@ -1078,6 +1094,8 @@ def _validate_usecols(usecols, names): "columns expected but not found: {missing}".format(missing=missing) ) + return usecols + def _validate_skipfooter_arg(skipfooter): """ From ba93833774fb24a049176d7e7ec93cfafb647b4a Mon Sep 17 00:00:00 2001 From: Aaron Critchley Date: Thu, 5 Oct 2017 01:05:01 +0100 Subject: [PATCH 06/10] Changing ValueError documentation --- pandas/io/parsers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 491d5568b3412..45385b70d8e1b 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1085,7 +1085,7 @@ def _validate_usecols(usecols, names): Raises ------ - ValueError : Detailing which usecols are missing, if any. + ValueError : Columns were missing. Error message will list them. """ missing = [c for c in usecols if c not in names] if len(missing) > 0: From 8a06cee2401223b286ddcae5697f772a550b848f Mon Sep 17 00:00:00 2001 From: Aaron Critchley Date: Wed, 8 Nov 2017 23:14:00 +0000 Subject: [PATCH 07/10] Reverting, changing name of function --- pandas/io/parsers.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 45385b70d8e1b..fafacd77947e3 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1065,7 +1065,7 @@ def _evaluate_usecols(usecols, names): return usecols -def _validate_usecols(usecols, names): +def _validate_usecols_names(usecols, names): """ Validates that all usecols are present in a given list of names. If not, raise a ValueError that @@ -1694,14 +1694,14 @@ def __init__(self, src, **kwds): # GH 14671 if (self.usecols_dtype == 'string' and not set(usecols).issubset(self.orig_names)): - _validate_usecols(usecols, self.orig_names) + _validate_usecols_names(usecols, self.orig_names) if len(self.names) > len(usecols): self.names = [n for i, n in enumerate(self.names) if (i in usecols or n in usecols)] if len(self.names) < len(usecols): - _validate_usecols(usecols, self.names) + _validate_usecols_names(usecols, self.names) self._set_noconvert_columns() @@ -2480,7 +2480,7 @@ def _handle_usecols(self, columns, usecols_key): try: col_indices.append(usecols_key.index(col)) except ValueError: - _validate_usecols(self.usecols, usecols_key) + _validate_usecols_names(self.usecols, usecols_key) else: col_indices.append(col) else: From 2209eae98824a354ec81a900c278b05d120a685b Mon Sep 17 00:00:00 2001 From: Aaron Critchley Date: Sat, 2 Dec 2017 15:54:20 +0000 Subject: [PATCH 08/10] Adding whatsnews entry, and test for multiple missing columns --- doc/source/whatsnew/v0.22.0.txt | 1 + pandas/tests/io/parser/usecols.py | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 1a08a1353a605..64f2701896a94 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -76,6 +76,7 @@ Other Enhancements - Improved wording of ``ValueError`` raised in :func:`to_datetime` when ``unit=`` is passed with a non-convertible value (:issue:`14350`) - :func:`Series.fillna` now accepts a Series or a dict as a ``value`` for a categorical dtype (:issue:`17033`) - :func:`pandas.read_clipboard` updated to use qtpy, falling back to PyQt5 and then PyQt4, adding compatibility with Python3 and multiple python-qt bindings (:issue:`17722`) +- Improved wording of ``ValueError`` raised when creating a DataFrame and the ``usecols`` argument cannot match all columns. (:issue:`17301`) .. _whatsnew_0220.api_breaking: diff --git a/pandas/tests/io/parser/usecols.py b/pandas/tests/io/parser/usecols.py index 210d40fd5c805..4126e2eb93f28 100644 --- a/pandas/tests/io/parser/usecols.py +++ b/pandas/tests/io/parser/usecols.py @@ -495,8 +495,9 @@ def test_raise_on_usecols_names_mismatch(self): with tm.assert_raises_regex(ValueError, msg.format(missing="\['f'\]")): self.read_csv(StringIO(data), usecols=usecols) - usecols = ['a', 'b', 'f'] - with tm.assert_raises_regex(ValueError, msg.format(missing="\['f'\]")): + usecols = ['a', 'b', 'f', 'g'] + with tm.assert_raises_regex( + ValueError, msg.format(missing="\[('f', 'g'|'g', 'f')\]")): self.read_csv(StringIO(data), usecols=usecols) names = ['A', 'B', 'C', 'D'] From 93185f5f80842c4287b74e37d0e4f41e3f093901 Mon Sep 17 00:00:00 2001 From: Aaron Critchley Date: Sat, 2 Dec 2017 15:58:07 +0000 Subject: [PATCH 09/10] Adding accidentally removed test back --- pandas/tests/io/parser/usecols.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/tests/io/parser/usecols.py b/pandas/tests/io/parser/usecols.py index 4126e2eb93f28..0fa53e6288bda 100644 --- a/pandas/tests/io/parser/usecols.py +++ b/pandas/tests/io/parser/usecols.py @@ -495,6 +495,10 @@ def test_raise_on_usecols_names_mismatch(self): with tm.assert_raises_regex(ValueError, msg.format(missing="\['f'\]")): self.read_csv(StringIO(data), usecols=usecols) + usecols = ['a', 'b', 'f'] + with tm.assert_raises_regex(ValueError, msg.format(missing="\['f'\]")): + self.read_csv(StringIO(data), usecols=usecols) + usecols = ['a', 'b', 'f', 'g'] with tm.assert_raises_regex( ValueError, msg.format(missing="\[('f', 'g'|'g', 'f')\]")): From 5dfccdb6e617ec6596a1a9c34ff4e5d5b4e091cf Mon Sep 17 00:00:00 2001 From: Aaron Critchley Date: Sat, 2 Dec 2017 16:00:35 +0000 Subject: [PATCH 10/10] Suggested whatsnew change --- doc/source/whatsnew/v0.22.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 64f2701896a94..1cad0e68d0f25 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -76,7 +76,7 @@ Other Enhancements - Improved wording of ``ValueError`` raised in :func:`to_datetime` when ``unit=`` is passed with a non-convertible value (:issue:`14350`) - :func:`Series.fillna` now accepts a Series or a dict as a ``value`` for a categorical dtype (:issue:`17033`) - :func:`pandas.read_clipboard` updated to use qtpy, falling back to PyQt5 and then PyQt4, adding compatibility with Python3 and multiple python-qt bindings (:issue:`17722`) -- Improved wording of ``ValueError`` raised when creating a DataFrame and the ``usecols`` argument cannot match all columns. (:issue:`17301`) +- Improved wording of ``ValueError`` raised in :func:`read_csv` when the ``usecols`` argument cannot match all columns. (:issue:`17301`) .. _whatsnew_0220.api_breaking: