From b232fa708372c00c8522a7c29905eb1352901918 Mon Sep 17 00:00:00 2001 From: Ming Li Date: Fri, 30 Mar 2018 16:58:00 +0100 Subject: [PATCH 01/10] pass usecols without converting to set at parser.pyx --- pandas/_libs/parsers.pyx | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 52ca3d1226f79..0dfc59586eaef 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -378,9 +378,7 @@ cdef class TextReader: self.compression = compression self.memory_map = memory_map - self.parser.usecols = (usecols is not None) - self._setup_parser_source(source) parser_set_default_options(self.parser) @@ -445,10 +443,8 @@ cdef class TextReader: # suboptimal if usecols is not None: self.has_usecols = 1 - if callable(usecols): - self.usecols = usecols - else: - self.usecols = set(usecols) + # GH20529, validate usecols at higher level. + self.usecols = usecols # XXX if skipfooter > 0: From 1e23cb534e0f84a38bbe4c979fc238930eecd69a Mon Sep 17 00:00:00 2001 From: Ming Li Date: Fri, 30 Mar 2018 18:21:06 +0100 Subject: [PATCH 02/10] validate if it's a list-like container and not string --- pandas/io/parsers.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 469cd6d82e4b4..df915637541cd 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -410,6 +410,7 @@ def _validate_names(names): return names + def _read(filepath_or_buffer, kwds): """Generic reader of line files.""" encoding = kwds.get('encoding', None) @@ -1192,17 +1193,19 @@ def _validate_usecols_arg(usecols): 'usecols_dtype` is the inferred dtype of 'usecols' if an array-like is passed in or None if a callable or None is passed in. """ - msg = ("'usecols' must either be all strings, all unicode, " + msg = ("'usecols' must either be list-like of all strings, all unicode, " "all integers or a callable") - if usecols is not None: if callable(usecols): return usecols, None - usecols_dtype = lib.infer_dtype(usecols) - if usecols_dtype not in ('empty', 'integer', - 'string', 'unicode'): + # GH20529, ensure is iterable container but not string. + elif not is_list_like(usecols): raise ValueError(msg) - + else: + usecols_dtype = lib.infer_dtype(usecols) + if usecols_dtype not in ('empty', 'integer', + 'string', 'unicode'): + raise ValueError(msg) return set(usecols), usecols_dtype return usecols, None From b74422bddc3764e30dd1a7d0725ff162f9216391 Mon Sep 17 00:00:00 2001 From: Ming Li Date: Fri, 30 Mar 2018 21:21:51 +0100 Subject: [PATCH 03/10] refactor msg assert --- pandas/tests/io/parser/usecols.py | 46 +++++++++++++++---------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/pandas/tests/io/parser/usecols.py b/pandas/tests/io/parser/usecols.py index 195fb4cba2aed..56a9b53cc9cf5 100644 --- a/pandas/tests/io/parser/usecols.py +++ b/pandas/tests/io/parser/usecols.py @@ -16,6 +16,11 @@ class UsecolsTests(object): + msg_validate_usecols_arg = ("'usecols' must either be list-like of all " + "strings, all unicode, all integers or a " + "callable.") + msg_validate_usecols_names = ("Usecols do not match columns, columns " + "expected but not found: {0}") def test_raise_on_mixed_dtype_usecols(self): # See gh-12678 @@ -24,11 +29,9 @@ def test_raise_on_mixed_dtype_usecols(self): 4000,5000,6000 """ - msg = ("'usecols' must either be all strings, all unicode, " - "all integers or a callable") usecols = [0, 'b', 2] - with tm.assert_raises_regex(ValueError, msg): + with tm.assert_raises_regex(ValueError, self.msg_validate_usecols_arg): self.read_csv(StringIO(data), usecols=usecols) def test_usecols(self): @@ -348,13 +351,10 @@ def test_usecols_with_mixed_encoding_strings(self): 3.568935038,7,False,a ''' - msg = ("'usecols' must either be all strings, all unicode, " - "all integers or a callable") - - with tm.assert_raises_regex(ValueError, msg): + with tm.assert_raises_regex(ValueError, self.msg_validate_usecols_arg): self.read_csv(StringIO(s), usecols=[u'AAA', b'BBB']) - with tm.assert_raises_regex(ValueError, msg): + with tm.assert_raises_regex(ValueError, self.msg_validate_usecols_arg): self.read_csv(StringIO(s), usecols=[b'AAA', u'BBB']) def test_usecols_with_multibyte_characters(self): @@ -480,11 +480,6 @@ def test_raise_on_usecols_names_mismatch(self): # GH 14671 data = 'a,b,c,d\n1,2,3,4\n5,6,7,8' - msg = ( - "Usecols do not match columns, " - "columns expected but not found: {missing}" - ) - usecols = ['a', 'b', 'c', 'd'] df = self.read_csv(StringIO(data), usecols=usecols) expected = DataFrame({'a': [1, 5], 'b': [2, 6], 'c': [3, 7], @@ -492,18 +487,21 @@ def test_raise_on_usecols_names_mismatch(self): tm.assert_frame_equal(df, expected) usecols = ['a', 'b', 'c', 'f'] - with tm.assert_raises_regex( - ValueError, msg.format(missing=r"\['f'\]")): + with tm.assert_raises_regex(ValueError, + self.msg_validate_usecols_names.format( + r"\['f'\]")): self.read_csv(StringIO(data), usecols=usecols) usecols = ['a', 'b', 'f'] - with tm.assert_raises_regex( - ValueError, msg.format(missing=r"\['f'\]")): + with tm.assert_raises_regex(ValueError, + self.msg_validate_usecols_names.format( + r"\['f'\]")): self.read_csv(StringIO(data), usecols=usecols) usecols = ['a', 'b', 'f', 'g'] - with tm.assert_raises_regex( - ValueError, msg.format(missing=r"\[('f', 'g'|'g', 'f')\]")): + with tm.assert_raises_regex(ValueError, + self.msg_validate_usecols_names.format( + r"\[('f', 'g'|'g', 'f')\]")): self.read_csv(StringIO(data), usecols=usecols) names = ['A', 'B', 'C', 'D'] @@ -527,11 +525,13 @@ def test_raise_on_usecols_names_mismatch(self): # tm.assert_frame_equal(df, expected) usecols = ['A', 'B', 'C', 'f'] - with tm.assert_raises_regex( - ValueError, msg.format(missing=r"\['f'\]")): + with tm.assert_raises_regex(ValueError, + self.msg_validate_usecols_names.format( + r"\['f'\]")): self.read_csv(StringIO(data), header=0, names=names, usecols=usecols) usecols = ['A', 'B', 'f'] - with tm.assert_raises_regex( - ValueError, msg.format(missing=r"\['f'\]")): + with tm.assert_raises_regex(ValueError, + self.msg_validate_usecols_names.format( + r"\['f'\]")): self.read_csv(StringIO(data), names=names, usecols=usecols) From 14b0ac0c67874eb664434f53512f33fcd79a0492 Mon Sep 17 00:00:00 2001 From: Ming Li Date: Fri, 30 Mar 2018 21:39:01 +0100 Subject: [PATCH 04/10] validate usecol --- pandas/io/parsers.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index df915637541cd..14d85f2cbd767 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -410,7 +410,6 @@ def _validate_names(names): return names - def _read(filepath_or_buffer, kwds): """Generic reader of line files.""" encoding = kwds.get('encoding', None) @@ -1194,7 +1193,7 @@ def _validate_usecols_arg(usecols): is passed in or None if a callable or None is passed in. """ msg = ("'usecols' must either be list-like of all strings, all unicode, " - "all integers or a callable") + "all integers or a callable.") if usecols is not None: if callable(usecols): return usecols, None @@ -1685,7 +1684,6 @@ class CParserWrapper(ParserBase): def __init__(self, src, **kwds): self.kwds = kwds kwds = kwds.copy() - ParserBase.__init__(self, kwds) if (kwds.get('compression') is None @@ -1700,11 +1698,12 @@ def __init__(self, src, **kwds): # #2442 kwds['allow_leading_cols'] = self.index_col is not False - self._reader = parsers.TextReader(src, **kwds) - - # XXX + # GH20529, validate usecol arg before TextReader self.usecols, self.usecols_dtype = _validate_usecols_arg( - self._reader.usecols) + kwds['usecols']) + kwds['usecols'] = self.usecols + + self._reader = parsers.TextReader(src, **kwds) passed_names = self.names is None From 23f3e8e124ffe948d5c5054b631e00267ae203dd Mon Sep 17 00:00:00 2001 From: Ming Li Date: Fri, 30 Mar 2018 21:44:02 +0100 Subject: [PATCH 05/10] add back space --- pandas/_libs/parsers.pyx | 2 ++ pandas/io/parsers.py | 1 + 2 files changed, 3 insertions(+) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 0dfc59586eaef..d4b95e4c7bccd 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -378,7 +378,9 @@ cdef class TextReader: self.compression = compression self.memory_map = memory_map + self.parser.usecols = (usecols is not None) + self._setup_parser_source(source) parser_set_default_options(self.parser) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 14d85f2cbd767..73e14ce4ae4c7 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1684,6 +1684,7 @@ class CParserWrapper(ParserBase): def __init__(self, src, **kwds): self.kwds = kwds kwds = kwds.copy() + ParserBase.__init__(self, kwds) if (kwds.get('compression') is None From 2de72b17f408729f5c8e548efcf83d3db0c057ed Mon Sep 17 00:00:00 2001 From: Ming Li Date: Fri, 30 Mar 2018 21:47:51 +0100 Subject: [PATCH 06/10] update parameter description to list-like --- pandas/io/parsers.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 73e14ce4ae4c7..780aa5d02f598 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -97,11 +97,11 @@ MultiIndex is used. If you have a malformed file with delimiters at the end of each line, you might consider index_col=False to force pandas to _not_ use the first column as the index (row names) -usecols : array-like or callable, default None - Return a subset of the columns. If array-like, all elements must either +usecols : list-like or callable, default None + Return a subset of the columns. If list-like, all elements must either be positional (i.e. integer indices into the document columns) or strings that correspond to column names provided either by the user in `names` or - inferred from the document header row(s). For example, a valid array-like + inferred from the document header row(s). For example, a valid list-like `usecols` parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. To instantiate a DataFrame from ``data`` with element order preserved use @@ -1177,7 +1177,7 @@ def _validate_usecols_arg(usecols): Parameters ---------- - usecols : array-like, callable, or None + usecols : list-like, callable, or None List of columns to use when parsing or a callable that can be used to filter a list of table columns. From 8362885b5e2c4999035352064008484c47b117e4 Mon Sep 17 00:00:00 2001 From: Ming Li Date: Fri, 30 Mar 2018 22:11:10 +0100 Subject: [PATCH 07/10] expand comment in parser.pyx --- pandas/_libs/parsers.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index d4b95e4c7bccd..a24e2cdd99f6f 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -445,7 +445,8 @@ cdef class TextReader: # suboptimal if usecols is not None: self.has_usecols = 1 - # GH20529, validate usecols at higher level. + # GH-20558, validate usecols at higher level and only pass clean + # usecols into TextReader. self.usecols = usecols # XXX From b177e93de0db981ba568b6fb32c1bc5c103cd6c8 Mon Sep 17 00:00:00 2001 From: Ming Li Date: Fri, 30 Mar 2018 22:36:39 +0100 Subject: [PATCH 08/10] add whatsnew entry --- doc/source/whatsnew/v0.23.0.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index e83f149db1f18..6524012d27fc9 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -1077,6 +1077,7 @@ I/O - Bug in :meth:`pandas.io.stata.StataReader.value_labels` raising an ``AttributeError`` when called on very old files. Now returns an empty dict (:issue:`19417`) - Bug in :func:`read_pickle` when unpickling objects with :class:`TimedeltaIndex` or :class:`Float64Index` created with pandas prior to version 0.20 (:issue:`19939`) - Bug in :meth:`pandas.io.json.json_normalize` where subrecords are not properly normalized if any subrecords values are NoneType (:issue:`20030`) +- Bug in ``usecols`` parameter in :func:`pandas.io.read_csv` and :func:`pandas.io.read_table` where error is not raised correctly when passing a string. (:issue:`20529`) Plotting ^^^^^^^^ From 86cb16bf06feb5d57376518d002e16cd433d8df5 Mon Sep 17 00:00:00 2001 From: Ming Li Date: Fri, 30 Mar 2018 22:43:02 +0100 Subject: [PATCH 09/10] update parameter description --- doc/source/io.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index ff505f525fc22..fd998d32cfbfb 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -130,11 +130,11 @@ index_col : int or sequence or ``False``, default ``None`` MultiIndex is used. If you have a malformed file with delimiters at the end of each line, you might consider ``index_col=False`` to force pandas to *not* use the first column as the index (row names). -usecols : array-like or callable, default ``None`` - Return a subset of the columns. If array-like, all elements must either +usecols : list-like or callable, default ``None`` + Return a subset of the columns. If list-like, all elements must either be positional (i.e. integer indices into the document columns) or strings that correspond to column names provided either by the user in `names` or - inferred from the document header row(s). For example, a valid array-like + inferred from the document header row(s). For example, a valid list-like `usecols` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``. Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. To From dafc44290f8f8d7a06fa28cdca77fc0a6dea2792 Mon Sep 17 00:00:00 2001 From: Ming Li Date: Sat, 31 Mar 2018 20:34:04 +0100 Subject: [PATCH 10/10] add test single string on usecols --- pandas/tests/io/parser/usecols.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pandas/tests/io/parser/usecols.py b/pandas/tests/io/parser/usecols.py index 56a9b53cc9cf5..584711528e9cb 100644 --- a/pandas/tests/io/parser/usecols.py +++ b/pandas/tests/io/parser/usecols.py @@ -88,6 +88,18 @@ def test_usecols(self): pytest.raises(ValueError, self.read_csv, StringIO(data), names=['a', 'b'], usecols=[1], header=None) + def test_usecols_single_string(self): + # GH 20558 + data = """foo, bar, baz + 1000, 2000, 3000 + 4000, 5000, 6000 + """ + + usecols = 'foo' + + with tm.assert_raises_regex(ValueError, self.msg_validate_usecols_arg): + self.read_csv(StringIO(data), usecols=usecols) + def test_usecols_index_col_False(self): # see gh-9082 s = "a,b,c,d\n1,2,3,4\n5,6,7,8"