From 9a82d19d8f945d6e5e4e8b8745f91273be335678 Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 23 Apr 2021 23:40:10 +0200 Subject: [PATCH 1/7] Bug in read_csv raising list index out of range instead of ParserError --- doc/source/whatsnew/v1.3.0.rst | 1 + pandas/io/parsers/python_parser.py | 7 +++++++ pandas/tests/io/parser/test_python_parser_only.py | 12 ++++++++++++ 3 files changed, 20 insertions(+) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 842b50ce53b21..2f50db55b4d96 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -797,6 +797,7 @@ I/O - Bug in :func:`read_excel` raising ``AttributeError`` with ``MultiIndex`` header followed by two empty rows and no index, and bug affecting :func:`read_excel`, :func:`read_csv`, :func:`read_table`, :func:`read_fwf`, and :func:`read_clipboard` where one blank row after a ``MultiIndex`` header with no index would be dropped (:issue:`40442`) - Bug in :meth:`DataFrame.to_string` misplacing the truncation column when ``index=False`` (:issue:`40907`) - Bug in :func:`read_orc` always raising ``AttributeError`` (:issue:`40918`) +- Bug in :func:`read_csv` raising uncontrolled ``ValueError`` when ``usecols`` index is ouf of bounds, now raising ``ParserError`` (:issue:`25623`) Period ^^^^^^ diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index a6d38eab99977..89722e1c76503 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -537,6 +537,13 @@ def _handle_usecols(self, columns, usecols_key): else: col_indices.append(col) else: + missing_usecols = [ + col for col in self.usecols if col > len(usecols_key) + ] + if missing_usecols: + raise ParserError( + f"Usecols indices {missing_usecols} are out of bounds!" + ) col_indices = self.usecols columns = [ diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index cf6866946ab76..76d1bd470532e 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -312,3 +312,15 @@ def test_malformed_skipfooter(python_parser_only): msg = "Expected 3 fields in line 4, saw 5" with pytest.raises(ParserError, match=msg): parser.read_csv(StringIO(data), header=1, comment="#", skipfooter=1) + + +def test_usecols_indices_out_of_bounds(python_parser_only): + # GH#25623 + parser = python_parser_only + data = """ + a,b + 1,2 + """ + msg = r"Usecols indices \[10\] are out of bounds!" + with pytest.raises(ParserError, match=msg): + parser.read_csv(StringIO(data), usecols=[0, 10]) From 0287dd98a20a18275b9d89ccddfc2375b1b59a19 Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 24 Apr 2021 00:39:43 +0200 Subject: [PATCH 2/7] Deprecated usecols with out of bounds indices in read_csv with c engine --- doc/source/whatsnew/v1.3.0.rst | 1 + pandas/_libs/parsers.pyx | 11 +++++++++++ pandas/tests/io/parser/test_c_parser_only.py | 13 +++++++++++++ 3 files changed, 25 insertions(+) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 842b50ce53b21..bc301fad730df 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -611,6 +611,7 @@ Deprecations - Deprecated the ``level`` keyword for :class:`DataFrame` and :class:`Series` aggregations; use groupby instead (:issue:`39983`) - The ``inplace`` parameter of :meth:`Categorical.remove_categories`, :meth:`Categorical.add_categories` is deprecated and will be removed in a future version (:issue:`37643`) - Deprecated :func:`merge` producing duplicated columns through the ``suffixes`` keyword and already existing columns (:issue:`22818`) +- Deprecated using ``usecols`` with out of bounds indices for ``read_csv`` with ``engine="c"`` (:issue:`25623`) .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 1a5ac31cc821b..5aa9f1db59e37 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -941,6 +941,17 @@ cdef class TextReader: f"{self.table_width - self.leading_cols} " f"and found {num_cols}") + if (self.usecols is not None and not callable(self.usecols) and + all(isinstance(u, int) for u in self.usecols)): + missing_usecols = [col for col in self.usecols if col >= num_cols] + if missing_usecols: + warnings.warn( + "Defining usecols with out of bounds indices is deprecated " + "and will raise a ParserError in a future version.", + FutureWarning, + stacklevel=6, + ) + results = {} nused = 0 for i in range(self.table_width): diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index f8aff3ad3696a..9060ca65b48c4 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -680,3 +680,16 @@ def test_float_precision_options(c_parser_only): with pytest.raises(ValueError, match=msg): parser.read_csv(StringIO(s), float_precision="junk") + + +def test_usecols_indices_out_of_bounds(c_parser_only): + # GH#25623 + parser = c_parser_only + data = """ +a,b +1,2 + """ + with tm.assert_produces_warning(FutureWarning): + result = parser.read_csv(StringIO(data), usecols=[0, 2]) + expected = DataFrame({"a": [1], "b": None}) + tm.assert_frame_equal(result, expected) From 97158edc69233c2d534a8fd005ef3c5c91e2a0fe Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 24 Apr 2021 00:41:41 +0200 Subject: [PATCH 3/7] Adjust test --- pandas/io/parsers/python_parser.py | 2 +- pandas/tests/io/parser/test_python_parser_only.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 89722e1c76503..205800d5ceaaa 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -538,7 +538,7 @@ def _handle_usecols(self, columns, usecols_key): col_indices.append(col) else: missing_usecols = [ - col for col in self.usecols if col > len(usecols_key) + col for col in self.usecols if col >= len(usecols_key) ] if missing_usecols: raise ParserError( diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index 76d1bd470532e..231780224ee12 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -321,6 +321,6 @@ def test_usecols_indices_out_of_bounds(python_parser_only): a,b 1,2 """ - msg = r"Usecols indices \[10\] are out of bounds!" + msg = r"Usecols indices \[2\] are out of bounds!" with pytest.raises(ParserError, match=msg): - parser.read_csv(StringIO(data), usecols=[0, 10]) + parser.read_csv(StringIO(data), usecols=[0, 2]) From f446e4f113849cdad0a28f6ffeb8a722e4bdd03b Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 24 Apr 2021 17:19:18 +0200 Subject: [PATCH 4/7] Fix bug for names and add tests --- pandas/io/parsers/python_parser.py | 18 +++++++++++------- .../tests/io/parser/test_python_parser_only.py | 8 ++++++-- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 205800d5ceaaa..21bfc9ee88efe 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -470,12 +470,14 @@ def _infer_columns(self): if self.usecols is not None: # Set _use_cols. We don't store columns because they are # overwritten. - self._handle_usecols(columns, names) + self._handle_usecols(columns, names, num_original_columns) else: num_original_columns = len(names) columns = [names] else: - columns = self._handle_usecols(columns, columns[0]) + columns = self._handle_usecols( + columns, columns[0], num_original_columns + ) else: try: line = self._buffered_line() @@ -494,10 +496,12 @@ def _infer_columns(self): columns = [[f"{self.prefix}{i}" for i in range(ncols)]] else: columns = [list(range(ncols))] - columns = self._handle_usecols(columns, columns[0]) + columns = self._handle_usecols( + columns, columns[0], num_original_columns + ) else: if self.usecols is None or len(names) >= num_original_columns: - columns = self._handle_usecols([names], names) + columns = self._handle_usecols([names], names, num_original_columns) num_original_columns = len(names) else: if not callable(self.usecols) and len(names) != len(self.usecols): @@ -506,13 +510,13 @@ def _infer_columns(self): "header fields in the file" ) # Ignore output but set used columns. - self._handle_usecols([names], names) + self._handle_usecols([names], names, ncols) columns = [names] num_original_columns = ncols return columns, num_original_columns, unnamed_cols - def _handle_usecols(self, columns, usecols_key): + def _handle_usecols(self, columns, usecols_key, num_original_columns): """ Sets self._col_indices @@ -538,7 +542,7 @@ def _handle_usecols(self, columns, usecols_key): col_indices.append(col) else: missing_usecols = [ - col for col in self.usecols if col >= len(usecols_key) + col for col in self.usecols if col >= num_original_columns ] if missing_usecols: raise ParserError( diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index 231780224ee12..49303658a63f9 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -314,8 +314,12 @@ def test_malformed_skipfooter(python_parser_only): parser.read_csv(StringIO(data), header=1, comment="#", skipfooter=1) -def test_usecols_indices_out_of_bounds(python_parser_only): +@pytest.mark.parametrize("header", [0, None]) +@pytest.mark.parametrize("names", [None, ["a", "b"], ["a", "b", "c"]]) +def test_usecols_indices_out_of_bounds(python_parser_only, names, header): # GH#25623 + if header == 0 and names == ["a", "b", "c"]: + pytest.skip("This case is not valid") parser = python_parser_only data = """ a,b @@ -323,4 +327,4 @@ def test_usecols_indices_out_of_bounds(python_parser_only): """ msg = r"Usecols indices \[2\] are out of bounds!" with pytest.raises(ParserError, match=msg): - parser.read_csv(StringIO(data), usecols=[0, 2]) + parser.read_csv(StringIO(data), usecols=[0, 2], names=names, header=header) From 21b496bf662bc80d5258b40ad8d0f143b93b0323 Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 12 May 2021 19:54:32 +0200 Subject: [PATCH 5/7] Rebase after fixing bug on master --- pandas/io/parsers/python_parser.py | 8 ++++++-- pandas/tests/io/parser/test_python_parser_only.py | 3 +-- pandas/tests/io/parser/usecols/test_usecols_basic.py | 3 ++- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index da876eb71a0fc..c5784b8a0b01c 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -15,6 +15,7 @@ Tuple, cast, ) +import warnings import numpy as np @@ -557,8 +558,11 @@ def _handle_usecols(self, columns, usecols_key, num_original_columns): col for col in self.usecols if col >= num_original_columns ] if missing_usecols: - raise ParserError( - f"Usecols indices {missing_usecols} are out of bounds!" + warnings.warn( + "Defining usecols with out of bounds indices is deprecated " + "and will raise a ParserError in a future version.", + FutureWarning, + stacklevel=8, ) col_indices = self.usecols diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index 49303658a63f9..a163490637343 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -325,6 +325,5 @@ def test_usecols_indices_out_of_bounds(python_parser_only, names, header): a,b 1,2 """ - msg = r"Usecols indices \[2\] are out of bounds!" - with pytest.raises(ParserError, match=msg): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): parser.read_csv(StringIO(data), usecols=[0, 2], names=names, header=header) diff --git a/pandas/tests/io/parser/usecols/test_usecols_basic.py b/pandas/tests/io/parser/usecols/test_usecols_basic.py index 371b8bea7def2..b86dc5ef85fc6 100644 --- a/pandas/tests/io/parser/usecols/test_usecols_basic.py +++ b/pandas/tests/io/parser/usecols/test_usecols_basic.py @@ -383,7 +383,8 @@ def test_usecols_indices_out_of_bounds(all_parsers, names): a,b 1,2 """ - result = parser.read_csv(StringIO(data), usecols=[0, 2], names=names, header=0) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = parser.read_csv(StringIO(data), usecols=[0, 2], names=names, header=0) expected = DataFrame({"a": [1], "b": [None]}) if names is None and parser.engine == "python": expected = DataFrame({"a": [1]}) From 41e3310d40b06ba3719188a886cde34d5bb43a48 Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 12 May 2021 20:02:58 +0200 Subject: [PATCH 6/7] Remove no longer necessary test --- pandas/tests/io/parser/test_c_parser_only.py | 13 ------------- pandas/tests/io/parser/test_python_parser_only.py | 15 --------------- 2 files changed, 28 deletions(-) diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index d5f21f2a2636b..044af57f49240 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -682,16 +682,3 @@ def test_float_precision_options(c_parser_only): with pytest.raises(ValueError, match=msg): parser.read_csv(StringIO(s), float_precision="junk") - - -def test_usecols_indices_out_of_bounds(c_parser_only): - # GH#25623 - parser = c_parser_only - data = """ -a,b -1,2 - """ - with tm.assert_produces_warning(FutureWarning): - result = parser.read_csv(StringIO(data), usecols=[0, 2]) - expected = DataFrame({"a": [1], "b": None}) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index a163490637343..cf6866946ab76 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -312,18 +312,3 @@ def test_malformed_skipfooter(python_parser_only): msg = "Expected 3 fields in line 4, saw 5" with pytest.raises(ParserError, match=msg): parser.read_csv(StringIO(data), header=1, comment="#", skipfooter=1) - - -@pytest.mark.parametrize("header", [0, None]) -@pytest.mark.parametrize("names", [None, ["a", "b"], ["a", "b", "c"]]) -def test_usecols_indices_out_of_bounds(python_parser_only, names, header): - # GH#25623 - if header == 0 and names == ["a", "b", "c"]: - pytest.skip("This case is not valid") - parser = python_parser_only - data = """ - a,b - 1,2 - """ - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - parser.read_csv(StringIO(data), usecols=[0, 2], names=names, header=header) From 5bef6766ce02e397c23a2f6792151be5ec16c524 Mon Sep 17 00:00:00 2001 From: phofl Date: Thu, 13 May 2021 19:59:29 +0200 Subject: [PATCH 7/7] Type function --- pandas/io/parsers/python_parser.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index c5784b8a0b01c..9082e41698913 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -529,7 +529,12 @@ def _infer_columns(self): return columns, num_original_columns, unnamed_cols - def _handle_usecols(self, columns, usecols_key, num_original_columns): + def _handle_usecols( + self, + columns: List[List[Union[Optional[str], Optional[int]]]], + usecols_key: List[Union[Optional[str], Optional[int]]], + num_original_columns: int, + ): """ Sets self._col_indices