From ef898b4aff3f22d37cf10a676be918a4fc4fa1bd Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 27 May 2022 11:56:32 +0200 Subject: [PATCH 1/2] BUG: read_csv adding additional columns as integers instead of strings --- doc/source/whatsnew/v1.5.0.rst | 1 + pandas/_libs/parsers.pyx | 4 +++- .../io/parser/usecols/test_usecols_basic.py | 18 ++++++++++++++++++ 3 files changed, 22 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index febad9f2a019c..65d25011b71d0 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -809,6 +809,7 @@ I/O - Bug in :func:`read_parquet` when ``engine="pyarrow"`` which caused partial write to disk when column of unsupported datatype was passed (:issue:`44914`) - Bug in :func:`DataFrame.to_excel` and :class:`ExcelWriter` would raise when writing an empty DataFrame to a ``.ods`` file (:issue:`45793`) - Bug in :func:`read_html` where elements surrounding ``
`` were joined without a space between them (:issue:`29528`) +- Bug in :func:`read_csv` adding columns as integers instead of string when data is longer than header leading to issue with ``usecols`` (:issue:`46997`) - Bug in Parquet roundtrip for Interval dtype with ``datetime64[ns]`` subtype (:issue:`45881`) - Bug in :func:`read_excel` when reading a ``.ods`` file with newlines between xml elements (:issue:`45598`) - Bug in :func:`read_parquet` when ``engine="fastparquet"`` where the file was not closed on error (:issue:`46555`) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index ac7199821e2b0..b07fa143c98b6 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1304,8 +1304,10 @@ cdef class TextReader: if self.header is not None: j = i - self.leading_cols # generate extra (bogus) headers if there are more columns than headers + # These should be strings, not integers, because otherwise we might get + # issues with callables as usecols GH#46997 if j >= len(self.header[0]): - return j + return str(j) elif self.has_mi_columns: return tuple(header_row[j] for header_row in self.header) else: diff --git a/pandas/tests/io/parser/usecols/test_usecols_basic.py b/pandas/tests/io/parser/usecols/test_usecols_basic.py index f35caf38c847f..aef1937dcf287 100644 --- a/pandas/tests/io/parser/usecols/test_usecols_basic.py +++ b/pandas/tests/io/parser/usecols/test_usecols_basic.py @@ -416,3 +416,21 @@ def test_usecols_indices_out_of_bounds(all_parsers, names): if names is None and parser.engine == "python": expected = DataFrame({"a": [1]}) tm.assert_frame_equal(result, expected) + + +def test_usecols_additional_columns(all_parsers): + # GH#46997 + parser = all_parsers + usecols = lambda header: header.strip() in ["a", "b", "c"] + result = parser.read_csv(StringIO("a,b\nx,y,z"), index_col=False, usecols=usecols) + expected = DataFrame({"a": ["x"], "b": "y"}) + tm.assert_frame_equal(result, expected) + + +def test_usecols_additional_columns_integer_columns(all_parsers): + # GH#46997 + parser = all_parsers + usecols = lambda header: header.strip() in ["0", "1"] + result = parser.read_csv(StringIO("0,1\nx,y,z"), index_col=False, usecols=usecols) + expected = DataFrame({"0": ["x"], "1": "y"}) + tm.assert_frame_equal(result, expected) From 65593092d3fee2d6832862b55915aeac1de2685c Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 27 May 2022 14:42:17 +0200 Subject: [PATCH 2/2] Reword whatsnew --- doc/source/whatsnew/v1.5.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 65d25011b71d0..6da0b60f0bcb9 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -809,7 +809,7 @@ I/O - Bug in :func:`read_parquet` when ``engine="pyarrow"`` which caused partial write to disk when column of unsupported datatype was passed (:issue:`44914`) - Bug in :func:`DataFrame.to_excel` and :class:`ExcelWriter` would raise when writing an empty DataFrame to a ``.ods`` file (:issue:`45793`) - Bug in :func:`read_html` where elements surrounding ``
`` were joined without a space between them (:issue:`29528`) -- Bug in :func:`read_csv` adding columns as integers instead of string when data is longer than header leading to issue with ``usecols`` (:issue:`46997`) +- Bug in :func:`read_csv` when data is longer than header leading to issues with callables in ``usecols`` expecting strings (:issue:`46997`) - Bug in Parquet roundtrip for Interval dtype with ``datetime64[ns]`` subtype (:issue:`45881`) - Bug in :func:`read_excel` when reading a ``.ods`` file with newlines between xml elements (:issue:`45598`) - Bug in :func:`read_parquet` when ``engine="fastparquet"`` where the file was not closed on error (:issue:`46555`)