BUG: read_csv adding additional columns as integers instead of strings

phofl · phofl · commit ef898b4aff3f · 2022-05-27T11:56:32.000+02:00
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -809,6 +809,7 @@ I/O
 - Bug in :func:`read_parquet` when ``engine="pyarrow"`` which caused partial write to disk when column of unsupported datatype was passed (:issue:`44914`)
 - Bug in :func:`DataFrame.to_excel` and :class:`ExcelWriter` would raise when writing an empty DataFrame to a ``.ods`` file (:issue:`45793`)
 - Bug in :func:`read_html` where elements surrounding ``<br>`` were joined without a space between them (:issue:`29528`)
+- Bug in :func:`read_csv` adding columns as integers instead of string when data is longer than header leading to issue with ``usecols`` (:issue:`46997`)
 - Bug in Parquet roundtrip for Interval dtype with ``datetime64[ns]`` subtype (:issue:`45881`)
 - Bug in :func:`read_excel` when reading a ``.ods`` file with newlines between xml elements (:issue:`45598`)
 - Bug in :func:`read_parquet` when ``engine="fastparquet"`` where the file was not closed on error (:issue:`46555`)
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -1304,8 +1304,10 @@ cdef class TextReader:
             if self.header is not None:
                 j = i - self.leading_cols
                 # generate extra (bogus) headers if there are more columns than headers
+                # These should be strings, not integers, because otherwise we might get
+                # issues with callables as usecols GH#46997
                 if j >= len(self.header[0]):
-                    return j
+                    return str(j)
                 elif self.has_mi_columns:
                     return tuple(header_row[j] for header_row in self.header)
                 else:
diff --git a/pandas/tests/io/parser/usecols/test_usecols_basic.py b/pandas/tests/io/parser/usecols/test_usecols_basic.py
@@ -416,3 +416,21 @@ def test_usecols_indices_out_of_bounds(all_parsers, names):
     if names is None and parser.engine == "python":
         expected = DataFrame({"a": [1]})
     tm.assert_frame_equal(result, expected)
+
+
+def test_usecols_additional_columns(all_parsers):
+    # GH#46997
+    parser = all_parsers
+    usecols = lambda header: header.strip() in ["a", "b", "c"]
+    result = parser.read_csv(StringIO("a,b\nx,y,z"), index_col=False, usecols=usecols)
+    expected = DataFrame({"a": ["x"], "b": "y"})
+    tm.assert_frame_equal(result, expected)
+
+
+def test_usecols_additional_columns_integer_columns(all_parsers):
+    # GH#46997
+    parser = all_parsers
+    usecols = lambda header: header.strip() in ["0", "1"]
+    result = parser.read_csv(StringIO("0,1\nx,y,z"), index_col=False, usecols=usecols)
+    expected = DataFrame({"0": ["x"], "1": "y"})
+    tm.assert_frame_equal(result, expected)