Skip to content

Commit ef898b4

Browse files
committed
BUG: read_csv adding additional columns as integers instead of strings
1 parent 647271e commit ef898b4

File tree

3 files changed

+22
-1
lines changed

3 files changed

+22
-1
lines changed

doc/source/whatsnew/v1.5.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -809,6 +809,7 @@ I/O
809809
- Bug in :func:`read_parquet` when ``engine="pyarrow"`` which caused partial write to disk when column of unsupported datatype was passed (:issue:`44914`)
810810
- Bug in :func:`DataFrame.to_excel` and :class:`ExcelWriter` would raise when writing an empty DataFrame to a ``.ods`` file (:issue:`45793`)
811811
- Bug in :func:`read_html` where elements surrounding ``<br>`` were joined without a space between them (:issue:`29528`)
812+
- Bug in :func:`read_csv` adding columns as integers instead of string when data is longer than header leading to issue with ``usecols`` (:issue:`46997`)
812813
- Bug in Parquet roundtrip for Interval dtype with ``datetime64[ns]`` subtype (:issue:`45881`)
813814
- Bug in :func:`read_excel` when reading a ``.ods`` file with newlines between xml elements (:issue:`45598`)
814815
- Bug in :func:`read_parquet` when ``engine="fastparquet"`` where the file was not closed on error (:issue:`46555`)

pandas/_libs/parsers.pyx

+3-1
Original file line numberDiff line numberDiff line change
@@ -1304,8 +1304,10 @@ cdef class TextReader:
13041304
if self.header is not None:
13051305
j = i - self.leading_cols
13061306
# generate extra (bogus) headers if there are more columns than headers
1307+
# These should be strings, not integers, because otherwise we might get
1308+
# issues with callables as usecols GH#46997
13071309
if j >= len(self.header[0]):
1308-
return j
1310+
return str(j)
13091311
elif self.has_mi_columns:
13101312
return tuple(header_row[j] for header_row in self.header)
13111313
else:

pandas/tests/io/parser/usecols/test_usecols_basic.py

+18
Original file line numberDiff line numberDiff line change
@@ -416,3 +416,21 @@ def test_usecols_indices_out_of_bounds(all_parsers, names):
416416
if names is None and parser.engine == "python":
417417
expected = DataFrame({"a": [1]})
418418
tm.assert_frame_equal(result, expected)
419+
420+
421+
def test_usecols_additional_columns(all_parsers):
422+
# GH#46997
423+
parser = all_parsers
424+
usecols = lambda header: header.strip() in ["a", "b", "c"]
425+
result = parser.read_csv(StringIO("a,b\nx,y,z"), index_col=False, usecols=usecols)
426+
expected = DataFrame({"a": ["x"], "b": "y"})
427+
tm.assert_frame_equal(result, expected)
428+
429+
430+
def test_usecols_additional_columns_integer_columns(all_parsers):
431+
# GH#46997
432+
parser = all_parsers
433+
usecols = lambda header: header.strip() in ["0", "1"]
434+
result = parser.read_csv(StringIO("0,1\nx,y,z"), index_col=False, usecols=usecols)
435+
expected = DataFrame({"0": ["x"], "1": "y"})
436+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)