Skip to content

Commit a3dc788

Browse files
authored
BUG: read_csv usecols and names parameters inconsistent between c and python (#38445)
1 parent b7b87da commit a3dc788

File tree

3 files changed

+9
-21
lines changed

3 files changed

+9
-21
lines changed

doc/source/whatsnew/v1.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,7 @@ MultiIndex
217217
I/O
218218
^^^
219219

220+
- Bug in :func:`read_csv` not accepting ``usecols`` with different length than ``names`` for ``engine="python"`` (:issue:`16469`)
220221
- Bug in :func:`read_csv` raising ``TypeError`` when ``names`` and ``parse_dates`` is specified for ``engine="c"`` (:issue:`33699`)
221222
-
222223

pandas/io/parsers.py

+7-15
Original file line numberDiff line numberDiff line change
@@ -2487,9 +2487,8 @@ def read(self, rows=None):
24872487
content = content[1:]
24882488

24892489
alldata = self._rows_to_cols(content)
2490-
data = self._exclude_implicit_index(alldata)
2490+
data, columns = self._exclude_implicit_index(alldata)
24912491

2492-
columns = self._maybe_dedup_names(self.columns)
24932492
columns, data = self._do_date_conversions(columns, data)
24942493

24952494
data = self._convert_data(data)
@@ -2500,19 +2499,14 @@ def read(self, rows=None):
25002499
def _exclude_implicit_index(self, alldata):
25012500
names = self._maybe_dedup_names(self.orig_names)
25022501

2502+
offset = 0
25032503
if self._implicit_index:
2504-
excl_indices = self.index_col
2504+
offset = len(self.index_col)
25052505

2506-
data = {}
2507-
offset = 0
2508-
for i, col in enumerate(names):
2509-
while i + offset in excl_indices:
2510-
offset += 1
2511-
data[col] = alldata[i + offset]
2512-
else:
2513-
data = {k: v for k, v in zip(names, alldata)}
2506+
if self._col_indices is not None and len(names) != len(self._col_indices):
2507+
names = [names[i] for i in sorted(self._col_indices)]
25142508

2515-
return data
2509+
return {name: alldata[i + offset] for i, name in enumerate(names)}, names
25162510

25172511
# legacy
25182512
def get_chunk(self, size=None):
@@ -2694,9 +2688,7 @@ def _infer_columns(self):
26942688
self._clear_buffer()
26952689

26962690
if names is not None:
2697-
if (self.usecols is not None and len(names) != len(self.usecols)) or (
2698-
self.usecols is None and len(names) != len(columns[0])
2699-
):
2691+
if len(names) > len(columns[0]):
27002692
raise ValueError(
27012693
"Number of passed names did not match "
27022694
"number of header fields in the file"

pandas/tests/io/parser/test_usecols.py

+1-6
Original file line numberDiff line numberDiff line change
@@ -559,12 +559,7 @@ def test_raises_on_usecols_names_mismatch(all_parsers, usecols, kwargs, expected
559559

560560

561561
@pytest.mark.parametrize("usecols", [["A", "C"], [0, 2]])
562-
def test_usecols_subset_names_mismatch_orig_columns(all_parsers, usecols, request):
563-
if all_parsers.engine != "c":
564-
reason = "see gh-16469: works on the C engine but not the Python engine"
565-
# Number of passed names did not match number of header fields in the file
566-
request.node.add_marker(pytest.mark.xfail(reason=reason, raises=ValueError))
567-
562+
def test_usecols_subset_names_mismatch_orig_columns(all_parsers, usecols):
568563
data = "a,b,c,d\n1,2,3,4\n5,6,7,8"
569564
names = ["A", "B", "C", "D"]
570565
parser = all_parsers

0 commit comments

Comments
 (0)