diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index d86c1b7911528..26e548f519ecd 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -217,6 +217,7 @@ MultiIndex I/O ^^^ +- Bug in :func:`read_csv` not accepting ``usecols`` with different length than ``names`` for ``engine="python"`` (:issue:`16469`) - Bug in :func:`read_csv` raising ``TypeError`` when ``names`` and ``parse_dates`` is specified for ``engine="c"`` (:issue:`33699`) - diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 7a56b03326762..8177741b5252d 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2487,9 +2487,8 @@ def read(self, rows=None): content = content[1:] alldata = self._rows_to_cols(content) - data = self._exclude_implicit_index(alldata) + data, columns = self._exclude_implicit_index(alldata) - columns = self._maybe_dedup_names(self.columns) columns, data = self._do_date_conversions(columns, data) data = self._convert_data(data) @@ -2500,19 +2499,14 @@ def read(self, rows=None): def _exclude_implicit_index(self, alldata): names = self._maybe_dedup_names(self.orig_names) + offset = 0 if self._implicit_index: - excl_indices = self.index_col + offset = len(self.index_col) - data = {} - offset = 0 - for i, col in enumerate(names): - while i + offset in excl_indices: - offset += 1 - data[col] = alldata[i + offset] - else: - data = {k: v for k, v in zip(names, alldata)} + if self._col_indices is not None and len(names) != len(self._col_indices): + names = [names[i] for i in sorted(self._col_indices)] - return data + return {name: alldata[i + offset] for i, name in enumerate(names)}, names # legacy def get_chunk(self, size=None): @@ -2694,9 +2688,7 @@ def _infer_columns(self): self._clear_buffer() if names is not None: - if (self.usecols is not None and len(names) != len(self.usecols)) or ( - self.usecols is None and len(names) != len(columns[0]) - ): + if len(names) > len(columns[0]): raise ValueError( "Number of passed names did not match " "number of header fields in the file" diff --git a/pandas/tests/io/parser/test_usecols.py b/pandas/tests/io/parser/test_usecols.py index fbf3b0ea7c792..98e5801b3458e 100644 --- a/pandas/tests/io/parser/test_usecols.py +++ b/pandas/tests/io/parser/test_usecols.py @@ -559,12 +559,7 @@ def test_raises_on_usecols_names_mismatch(all_parsers, usecols, kwargs, expected @pytest.mark.parametrize("usecols", [["A", "C"], [0, 2]]) -def test_usecols_subset_names_mismatch_orig_columns(all_parsers, usecols, request): - if all_parsers.engine != "c": - reason = "see gh-16469: works on the C engine but not the Python engine" - # Number of passed names did not match number of header fields in the file - request.node.add_marker(pytest.mark.xfail(reason=reason, raises=ValueError)) - +def test_usecols_subset_names_mismatch_orig_columns(all_parsers, usecols): data = "a,b,c,d\n1,2,3,4\n5,6,7,8" names = ["A", "B", "C", "D"] parser = all_parsers