BUG: read_csv usecols and names parameters inconsistent between c and python (#38445)

phofl · web-flow · commit a3dc788c4ccd · 2020-12-13T12:08:28.000-05:00
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -217,6 +217,7 @@ MultiIndex
 I/O
 ^^^
 
+- Bug in :func:`read_csv` not accepting ``usecols`` with different length than ``names`` for ``engine="python"`` (:issue:`16469`)
 - Bug in :func:`read_csv` raising ``TypeError`` when ``names`` and ``parse_dates`` is specified for ``engine="c"`` (:issue:`33699`)
 -
 
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -2487,9 +2487,8 @@ def read(self, rows=None):
             content = content[1:]
 
         alldata = self._rows_to_cols(content)
-        data = self._exclude_implicit_index(alldata)
+        data, columns = self._exclude_implicit_index(alldata)
 
-        columns = self._maybe_dedup_names(self.columns)
         columns, data = self._do_date_conversions(columns, data)
 
         data = self._convert_data(data)
@@ -2500,19 +2499,14 @@ def read(self, rows=None):
     def _exclude_implicit_index(self, alldata):
         names = self._maybe_dedup_names(self.orig_names)
 
+        offset = 0
         if self._implicit_index:
-            excl_indices = self.index_col
+            offset = len(self.index_col)
 
-            data = {}
-            offset = 0
-            for i, col in enumerate(names):
-                while i + offset in excl_indices:
-                    offset += 1
-                data[col] = alldata[i + offset]
-        else:
-            data = {k: v for k, v in zip(names, alldata)}
+        if self._col_indices is not None and len(names) != len(self._col_indices):
+            names = [names[i] for i in sorted(self._col_indices)]
 
-        return data
+        return {name: alldata[i + offset] for i, name in enumerate(names)}, names
 
     # legacy
     def get_chunk(self, size=None):
@@ -2694,9 +2688,7 @@ def _infer_columns(self):
                 self._clear_buffer()
 
             if names is not None:
-                if (self.usecols is not None and len(names) != len(self.usecols)) or (
-                    self.usecols is None and len(names) != len(columns[0])
-                ):
+                if len(names) > len(columns[0]):
                     raise ValueError(
                         "Number of passed names did not match "
                         "number of header fields in the file"
diff --git a/pandas/tests/io/parser/test_usecols.py b/pandas/tests/io/parser/test_usecols.py
@@ -559,12 +559,7 @@ def test_raises_on_usecols_names_mismatch(all_parsers, usecols, kwargs, expected
 
 
 @pytest.mark.parametrize("usecols", [["A", "C"], [0, 2]])
-def test_usecols_subset_names_mismatch_orig_columns(all_parsers, usecols, request):
-    if all_parsers.engine != "c":
-        reason = "see gh-16469: works on the C engine but not the Python engine"
-        # Number of passed names did not match number of header fields in the file
-        request.node.add_marker(pytest.mark.xfail(reason=reason, raises=ValueError))
-
+def test_usecols_subset_names_mismatch_orig_columns(all_parsers, usecols):
     data = "a,b,c,d\n1,2,3,4\n5,6,7,8"
     names = ["A", "B", "C", "D"]
     parser = all_parsers

Original file line number	Diff line number	Diff line change
`@@ -217,6 +217,7 @@ MultiIndex`
`217`	`217`	`I/O`
`218`	`218`	`^^^`
`219`	`219`
	`220`	+- Bug in :func:`read_csv` not accepting ``usecols`` with different length than ``names`` for ``engine="python"`` (:issue:`16469`)
`220`	`221`	- Bug in :func:`read_csv` raising ``TypeError`` when ``names`` and ``parse_dates`` is specified for ``engine="c"`` (:issue:`33699`)
`221`	`222`	`-`
`222`	`223`