BUG: Fix some index_col tests for pyarrow read_csv (pandas-dev#43706)

lithomas1 · web-flow · commit 7692d2813083 · 2021-09-27T19:32:41.000-04:00
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -93,7 +93,7 @@ Multithreaded CSV reading with a new CSV Engine based on pyarrow
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 :func:`pandas.read_csv` now accepts ``engine="pyarrow"`` (requires at least ``pyarrow`` 0.17.0) as an argument, allowing for faster csv parsing on multicore machines
-with pyarrow installed. See the :doc:`I/O docs </user_guide/io>` for more info. (:issue:`23697`)
+with pyarrow installed. See the :doc:`I/O docs </user_guide/io>` for more info. (:issue:`23697`, :issue:`43706`)
 
 .. _whatsnew_140.enhancements.window_rank:
 
diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py
@@ -91,20 +91,35 @@ def _finalize_output(self, frame: DataFrame) -> DataFrame:
             The processed DataFrame.
         """
         num_cols = len(frame.columns)
+        multi_index_named = True
         if self.header is None:
             if self.names is None:
                 if self.prefix is not None:
                     self.names = [f"{self.prefix}{i}" for i in range(num_cols)]
                 elif self.header is None:
                     self.names = range(num_cols)
+            if len(self.names) != num_cols:
+                # usecols is passed through to pyarrow, we only handle index col here
+                # The only way self.names is not the same length as number of cols is
+                # if we have int index_col. We should just pad the names(they will get
+                # removed anyways) to expected length then.
+                self.names = list(range(num_cols - len(self.names))) + self.names
+                multi_index_named = False
             frame.columns = self.names
         # we only need the frame not the names
         frame.columns, frame = self._do_date_conversions(frame.columns, frame)
         if self.index_col is not None:
             for i, item in enumerate(self.index_col):
                 if is_integer(item):
                     self.index_col[i] = frame.columns[item]
+                else:
+                    # String case
+                    if item not in frame.columns:
+                        raise ValueError(f"Index {item} invalid")
             frame.set_index(self.index_col, drop=True, inplace=True)
+            # Clear names if headerless and no name given
+            if self.header is None and not multi_index_named:
+                frame.index.names = [None] * len(frame.index.names)
 
         if self.kwds.get("dtype") is not None:
             frame = frame.astype(self.kwds.get("dtype"))
diff --git a/pandas/tests/io/parser/common/test_index.py b/pandas/tests/io/parser/common/test_index.py
@@ -84,7 +84,6 @@ def test_pass_names_with_index(all_parsers, data, kwargs, expected):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
 @pytest.mark.parametrize("index_col", [[0, 1], [1, 0]])
 def test_multi_index_no_level_names(all_parsers, index_col):
     data = """index1,index2,A,B,C,D
diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py
@@ -19,7 +19,6 @@
 skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
 
 
-@skip_pyarrow
 @pytest.mark.parametrize("with_header", [True, False])
 def test_index_col_named(all_parsers, with_header):
     parser = all_parsers
@@ -228,7 +227,6 @@ def test_header_with_index_col(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@skip_pyarrow
 @pytest.mark.slow
 def test_index_col_large_csv(all_parsers):
     # https://github.com/pandas-dev/pandas/issues/37094