diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 243bcf6900d2e..46759fbbcf8b8 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -93,7 +93,7 @@ Multithreaded CSV reading with a new CSV Engine based on pyarrow ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ :func:`pandas.read_csv` now accepts ``engine="pyarrow"`` (requires at least ``pyarrow`` 0.17.0) as an argument, allowing for faster csv parsing on multicore machines -with pyarrow installed. See the :doc:`I/O docs ` for more info. (:issue:`23697`) +with pyarrow installed. See the :doc:`I/O docs ` for more info. (:issue:`23697`, :issue:`43706`) .. _whatsnew_140.enhancements.window_rank: diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 033cd88da9687..5b1b178c4f610 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -91,12 +91,20 @@ def _finalize_output(self, frame: DataFrame) -> DataFrame: The processed DataFrame. """ num_cols = len(frame.columns) + multi_index_named = True if self.header is None: if self.names is None: if self.prefix is not None: self.names = [f"{self.prefix}{i}" for i in range(num_cols)] elif self.header is None: self.names = range(num_cols) + if len(self.names) != num_cols: + # usecols is passed through to pyarrow, we only handle index col here + # The only way self.names is not the same length as number of cols is + # if we have int index_col. We should just pad the names(they will get + # removed anyways) to expected length then. + self.names = list(range(num_cols - len(self.names))) + self.names + multi_index_named = False frame.columns = self.names # we only need the frame not the names frame.columns, frame = self._do_date_conversions(frame.columns, frame) @@ -104,7 +112,14 @@ def _finalize_output(self, frame: DataFrame) -> DataFrame: for i, item in enumerate(self.index_col): if is_integer(item): self.index_col[i] = frame.columns[item] + else: + # String case + if item not in frame.columns: + raise ValueError(f"Index {item} invalid") frame.set_index(self.index_col, drop=True, inplace=True) + # Clear names if headerless and no name given + if self.header is None and not multi_index_named: + frame.index.names = [None] * len(frame.index.names) if self.kwds.get("dtype") is not None: frame = frame.astype(self.kwds.get("dtype")) diff --git a/pandas/tests/io/parser/common/test_index.py b/pandas/tests/io/parser/common/test_index.py index 9db1df345404b..69afb9fe56472 100644 --- a/pandas/tests/io/parser/common/test_index.py +++ b/pandas/tests/io/parser/common/test_index.py @@ -84,7 +84,6 @@ def test_pass_names_with_index(all_parsers, data, kwargs, expected): tm.assert_frame_equal(result, expected) -@xfail_pyarrow @pytest.mark.parametrize("index_col", [[0, 1], [1, 0]]) def test_multi_index_no_level_names(all_parsers, index_col): data = """index1,index2,A,B,C,D diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index 6be82af5349ed..646cb2029919d 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -19,7 +19,6 @@ skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") -@skip_pyarrow @pytest.mark.parametrize("with_header", [True, False]) def test_index_col_named(all_parsers, with_header): parser = all_parsers @@ -228,7 +227,6 @@ def test_header_with_index_col(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow @pytest.mark.slow def test_index_col_large_csv(all_parsers): # https://github.com/pandas-dev/pandas/issues/37094