From 99b002e8148c6a453e83143bca8f152aaa77a793 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Wed, 22 Sep 2021 17:40:43 -0700 Subject: [PATCH 1/3] BUG: Fix some index_col tests for pyarrow read_csv --- pandas/io/parsers/arrow_parser_wrapper.py | 13 +++++++++++++ pandas/tests/io/parser/common/test_index.py | 1 - pandas/tests/io/parser/test_index_col.py | 2 -- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 033cd88da9687..4a50f99d1ca11 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -91,12 +91,18 @@ def _finalize_output(self, frame: DataFrame) -> DataFrame: The processed DataFrame. """ num_cols = len(frame.columns) + multi_index_named = True if self.header is None: if self.names is None: if self.prefix is not None: self.names = [f"{self.prefix}{i}" for i in range(num_cols)] elif self.header is None: self.names = range(num_cols) + if len(self.names) != num_cols: + # usecols is passed through to pyarrow, we only handle index col here + # pretty much we just pad names to the expected length + self.names = range(num_cols - len(self.names)) + self.names + multi_index_named = False frame.columns = self.names # we only need the frame not the names frame.columns, frame = self._do_date_conversions(frame.columns, frame) @@ -104,7 +110,14 @@ def _finalize_output(self, frame: DataFrame) -> DataFrame: for i, item in enumerate(self.index_col): if is_integer(item): self.index_col[i] = frame.columns[item] + else: + # String case + if item not in frame.columns: + raise ValueError(f"Index {item} invalid") frame.set_index(self.index_col, drop=True, inplace=True) + # Clear names if headerless and no name given + if self.header is None and not multi_index_named: + frame.index.names = [None] * len(frame.index.names) if self.kwds.get("dtype") is not None: frame = frame.astype(self.kwds.get("dtype")) diff --git a/pandas/tests/io/parser/common/test_index.py b/pandas/tests/io/parser/common/test_index.py index a37bd010d0e1b..082bb466d5dd2 100644 --- a/pandas/tests/io/parser/common/test_index.py +++ b/pandas/tests/io/parser/common/test_index.py @@ -80,7 +80,6 @@ def test_pass_names_with_index(all_parsers, data, kwargs, expected): tm.assert_frame_equal(result, expected) -@xfail_pyarrow @pytest.mark.parametrize("index_col", [[0, 1], [1, 0]]) def test_multi_index_no_level_names(all_parsers, index_col): data = """index1,index2,A,B,C,D diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index 6be82af5349ed..646cb2029919d 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -19,7 +19,6 @@ skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") -@skip_pyarrow @pytest.mark.parametrize("with_header", [True, False]) def test_index_col_named(all_parsers, with_header): parser = all_parsers @@ -228,7 +227,6 @@ def test_header_with_index_col(all_parsers): tm.assert_frame_equal(result, expected) -@skip_pyarrow @pytest.mark.slow def test_index_col_large_csv(all_parsers): # https://github.com/pandas-dev/pandas/issues/37094 From 47d5b3367307a9c3eaf5a6acba2ef828111aa0cb Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Wed, 22 Sep 2021 20:04:17 -0700 Subject: [PATCH 2/3] fixes --- pandas/io/parsers/arrow_parser_wrapper.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 4a50f99d1ca11..5b1b178c4f610 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -100,8 +100,10 @@ def _finalize_output(self, frame: DataFrame) -> DataFrame: self.names = range(num_cols) if len(self.names) != num_cols: # usecols is passed through to pyarrow, we only handle index col here - # pretty much we just pad names to the expected length - self.names = range(num_cols - len(self.names)) + self.names + # The only way self.names is not the same length as number of cols is + # if we have int index_col. We should just pad the names(they will get + # removed anyways) to expected length then. + self.names = list(range(num_cols - len(self.names))) + self.names multi_index_named = False frame.columns = self.names # we only need the frame not the names From bc500ed2b91a74859bdc077af9f7ae1e890caf35 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 27 Sep 2021 13:46:46 -0700 Subject: [PATCH 3/3] Update v1.4.0.rst --- doc/source/whatsnew/v1.4.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index f18b3b75ca3d2..ff92f55c67b12 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -92,7 +92,7 @@ Multithreaded CSV reading with a new CSV Engine based on pyarrow ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ :func:`pandas.read_csv` now accepts ``engine="pyarrow"`` (requires at least ``pyarrow`` 0.17.0) as an argument, allowing for faster csv parsing on multicore machines -with pyarrow installed. See the :doc:`I/O docs ` for more info. (:issue:`23697`) +with pyarrow installed. See the :doc:`I/O docs ` for more info. (:issue:`23697`, :issue:`43706`) .. _whatsnew_140.enhancements.window_rank: