Skip to content

Commit 7692d28

Browse files
authored
BUG: Fix some index_col tests for pyarrow read_csv (pandas-dev#43706)
1 parent b1801bd commit 7692d28

File tree

4 files changed

+16
-4
lines changed

4 files changed

+16
-4
lines changed

doc/source/whatsnew/v1.4.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ Multithreaded CSV reading with a new CSV Engine based on pyarrow
9393
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
9494

9595
:func:`pandas.read_csv` now accepts ``engine="pyarrow"`` (requires at least ``pyarrow`` 0.17.0) as an argument, allowing for faster csv parsing on multicore machines
96-
with pyarrow installed. See the :doc:`I/O docs </user_guide/io>` for more info. (:issue:`23697`)
96+
with pyarrow installed. See the :doc:`I/O docs </user_guide/io>` for more info. (:issue:`23697`, :issue:`43706`)
9797

9898
.. _whatsnew_140.enhancements.window_rank:
9999

pandas/io/parsers/arrow_parser_wrapper.py

+15
Original file line numberDiff line numberDiff line change
@@ -91,20 +91,35 @@ def _finalize_output(self, frame: DataFrame) -> DataFrame:
9191
The processed DataFrame.
9292
"""
9393
num_cols = len(frame.columns)
94+
multi_index_named = True
9495
if self.header is None:
9596
if self.names is None:
9697
if self.prefix is not None:
9798
self.names = [f"{self.prefix}{i}" for i in range(num_cols)]
9899
elif self.header is None:
99100
self.names = range(num_cols)
101+
if len(self.names) != num_cols:
102+
# usecols is passed through to pyarrow, we only handle index col here
103+
# The only way self.names is not the same length as number of cols is
104+
# if we have int index_col. We should just pad the names(they will get
105+
# removed anyways) to expected length then.
106+
self.names = list(range(num_cols - len(self.names))) + self.names
107+
multi_index_named = False
100108
frame.columns = self.names
101109
# we only need the frame not the names
102110
frame.columns, frame = self._do_date_conversions(frame.columns, frame)
103111
if self.index_col is not None:
104112
for i, item in enumerate(self.index_col):
105113
if is_integer(item):
106114
self.index_col[i] = frame.columns[item]
115+
else:
116+
# String case
117+
if item not in frame.columns:
118+
raise ValueError(f"Index {item} invalid")
107119
frame.set_index(self.index_col, drop=True, inplace=True)
120+
# Clear names if headerless and no name given
121+
if self.header is None and not multi_index_named:
122+
frame.index.names = [None] * len(frame.index.names)
108123

109124
if self.kwds.get("dtype") is not None:
110125
frame = frame.astype(self.kwds.get("dtype"))

pandas/tests/io/parser/common/test_index.py

-1
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,6 @@ def test_pass_names_with_index(all_parsers, data, kwargs, expected):
8484
tm.assert_frame_equal(result, expected)
8585

8686

87-
@xfail_pyarrow
8887
@pytest.mark.parametrize("index_col", [[0, 1], [1, 0]])
8988
def test_multi_index_no_level_names(all_parsers, index_col):
9089
data = """index1,index2,A,B,C,D

pandas/tests/io/parser/test_index_col.py

-2
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
2020

2121

22-
@skip_pyarrow
2322
@pytest.mark.parametrize("with_header", [True, False])
2423
def test_index_col_named(all_parsers, with_header):
2524
parser = all_parsers
@@ -228,7 +227,6 @@ def test_header_with_index_col(all_parsers):
228227
tm.assert_frame_equal(result, expected)
229228

230229

231-
@skip_pyarrow
232230
@pytest.mark.slow
233231
def test_index_col_large_csv(all_parsers):
234232
# https://github.com/pandas-dev/pandas/issues/37094

0 commit comments

Comments
 (0)