Skip to content

Commit 44e8822

Browse files
authored
ENH: Add Arrow CSV Reader (#43072)
1 parent 6e75d9d commit 44e8822

37 files changed

+583
-40
lines changed

asv_bench/benchmarks/io/csv.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,7 @@ def time_read_csv(self, bad_date_value):
206206
class ReadCSVSkipRows(BaseIO):
207207

208208
fname = "__test__.csv"
209-
params = ([None, 10000], ["c", "python"])
209+
params = ([None, 10000], ["c", "python", "pyarrow"])
210210
param_names = ["skiprows", "engine"]
211211

212212
def setup(self, skiprows, engine):
@@ -320,7 +320,7 @@ def time_read_csv_python_engine(self, sep, decimal, float_precision):
320320

321321

322322
class ReadCSVEngine(StringIORewind):
323-
params = ["c", "python"]
323+
params = ["c", "python", "pyarrow"]
324324
param_names = ["engine"]
325325

326326
def setup(self, engine):

doc/source/user_guide/io.rst

+46-8
Original file line numberDiff line numberDiff line change
@@ -160,9 +160,15 @@ dtype : Type name or dict of column -> type, default ``None``
160160
(unsupported with ``engine='python'``). Use ``str`` or ``object`` together
161161
with suitable ``na_values`` settings to preserve and
162162
not interpret dtype.
163-
engine : {``'c'``, ``'python'``}
164-
Parser engine to use. The C engine is faster while the Python engine is
165-
currently more feature-complete.
163+
engine : {``'c'``, ``'python'``, ``'pyarrow'``}
164+
Parser engine to use. The C and pyarrow engines are faster, while the python engine
165+
is currently more feature-complete. Multithreading is currently only supported by
166+
the pyarrow engine.
167+
168+
.. versionadded:: 1.4.0
169+
170+
The "pyarrow" engine was added as an *experimental* engine, and some features
171+
are unsupported, or may not work correctly, with this engine.
166172
converters : dict, default ``None``
167173
Dict of functions for converting values in certain columns. Keys can either be
168174
integers or column labels.
@@ -1622,11 +1628,17 @@ Specifying ``iterator=True`` will also return the ``TextFileReader`` object:
16221628
Specifying the parser engine
16231629
''''''''''''''''''''''''''''
16241630

1625-
Under the hood pandas uses a fast and efficient parser implemented in C as well
1626-
as a Python implementation which is currently more feature-complete. Where
1627-
possible pandas uses the C parser (specified as ``engine='c'``), but may fall
1628-
back to Python if C-unsupported options are specified. Currently, C-unsupported
1629-
options include:
1631+
Pandas currently supports three engines, the C engine, the python engine, and an experimental
1632+
pyarrow engine (requires the ``pyarrow`` package). In general, the pyarrow engine is fastest
1633+
on larger workloads and is equivalent in speed to the C engine on most other workloads.
1634+
The python engine tends to be slower than the pyarrow and C engines on most workloads. However,
1635+
the pyarrow engine is much less robust than the C engine, which lacks a few features compared to the
1636+
Python engine.
1637+
1638+
Where possible, pandas uses the C parser (specified as ``engine='c'``), but it may fall
1639+
back to Python if C-unsupported options are specified.
1640+
1641+
Currently, options unsupported by the C and pyarrow engines include:
16301642

16311643
* ``sep`` other than a single character (e.g. regex separators)
16321644
* ``skipfooter``
@@ -1635,6 +1647,32 @@ options include:
16351647
Specifying any of the above options will produce a ``ParserWarning`` unless the
16361648
python engine is selected explicitly using ``engine='python'``.
16371649

1650+
Options that are unsupported by the pyarrow engine which are not covered by the list above include:
1651+
1652+
* ``float_precision``
1653+
* ``chunksize``
1654+
* ``comment``
1655+
* ``nrows``
1656+
* ``thousands``
1657+
* ``memory_map``
1658+
* ``dialect``
1659+
* ``warn_bad_lines``
1660+
* ``error_bad_lines``
1661+
* ``on_bad_lines``
1662+
* ``delim_whitespace``
1663+
* ``quoting``
1664+
* ``lineterminator``
1665+
* ``converters``
1666+
* ``decimal``
1667+
* ``iterator``
1668+
* ``dayfirst``
1669+
* ``infer_datetime_format``
1670+
* ``verbose``
1671+
* ``skipinitialspace``
1672+
* ``low_memory``
1673+
1674+
Specifying these options with ``engine='pyarrow'`` will raise a ``ValueError``.
1675+
16381676
.. _io.remote:
16391677

16401678
Reading/writing remote files

doc/source/whatsnew/v1.4.0.rst

+6-3
Original file line numberDiff line numberDiff line change
@@ -78,10 +78,13 @@ Styler
7878

7979
There are also bug fixes and deprecations listed below.
8080

81-
.. _whatsnew_140.enhancements.enhancement2:
81+
.. _whatsnew_140.enhancements.pyarrow_csv_engine:
8282

83-
enhancement2
84-
^^^^^^^^^^^^
83+
Multithreaded CSV reading with a new CSV Engine based on pyarrow
84+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
85+
86+
:func:`pandas.read_csv` now accepts ``engine="pyarrow"`` (requires at least ``pyarrow`` 0.17.0) as an argument, allowing for faster csv parsing on multicore machines
87+
with pyarrow installed. See the :doc:`I/O docs </user_guide/io>` for more info. (:issue:`23697`)
8588

8689
.. _whatsnew_140.enhancements.other:
8790

+138
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
from __future__ import annotations
2+
3+
from pandas._typing import FilePathOrBuffer
4+
from pandas.compat._optional import import_optional_dependency
5+
6+
from pandas.core.dtypes.inference import is_integer
7+
8+
from pandas.core.frame import DataFrame
9+
10+
from pandas.io.common import get_handle
11+
from pandas.io.parsers.base_parser import ParserBase
12+
13+
14+
class ArrowParserWrapper(ParserBase):
15+
"""
16+
Wrapper for the pyarrow engine for read_csv()
17+
"""
18+
19+
def __init__(self, src: FilePathOrBuffer, **kwds):
20+
self.kwds = kwds
21+
self.src = src
22+
23+
ParserBase.__init__(self, kwds)
24+
25+
self._parse_kwds()
26+
27+
def _parse_kwds(self):
28+
"""
29+
Validates keywords before passing to pyarrow.
30+
"""
31+
encoding: str | None = self.kwds.get("encoding")
32+
self.encoding = "utf-8" if encoding is None else encoding
33+
34+
self.usecols, self.usecols_dtype = self._validate_usecols_arg(
35+
self.kwds["usecols"]
36+
)
37+
na_values = self.kwds["na_values"]
38+
if isinstance(na_values, dict):
39+
raise ValueError(
40+
"The pyarrow engine doesn't support passing a dict for na_values"
41+
)
42+
self.na_values = list(self.kwds["na_values"])
43+
44+
def _get_pyarrow_options(self):
45+
"""
46+
Rename some arguments to pass to pyarrow
47+
"""
48+
mapping = {
49+
"usecols": "include_columns",
50+
"na_values": "null_values",
51+
"escapechar": "escape_char",
52+
"skip_blank_lines": "ignore_empty_lines",
53+
}
54+
for pandas_name, pyarrow_name in mapping.items():
55+
if pandas_name in self.kwds and self.kwds.get(pandas_name) is not None:
56+
self.kwds[pyarrow_name] = self.kwds.pop(pandas_name)
57+
58+
self.parse_options = {
59+
option_name: option_value
60+
for option_name, option_value in self.kwds.items()
61+
if option_value is not None
62+
and option_name
63+
in ("delimiter", "quote_char", "escape_char", "ignore_empty_lines")
64+
}
65+
self.convert_options = {
66+
option_name: option_value
67+
for option_name, option_value in self.kwds.items()
68+
if option_value is not None
69+
and option_name
70+
in ("include_columns", "null_values", "true_values", "false_values")
71+
}
72+
self.read_options = {
73+
"autogenerate_column_names": self.header is None,
74+
"skip_rows": self.header
75+
if self.header is not None
76+
else self.kwds["skiprows"],
77+
}
78+
79+
def _finalize_output(self, frame: DataFrame) -> DataFrame:
80+
"""
81+
Processes data read in based on kwargs.
82+
83+
Parameters
84+
----------
85+
frame: DataFrame
86+
The DataFrame to process.
87+
88+
Returns
89+
-------
90+
DataFrame
91+
The processed DataFrame.
92+
"""
93+
num_cols = len(frame.columns)
94+
if self.header is None:
95+
if self.names is None:
96+
if self.prefix is not None:
97+
self.names = [f"{self.prefix}{i}" for i in range(num_cols)]
98+
elif self.header is None:
99+
self.names = range(num_cols)
100+
frame.columns = self.names
101+
# we only need the frame not the names
102+
frame.columns, frame = self._do_date_conversions(frame.columns, frame)
103+
if self.index_col is not None:
104+
for i, item in enumerate(self.index_col):
105+
if is_integer(item):
106+
self.index_col[i] = frame.columns[item]
107+
frame.set_index(self.index_col, drop=True, inplace=True)
108+
109+
if self.kwds.get("dtype") is not None:
110+
frame = frame.astype(self.kwds.get("dtype"))
111+
return frame
112+
113+
def read(self) -> DataFrame:
114+
"""
115+
Reads the contents of a CSV file into a DataFrame and
116+
processes it according to the kwargs passed in the
117+
constructor.
118+
119+
Returns
120+
-------
121+
DataFrame
122+
The DataFrame created from the CSV file.
123+
"""
124+
pyarrow_csv = import_optional_dependency("pyarrow.csv")
125+
self._get_pyarrow_options()
126+
127+
with get_handle(
128+
self.src, "rb", encoding=self.encoding, is_text=False
129+
) as handles:
130+
table = pyarrow_csv.read_csv(
131+
handles.handle,
132+
read_options=pyarrow_csv.ReadOptions(**self.read_options),
133+
parse_options=pyarrow_csv.ParseOptions(**self.parse_options),
134+
convert_options=pyarrow_csv.ConvertOptions(**self.convert_options),
135+
)
136+
137+
frame = table.to_pandas()
138+
return self._finalize_output(frame)

0 commit comments

Comments
 (0)