Skip to content

Commit d0e7d86

Browse files
committed
ENH: Add arrow engine to to_csv
1 parent 3d7a44a commit d0e7d86

File tree

4 files changed

+199
-115
lines changed

4 files changed

+199
-115
lines changed

pandas/core/generic.py

+11
Original file line numberDiff line numberDiff line change
@@ -3658,6 +3658,7 @@ def to_csv(
36583658
path_or_buf: None = ...,
36593659
sep: str = ...,
36603660
na_rep: str = ...,
3661+
engine: str = "python",
36613662
float_format: str | Callable | None = ...,
36623663
columns: Sequence[Hashable] | None = ...,
36633664
header: bool_t | list[str] = ...,
@@ -3685,6 +3686,7 @@ def to_csv(
36853686
path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str],
36863687
sep: str = ...,
36873688
na_rep: str = ...,
3689+
engine: str = "python",
36883690
float_format: str | Callable | None = ...,
36893691
columns: Sequence[Hashable] | None = ...,
36903692
header: bool_t | list[str] = ...,
@@ -3716,6 +3718,7 @@ def to_csv(
37163718
path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
37173719
sep: str = ",",
37183720
na_rep: str = "",
3721+
engine: str = "python",
37193722
float_format: str | Callable | None = None,
37203723
columns: Sequence[Hashable] | None = None,
37213724
header: bool_t | list[str] = True,
@@ -3755,6 +3758,13 @@ def to_csv(
37553758
String of length 1. Field delimiter for the output file.
37563759
na_rep : str, default ''
37573760
Missing data representation.
3761+
engine : str, default 'python'
3762+
The engine to use. Available options are "pyarrow" or "python".
3763+
The pyarrow engine requires the pyarrow library to be installed
3764+
and is generally faster than the python engine.
3765+
3766+
However, the python engine may be more feature complete than the
3767+
pyarrow engine.
37583768
float_format : str, Callable, default None
37593769
Format string for floating point numbers. If a Callable is given, it takes
37603770
precedence over other numeric formatting parameters, like decimal.
@@ -3890,6 +3900,7 @@ def to_csv(
38903900

38913901
return DataFrameRenderer(formatter).to_csv(
38923902
path_or_buf,
3903+
engine=engine,
38933904
lineterminator=lineterminator,
38943905
sep=sep,
38953906
encoding=encoding,

pandas/io/formats/csvs.py

+47-8
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import numpy as np
2121

2222
from pandas._libs import writers as libwriters
23+
from pandas.compat._optional import import_optional_dependency
2324
from pandas.util._decorators import cache_readonly
2425

2526
from pandas.core.dtypes.generic import (
@@ -57,6 +58,7 @@ def __init__(
5758
self,
5859
formatter: DataFrameFormatter,
5960
path_or_buf: FilePath | WriteBuffer[str] | WriteBuffer[bytes] = "",
61+
engine: str = "python",
6062
sep: str = ",",
6163
cols: Sequence[Hashable] | None = None,
6264
index_label: IndexLabel | None = None,
@@ -78,6 +80,7 @@ def __init__(
7880
self.obj = self.fmt.frame
7981

8082
self.filepath_or_buffer = path_or_buf
83+
self.engine = engine
8184
self.encoding = encoding
8285
self.compression: CompressionOptions = compression
8386
self.mode = mode
@@ -252,22 +255,58 @@ def save(self) -> None:
252255
storage_options=self.storage_options,
253256
) as handles:
254257
# Note: self.encoding is irrelevant here
258+
self._save(handles.handle)
259+
260+
def _save_pyarrow(self, handle) -> None:
261+
pa = import_optional_dependency("pyarrow")
262+
pa_csv = import_optional_dependency("pyarrow.csv")
263+
# Convert index to column and rename name to empty string
264+
# since we serialize the index as basically a column with no name
265+
# TODO: this won't work for multi-indexes
266+
obj = self.obj.reset_index(names=[""])
267+
268+
table = pa.Table.from_pandas(obj)
269+
270+
# Map quoting arg to pyarrow equivalents
271+
pa_quoting = None
272+
if self.quoting == csvlib.QUOTE_MINIMAL:
273+
pa_quoting = "needed"
274+
elif self.quoting == csvlib.QUOTE_ALL:
275+
# TODO: Is this a 1-1 mapping?
276+
# This doesn't quote nulls, check if Python does this
277+
pa_quoting = "all_valid"
278+
elif self.quoting == csvlib.QUOTE_NONE:
279+
pa_quoting = "none"
280+
else:
281+
raise ValueError(
282+
f"Quoting option {self.quoting} is not supported with engine='pyarrow'"
283+
)
284+
285+
write_options = pa_csv.WriteOptions(
286+
include_header=self._need_to_save_header,
287+
batch_size=self.chunksize,
288+
delimiter=self.sep,
289+
quoting_style=pa_quoting,
290+
)
291+
# pa_csv.write_csv(table, handle, write_options)
292+
pa_csv.write_csv(table, self.filepath_or_buffer, write_options)
293+
294+
def _save(self, handle) -> None:
295+
if self.engine == "pyarrow":
296+
self._save_pyarrow(handle)
297+
else:
255298
self.writer = csvlib.writer(
256-
handles.handle,
299+
handle,
257300
lineterminator=self.lineterminator,
258301
delimiter=self.sep,
259302
quoting=self.quoting,
260303
doublequote=self.doublequote,
261304
escapechar=self.escapechar,
262305
quotechar=self.quotechar,
263306
)
264-
265-
self._save()
266-
267-
def _save(self) -> None:
268-
if self._need_to_save_header:
269-
self._save_header()
270-
self._save_body()
307+
if self._need_to_save_header:
308+
self._save_header()
309+
self._save_body()
271310

272311
def _save_header(self) -> None:
273312
if not self.has_mi_columns or self._has_aliases:

pandas/io/formats/format.py

+2
Original file line numberDiff line numberDiff line change
@@ -1103,6 +1103,7 @@ def to_string(
11031103
def to_csv(
11041104
self,
11051105
path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
1106+
engine: str = "python",
11061107
encoding: str | None = None,
11071108
sep: str = ",",
11081109
columns: Sequence[Hashable] | None = None,
@@ -1132,6 +1133,7 @@ def to_csv(
11321133

11331134
csv_formatter = CSVFormatter(
11341135
path_or_buf=path_or_buf,
1136+
engine=engine,
11351137
lineterminator=lineterminator,
11361138
sep=sep,
11371139
encoding=encoding,

0 commit comments

Comments
 (0)