ENH: Add arrow engine to to_csv

lithomas1 · lithomas1 · commit d0e7d869fa35 · 2023-07-17T14:33:55.000-07:00
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -3658,6 +3658,7 @@ def to_csv(
         path_or_buf: None = ...,
         sep: str = ...,
         na_rep: str = ...,
+        engine: str = "python",
         float_format: str | Callable | None = ...,
         columns: Sequence[Hashable] | None = ...,
         header: bool_t | list[str] = ...,
@@ -3685,6 +3686,7 @@ def to_csv(
         path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str],
         sep: str = ...,
         na_rep: str = ...,
+        engine: str = "python",
         float_format: str | Callable | None = ...,
         columns: Sequence[Hashable] | None = ...,
         header: bool_t | list[str] = ...,
@@ -3716,6 +3718,7 @@ def to_csv(
         path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
         sep: str = ",",
         na_rep: str = "",
+        engine: str = "python",
         float_format: str | Callable | None = None,
         columns: Sequence[Hashable] | None = None,
         header: bool_t | list[str] = True,
@@ -3755,6 +3758,13 @@ def to_csv(
             String of length 1. Field delimiter for the output file.
         na_rep : str, default ''
             Missing data representation.
+        engine : str, default 'python'
+            The engine to use. Available options are "pyarrow" or "python".
+            The pyarrow engine requires the pyarrow library to be installed
+            and is generally faster than the python engine.
+
+            However, the python engine may be more feature complete than the
+            pyarrow engine.
         float_format : str, Callable, default None
             Format string for floating point numbers. If a Callable is given, it takes
             precedence over other numeric formatting parameters, like decimal.
@@ -3890,6 +3900,7 @@ def to_csv(
 
         return DataFrameRenderer(formatter).to_csv(
             path_or_buf,
+            engine=engine,
             lineterminator=lineterminator,
             sep=sep,
             encoding=encoding,
diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py
@@ -20,6 +20,7 @@
 import numpy as np
 
 from pandas._libs import writers as libwriters
+from pandas.compat._optional import import_optional_dependency
 from pandas.util._decorators import cache_readonly
 
 from pandas.core.dtypes.generic import (
@@ -57,6 +58,7 @@ def __init__(
         self,
         formatter: DataFrameFormatter,
         path_or_buf: FilePath | WriteBuffer[str] | WriteBuffer[bytes] = "",
+        engine: str = "python",
         sep: str = ",",
         cols: Sequence[Hashable] | None = None,
         index_label: IndexLabel | None = None,
@@ -78,6 +80,7 @@ def __init__(
         self.obj = self.fmt.frame
 
         self.filepath_or_buffer = path_or_buf
+        self.engine = engine
         self.encoding = encoding
         self.compression: CompressionOptions = compression
         self.mode = mode
@@ -252,22 +255,58 @@ def save(self) -> None:
             storage_options=self.storage_options,
         ) as handles:
             # Note: self.encoding is irrelevant here
+            self._save(handles.handle)
+
+    def _save_pyarrow(self, handle) -> None:
+        pa = import_optional_dependency("pyarrow")
+        pa_csv = import_optional_dependency("pyarrow.csv")
+        # Convert index to column and rename name to empty string
+        # since we serialize the index as basically a column with no name
+        # TODO: this won't work for multi-indexes
+        obj = self.obj.reset_index(names=[""])
+
+        table = pa.Table.from_pandas(obj)
+
+        # Map quoting arg to pyarrow equivalents
+        pa_quoting = None
+        if self.quoting == csvlib.QUOTE_MINIMAL:
+            pa_quoting = "needed"
+        elif self.quoting == csvlib.QUOTE_ALL:
+            # TODO: Is this a 1-1 mapping?
+            # This doesn't quote nulls, check if Python does this
+            pa_quoting = "all_valid"
+        elif self.quoting == csvlib.QUOTE_NONE:
+            pa_quoting = "none"
+        else:
+            raise ValueError(
+                f"Quoting option {self.quoting} is not supported with engine='pyarrow'"
+            )
+
+        write_options = pa_csv.WriteOptions(
+            include_header=self._need_to_save_header,
+            batch_size=self.chunksize,
+            delimiter=self.sep,
+            quoting_style=pa_quoting,
+        )
+        # pa_csv.write_csv(table, handle, write_options)
+        pa_csv.write_csv(table, self.filepath_or_buffer, write_options)
+
+    def _save(self, handle) -> None:
+        if self.engine == "pyarrow":
+            self._save_pyarrow(handle)
+        else:
             self.writer = csvlib.writer(
-                handles.handle,
+                handle,
                 lineterminator=self.lineterminator,
                 delimiter=self.sep,
                 quoting=self.quoting,
                 doublequote=self.doublequote,
                 escapechar=self.escapechar,
                 quotechar=self.quotechar,
             )
-
-            self._save()
-
-    def _save(self) -> None:
-        if self._need_to_save_header:
-            self._save_header()
-        self._save_body()
+            if self._need_to_save_header:
+                self._save_header()
+            self._save_body()
 
     def _save_header(self) -> None:
         if not self.has_mi_columns or self._has_aliases:
diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py
@@ -1103,6 +1103,7 @@ def to_string(
     def to_csv(
         self,
         path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
+        engine: str = "python",
         encoding: str | None = None,
         sep: str = ",",
         columns: Sequence[Hashable] | None = None,
@@ -1132,6 +1133,7 @@ def to_csv(
 
         csv_formatter = CSVFormatter(
             path_or_buf=path_or_buf,
+            engine=engine,
             lineterminator=lineterminator,
             sep=sep,
             encoding=encoding,
diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py