BUG: Add errors argument to to_csv() call to enable error handling for encoders (#32702)

roberthdevries · web-flow · commit 43a463d22c60 · 2020-06-08T16:36:04.000-07:00
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -288,6 +288,7 @@ Other enhancements
 - :meth:`HDFStore.put` now accepts `track_times` parameter. Parameter is passed to ``create_table`` method of ``PyTables`` (:issue:`32682`).
 - Make :class:`pandas.core.window.Rolling` and :class:`pandas.core.window.Expanding` iterable（:issue:`11704`)
 - Make ``option_context`` a :class:`contextlib.ContextDecorator`, which allows it to be used as a decorator over an entire function (:issue:`34253`).
+- :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now accept an ``errors`` argument (:issue:`22610`)
 - :meth:`groupby.transform` now allows ``func`` to be ``pad``, ``backfill`` and ``cumcount`` (:issue:`31269`).
 - :meth:`~pandas.io.json.read_json` now accepts `nrows` parameter. (:issue:`33916`).
 - :meth `~pandas.io.gbq.read_gbq` now allows to disable progress bar (:issue:`33360`).
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -3049,6 +3049,7 @@ def to_csv(
         doublequote: bool_t = True,
         escapechar: Optional[str] = None,
         decimal: Optional[str] = ".",
+        errors: str = "strict",
     ) -> Optional[str]:
         r"""
         Write object to a comma-separated values (csv) file.
@@ -3143,6 +3144,12 @@ def to_csv(
         decimal : str, default '.'
             Character recognized as decimal separator. E.g. use ',' for
             European data.
+        errors : str, default 'strict'
+            Specifies how encoding and decoding errors are to be handled.
+            See the errors argument for :func:`open` for a full list
+            of options.
+
+            .. versionadded:: 1.1.0
 
         Returns
         -------
@@ -3180,6 +3187,7 @@ def to_csv(
             line_terminator=line_terminator,
             sep=sep,
             encoding=encoding,
+            errors=errors,
             compression=compression,
             quoting=quoting,
             na_rep=na_rep,
diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -352,6 +352,7 @@ def get_handle(
     compression: Optional[Union[str, Mapping[str, Any]]] = None,
     memory_map: bool = False,
     is_text: bool = True,
+    errors=None,
 ):
     """
     Get file handle for given path/buffer and mode.
@@ -390,6 +391,12 @@ def get_handle(
     is_text : boolean, default True
         whether file/buffer is in text format (csv, json, etc.), or in binary
         mode (pickle, etc.).
+    errors : str, default 'strict'
+        Specifies how encoding and decoding errors are to be handled.
+        See the errors argument for :func:`open` for a full list
+        of options.
+
+        .. versionadded:: 1.1.0
 
     Returns
     -------
@@ -475,7 +482,7 @@ def get_handle(
     elif is_path:
         if encoding:
             # Encoding
-            f = open(path_or_buf, mode, encoding=encoding, newline="")
+            f = open(path_or_buf, mode, encoding=encoding, errors=errors, newline="")
         elif is_text:
             # No explicit encoding
             f = open(path_or_buf, mode, errors="replace", newline="")
@@ -488,7 +495,7 @@ def get_handle(
     if is_text and (compression or isinstance(f, need_text_wrapping)):
         from io import TextIOWrapper
 
-        g = TextIOWrapper(f, encoding=encoding, newline="")
+        g = TextIOWrapper(f, encoding=encoding, errors=errors, newline="")
         if not isinstance(f, (BufferedIOBase, RawIOBase)):
             handles.append(g)
         f = g
diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py
@@ -44,6 +44,7 @@ def __init__(
         index_label: Optional[Union[bool, Hashable, Sequence[Hashable]]] = None,
         mode: str = "w",
         encoding: Optional[str] = None,
+        errors: str = "strict",
         compression: Union[str, Mapping[str, str], None] = "infer",
         quoting: Optional[int] = None,
         line_terminator="\n",
@@ -77,6 +78,7 @@ def __init__(
         if encoding is None:
             encoding = "utf-8"
         self.encoding = encoding
+        self.errors = errors
         self.compression = infer_compression(self.path_or_buf, compression)
 
         if quoting is None:
@@ -184,6 +186,7 @@ def save(self) -> None:
                 self.path_or_buf,
                 self.mode,
                 encoding=self.encoding,
+                errors=self.errors,
                 compression=dict(self.compression_args, method=self.compression),
             )
             close = True
@@ -215,6 +218,7 @@ def save(self) -> None:
                         self.path_or_buf,
                         self.mode,
                         encoding=self.encoding,
+                        errors=self.errors,
                         compression=compression,
                     )
                     f.write(buf)
diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py
@@ -597,3 +597,13 @@ def test_na_rep_truncated(self):
         result = pd.Series([1.1, 2.2]).to_csv(na_rep=".")
         expected = tm.convert_rows_list_to_csv_str([",0", "0,1.1", "1,2.2"])
         assert result == expected
+
+    @pytest.mark.parametrize("errors", ["surrogatepass", "ignore", "replace"])
+    def test_to_csv_errors(self, errors):
+        # GH 22610
+        data = ["\ud800foo"]
+        ser = pd.Series(data, index=pd.Index(data))
+        with tm.ensure_clean("test.csv") as path:
+            ser.to_csv(path, errors=errors)
+        # No use in reading back the data as it is not the same anymore
+        # due to the error handling