Add errors option in pandas.DataFrame.to_csv

shigemk2 · shigemk2 · commit fc190623eb3f · 2019-08-14T13:00:28.000+09:00
errors : str, default 'strict'
    Response when the input string can’t be converted according to
    the encoding’s rules
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -3069,6 +3069,7 @@ def to_csv(
         doublequote=True,
         escapechar=None,
         decimal=".",
+        errors="strict",
     ):
         r"""
         Write object to a comma-separated values (csv) file.
@@ -3151,6 +3152,12 @@ def to_csv(
         decimal : str, default '.'
             Character recognized as decimal separator. E.g. use ',' for
             European data.
+        errors : str, default 'strict'
+            Response when the input string can’t be converted according to
+            the encoding’s rules
+            strict: Raise UnicodeError
+            ignore: Ignore the malformed data and continue without further notice
+            replace: Replace with a suitable replacement marker
 
         Returns
         -------
@@ -3197,6 +3204,7 @@ def to_csv(
             doublequote=doublequote,
             escapechar=escapechar,
             decimal=decimal,
+            errors=errors,
         )
         formatter.save()
 
diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -310,7 +310,13 @@ def _infer_compression(
 
 
 def _get_handle(
-    path_or_buf, mode, encoding=None, compression=None, memory_map=False, is_text=True
+    path_or_buf,
+    mode,
+    encoding=None,
+    compression=None,
+    memory_map=False,
+    is_text=True,
+    errors="strict",
 ):
     """
     Get file handle for given path/buffer and mode.
@@ -331,6 +337,12 @@ def _get_handle(
     is_text : boolean, default True
         whether file/buffer is in text format (csv, json, etc.), or in binary
         mode (pickle, etc.)
+    errors : str, default 'strict'
+        Response when the input string can’t be converted according to
+        the encoding’s rules
+        strict: Raise UnicodeError
+        ignore: Ignore the malformed data and continue without further notice
+        replace: Replace with a suitable replacement marker
 
     Returns
     -------
@@ -407,10 +419,10 @@ def _get_handle(
     elif is_path:
         if encoding:
             # Encoding
-            f = open(path_or_buf, mode, encoding=encoding, newline="")
+            f = open(path_or_buf, mode, errors=errors, encoding=encoding, newline="")
         elif is_text:
             # No explicit encoding
-            f = open(path_or_buf, mode, errors="replace", newline="")
+            f = open(path_or_buf, mode, errors=errors, newline="")
         else:
             # Binary mode
             f = open(path_or_buf, mode)
diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py
@@ -51,6 +51,7 @@ def __init__(
         doublequote=True,
         escapechar=None,
         decimal=".",
+        errors="strict",
     ):
 
         self.obj = obj
@@ -93,6 +94,8 @@ def __init__(
 
         self.has_mi_columns = isinstance(obj.columns, ABCMultiIndex)
 
+        self.errors = errors
+
         # validate mi options
         if self.has_mi_columns:
             if cols is not None:
@@ -179,6 +182,7 @@ def save(self):
                 self.mode,
                 encoding=self.encoding,
                 compression=self.compression,
+                errors=self.errors,
             )
             close = True
 
diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py
@@ -514,3 +514,85 @@ def test_to_csv_compression(self, compression_only, read_infer, to_infer):
             df.to_csv(path, compression=to_compression)
             result = pd.read_csv(path, index_col=0, compression=read_compression)
             tm.assert_frame_equal(result, df)
+
+    def test_to_csv_string_with_sjis(self):
+        data = {"int": [1, 2, 3], "str_sjis": ["abc", "\u070a", "def"]}
+        df = pd.DataFrame(data)
+        # case 1: errors=strict
+        with tm.ensure_clean("sjis_test.csv") as path:
+            expected_exception = UnicodeEncodeError
+            with pytest.raises(expected_exception):
+                df.to_csv(
+                    path,
+                    line_terminator="\n",
+                    encoding="sjis",
+                    errors="strict",
+                    index=False,
+                )
+
+        # case 2: errors=replace
+        with tm.ensure_clean("sjis_test.csv") as path:
+            expected_sjis = b"int,str_sjis\n" b"1,abc\n" b"2,?\n" b"3,def\n"
+            df.to_csv(
+                path,
+                line_terminator="\n",
+                encoding="sjis",
+                errors="replace",
+                index=False,
+            )
+            with open(path, "rb") as f:
+                assert f.read() == expected_sjis
+
+        # case 3: errors=ignore
+        with tm.ensure_clean("sjis_test.csv") as path:
+            expected_sjis = b"int,str_sjis\n" b"1,abc\n" b"2,\n" b"3,def\n"
+            df.to_csv(
+                path,
+                line_terminator="\n",
+                encoding="sjis",
+                errors="ignore",
+                index=False,
+            )
+            with open(path, "rb") as f:
+                assert f.read() == expected_sjis
+
+    def test_to_csv_string_with_cp932(self):
+        data = {"int": [1, 2, 3], "str_cp932": ["abc", "\u070a", "def"]}
+        df = pd.DataFrame(data)
+        # case 1: errors=strict
+        with tm.ensure_clean("cp932_test.csv") as path:
+            expected_exception = UnicodeEncodeError
+            with pytest.raises(expected_exception):
+                df.to_csv(
+                    path,
+                    line_terminator="\n",
+                    encoding="cp932",
+                    errors="strict",
+                    index=False,
+                )
+
+        # case 2: errors=replace
+        with tm.ensure_clean("cp932_test.csv") as path:
+            expected_cp932 = b"int,str_cp932\n" b"1,abc\n" b"2,?\n" b"3,def\n"
+            df.to_csv(
+                path,
+                line_terminator="\n",
+                encoding="cp932",
+                errors="replace",
+                index=False,
+            )
+            with open(path, "rb") as f:
+                assert f.read() == expected_cp932
+
+        # case 3: errors=ignore
+        with tm.ensure_clean("cp932_test.csv") as path:
+            expected_cp932 = b"int,str_cp932\n" b"1,abc\n" b"2,\n" b"3,def\n"
+            df.to_csv(
+                path,
+                line_terminator="\n",
+                encoding="cp932",
+                errors="ignore",
+                index=False,
+            )
+            with open(path, "rb") as f:
+                assert f.read() == expected_cp932