diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 338c890ce317c..0e500db6fec63 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1710,6 +1710,8 @@ function takes a number of arguments. Only the first is required. appropriate (default None) * ``chunksize``: Number of rows to write at a time * ``date_format``: Format string for datetime objects +* ``encoding_errors``: Behavior when the input string can’t be converted according to the encoding’s rules (strict, ignore, replace, etc.) + .. versionadded:: 1.0.0 Writing a formatted string ++++++++++++++++++++++++++ diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 3b6288146bdf2..afcb5b5f45bb3 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -21,7 +21,7 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ -- +- :meth:`Dataframe.to_csv` Add `encoding_errors` option (:issue:`27750`). - .. _whatsnew_1000.enhancements.other: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 1a5b36b07e93c..7adb3b9675c6a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3084,6 +3084,7 @@ def to_csv( doublequote: bool_t = True, escapechar: Optional[str] = None, decimal: Optional[str] = ".", + encoding_errors: Optional[str] = "strict", ) -> Optional[str]: r""" Write object to a comma-separated values (csv) file. @@ -3171,6 +3172,11 @@ def to_csv( decimal : str, default '.' Character recognized as decimal separator. E.g. use ',' for European data. + encoding_errors : str, default 'strict' + Behavior when the input string can’t be converted according to + the encoding’s rules (strict, ignore, replace, etc.) + See: https://docs.python.org/3/library/codecs.html#codec-base-classes + .. versionadded:: 1.0.0 Returns ------- @@ -3224,6 +3230,7 @@ def to_csv( doublequote=doublequote, escapechar=escapechar, decimal=decimal, + encoding_errors=encoding_errors, ) formatter.save() diff --git a/pandas/io/common.py b/pandas/io/common.py index 30228d660e816..fea57344718f6 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -363,6 +363,7 @@ def _get_handle( compression: Optional[Union[str, Dict[str, Any]]] = None, memory_map: bool = False, is_text: bool = True, + encoding_errors: Optional[str] = "strict", ): """ Get file handle for given path/buffer and mode. @@ -395,6 +396,11 @@ def _get_handle( is_text : boolean, default True whether file/buffer is in text format (csv, json, etc.), or in binary mode (pickle, etc.). + encoding_errors : str, default 'strict' + Behavior when the input string can’t be converted according to + the encoding’s rules (strict, ignore, replace, etc.) + See: https://docs.python.org/3/library/codecs.html#codec-base-classes + .. versionadded:: 1.0.0 Returns ------- @@ -472,10 +478,12 @@ def _get_handle( elif is_path: if encoding: # Encoding - f = open(path_or_buf, mode, encoding=encoding, newline="") + f = open( + path_or_buf, mode, errors=encoding_errors, encoding=encoding, newline="" + ) elif is_text: # No explicit encoding - f = open(path_or_buf, mode, errors="replace", newline="") + f = open(path_or_buf, mode, errors=encoding_errors, newline="") else: # Binary mode f = open(path_or_buf, mode) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index e25862537cbfc..814c2c12d9037 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -52,6 +52,7 @@ def __init__( doublequote=True, escapechar=None, decimal=".", + encoding_errors="strict", ): self.obj = obj @@ -97,6 +98,8 @@ def __init__( self.has_mi_columns = isinstance(obj.columns, ABCMultiIndex) + self.encoding_errors = encoding_errors + # validate mi options if self.has_mi_columns: if cols is not None: @@ -183,6 +186,7 @@ def save(self): self.mode, encoding=self.encoding, compression=dict(self.compression_args, method=self.compression), + encoding_errors=self.encoding_errors, ) close = True diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index ab44b8b8059eb..39c488b3afd7f 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -555,3 +555,467 @@ def test_to_csv_zip_arguments(self, compression, archive_name): assert len(zp.filelist) == 1 archived_file = os.path.basename(zp.filelist[0].filename) assert archived_file == expected_arcname + + def test_to_csv_path_with_sjis(self): + # https://github.com/pandas-dev/pandas/issues/27750 + data = {"int": [1, 2, 3], "str_sjis": ["abc", "\u070a", "def"]} + df = pd.DataFrame(data) + # case 1: encoding_errors=strict + with tm.ensure_clean("sjis_test.csv") as path: + with pytest.raises(UnicodeEncodeError): + df.to_csv( + path, + line_terminator="\n", + encoding="sjis", + encoding_errors="strict", + index=False, + ) + + # case 2: encoding_errors=replace + with tm.ensure_clean("sjis_test.csv") as path: + expected_sjis = """\ +int,str_sjis +1,abc +2,? +3,def +""" + df.to_csv( + path, + line_terminator="\n", + encoding="sjis", + encoding_errors="replace", + index=False, + ) + with open(path, "r") as f: + assert f.read() == expected_sjis + + # case 3: encoding_errors=ignore + with tm.ensure_clean("sjis_test.csv") as path: + expected_sjis = """\ +int,str_sjis +1,abc +2, +3,def +""" + df.to_csv( + path, + line_terminator="\n", + encoding="sjis", + encoding_errors="ignore", + index=False, + ) + with open(path, "r") as f: + assert f.read() == expected_sjis + + # case 4: encoding_errors=xmlcharrefreplace + with tm.ensure_clean("sjis_test.csv") as path: + expected_sjis = """\ +int,str_sjis +1,abc +2,܊ +3,def +""" + df.to_csv( + path, + line_terminator="\n", + encoding="sjis", + encoding_errors="xmlcharrefreplace", + index=False, + ) + with open(path, "r") as f: + assert f.read() == expected_sjis + + # case 5: encoding_errors=backslashreplace + with tm.ensure_clean("sjis_test.csv") as path: + expected_sjis = """\ +int,str_sjis +1,abc +2,\\u070a +3,def +""" + df.to_csv( + path, + line_terminator="\n", + encoding="sjis", + encoding_errors="backslashreplace", + index=False, + ) + with open(path, "r") as f: + assert f.read() == expected_sjis + + # case 6: encoding_errors=namereplace + with tm.ensure_clean("sjis_test.csv") as path: + expected_sjis = """\ +int,str_sjis +1,abc +2,\\N{SYRIAC CONTRACTION} +3,def +""" + df.to_csv( + path, + line_terminator="\n", + encoding="sjis", + encoding_errors="namereplace", + index=False, + ) + with open(path, "r") as f: + assert f.read() == expected_sjis + + # case 7: encoding_errors=surrogatepass + with tm.ensure_clean("sjis_test.csv") as path: + with pytest.raises(UnicodeEncodeError): + df.to_csv( + path, + line_terminator="\n", + encoding="sjis", + encoding_errors="surrogatepass", + index=False, + ) + + def test_to_csv_path_with_cp932(self): + # https://github.com/pandas-dev/pandas/issues/27750 + data = {"int": [1, 2, 3], "str_cp932": ["abc", "\u070a", "def"]} + df = pd.DataFrame(data) + # case 1: encoding_errors=strict + with tm.ensure_clean("cp932_test.csv") as path: + with pytest.raises(UnicodeEncodeError): + df.to_csv( + path, + line_terminator="\n", + encoding="cp932", + encoding_errors="strict", + index=False, + ) + + # case 2: encoding_errors=replace + with tm.ensure_clean("cp932_test.csv") as path: + expected_cp932 = """\ +int,str_cp932 +1,abc +2,? +3,def +""" + df.to_csv( + path, + line_terminator="\n", + encoding="cp932", + encoding_errors="replace", + index=False, + ) + with open(path, "r") as f: + assert f.read() == expected_cp932 + + # case 3: encoding_errors=ignore + with tm.ensure_clean("cp932_test.csv") as path: + expected_cp932 = """\ +int,str_cp932 +1,abc +2, +3,def +""" + df.to_csv( + path, + line_terminator="\n", + encoding="cp932", + encoding_errors="ignore", + index=False, + ) + with open(path, "r") as f: + assert f.read() == expected_cp932 + + # case 4: encoding_errors=xmlcharrefreplace + with tm.ensure_clean("cp932_test.csv") as path: + expected_cp932 = """\ +int,str_cp932 +1,abc +2,܊ +3,def +""" + df.to_csv( + path, + line_terminator="\n", + encoding="cp932", + encoding_errors="xmlcharrefreplace", + index=False, + ) + with open(path, "r") as f: + assert f.read() == expected_cp932 + + # case 5: encoding_errors=backslashreplace + with tm.ensure_clean("cp932_test.csv") as path: + expected_cp932 = """\ +int,str_cp932 +1,abc +2,\\u070a +3,def +""" + df.to_csv( + path, + line_terminator="\n", + encoding="cp932", + encoding_errors="backslashreplace", + index=False, + ) + with open(path, "r") as f: + assert f.read() == expected_cp932 + + # case 6: encoding_errors=namereplace + with tm.ensure_clean("cp932_test.csv") as path: + expected_cp932 = """\ +int,str_cp932 +1,abc +2,\\N{SYRIAC CONTRACTION} +3,def +""" + df.to_csv( + path, + line_terminator="\n", + encoding="cp932", + encoding_errors="namereplace", + index=False, + ) + with open(path, "r") as f: + assert f.read() == expected_cp932 + + # case 7: encoding_errors=surrogatepass + with tm.ensure_clean("cp932_test.csv") as path: + with pytest.raises(UnicodeEncodeError): + df.to_csv( + path, + line_terminator="\n", + encoding="cp932", + encoding_errors="surrogatepass", + index=False, + ) + + def test_to_csv_file_object_with_sjis(self): + # https://github.com/pandas-dev/pandas/issues/27750 + data = {"int": [1, 2, 3], "str_sjis": ["abc", "\u070a", "def"]} + df = pd.DataFrame(data) + # case 1: encoding_errors=strict + with tm.ensure_clean() as path: + with pytest.raises(UnicodeEncodeError): + df.to_csv( + path, + line_terminator="\n", + encoding="sjis", + encoding_errors="strict", + index=False, + ) + + # case 2: encoding_errors=replace + with tm.ensure_clean() as path: + expected_sjis = """\ +int,str_sjis +1,abc +2,? +3,def +""" + df.to_csv( + path, + line_terminator="\n", + encoding="sjis", + encoding_errors="replace", + index=False, + ) + with open(path, "r") as f: + assert f.read() == expected_sjis + + # case 3: encoding_errors=ignore + with tm.ensure_clean() as path: + expected_sjis = """\ +int,str_sjis +1,abc +2, +3,def +""" + df.to_csv( + path, + line_terminator="\n", + encoding="sjis", + encoding_errors="ignore", + index=False, + ) + with open(path, "r") as f: + assert f.read() == expected_sjis + + # case 4: encoding_errors=xmlcharrefreplace + with tm.ensure_clean() as path: + expected_sjis = """\ +int,str_sjis +1,abc +2,܊ +3,def +""" + df.to_csv( + path, + line_terminator="\n", + encoding="sjis", + encoding_errors="xmlcharrefreplace", + index=False, + ) + with open(path, "r") as f: + assert f.read() == expected_sjis + + # case 5: encoding_errors=backslashreplace + with tm.ensure_clean() as path: + expected_sjis = """\ +int,str_sjis +1,abc +2,\\u070a +3,def +""" + df.to_csv( + path, + line_terminator="\n", + encoding="sjis", + encoding_errors="backslashreplace", + index=False, + ) + with open(path, "r") as f: + assert f.read() == expected_sjis + + # case 6: encoding_errors=namereplace + with tm.ensure_clean() as path: + expected_sjis = """\ +int,str_sjis +1,abc +2,\\N{SYRIAC CONTRACTION} +3,def +""" + df.to_csv( + path, + line_terminator="\n", + encoding="sjis", + encoding_errors="namereplace", + index=False, + ) + with open(path, "r") as f: + assert f.read() == expected_sjis + + # case 7: encoding_errors=surrogatepass + with tm.ensure_clean() as path: + with pytest.raises(UnicodeEncodeError): + df.to_csv( + path, + line_terminator="\n", + encoding="sjis", + encoding_errors="surrogatepass", + index=False, + ) + + def test_to_csv_file_object_with_cp932(self): + # https://github.com/pandas-dev/pandas/issues/27750 + data = {"int": [1, 2, 3], "str_cp932": ["abc", "\u070a", "def"]} + df = pd.DataFrame(data) + # case 1: encoding_errors=strict + with tm.ensure_clean() as path: + with pytest.raises(UnicodeEncodeError): + df.to_csv( + path, + line_terminator="\n", + encoding="cp932", + encoding_errors="strict", + index=False, + ) + + # case 2: encoding_errors=replace + with tm.ensure_clean() as path: + expected_cp932 = """\ +int,str_cp932 +1,abc +2,? +3,def +""" + df.to_csv( + path, + line_terminator="\n", + encoding="cp932", + encoding_errors="replace", + index=False, + ) + with open(path, "r") as f: + assert f.read() == expected_cp932 + + # case 3: encoding_errors=ignore + with tm.ensure_clean() as path: + expected_cp932 = """\ +int,str_cp932 +1,abc +2, +3,def +""" + df.to_csv( + path, + line_terminator="\n", + encoding="cp932", + encoding_errors="ignore", + index=False, + ) + with open(path, "r") as f: + assert f.read() == expected_cp932 + + # case 4: encoding_errors=xmlcharrefreplace + with tm.ensure_clean() as path: + expected_cp932 = """\ +int,str_cp932 +1,abc +2,܊ +3,def +""" + df.to_csv( + path, + line_terminator="\n", + encoding="cp932", + encoding_errors="xmlcharrefreplace", + index=False, + ) + with open(path, "r") as f: + assert f.read() == expected_cp932 + + # case 5: encoding_errors=backslashreplace + with tm.ensure_clean() as path: + expected_cp932 = """\ +int,str_cp932 +1,abc +2,\\u070a +3,def +""" + df.to_csv( + path, + line_terminator="\n", + encoding="cp932", + encoding_errors="backslashreplace", + index=False, + ) + with open(path, "r") as f: + assert f.read() == expected_cp932 + + # case 6: encoding_errors=namereplace + with tm.ensure_clean() as path: + expected_cp932 = """\ +int,str_cp932 +1,abc +2,\\N{SYRIAC CONTRACTION} +3,def +""" + df.to_csv( + path, + line_terminator="\n", + encoding="cp932", + encoding_errors="namereplace", + index=False, + ) + with open(path, "r") as f: + assert f.read() == expected_cp932 + + # case 7: encoding_errors=surrogatepass + with tm.ensure_clean() as path: + with pytest.raises(UnicodeEncodeError): + df.to_csv( + path, + line_terminator="\n", + encoding="cp932", + encoding_errors="surrogatepass", + index=False, + )