Skip to content

Commit fc19062

Browse files
committed
Add errors option in pandas.DataFrame.to_csv
errors : str, default 'strict' Response when the input string can’t be converted according to the encoding’s rules
1 parent 5de4e55 commit fc19062

File tree

4 files changed

+109
-3
lines changed

4 files changed

+109
-3
lines changed

pandas/core/generic.py

+8
Original file line numberDiff line numberDiff line change
@@ -3069,6 +3069,7 @@ def to_csv(
30693069
doublequote=True,
30703070
escapechar=None,
30713071
decimal=".",
3072+
errors="strict",
30723073
):
30733074
r"""
30743075
Write object to a comma-separated values (csv) file.
@@ -3151,6 +3152,12 @@ def to_csv(
31513152
decimal : str, default '.'
31523153
Character recognized as decimal separator. E.g. use ',' for
31533154
European data.
3155+
errors : str, default 'strict'
3156+
Response when the input string can’t be converted according to
3157+
the encoding’s rules
3158+
strict: Raise UnicodeError
3159+
ignore: Ignore the malformed data and continue without further notice
3160+
replace: Replace with a suitable replacement marker
31543161
31553162
Returns
31563163
-------
@@ -3197,6 +3204,7 @@ def to_csv(
31973204
doublequote=doublequote,
31983205
escapechar=escapechar,
31993206
decimal=decimal,
3207+
errors=errors,
32003208
)
32013209
formatter.save()
32023210

pandas/io/common.py

+15-3
Original file line numberDiff line numberDiff line change
@@ -310,7 +310,13 @@ def _infer_compression(
310310

311311

312312
def _get_handle(
313-
path_or_buf, mode, encoding=None, compression=None, memory_map=False, is_text=True
313+
path_or_buf,
314+
mode,
315+
encoding=None,
316+
compression=None,
317+
memory_map=False,
318+
is_text=True,
319+
errors="strict",
314320
):
315321
"""
316322
Get file handle for given path/buffer and mode.
@@ -331,6 +337,12 @@ def _get_handle(
331337
is_text : boolean, default True
332338
whether file/buffer is in text format (csv, json, etc.), or in binary
333339
mode (pickle, etc.)
340+
errors : str, default 'strict'
341+
Response when the input string can’t be converted according to
342+
the encoding’s rules
343+
strict: Raise UnicodeError
344+
ignore: Ignore the malformed data and continue without further notice
345+
replace: Replace with a suitable replacement marker
334346
335347
Returns
336348
-------
@@ -407,10 +419,10 @@ def _get_handle(
407419
elif is_path:
408420
if encoding:
409421
# Encoding
410-
f = open(path_or_buf, mode, encoding=encoding, newline="")
422+
f = open(path_or_buf, mode, errors=errors, encoding=encoding, newline="")
411423
elif is_text:
412424
# No explicit encoding
413-
f = open(path_or_buf, mode, errors="replace", newline="")
425+
f = open(path_or_buf, mode, errors=errors, newline="")
414426
else:
415427
# Binary mode
416428
f = open(path_or_buf, mode)

pandas/io/formats/csvs.py

+4
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ def __init__(
5151
doublequote=True,
5252
escapechar=None,
5353
decimal=".",
54+
errors="strict",
5455
):
5556

5657
self.obj = obj
@@ -93,6 +94,8 @@ def __init__(
9394

9495
self.has_mi_columns = isinstance(obj.columns, ABCMultiIndex)
9596

97+
self.errors = errors
98+
9699
# validate mi options
97100
if self.has_mi_columns:
98101
if cols is not None:
@@ -179,6 +182,7 @@ def save(self):
179182
self.mode,
180183
encoding=self.encoding,
181184
compression=self.compression,
185+
errors=self.errors,
182186
)
183187
close = True
184188

pandas/tests/io/formats/test_to_csv.py

+82
Original file line numberDiff line numberDiff line change
@@ -514,3 +514,85 @@ def test_to_csv_compression(self, compression_only, read_infer, to_infer):
514514
df.to_csv(path, compression=to_compression)
515515
result = pd.read_csv(path, index_col=0, compression=read_compression)
516516
tm.assert_frame_equal(result, df)
517+
518+
def test_to_csv_string_with_sjis(self):
519+
data = {"int": [1, 2, 3], "str_sjis": ["abc", "\u070a", "def"]}
520+
df = pd.DataFrame(data)
521+
# case 1: errors=strict
522+
with tm.ensure_clean("sjis_test.csv") as path:
523+
expected_exception = UnicodeEncodeError
524+
with pytest.raises(expected_exception):
525+
df.to_csv(
526+
path,
527+
line_terminator="\n",
528+
encoding="sjis",
529+
errors="strict",
530+
index=False,
531+
)
532+
533+
# case 2: errors=replace
534+
with tm.ensure_clean("sjis_test.csv") as path:
535+
expected_sjis = b"int,str_sjis\n" b"1,abc\n" b"2,?\n" b"3,def\n"
536+
df.to_csv(
537+
path,
538+
line_terminator="\n",
539+
encoding="sjis",
540+
errors="replace",
541+
index=False,
542+
)
543+
with open(path, "rb") as f:
544+
assert f.read() == expected_sjis
545+
546+
# case 3: errors=ignore
547+
with tm.ensure_clean("sjis_test.csv") as path:
548+
expected_sjis = b"int,str_sjis\n" b"1,abc\n" b"2,\n" b"3,def\n"
549+
df.to_csv(
550+
path,
551+
line_terminator="\n",
552+
encoding="sjis",
553+
errors="ignore",
554+
index=False,
555+
)
556+
with open(path, "rb") as f:
557+
assert f.read() == expected_sjis
558+
559+
def test_to_csv_string_with_cp932(self):
560+
data = {"int": [1, 2, 3], "str_cp932": ["abc", "\u070a", "def"]}
561+
df = pd.DataFrame(data)
562+
# case 1: errors=strict
563+
with tm.ensure_clean("cp932_test.csv") as path:
564+
expected_exception = UnicodeEncodeError
565+
with pytest.raises(expected_exception):
566+
df.to_csv(
567+
path,
568+
line_terminator="\n",
569+
encoding="cp932",
570+
errors="strict",
571+
index=False,
572+
)
573+
574+
# case 2: errors=replace
575+
with tm.ensure_clean("cp932_test.csv") as path:
576+
expected_cp932 = b"int,str_cp932\n" b"1,abc\n" b"2,?\n" b"3,def\n"
577+
df.to_csv(
578+
path,
579+
line_terminator="\n",
580+
encoding="cp932",
581+
errors="replace",
582+
index=False,
583+
)
584+
with open(path, "rb") as f:
585+
assert f.read() == expected_cp932
586+
587+
# case 3: errors=ignore
588+
with tm.ensure_clean("cp932_test.csv") as path:
589+
expected_cp932 = b"int,str_cp932\n" b"1,abc\n" b"2,\n" b"3,def\n"
590+
df.to_csv(
591+
path,
592+
line_terminator="\n",
593+
encoding="cp932",
594+
errors="ignore",
595+
index=False,
596+
)
597+
with open(path, "rb") as f:
598+
assert f.read() == expected_cp932

0 commit comments

Comments
 (0)