From d6148fea83906695f9d84b249c315c6809c098ad Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= <twoertwein@gmail.com>
Date: Fri, 12 Feb 2021 09:01:37 -0500
Subject: [PATCH 1/2] ENH: 'encoding_errors' argument for read_csv/json

---
 doc/source/whatsnew/v1.3.0.rst                |  1 +
 pandas/_libs/parsers.pyx                      | 53 ++++++++++---------
 pandas/_libs/src/parser/io.c                  |  4 +-
 pandas/_libs/src/parser/io.h                  |  2 +-
 pandas/_libs/src/parser/tokenizer.c           | 20 ++++---
 pandas/_libs/src/parser/tokenizer.h           |  6 +--
 pandas/io/common.py                           | 25 +++++++--
 pandas/io/json/_json.py                       | 15 +++++-
 pandas/io/parsers/base_parser.py              |  2 +
 pandas/io/parsers/readers.py                  | 15 ++++++
 .../io/parser/common/test_common_basic.py     | 19 +++++++
 .../io/parser/common/test_read_errors.py      |  2 +-
 pandas/tests/io/test_common.py                | 52 +++++++++++++++---
 13 files changed, 162 insertions(+), 54 deletions(-)

diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
index 370ea28832758..b240dad32f0e1 100644
--- a/doc/source/whatsnew/v1.3.0.rst
+++ b/doc/source/whatsnew/v1.3.0.rst
@@ -141,6 +141,7 @@ Other enhancements
 - Add support for parsing ``ISO 8601``-like timestamps with negative signs to :meth:`pandas.Timedelta` (:issue:`37172`)
 - Add support for unary operators in :class:`FloatingArray` (:issue:`38749`)
 - :meth:`round` being enabled for the nullable integer and floating dtypes (:issue:`38844`)
+- :meth:`pandas.read_csv` and :meth:`pandas.read_json` expose the argument ``encoding_errors`` to control how encoding errors are handled (:issue:`39450`)
 
 .. ---------------------------------------------------------------------------
 
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index c4d98ccb88ba5..031a567925a4d 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -20,13 +20,19 @@ from libc.string cimport (
 import cython
 from cython import Py_ssize_t
 
-from cpython.bytes cimport PyBytes_AsString
+from cpython.bytes cimport (
+    PyBytes_AsString,
+    PyBytes_FromString,
+)
 from cpython.exc cimport (
     PyErr_Fetch,
     PyErr_Occurred,
 )
 from cpython.object cimport PyObject
-from cpython.ref cimport Py_XDECREF
+from cpython.ref cimport (
+    Py_INCREF,
+    Py_XDECREF,
+)
 from cpython.unicode cimport (
     PyUnicode_AsUTF8String,
     PyUnicode_Decode,
@@ -143,7 +149,7 @@ cdef extern from "parser/tokenizer.h":
     enum: ERROR_OVERFLOW
 
     ctypedef void* (*io_callback)(void *src, size_t nbytes, size_t *bytes_read,
-                                  int *status)
+                                  int *status, const char *encoding_errors)
     ctypedef int (*io_cleanup)(void *src)
 
     ctypedef struct parser_t:
@@ -255,8 +261,8 @@ cdef extern from "parser/tokenizer.h":
 
     int parser_trim_buffers(parser_t *self)
 
-    int tokenize_all_rows(parser_t *self) nogil
-    int tokenize_nrows(parser_t *self, size_t nrows) nogil
+    int tokenize_all_rows(parser_t *self, const char *encoding_errors) nogil
+    int tokenize_nrows(parser_t *self, size_t nrows, const char *encoding_errors) nogil
 
     int64_t str_to_int64(char *p_item, int64_t int_min,
                          int64_t int_max, int *error, char tsep) nogil
@@ -293,7 +299,7 @@ cdef extern from "parser/io.h":
                             size_t *bytes_read, int *status)
 
     void* buffer_rd_bytes(void *source, size_t nbytes,
-                          size_t *bytes_read, int *status)
+                          size_t *bytes_read, int *status, const char *encoding_errors)
 
 
 cdef class TextReader:
@@ -316,6 +322,7 @@ cdef class TextReader:
         uint64_t parser_start
         list clocks
         char *c_encoding
+        const char *encoding_errors
         kh_str_starts_t *false_set
         kh_str_starts_t *true_set
 
@@ -370,10 +377,15 @@ cdef class TextReader:
                   bint verbose=False,
                   bint mangle_dupe_cols=True,
                   float_precision=None,
-                  bint skip_blank_lines=True):
+                  bint skip_blank_lines=True,
+                  encoding_errors=b"strict"):
 
         # set encoding for native Python and C library
         self.c_encoding = NULL
+        if isinstance(encoding_errors, str):
+            encoding_errors = encoding_errors.encode("utf-8")
+        Py_INCREF(encoding_errors)
+        self.encoding_errors = PyBytes_AsString(encoding_errors)
 
         self.parser = parser_new()
         self.parser.chunksize = tokenize_chunksize
@@ -558,13 +570,7 @@ cdef class TextReader:
         pass
 
     def __dealloc__(self):
-        parser_free(self.parser)
-        if self.true_set:
-            kh_destroy_str_starts(self.true_set)
-            self.true_set = NULL
-        if self.false_set:
-            kh_destroy_str_starts(self.false_set)
-            self.false_set = NULL
+        self.close()
         parser_del(self.parser)
 
     def close(self):
@@ -632,7 +638,6 @@ cdef class TextReader:
             char *word
             object name, old_name
             uint64_t hr, data_line = 0
-            char *errors = "strict"
             StringPath path = _string_path(self.c_encoding)
             list header = []
             set unnamed_cols = set()
@@ -673,11 +678,8 @@ cdef class TextReader:
                 for i in range(field_count):
                     word = self.parser.words[start + i]
 
-                    if path == UTF8:
-                        name = PyUnicode_FromString(word)
-                    elif path == ENCODED:
-                        name = PyUnicode_Decode(word, strlen(word),
-                                                self.c_encoding, errors)
+                    name = PyUnicode_Decode(word, strlen(word),
+                                            self.c_encoding, self.encoding_errors)
 
                     # We use this later when collecting placeholder names.
                     old_name = name
@@ -831,7 +833,7 @@ cdef class TextReader:
             int status
 
         with nogil:
-            status = tokenize_nrows(self.parser, nrows)
+            status = tokenize_nrows(self.parser, nrows, self.encoding_errors)
 
         if self.parser.warn_msg != NULL:
             print(self.parser.warn_msg, file=sys.stderr)
@@ -859,7 +861,7 @@ cdef class TextReader:
                                  'the whole file')
         else:
             with nogil:
-                status = tokenize_all_rows(self.parser)
+                status = tokenize_all_rows(self.parser, self.encoding_errors)
 
             if self.parser.warn_msg != NULL:
                 print(self.parser.warn_msg, file=sys.stderr)
@@ -1201,7 +1203,7 @@ cdef class TextReader:
 
         if path == UTF8:
             return _string_box_utf8(self.parser, i, start, end, na_filter,
-                                    na_hashset)
+                                    na_hashset, self.encoding_errors)
         elif path == ENCODED:
             return _string_box_decode(self.parser, i, start, end,
                                       na_filter, na_hashset, self.c_encoding)
@@ -1352,7 +1354,8 @@ cdef inline StringPath _string_path(char *encoding):
 
 cdef _string_box_utf8(parser_t *parser, int64_t col,
                       int64_t line_start, int64_t line_end,
-                      bint na_filter, kh_str_starts_t *na_hashset):
+                      bint na_filter, kh_str_starts_t *na_hashset,
+                      const char *encoding_errors):
     cdef:
         int error, na_count = 0
         Py_ssize_t i, lines
@@ -1391,7 +1394,7 @@ cdef _string_box_utf8(parser_t *parser, int64_t col,
             pyval = <object>table.vals[k]
         else:
             # box it. new ref?
-            pyval = PyUnicode_FromString(word)
+            pyval = PyUnicode_Decode(word, strlen(word), "utf-8", encoding_errors)
 
             k = kh_put_strbox(table, word, &ret)
             table.vals[k] = <PyObject *>pyval
diff --git a/pandas/_libs/src/parser/io.c b/pandas/_libs/src/parser/io.c
index 51504527de5a2..449f0b55bff70 100644
--- a/pandas/_libs/src/parser/io.c
+++ b/pandas/_libs/src/parser/io.c
@@ -163,7 +163,7 @@ void *buffer_file_bytes(void *source, size_t nbytes, size_t *bytes_read,
 }
 
 void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read,
-                      int *status) {
+                      int *status, const char *encoding_errors) {
     PyGILState_STATE state;
     PyObject *result, *func, *args, *tmp;
 
@@ -191,7 +191,7 @@ void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read,
         *status = CALLING_READ_FAILED;
         return NULL;
     } else if (!PyBytes_Check(result)) {
-        tmp = PyUnicode_AsUTF8String(result);
+        tmp = PyUnicode_AsEncodedString(result, "utf-8", encoding_errors);
         Py_DECREF(result);
         if (tmp == NULL) {
             PyGILState_Release(state);
diff --git a/pandas/_libs/src/parser/io.h b/pandas/_libs/src/parser/io.h
index aac418457d3b6..dbe757b458c54 100644
--- a/pandas/_libs/src/parser/io.h
+++ b/pandas/_libs/src/parser/io.h
@@ -64,6 +64,6 @@ void *buffer_file_bytes(void *source, size_t nbytes, size_t *bytes_read,
                         int *status);
 
 void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read,
-                      int *status);
+                      int *status, const char *encoding_errors);
 
 #endif  // PANDAS__LIBS_SRC_PARSER_IO_H_
diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 1b229171ea879..49eb1e7855098 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -553,13 +553,15 @@ int parser_set_skipfirstnrows(parser_t *self, int64_t nrows) {
     return 0;
 }
 
-static int parser_buffer_bytes(parser_t *self, size_t nbytes) {
+static int parser_buffer_bytes(parser_t *self, size_t nbytes,
+                               const char *encoding_errors) {
     int status;
     size_t bytes_read;
 
     status = 0;
     self->datapos = 0;
-    self->data = self->cb_io(self->source, nbytes, &bytes_read, &status);
+    self->data = self->cb_io(self->source, nbytes, &bytes_read, &status,
+                             encoding_errors);
     TRACE((
         "parser_buffer_bytes self->cb_io: nbytes=%zu, datalen: %d, status=%d\n",
         nbytes, bytes_read, status));
@@ -1334,7 +1336,8 @@ int parser_trim_buffers(parser_t *self) {
   all : tokenize all the data vs. certain number of rows
  */
 
-int _tokenize_helper(parser_t *self, size_t nrows, int all) {
+int _tokenize_helper(parser_t *self, size_t nrows, int all,
+                     const char *encoding_errors) {
     int status = 0;
     uint64_t start_lines = self->lines;
 
@@ -1350,7 +1353,8 @@ int _tokenize_helper(parser_t *self, size_t nrows, int all) {
         if (!all && self->lines - start_lines >= nrows) break;
 
         if (self->datapos == self->datalen) {
-            status = parser_buffer_bytes(self, self->chunksize);
+            status = parser_buffer_bytes(self, self->chunksize,
+                                         encoding_errors);
 
             if (status == REACHED_EOF) {
                 // close out last line
@@ -1383,13 +1387,13 @@ int _tokenize_helper(parser_t *self, size_t nrows, int all) {
     return status;
 }
 
-int tokenize_nrows(parser_t *self, size_t nrows) {
-    int status = _tokenize_helper(self, nrows, 0);
+int tokenize_nrows(parser_t *self, size_t nrows, const char *encoding_errors) {
+    int status = _tokenize_helper(self, nrows, 0, encoding_errors);
     return status;
 }
 
-int tokenize_all_rows(parser_t *self) {
-    int status = _tokenize_helper(self, -1, 1);
+int tokenize_all_rows(parser_t *self, const char *encoding_errors) {
+    int status = _tokenize_helper(self, -1, 1, encoding_errors);
     return status;
 }
 
diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h
index 876e2267906ee..f69fee4993d34 100644
--- a/pandas/_libs/src/parser/tokenizer.h
+++ b/pandas/_libs/src/parser/tokenizer.h
@@ -85,7 +85,7 @@ typedef enum {
 } QuoteStyle;
 
 typedef void *(*io_callback)(void *src, size_t nbytes, size_t *bytes_read,
-                             int *status);
+                             int *status, const char *encoding_errors);
 typedef int (*io_cleanup)(void *src);
 
 typedef struct parser_t {
@@ -196,9 +196,9 @@ void parser_del(parser_t *self);
 
 void parser_set_default_options(parser_t *self);
 
-int tokenize_nrows(parser_t *self, size_t nrows);
+int tokenize_nrows(parser_t *self, size_t nrows, const char *encoding_errors);
 
-int tokenize_all_rows(parser_t *self);
+int tokenize_all_rows(parser_t *self, const char *encoding_errors);
 
 // Have parsed / type-converted a chunk of data
 // and want to free memory from the token stream
diff --git a/pandas/io/common.py b/pandas/io/common.py
index cf3b92ec93b1f..b87e8fcae1064 100644
--- a/pandas/io/common.py
+++ b/pandas/io/common.py
@@ -583,12 +583,32 @@ def get_handle(
     Returns the dataclass IOHandles
     """
     # Windows does not default to utf-8. Set to utf-8 for a consistent behavior
-    encoding_passed, encoding = encoding, encoding or "utf-8"
+    encoding = encoding or "utf-8"
 
     # read_csv does not know whether the buffer is opened in binary/text mode
     if _is_binary_mode(path_or_buf, mode) and "b" not in mode:
         mode += "b"
 
+    # valdiate errors
+    if isinstance(errors, str):
+        errors = errors.lower()
+    if errors not in (
+        None,
+        "strict",
+        "ignore",
+        "replace",
+        "xmlcharrefreplace",
+        "backslashreplace",
+        "namereplace",
+        "surrogateescape",
+        "surrogatepass",
+    ):
+        raise ValueError(
+            f"Invalid value for `encoding_errors` ({errors}). Please see "
+            + "https://docs.python.org/3/library/codecs.html#error-handlers "
+            + "for valid values."
+        )
+
     # open URLs
     ioargs = _get_filepath_or_buffer(
         path_or_buf,
@@ -677,9 +697,6 @@ def get_handle(
         # Check whether the filename is to be opened in binary mode.
         # Binary mode does not support 'encoding' and 'newline'.
         if ioargs.encoding and "b" not in ioargs.mode:
-            if errors is None and encoding_passed is None:
-                # ignore errors when no encoding is specified
-                errors = "replace"
             # Encoding
             handle = open(
                 handle,
diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py
index f050a6a086584..aa654e971641f 100644
--- a/pandas/io/json/_json.py
+++ b/pandas/io/json/_json.py
@@ -334,6 +334,7 @@ def read_json(
     precise_float: bool = False,
     date_unit=None,
     encoding=None,
+    encoding_errors: Optional[str] = "strict",
     lines: bool = False,
     chunksize: Optional[int] = None,
     compression: CompressionOptions = "infer",
@@ -456,6 +457,12 @@ def read_json(
     encoding : str, default is 'utf-8'
         The encoding to use to decode py3 bytes.
 
+    encoding_errors : str, optional, default "strict"
+        How encoding errors are treated. `List of possible values
+        <https://docs.python.org/3/library/codecs.html#error-handlers>`_ .
+
+        .. versionadded:: 1.3
+
     lines : bool, default False
         Read the file as a json object per line.
 
@@ -584,6 +591,7 @@ def read_json(
         compression=compression,
         nrows=nrows,
         storage_options=storage_options,
+        encoding_errors=encoding_errors,
     )
 
     if chunksize:
@@ -620,6 +628,7 @@ def __init__(
         compression: CompressionOptions,
         nrows: Optional[int],
         storage_options: StorageOptions = None,
+        encoding_errors: Optional[str] = "strict",
     ):
 
         self.orient = orient
@@ -638,6 +647,7 @@ def __init__(
         self.chunksize = chunksize
         self.nrows_seen = 0
         self.nrows = nrows
+        self.encoding_errors = encoding_errors
         self.handles: Optional[IOHandles] = None
 
         if self.chunksize is not None:
@@ -661,8 +671,8 @@ def _preprocess_data(self, data):
         Otherwise, we read it into memory for the `read` method.
         """
         if hasattr(data, "read") and not (self.chunksize or self.nrows):
-            data = data.read()
-            self.close()
+            with self:
+                data = data.read()
         if not hasattr(data, "read") and (self.chunksize or self.nrows):
             data = StringIO(data)
 
@@ -692,6 +702,7 @@ def _get_data_from_filepath(self, filepath_or_buffer):
                 encoding=self.encoding,
                 compression=self.compression,
                 storage_options=self.storage_options,
+                errors=self.encoding_errors,
             )
             filepath_or_buffer = self.handles.handle
 
diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
index 2d17978b60327..c05efe9e73c5a 100644
--- a/pandas/io/parsers/base_parser.py
+++ b/pandas/io/parsers/base_parser.py
@@ -109,6 +109,7 @@
     "mangle_dupe_cols": True,
     "infer_datetime_format": False,
     "skip_blank_lines": True,
+    "encoding_errors": "strict",
 }
 
 
@@ -212,6 +213,7 @@ def _open_handles(self, src: FilePathOrBuffer, kwds: Dict[str, Any]) -> None:
             compression=kwds.get("compression", None),
             memory_map=kwds.get("memory_map", False),
             storage_options=kwds.get("storage_options", None),
+            errors=kwds.get("encoding_errors", "strict"),
         )
 
     def _validate_parse_dates_presence(self, columns: List[str]) -> None:
diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
index edfc7ee0b6258..6adf1b20b769f 100644
--- a/pandas/io/parsers/readers.py
+++ b/pandas/io/parsers/readers.py
@@ -296,11 +296,24 @@
     Encoding to use for UTF when reading/writing (ex. 'utf-8'). `List of Python
     standard encodings
     <https://docs.python.org/3/library/codecs.html#standard-encodings>`_ .
+
     .. versionchanged:: 1.2
 
        When ``encoding`` is ``None``, ``errors="replace"`` is passed to
        ``open()``. Otherwise, ``errors="strict"`` is passed to ``open()``.
        This behavior was previously only the case for ``engine="python"``.
+
+    .. versionchanged:: 1.3
+
+       ``encoding_errors`` is a new argument. ``encoding`` has no longer an
+       influence on how encoding errors are handled.
+
+encoding_errors : str, optional, default "strict"
+    How encoding errors are treated. `List of possible values
+    <https://docs.python.org/3/library/codecs.html#error-handlers>`_ .
+
+    .. versionadded:: 1.3
+
 dialect : str or csv.Dialect, optional
     If provided, this parameter will override values (default or not) for the
     following parameters: `delimiter`, `doublequote`, `escapechar`,
@@ -515,6 +528,7 @@ def read_csv(
     escapechar=None,
     comment=None,
     encoding=None,
+    encoding_errors: Optional[str] = "strict",
     dialect=None,
     # Error Handling
     error_bad_lines=True,
@@ -599,6 +613,7 @@ def read_table(
     # Error Handling
     error_bad_lines=True,
     warn_bad_lines=True,
+    encoding_errors: Optional[str] = "strict",
     # Internal
     delim_whitespace=False,
     low_memory=_c_parser_defaults["low_memory"],
diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py
index 6730d8cc46603..7cf9470e3057d 100644
--- a/pandas/tests/io/parser/common/test_common_basic.py
+++ b/pandas/tests/io/parser/common/test_common_basic.py
@@ -6,6 +6,7 @@
 from inspect import signature
 from io import StringIO
 import os
+from pathlib import Path
 
 import numpy as np
 import pytest
@@ -734,3 +735,21 @@ def test_dict_keys_as_names(all_parsers):
     result = parser.read_csv(StringIO(data), names=keys)
     expected = DataFrame({"a": [1], "b": [2]})
     tm.assert_frame_equal(result, expected)
+
+
+def test_encoding_surrogatepass(all_parsers):
+    # GH39017
+    parser = all_parsers
+    content = b"\xed\xbd\xbf"
+    decoded = content.decode("utf-8", errors="surrogatepass")
+    expected = DataFrame({decoded: [decoded]}, index=[decoded * 2])
+    expected.index.name = decoded * 2
+
+    with tm.ensure_clean() as path:
+        Path(path).write_bytes(
+            content * 2 + b"," + content + b"\n" + content * 2 + b"," + content
+        )
+        df = parser.read_csv(path, encoding_errors="surrogatepass", index_col=0)
+        tm.assert_frame_equal(df, expected)
+        with pytest.raises(UnicodeDecodeError, match="'utf-8' codec can't decode byte"):
+            parser.read_csv(path)
diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py
index 91b5e26efafa1..4e3d99af685ec 100644
--- a/pandas/tests/io/parser/common/test_read_errors.py
+++ b/pandas/tests/io/parser/common/test_read_errors.py
@@ -232,5 +232,5 @@ def test_open_file(all_parsers):
         warnings.simplefilter("always", category=ResourceWarning)
         with warnings.catch_warnings(record=True) as record:
             with pytest.raises(csv.Error, match="Could not determine delimiter"):
-                parser.read_csv(file, sep=None)
+                parser.read_csv(file, sep=None, encoding_errors="replace")
             assert len(record) == 0, record[0].message
diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py
index e1dcec56913f9..bacec6969b940 100644
--- a/pandas/tests/io/test_common.py
+++ b/pandas/tests/io/test_common.py
@@ -2,6 +2,7 @@
 Tests for the pandas.io.common functionalities
 """
 import codecs
+from functools import partial
 from io import (
     BytesIO,
     StringIO,
@@ -429,14 +430,6 @@ def test_is_fsspec_url():
     assert not icom.is_fsspec_url("relative/local/path")
 
 
-def test_default_errors():
-    # GH 38989
-    with tm.ensure_clean() as path:
-        file = Path(path)
-        file.write_bytes(b"\xe4\na\n1")
-        tm.assert_frame_equal(pd.read_csv(file, skiprows=[0]), pd.DataFrame({"a": [1]}))
-
-
 @pytest.mark.parametrize("encoding", [None, "utf-8"])
 @pytest.mark.parametrize("format", ["csv", "json"])
 def test_codecs_encoding(encoding, format):
@@ -481,3 +474,46 @@ def test_explicit_encoding(io_class, mode, msg):
     with io_class() as buffer:
         with pytest.raises(TypeError, match=msg):
             expected.to_csv(buffer, mode=f"w{mode}")
+
+
+@pytest.mark.parametrize("encoding_errors", [None, "strict", "replace"])
+@pytest.mark.parametrize("format", ["csv", "json"])
+def test_encoding_errors(encoding_errors, format):
+    # GH39450
+    msg = "'utf-8' codec can't decode byte"
+    bad_encoding = b"\xe4"
+
+    if format == "csv":
+        return
+        content = bad_encoding + b"\n" + bad_encoding
+        reader = pd.read_csv
+    else:
+        content = (
+            b'{"'
+            + bad_encoding * 2
+            + b'": {"'
+            + bad_encoding
+            + b'":"'
+            + bad_encoding
+            + b'"}}'
+        )
+        reader = partial(pd.read_json, orient="index")
+    with tm.ensure_clean() as path:
+        file = Path(path)
+        file.write_bytes(content)
+
+        if encoding_errors != "replace":
+            with pytest.raises(UnicodeDecodeError, match=msg):
+                reader(path, encoding_errors=encoding_errors)
+        else:
+            df = reader(path, encoding_errors=encoding_errors)
+            decoded = bad_encoding.decode(errors=encoding_errors)
+            expected = pd.DataFrame({decoded: [decoded]}, index=[decoded * 2])
+            tm.assert_frame_equal(df, expected)
+
+
+def test_bad_encdoing_errors():
+    # GH 39777
+    with tm.ensure_clean() as path:
+        with pytest.raises(ValueError, match="Invalide value for `encoding_errors`"):
+            icom.get_handle(path, "w", errors="bad")

From 029c61c80dbc06ede6d79bd0804104531f5f3996 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= <twoertwein@gmail.com>
Date: Mon, 8 Mar 2021 20:56:46 -0500
Subject: [PATCH 2/2] fix typo in test

---
 pandas/tests/io/test_common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py
index bacec6969b940..e483b5b5a8abb 100644
--- a/pandas/tests/io/test_common.py
+++ b/pandas/tests/io/test_common.py
@@ -515,5 +515,5 @@ def test_encoding_errors(encoding_errors, format):
 def test_bad_encdoing_errors():
     # GH 39777
     with tm.ensure_clean() as path:
-        with pytest.raises(ValueError, match="Invalide value for `encoding_errors`"):
+        with pytest.raises(ValueError, match="Invalid value for `encoding_errors`"):
             icom.get_handle(path, "w", errors="bad")