pandas-dev · MarcoGorelli · May 7, 2023 · Apr 28, 2023 · Apr 29, 2023 · Apr 29, 2023
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -83,9 +83,6 @@ repos:
     hooks:
     -   id: pylint
         stages: [manual]
--   repo: https://github.com/pycqa/pylint
-    rev: v2.16.2
-    hooks:
     -   id: pylint
         alias: redefined-outer-name
         name: Redefining name from outer scope
@@ -99,6 +96,11 @@ repos:
             |^pandas/conftest\.py  # keep excluded
         args: [--disable=all, --enable=redefined-outer-name]
         stages: [manual]
+    -   id: pylint
+        alias: unspecified-encoding
+        name: Using open without explicitly specifying an encoding
+        args: [--disable=all, --enable=unspecified-encoding]
+        stages: [manual]
 -   repo: https://github.com/PyCQA/isort
     rev: 5.12.0
     hooks:

diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
@@ -444,7 +444,7 @@ class ReadCSVMemoryGrowth(BaseIO):
     param_names = ["engine"]
 
     def setup(self, engine):
-        with open(self.fname, "w") as f:
+        with open(self.fname, "w", encoding="utf-8") as f:
             for i in range(self.num_rows):
                 f.write(f"{i}\n")
 

diff --git a/doc/make.py b/doc/make.py
@@ -163,12 +163,12 @@ def _get_page_title(self, page):
             components=(docutils.parsers.rst.Parser,)
         )
         doc = docutils.utils.new_document("<doc>", option_parser.get_default_values())
-        with open(fname) as f:
+        with open(fname, encoding="utf-8") as f:
             data = f.read()
 
         parser = docutils.parsers.rst.Parser()
         # do not generate any warning when parsing the rst
-        with open(os.devnull, "a") as f:
+        with open(os.devnull, "a", encoding="utf-8") as f:
             doc.reporter.stream = f
             parser.parse(data, doc)
 
@@ -186,7 +186,7 @@ def _add_redirects(self):
         Create in the build directory an html file with a redirect,
         for every row in REDIRECTS_FILE.
         """
-        with open(REDIRECTS_FILE) as mapping_fd:
+        with open(REDIRECTS_FILE, encoding="utf-8") as mapping_fd:
             reader = csv.reader(mapping_fd)
             for row in reader:
                 if not row or row[0].strip().startswith("#"):
@@ -209,7 +209,7 @@ def _add_redirects(self):
                     # sphinx specific stuff
                     title = "this page"
 
-                with open(path, "w") as moved_page_fd:
+                with open(path, "w", encoding="utf-8") as moved_page_fd:
                     html = f"""\
 <html>
     <head>

diff --git a/doc/source/conf.py b/doc/source/conf.py
@@ -117,9 +117,9 @@
                 elif single_doc and rel_fname != pattern:
                     exclude_patterns.append(rel_fname)
 
-with open(os.path.join(source_path, "index.rst.template")) as f:
+with open(os.path.join(source_path, "index.rst.template"), encoding="utf-8") as f:
     t = jinja2.Template(f.read())
-with open(os.path.join(source_path, "index.rst"), "w") as f:
+with open(os.path.join(source_path, "index.rst"), "w", encoding="utf-8") as f:
     f.write(
         t.render(
             include_api=include_api,

diff --git a/pandas/_testing/contexts.py b/pandas/_testing/contexts.py
@@ -126,7 +126,11 @@ def ensure_clean(
     handle_or_str: str | IO = str(path)
     if return_filelike:
         kwargs.setdefault("mode", "w+b")
-        handle_or_str = open(path, **kwargs)
+        handle_or_str = open(
+            path,
+            encoding=kwargs.get("encoding", None),
+            **{key: value for key, value in kwargs.items() if key != "encoding"},
+        )
 
     try:
         yield handle_or_str

diff --git a/pandas/_version.py b/pandas/_version.py
@@ -159,7 +159,7 @@ def git_get_keywords(versionfile_abs):
     # _version.py.
     keywords = {}
     try:
-        with open(versionfile_abs) as fobj:
+        with open(versionfile_abs, encoding="utf-8") as fobj:
             for line in fobj:
                 if line.strip().startswith("git_refnames ="):
                     mo = re.search(r'=\s*"(.*)"', line)

diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -1699,7 +1699,7 @@ def to_string(
             if hasattr(buf, "write"):
                 buf.write(result)
             else:
-                with open(buf, "w") as f:
+                with open(buf, "w", encoding="utf-8") as f:
                     f.write(result)
         return None
 

diff --git a/pandas/io/clipboard/__init__.py b/pandas/io/clipboard/__init__.py
@@ -282,11 +282,11 @@ def copy_dev_clipboard(text):
                 stacklevel=find_stack_level(),
             )
 
-        with open("/dev/clipboard", "w") as fd:
+        with open("/dev/clipboard", "w", encoding="utf-8") as fd:
             fd.write(text)
 
     def paste_dev_clipboard() -> str:
-        with open("/dev/clipboard") as fd:
+        with open("/dev/clipboard", encoding="utf-8") as fd:
             content = fd.read()
         return content
 

diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py
@@ -626,7 +626,7 @@ def test_to_csv_float32_nanrep(self):
         with tm.ensure_clean("__tmp_to_csv_float32_nanrep__.csv") as path:
             df.to_csv(path, na_rep=999)
 
-            with open(path) as f:
+            with open(path, encoding="utf-8") as f:
                 lines = f.readlines()
                 assert lines[1].split(",")[2] == "999"
 

diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py
@@ -1702,7 +1702,7 @@ def test_corrupt_files_closed(self, engine, read_ext):
             errors = (BadZipFile, xlrd.biffh.XLRDError)
 
         with tm.ensure_clean(f"corrupt{read_ext}") as file:
-            Path(file).write_text("corrupt")
+            Path(file).write_text("corrupt", encoding="utf-8")
             with tm.assert_produces_warning(False):
                 try:
                     pd.ExcelFile(file, engine=engine)

diff --git a/pandas/tests/io/formats/style/test_html.py b/pandas/tests/io/formats/style/test_html.py
@@ -43,7 +43,7 @@ def tpl_table():
 def test_html_template_extends_options():
     # make sure if templates are edited tests are updated as are setup fixtures
     # to understand the dependency
-    with open("pandas/io/formats/templates/html.tpl") as file:
+    with open("pandas/io/formats/templates/html.tpl", encoding="utf-8") as file:
         result = file.read()
     assert "{% include html_style_tpl %}" in result
     assert "{% include html_table_tpl %}" in result

diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py
@@ -32,7 +32,7 @@ def test_to_csv_with_single_column(self):
 """
         with tm.ensure_clean("test.csv") as path:
             df1.to_csv(path, header=None, index=None)
-            with open(path) as f:
+            with open(path, encoding="utf-8") as f:
                 assert f.read() == expected1
 
         df2 = DataFrame([1, None])
@@ -42,7 +42,7 @@ def test_to_csv_with_single_column(self):
 """
         with tm.ensure_clean("test.csv") as path:
             df2.to_csv(path, header=None, index=None)
-            with open(path) as f:
+            with open(path, encoding="utf-8") as f:
                 assert f.read() == expected2
 
     def test_to_csv_default_encoding(self):
@@ -64,7 +64,7 @@ def test_to_csv_quotechar(self):
 
         with tm.ensure_clean("test.csv") as path:
             df.to_csv(path, quoting=1)  # 1=QUOTE_ALL
-            with open(path) as f:
+            with open(path, encoding="utf-8") as f:
                 assert f.read() == expected
 
         expected = """\
@@ -75,7 +75,7 @@ def test_to_csv_quotechar(self):
 
         with tm.ensure_clean("test.csv") as path:
             df.to_csv(path, quoting=1, quotechar="$")
-            with open(path) as f:
+            with open(path, encoding="utf-8") as f:
                 assert f.read() == expected
 
         with tm.ensure_clean("test.csv") as path:
@@ -92,7 +92,7 @@ def test_to_csv_doublequote(self):
 
         with tm.ensure_clean("test.csv") as path:
             df.to_csv(path, quoting=1, doublequote=True)  # QUOTE_ALL
-            with open(path) as f:
+            with open(path, encoding="utf-8") as f:
                 assert f.read() == expected
 
         with tm.ensure_clean("test.csv") as path:
@@ -109,7 +109,7 @@ def test_to_csv_escapechar(self):
 
         with tm.ensure_clean("test.csv") as path:  # QUOTE_ALL
             df.to_csv(path, quoting=1, doublequote=False, escapechar="\\")
-            with open(path) as f:
+            with open(path, encoding="utf-8") as f:
                 assert f.read() == expected
 
         df = DataFrame({"col": ["a,a", ",bb,"]})
@@ -121,7 +121,7 @@ def test_to_csv_escapechar(self):
 
         with tm.ensure_clean("test.csv") as path:
             df.to_csv(path, quoting=3, escapechar="\\")  # QUOTE_NONE
-            with open(path) as f:
+            with open(path, encoding="utf-8") as f:
                 assert f.read() == expected
 
     def test_csv_to_string(self):
@@ -401,7 +401,7 @@ def test_to_csv_string_array_ascii(self):
 """
         with tm.ensure_clean("str_test.csv") as path:
             df.to_csv(path, encoding="ascii")
-            with open(path) as f:
+            with open(path, encoding="utf-8") as f:
                 assert f.read() == expected_ascii
 
     def test_to_csv_string_array_utf8(self):
@@ -415,7 +415,7 @@ def test_to_csv_string_array_utf8(self):
 """
         with tm.ensure_clean("unicode_test.csv") as path:
             df.to_csv(path, encoding="utf-8")
-            with open(path) as f:
+            with open(path, encoding="utf-8") as f:
                 assert f.read() == expected_utf8
 
     def test_to_csv_string_with_lf(self):
@@ -521,10 +521,10 @@ def test_to_csv_write_to_open_file(self):
 z
 """
         with tm.ensure_clean("test.txt") as path:
-            with open(path, "w") as f:
+            with open(path, "w", encoding="utf-8") as f:
                 f.write("manual header\n")
                 df.to_csv(f, header=None, index=None)
-            with open(path) as f:
+            with open(path, encoding="utf-8") as f:
                 assert f.read() == expected
 
     def test_to_csv_write_to_open_file_with_newline_py3(self):
@@ -534,7 +534,7 @@ def test_to_csv_write_to_open_file_with_newline_py3(self):
         expected_rows = ["x", "y", "z"]
         expected = "manual header\n" + tm.convert_rows_list_to_csv_str(expected_rows)
         with tm.ensure_clean("test.txt") as path:
-            with open(path, "w", newline="") as f:
+            with open(path, "w", newline="", encoding="utf-8") as f:
                 f.write("manual header\n")
                 df.to_csv(f, header=None, index=None)
 

diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py
@@ -34,7 +34,7 @@ class TestToLatex:
     def test_to_latex_to_file(self, float_frame):
         with tm.ensure_clean("test.tex") as path:
             float_frame.to_latex(path)
-            with open(path) as f:
+            with open(path, encoding="utf-8") as f:
                 assert float_frame.to_latex() == f.read()
 
     def test_to_latex_to_file_utf8_with_encoding(self):

diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
@@ -1214,7 +1214,7 @@ def test_read_s3_jsonl(self, s3_resource, s3so):
     def test_read_local_jsonl(self):
         # GH17200
         with tm.ensure_clean("tmp_items.json") as path:
-            with open(path, "w") as infile:
+            with open(path, "w", encoding="utf-8") as infile:
                 infile.write('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n')
             result = read_json(path, lines=True)
             expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])

diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py
@@ -228,7 +228,7 @@ def test_read_csv_memory_growth_chunksize(all_parsers):
     parser = all_parsers
 
     with tm.ensure_clean() as path:
-        with open(path, "w") as f:
+        with open(path, "w", encoding="utf-8") as f:
             for i in range(1000):
                 f.write(str(i) + "\n")
 

diff --git a/pandas/tests/io/parser/common/test_file_buffer_url.py b/pandas/tests/io/parser/common/test_file_buffer_url.py
@@ -107,7 +107,7 @@ def test_no_permission(all_parsers):
 
         # verify that this process cannot open the file (not running as sudo)
         try:
-            with open(path):
+            with open(path, encoding="utf-8"):
                 pass
             pytest.skip("Running as sudo.")
         except PermissionError:
@@ -285,7 +285,7 @@ def test_file_handles_with_open(all_parsers, csv1):
     parser = all_parsers
 
     for mode in ["r", "rb"]:
-        with open(csv1, mode) as f:
+        with open(csv1, mode, encoding="utf-8") as f:
             parser.read_csv(f)
             assert not f.closed
 
@@ -392,7 +392,7 @@ def test_context_manageri_user_provided(all_parsers, datapath):
     # make sure that user-provided handles are not closed
     parser = all_parsers
 
-    with open(datapath("io", "data", "csv", "iris.csv")) as path:
+    with open(datapath("io", "data", "csv", "iris.csv"), encoding="utf-8") as path:
         reader = parser.read_csv(path, chunksize=1)
         assert not reader.handles.handle.closed
         try:

diff --git a/pandas/tests/io/parser/common/test_iterator.py b/pandas/tests/io/parser/common/test_iterator.py
@@ -95,10 +95,10 @@ def test_iteration_open_handle(all_parsers):
     kwargs = {"header": None}
 
     with tm.ensure_clean() as path:
-        with open(path, "w") as f:
+        with open(path, "w", encoding="utf-8") as f:
             f.write("AAA\nBBB\nCCC\nDDD\nEEE\nFFF\nGGG")
 
-        with open(path) as f:
+        with open(path, encoding="utf-8") as f:
             for line in f:
                 if "CCC" in line:
                     break

diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py
@@ -603,7 +603,7 @@ def test_file_handles_mmap(c_parser_only, csv1):
     # Don't close user provided file handles.
     parser = c_parser_only
 
-    with open(csv1) as f:
+    with open(csv1, encoding="utf-8") as f:
         with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as m:
             parser.read_csv(m)
             assert not m.closed
@@ -615,7 +615,7 @@ def test_file_binary_mode(c_parser_only):
     expected = DataFrame([[1, 2, 3], [4, 5, 6]])
 
     with tm.ensure_clean() as path:
-        with open(path, "w") as f:
+        with open(path, "w", encoding="utf-8") as f:
             f.write("1,2,3\n4,5,6")
 
         with open(path, "rb") as f:
@@ -627,7 +627,7 @@ def test_unix_style_breaks(c_parser_only):
     # GH 11020
     parser = c_parser_only
     with tm.ensure_clean() as path:
-        with open(path, "w", newline="\n") as f:
+        with open(path, "w", newline="\n", encoding="utf-8") as f:
             f.write("blah\n\ncol_1,col_2,col_3\n\n")
         result = parser.read_csv(path, skiprows=2, encoding="utf-8", engine="c")
     expected = DataFrame(columns=["col_1", "col_2", "col_3"])

diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py
@@ -129,7 +129,7 @@ def test_infer_compression(all_parsers, csv1, buffer, ext):
     kwargs["compression"] = "infer"
 
     if buffer:
-        with open(csv1) as f:
+        with open(csv1, encoding="utf-8") as f:
             result = parser.read_csv(f, **kwargs)
     else:
         ext = "." + ext if ext else ""
@@ -183,7 +183,9 @@ def test_ignore_compression_extension(all_parsers):
         with tm.ensure_clean("test.csv.zip") as path_zip:
             # make sure to create un-compressed file with zip extension
             df.to_csv(path_csv, index=False)
-            Path(path_zip).write_text(Path(path_csv).read_text())
+            Path(path_zip).write_text(
+                Path(path_csv).read_text(encoding="utf-8"), encoding="utf-8"
+            )
 
             tm.assert_frame_equal(parser.read_csv(path_zip, compression=None), df)
 

diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py
@@ -411,15 +411,15 @@ def test_constructor_bad_file(self, mmap_file):
         with pytest.raises(err, match=msg):
             icom._maybe_memory_map(non_file, True)
 
-        with open(mmap_file) as target:
+        with open(mmap_file, encoding="utf-8") as target:
             pass
 
         msg = "I/O operation on closed file"
         with pytest.raises(ValueError, match=msg):
             icom._maybe_memory_map(target, True)
 
     def test_next(self, mmap_file):
-        with open(mmap_file) as target:
+        with open(mmap_file, encoding="utf-8") as target:
             lines = target.readlines()
 
             with icom.get_handle(

diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py
@@ -299,10 +299,10 @@ def test_ambiguous_archive_zip():
 
 def test_ambiguous_archive_tar(tmp_path):
     csvAPath = tmp_path / "a.csv"
-    with open(csvAPath, "w") as a:
+    with open(csvAPath, "w", encoding="utf-8") as a:
         a.write("foo,bar\n")
     csvBPath = tmp_path / "b.csv"
-    with open(csvBPath, "w") as b:
+    with open(csvBPath, "w", encoding="utf-8") as b:
         b.write("foo,bar\n")
 
     tarpath = tmp_path / "archive.tar"