diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index de1e615beaaa2..f69c9f9606c60 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -83,9 +83,6 @@ repos: hooks: - id: pylint stages: [manual] -- repo: https://github.com/pycqa/pylint - rev: v2.16.2 - hooks: - id: pylint alias: redefined-outer-name name: Redefining name from outer scope @@ -99,6 +96,11 @@ repos: |^pandas/conftest\.py # keep excluded args: [--disable=all, --enable=redefined-outer-name] stages: [manual] + - id: pylint + alias: unspecified-encoding + name: Using open without explicitly specifying an encoding + args: [--disable=all, --enable=unspecified-encoding] + stages: [manual] - repo: https://github.com/PyCQA/isort rev: 5.12.0 hooks: diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 36301d22db5d3..07d536d827959 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -444,7 +444,7 @@ class ReadCSVMemoryGrowth(BaseIO): param_names = ["engine"] def setup(self, engine): - with open(self.fname, "w") as f: + with open(self.fname, "w", encoding="utf-8") as f: for i in range(self.num_rows): f.write(f"{i}\n") diff --git a/doc/make.py b/doc/make.py index de298ce0475b1..937b2638fb098 100755 --- a/doc/make.py +++ b/doc/make.py @@ -163,12 +163,12 @@ def _get_page_title(self, page): components=(docutils.parsers.rst.Parser,) ) doc = docutils.utils.new_document("", option_parser.get_default_values()) - with open(fname) as f: + with open(fname, encoding="utf-8") as f: data = f.read() parser = docutils.parsers.rst.Parser() # do not generate any warning when parsing the rst - with open(os.devnull, "a") as f: + with open(os.devnull, "a", encoding="utf-8") as f: doc.reporter.stream = f parser.parse(data, doc) @@ -186,7 +186,7 @@ def _add_redirects(self): Create in the build directory an html file with a redirect, for every row in REDIRECTS_FILE. """ - with open(REDIRECTS_FILE) as mapping_fd: + with open(REDIRECTS_FILE, encoding="utf-8") as mapping_fd: reader = csv.reader(mapping_fd) for row in reader: if not row or row[0].strip().startswith("#"): @@ -209,7 +209,7 @@ def _add_redirects(self): # sphinx specific stuff title = "this page" - with open(path, "w") as moved_page_fd: + with open(path, "w", encoding="utf-8") as moved_page_fd: html = f"""\ diff --git a/doc/source/conf.py b/doc/source/conf.py index ac318c5230275..6f7e770e5d554 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -117,9 +117,9 @@ elif single_doc and rel_fname != pattern: exclude_patterns.append(rel_fname) -with open(os.path.join(source_path, "index.rst.template")) as f: +with open(os.path.join(source_path, "index.rst.template"), encoding="utf-8") as f: t = jinja2.Template(f.read()) -with open(os.path.join(source_path, "index.rst"), "w") as f: +with open(os.path.join(source_path, "index.rst"), "w", encoding="utf-8") as f: f.write( t.render( include_api=include_api, diff --git a/pandas/_testing/contexts.py b/pandas/_testing/contexts.py index ab00c80886794..ba2c8c219dc41 100644 --- a/pandas/_testing/contexts.py +++ b/pandas/_testing/contexts.py @@ -124,9 +124,10 @@ def ensure_clean( path.touch() handle_or_str: str | IO = str(path) + encoding = kwargs.pop("encoding", None) if return_filelike: kwargs.setdefault("mode", "w+b") - handle_or_str = open(path, **kwargs) + handle_or_str = open(path, encoding=encoding, **kwargs) try: yield handle_or_str diff --git a/pandas/_version.py b/pandas/_version.py index 6705b8505f7e2..8c655648377c7 100644 --- a/pandas/_version.py +++ b/pandas/_version.py @@ -159,7 +159,7 @@ def git_get_keywords(versionfile_abs): # _version.py. keywords = {} try: - with open(versionfile_abs) as fobj: + with open(versionfile_abs, encoding="utf-8") as fobj: for line in fobj: if line.strip().startswith("git_refnames ="): mo = re.search(r'=\s*"(.*)"', line) diff --git a/pandas/core/series.py b/pandas/core/series.py index f47b5edc0f243..84fa874831d85 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1699,7 +1699,7 @@ def to_string( if hasattr(buf, "write"): buf.write(result) else: - with open(buf, "w") as f: + with open(buf, "w", encoding="utf-8") as f: f.write(result) return None diff --git a/pandas/io/clipboard/__init__.py b/pandas/io/clipboard/__init__.py index e574ed2c8059a..c07f51d875d4d 100644 --- a/pandas/io/clipboard/__init__.py +++ b/pandas/io/clipboard/__init__.py @@ -282,11 +282,11 @@ def copy_dev_clipboard(text): stacklevel=find_stack_level(), ) - with open("/dev/clipboard", "w") as fd: + with open("/dev/clipboard", "w", encoding="utf-8") as fd: fd.write(text) def paste_dev_clipboard() -> str: - with open("/dev/clipboard") as fd: + with open("/dev/clipboard", encoding="utf-8") as fd: content = fd.read() return content diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 639c6f9d73511..b44b05f9f8153 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -626,7 +626,7 @@ def test_to_csv_float32_nanrep(self): with tm.ensure_clean("__tmp_to_csv_float32_nanrep__.csv") as path: df.to_csv(path, na_rep=999) - with open(path) as f: + with open(path, encoding="utf-8") as f: lines = f.readlines() assert lines[1].split(",")[2] == "999" diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index da0df3954b84a..f6d6433cd0643 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1708,7 +1708,7 @@ def test_corrupt_files_closed(self, engine, read_ext): errors = (BadZipFile, xlrd.biffh.XLRDError) with tm.ensure_clean(f"corrupt{read_ext}") as file: - Path(file).write_text("corrupt") + Path(file).write_text("corrupt", encoding="utf-8") with tm.assert_produces_warning(False): try: pd.ExcelFile(file, engine=engine) diff --git a/pandas/tests/io/formats/style/test_html.py b/pandas/tests/io/formats/style/test_html.py index c44f42727faeb..53062c52a29db 100644 --- a/pandas/tests/io/formats/style/test_html.py +++ b/pandas/tests/io/formats/style/test_html.py @@ -43,7 +43,7 @@ def tpl_table(): def test_html_template_extends_options(): # make sure if templates are edited tests are updated as are setup fixtures # to understand the dependency - with open("pandas/io/formats/templates/html.tpl") as file: + with open("pandas/io/formats/templates/html.tpl", encoding="utf-8") as file: result = file.read() assert "{% include html_style_tpl %}" in result assert "{% include html_table_tpl %}" in result diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 81dc79d3111b8..a208daaf9f77b 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -32,7 +32,7 @@ def test_to_csv_with_single_column(self): """ with tm.ensure_clean("test.csv") as path: df1.to_csv(path, header=None, index=None) - with open(path) as f: + with open(path, encoding="utf-8") as f: assert f.read() == expected1 df2 = DataFrame([1, None]) @@ -42,7 +42,7 @@ def test_to_csv_with_single_column(self): """ with tm.ensure_clean("test.csv") as path: df2.to_csv(path, header=None, index=None) - with open(path) as f: + with open(path, encoding="utf-8") as f: assert f.read() == expected2 def test_to_csv_default_encoding(self): @@ -64,7 +64,7 @@ def test_to_csv_quotechar(self): with tm.ensure_clean("test.csv") as path: df.to_csv(path, quoting=1) # 1=QUOTE_ALL - with open(path) as f: + with open(path, encoding="utf-8") as f: assert f.read() == expected expected = """\ @@ -75,7 +75,7 @@ def test_to_csv_quotechar(self): with tm.ensure_clean("test.csv") as path: df.to_csv(path, quoting=1, quotechar="$") - with open(path) as f: + with open(path, encoding="utf-8") as f: assert f.read() == expected with tm.ensure_clean("test.csv") as path: @@ -92,7 +92,7 @@ def test_to_csv_doublequote(self): with tm.ensure_clean("test.csv") as path: df.to_csv(path, quoting=1, doublequote=True) # QUOTE_ALL - with open(path) as f: + with open(path, encoding="utf-8") as f: assert f.read() == expected with tm.ensure_clean("test.csv") as path: @@ -109,7 +109,7 @@ def test_to_csv_escapechar(self): with tm.ensure_clean("test.csv") as path: # QUOTE_ALL df.to_csv(path, quoting=1, doublequote=False, escapechar="\\") - with open(path) as f: + with open(path, encoding="utf-8") as f: assert f.read() == expected df = DataFrame({"col": ["a,a", ",bb,"]}) @@ -121,7 +121,7 @@ def test_to_csv_escapechar(self): with tm.ensure_clean("test.csv") as path: df.to_csv(path, quoting=3, escapechar="\\") # QUOTE_NONE - with open(path) as f: + with open(path, encoding="utf-8") as f: assert f.read() == expected def test_csv_to_string(self): @@ -401,7 +401,7 @@ def test_to_csv_string_array_ascii(self): """ with tm.ensure_clean("str_test.csv") as path: df.to_csv(path, encoding="ascii") - with open(path) as f: + with open(path, encoding="utf-8") as f: assert f.read() == expected_ascii def test_to_csv_string_array_utf8(self): @@ -415,7 +415,7 @@ def test_to_csv_string_array_utf8(self): """ with tm.ensure_clean("unicode_test.csv") as path: df.to_csv(path, encoding="utf-8") - with open(path) as f: + with open(path, encoding="utf-8") as f: assert f.read() == expected_utf8 def test_to_csv_string_with_lf(self): @@ -521,10 +521,10 @@ def test_to_csv_write_to_open_file(self): z """ with tm.ensure_clean("test.txt") as path: - with open(path, "w") as f: + with open(path, "w", encoding="utf-8") as f: f.write("manual header\n") df.to_csv(f, header=None, index=None) - with open(path) as f: + with open(path, encoding="utf-8") as f: assert f.read() == expected def test_to_csv_write_to_open_file_with_newline_py3(self): @@ -534,7 +534,7 @@ def test_to_csv_write_to_open_file_with_newline_py3(self): expected_rows = ["x", "y", "z"] expected = "manual header\n" + tm.convert_rows_list_to_csv_str(expected_rows) with tm.ensure_clean("test.txt") as path: - with open(path, "w", newline="") as f: + with open(path, "w", newline="", encoding="utf-8") as f: f.write("manual header\n") df.to_csv(f, header=None, index=None) diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py index 927bb8abc5e31..d715daf253cd3 100644 --- a/pandas/tests/io/formats/test_to_latex.py +++ b/pandas/tests/io/formats/test_to_latex.py @@ -34,7 +34,7 @@ class TestToLatex: def test_to_latex_to_file(self, float_frame): with tm.ensure_clean("test.tex") as path: float_frame.to_latex(path) - with open(path) as f: + with open(path, encoding="utf-8") as f: assert float_frame.to_latex() == f.read() def test_to_latex_to_file_utf8_with_encoding(self): diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 5fc04509b86b6..788a6e97e3d0f 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1214,7 +1214,7 @@ def test_read_s3_jsonl(self, s3_resource, s3so): def test_read_local_jsonl(self): # GH17200 with tm.ensure_clean("tmp_items.json") as path: - with open(path, "w") as infile: + with open(path, "w", encoding="utf-8") as infile: infile.write('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n') result = read_json(path, lines=True) expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py index c8cef56c73902..6be7269cb8433 100644 --- a/pandas/tests/io/parser/common/test_chunksize.py +++ b/pandas/tests/io/parser/common/test_chunksize.py @@ -228,7 +228,7 @@ def test_read_csv_memory_growth_chunksize(all_parsers): parser = all_parsers with tm.ensure_clean() as path: - with open(path, "w") as f: + with open(path, "w", encoding="utf-8") as f: for i in range(1000): f.write(str(i) + "\n") diff --git a/pandas/tests/io/parser/common/test_file_buffer_url.py b/pandas/tests/io/parser/common/test_file_buffer_url.py index c11a59a8b4660..ba196a532adf6 100644 --- a/pandas/tests/io/parser/common/test_file_buffer_url.py +++ b/pandas/tests/io/parser/common/test_file_buffer_url.py @@ -107,7 +107,7 @@ def test_no_permission(all_parsers): # verify that this process cannot open the file (not running as sudo) try: - with open(path): + with open(path, encoding="utf-8"): pass pytest.skip("Running as sudo.") except PermissionError: @@ -285,7 +285,7 @@ def test_file_handles_with_open(all_parsers, csv1): parser = all_parsers for mode in ["r", "rb"]: - with open(csv1, mode) as f: + with open(csv1, mode, encoding="utf-8" if mode == "r" else None) as f: parser.read_csv(f) assert not f.closed @@ -392,7 +392,7 @@ def test_context_manageri_user_provided(all_parsers, datapath): # make sure that user-provided handles are not closed parser = all_parsers - with open(datapath("io", "data", "csv", "iris.csv")) as path: + with open(datapath("io", "data", "csv", "iris.csv"), encoding="utf-8") as path: reader = parser.read_csv(path, chunksize=1) assert not reader.handles.handle.closed try: diff --git a/pandas/tests/io/parser/common/test_iterator.py b/pandas/tests/io/parser/common/test_iterator.py index 939ed0e73a5ee..58e5886aedd6b 100644 --- a/pandas/tests/io/parser/common/test_iterator.py +++ b/pandas/tests/io/parser/common/test_iterator.py @@ -95,10 +95,10 @@ def test_iteration_open_handle(all_parsers): kwargs = {"header": None} with tm.ensure_clean() as path: - with open(path, "w") as f: + with open(path, "w", encoding="utf-8") as f: f.write("AAA\nBBB\nCCC\nDDD\nEEE\nFFF\nGGG") - with open(path) as f: + with open(path, encoding="utf-8") as f: for line in f: if "CCC" in line: break diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index c2a65704a845a..425f5cfbcf392 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -603,7 +603,7 @@ def test_file_handles_mmap(c_parser_only, csv1): # Don't close user provided file handles. parser = c_parser_only - with open(csv1) as f: + with open(csv1, encoding="utf-8") as f: with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as m: parser.read_csv(m) assert not m.closed @@ -615,7 +615,7 @@ def test_file_binary_mode(c_parser_only): expected = DataFrame([[1, 2, 3], [4, 5, 6]]) with tm.ensure_clean() as path: - with open(path, "w") as f: + with open(path, "w", encoding="utf-8") as f: f.write("1,2,3\n4,5,6") with open(path, "rb") as f: @@ -627,7 +627,7 @@ def test_unix_style_breaks(c_parser_only): # GH 11020 parser = c_parser_only with tm.ensure_clean() as path: - with open(path, "w", newline="\n") as f: + with open(path, "w", newline="\n", encoding="utf-8") as f: f.write("blah\n\ncol_1,col_2,col_3\n\n") result = parser.read_csv(path, skiprows=2, encoding="utf-8", engine="c") expected = DataFrame(columns=["col_1", "col_2", "col_3"]) diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py index ab00e31bd9b43..bcba9c4a1823d 100644 --- a/pandas/tests/io/parser/test_compression.py +++ b/pandas/tests/io/parser/test_compression.py @@ -129,7 +129,7 @@ def test_infer_compression(all_parsers, csv1, buffer, ext): kwargs["compression"] = "infer" if buffer: - with open(csv1) as f: + with open(csv1, encoding="utf-8") as f: result = parser.read_csv(f, **kwargs) else: ext = "." + ext if ext else "" @@ -183,7 +183,9 @@ def test_ignore_compression_extension(all_parsers): with tm.ensure_clean("test.csv.zip") as path_zip: # make sure to create un-compressed file with zip extension df.to_csv(path_csv, index=False) - Path(path_zip).write_text(Path(path_csv).read_text()) + Path(path_zip).write_text( + Path(path_csv).read_text(encoding="utf-8"), encoding="utf-8" + ) tm.assert_frame_equal(parser.read_csv(path_zip, compression=None), df) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index b248c0c460c74..435b9bdade944 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -411,7 +411,7 @@ def test_constructor_bad_file(self, mmap_file): with pytest.raises(err, match=msg): icom._maybe_memory_map(non_file, True) - with open(mmap_file) as target: + with open(mmap_file, encoding="utf-8") as target: pass msg = "I/O operation on closed file" @@ -419,7 +419,7 @@ def test_constructor_bad_file(self, mmap_file): icom._maybe_memory_map(target, True) def test_next(self, mmap_file): - with open(mmap_file) as target: + with open(mmap_file, encoding="utf-8") as target: lines = target.readlines() with icom.get_handle( diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index fc15ff3488ce9..eadf35aedd708 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -299,10 +299,10 @@ def test_ambiguous_archive_zip(): def test_ambiguous_archive_tar(tmp_path): csvAPath = tmp_path / "a.csv" - with open(csvAPath, "w") as a: + with open(csvAPath, "w", encoding="utf-8") as a: a.write("foo,bar\n") csvBPath = tmp_path / "b.csv" - with open(csvBPath, "w") as b: + with open(csvBPath, "w", encoding="utf-8") as b: b.write("foo,bar\n") tarpath = tmp_path / "archive.tar" diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index b65a19d766976..18cc0f0b11dc9 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -193,7 +193,7 @@ class MockGCSFileSystem(AbstractFileSystem): def open(self, path, mode="r", *args): if "w" not in mode: raise FileNotFoundError - return open(os.path.join(tmpdir, "test.parquet"), mode) + return open(os.path.join(tmpdir, "test.parquet"), mode, encoding="utf-8") monkeypatch.setattr("gcsfs.GCSFileSystem", MockGCSFileSystem) df1.to_parquet( diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index bb62d1a194a3e..5d9e4efd9ecf3 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -692,7 +692,7 @@ def try_remove_ws(x): @pytest.mark.slow def test_gold_canyon(self, banklist_data): gc = "Gold Canyon" - with open(banklist_data) as f: + with open(banklist_data, encoding="utf-8") as f: raw_text = f.read() assert gc in raw_text diff --git a/pandas/tests/io/xml/test_to_xml.py b/pandas/tests/io/xml/test_to_xml.py index 4843f40d6813d..1f1f44f408fc1 100644 --- a/pandas/tests/io/xml/test_to_xml.py +++ b/pandas/tests/io/xml/test_to_xml.py @@ -983,7 +983,7 @@ def test_unknown_parser(): def test_stylesheet_file_like(datapath, mode): xsl = datapath("io", "data", "xml", "row_field_output.xsl") - with open(xsl, mode) as f: + with open(xsl, mode, encoding="utf-8" if mode == "r" else None) as f: assert geom_df.to_xml(stylesheet=f) == xsl_expected @@ -995,7 +995,7 @@ def test_stylesheet_io(datapath, mode): # consider using --check-untyped-defs xsl_obj: BytesIO | StringIO # type: ignore[annotation-unchecked] - with open(xsl_path, mode) as f: + with open(xsl_path, mode, encoding="utf-8" if mode == "r" else None) as f: if mode == "rb": xsl_obj = BytesIO(f.read()) else: @@ -1010,7 +1010,7 @@ def test_stylesheet_io(datapath, mode): def test_stylesheet_buffered_reader(datapath, mode): xsl = datapath("io", "data", "xml", "row_field_output.xsl") - with open(xsl, mode) as f: + with open(xsl, mode, encoding="utf-8" if mode == "r" else None) as f: xsl_obj = f.read() output = geom_df.to_xml(stylesheet=xsl_obj) diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index c6a6b9eeadf5f..04abebe4a0a71 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -259,7 +259,7 @@ def parser(request): def read_xml_iterparse(data, **kwargs): with tm.ensure_clean() as path: - with open(path, "w") as f: + with open(path, "w", encoding="utf-8") as f: f.write(data) return read_xml(path, **kwargs) @@ -267,7 +267,7 @@ def read_xml_iterparse(data, **kwargs): def read_xml_iterparse_comp(comp_path, compression_only, **kwargs): with get_handle(comp_path, "r", compression=compression_only) as handles: with tm.ensure_clean() as path: - with open(path, "w") as f: + with open(path, "w", encoding="utf-8") as f: f.write(handles.handle.read()) return read_xml(path, **kwargs) @@ -351,7 +351,7 @@ def test_parser_consistency_url(parser): def test_file_like(datapath, parser, mode): filename = datapath("io", "data", "xml", "books.xml") - with open(filename, mode) as f: + with open(filename, mode, encoding="utf-8" if mode == "r" else None) as f: df_file = read_xml(f, parser=parser) df_expected = DataFrame( @@ -369,7 +369,7 @@ def test_file_like(datapath, parser, mode): def test_file_io(datapath, parser, mode): filename = datapath("io", "data", "xml", "books.xml") - with open(filename, mode) as f: + with open(filename, mode, encoding="utf-8" if mode == "r" else None) as f: xml_obj = f.read() df_io = read_xml( @@ -392,7 +392,7 @@ def test_file_io(datapath, parser, mode): def test_file_buffered_reader_string(datapath, parser, mode): filename = datapath("io", "data", "xml", "books.xml") - with open(filename, mode) as f: + with open(filename, mode, encoding="utf-8" if mode == "r" else None) as f: xml_obj = f.read() df_str = read_xml(xml_obj, parser=parser) @@ -412,7 +412,7 @@ def test_file_buffered_reader_string(datapath, parser, mode): def test_file_buffered_reader_no_xml_declaration(datapath, parser, mode): filename = datapath("io", "data", "xml", "books.xml") - with open(filename, mode) as f: + with open(filename, mode, encoding="utf-8" if mode == "r" else None) as f: next(f) xml_obj = f.read() @@ -1163,7 +1163,7 @@ def test_stylesheet_file_like(datapath, mode): kml = datapath("io", "data", "xml", "cta_rail_lines.kml") xsl = datapath("io", "data", "xml", "flatten_doc.xsl") - with open(xsl, mode) as f: + with open(xsl, mode, encoding="utf-8" if mode == "r" else None) as f: df_style = read_xml( kml, xpath=".//k:Placemark", @@ -1183,7 +1183,7 @@ def test_stylesheet_io(datapath, mode): # consider using --check-untyped-defs xsl_obj: BytesIO | StringIO # type: ignore[annotation-unchecked] - with open(xsl, mode) as f: + with open(xsl, mode, encoding="utf-8" if mode == "r" else None) as f: if mode == "rb": xsl_obj = BytesIO(f.read()) else: @@ -1204,7 +1204,7 @@ def test_stylesheet_buffered_reader(datapath, mode): kml = datapath("io", "data", "xml", "cta_rail_lines.kml") xsl = datapath("io", "data", "xml", "flatten_doc.xsl") - with open(xsl, mode) as f: + with open(xsl, mode, encoding="utf-8" if mode == "r" else None) as f: xsl_obj = f.read() df_style = read_xml( @@ -1364,7 +1364,7 @@ def test_stylesheet_file_close(datapath, mode): # consider using --check-untyped-defs xsl_obj: BytesIO | StringIO # type: ignore[annotation-unchecked] - with open(xsl, mode) as f: + with open(xsl, mode, encoding="utf-8" if mode == "r" else None) as f: if mode == "rb": xsl_obj = BytesIO(f.read()) else: @@ -1416,7 +1416,7 @@ def test_string_error(parser): def test_file_like_iterparse(datapath, parser, mode): filename = datapath("io", "data", "xml", "books.xml") - with open(filename, mode) as f: + with open(filename, mode, encoding="utf-8" if mode == "r" else None) as f: if mode == "r" and parser == "lxml": with pytest.raises( TypeError, match=("reading file objects must return bytes objects") @@ -1453,7 +1453,11 @@ def test_file_io_iterparse(datapath, parser, mode): filename = datapath("io", "data", "xml", "books.xml") funcIO = StringIO if mode == "r" else BytesIO - with open(filename, mode) as f: + with open( + filename, + mode, + encoding="utf-8" if mode == "r" else None, + ) as f: with funcIO(f.read()) as b: if mode == "r" and parser == "lxml": with pytest.raises( @@ -1559,7 +1563,7 @@ def test_bad_xml(parser): """ with tm.ensure_clean(filename="bad.xml") as path: - with open(path, "w") as f: + with open(path, "w", encoding="utf-8") as f: f.write(bad_xml) with pytest.raises( diff --git a/pandas/tests/io/xml/test_xml_dtypes.py b/pandas/tests/io/xml/test_xml_dtypes.py index 17d1e7e00653b..d62b9fa27e264 100644 --- a/pandas/tests/io/xml/test_xml_dtypes.py +++ b/pandas/tests/io/xml/test_xml_dtypes.py @@ -29,7 +29,7 @@ def iterparse(request): def read_xml_iterparse(data, **kwargs): with tm.ensure_clean() as path: - with open(path, "w") as f: + with open(path, "w", encoding="utf-8") as f: f.write(data) return read_xml(path, **kwargs) diff --git a/pandas/tests/series/methods/test_to_csv.py b/pandas/tests/series/methods/test_to_csv.py index 990c3698a5036..070ab872a4e5b 100644 --- a/pandas/tests/series/methods/test_to_csv.py +++ b/pandas/tests/series/methods/test_to_csv.py @@ -52,7 +52,7 @@ def test_from_csv(self, datetime_series, string_series): series_h = self.read_csv(path, header=0) assert series_h.name == "series" - with open(path, "w") as outfile: + with open(path, "w", encoding="utf-8") as outfile: outfile.write("1998-01-01|1.0\n1999-01-01|2.0") series = self.read_csv(path, sep="|", parse_dates=True) @@ -69,7 +69,7 @@ def test_to_csv(self, datetime_series): with tm.ensure_clean() as path: datetime_series.to_csv(path, header=False) - with open(path, newline=None) as f: + with open(path, newline=None, encoding="utf-8") as f: lines = f.readlines() assert lines[1] != "\n" diff --git a/pandas/tests/util/test_show_versions.py b/pandas/tests/util/test_show_versions.py index 714588d179aef..72c9db23b2108 100644 --- a/pandas/tests/util/test_show_versions.py +++ b/pandas/tests/util/test_show_versions.py @@ -16,7 +16,7 @@ def test_show_versions(tmpdir): pd.show_versions(as_json=as_json) - with open(as_json) as fd: + with open(as_json, encoding="utf-8") as fd: # check if file output is valid JSON, will raise an exception if not result = json.load(fd) @@ -75,7 +75,7 @@ def test_json_output_match(capsys, tmpdir): out_path = os.path.join(tmpdir, "test_json.json") pd.show_versions(as_json=out_path) - with open(out_path) as out_fd: + with open(out_path, encoding="utf-8") as out_fd: result_file = out_fd.read() assert result_console == result_file diff --git a/scripts/generate_pxi.py b/scripts/generate_pxi.py index 3462b97aefcbf..47648a3937b4c 100644 --- a/scripts/generate_pxi.py +++ b/scripts/generate_pxi.py @@ -5,11 +5,11 @@ def process_tempita(pxifile, outfile): - with open(pxifile) as f: + with open(pxifile, encoding="utf-8") as f: tmpl = f.read() pyxcontent = Tempita.sub(tmpl) - with open(outfile, "w") as f: + with open(outfile, "w", encoding="utf-8") as f: f.write(pyxcontent) diff --git a/scripts/generate_version.py b/scripts/generate_version.py index fbc78ab12429a..8a93e4c1df55e 100644 --- a/scripts/generate_version.py +++ b/scripts/generate_version.py @@ -8,7 +8,7 @@ def write_version_info(path): if os.environ.get("MESON_DIST_ROOT"): # raise ValueError("dist root is", os.environ.get("MESON_DIST_ROOT")) path = os.path.join(os.environ.get("MESON_DIST_ROOT"), path) - with open(path, "w") as file: + with open(path, "w", encoding="utf-8") as file: file.write(f'__version__="{versioneer.get_version()}"\n') file.write( f'__git_version__="{versioneer.get_versions()["full-revisionid"]}"\n' diff --git a/scripts/pandas_errors_documented.py b/scripts/pandas_errors_documented.py index 52c1e2008b8a0..116a63b33eaf0 100644 --- a/scripts/pandas_errors_documented.py +++ b/scripts/pandas_errors_documented.py @@ -34,7 +34,7 @@ def main(argv: Sequence[str] | None = None) -> None: args = parser.parse_args(argv) with open(args.path, encoding="utf-8") as f: file_errors = get_defined_errors(f.read()) - with open(API_PATH) as f: + with open(API_PATH, encoding="utf-8") as f: doc_errors = { line.split(".")[1].strip() for line in f.readlines() if "errors" in line } diff --git a/scripts/sort_whatsnew_note.py b/scripts/sort_whatsnew_note.py index ae1d3346a5827..531ea57244b23 100644 --- a/scripts/sort_whatsnew_note.py +++ b/scripts/sort_whatsnew_note.py @@ -63,12 +63,12 @@ def main(argv: Sequence[str] | None = None) -> int: args = parser.parse_args(argv) ret = 0 for path in args.paths: - with open(path) as fd: + with open(path, encoding="utf-8") as fd: content = fd.read() new_content = sort_whatsnew_note(content) if content != new_content: ret |= 1 - with open(path, "w") as fd: + with open(path, "w", encoding="utf-8") as fd: fd.write(new_content) return ret diff --git a/scripts/tests/test_validate_min_versions_in_sync.py b/scripts/tests/test_validate_min_versions_in_sync.py index 13e8965bb7591..ac33f8dcbffaf 100644 --- a/scripts/tests/test_validate_min_versions_in_sync.py +++ b/scripts/tests/test_validate_min_versions_in_sync.py @@ -49,13 +49,13 @@ def test_pin_min_versions_to_yaml_file(src_toml, src_yaml, expected_yaml): with open(src_toml, "rb") as toml_f: toml_map = tomllib.load(toml_f) - with open(src_yaml) as yaml_f: + with open(src_yaml, encoding="utf-8") as yaml_f: yaml_file_data = yaml_f.read() yaml_file = yaml.safe_load(yaml_file_data) yaml_dependencies = yaml_file["dependencies"] yaml_map = get_yaml_map_from(yaml_dependencies) toml_map = get_toml_map_from(toml_map) result_yaml_file = pin_min_versions_to_yaml_file(yaml_map, toml_map, yaml_file_data) - with open(expected_yaml) as yaml_f: + with open(expected_yaml, encoding="utf-8") as yaml_f: dummy_yaml_expected_file_1 = yaml_f.read() assert result_yaml_file == dummy_yaml_expected_file_1 diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index 4c133483f571f..c9eb476ab65fa 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -367,7 +367,7 @@ def get_all_api_items(): base_path = pathlib.Path(__file__).parent.parent api_doc_fnames = pathlib.Path(base_path, "doc", "source", "reference") for api_doc_fname in api_doc_fnames.glob("*.rst"): - with open(api_doc_fname) as f: + with open(api_doc_fname, encoding="utf-8") as f: yield from get_api_items(f) diff --git a/scripts/validate_exception_location.py b/scripts/validate_exception_location.py index 7af5e749b4b96..5f77e4c78db82 100644 --- a/scripts/validate_exception_location.py +++ b/scripts/validate_exception_location.py @@ -36,7 +36,7 @@ def get_warnings_and_exceptions_from_api_path() -> set[str]: - with open(API_PATH) as f: + with open(API_PATH, encoding="utf-8") as f: doc_errors = { line.split(".")[1].strip() for line in f.readlines() if "errors" in line } diff --git a/scripts/validate_min_versions_in_sync.py b/scripts/validate_min_versions_in_sync.py index 9a6d97a222000..cb03276d2dd93 100755 --- a/scripts/validate_min_versions_in_sync.py +++ b/scripts/validate_min_versions_in_sync.py @@ -63,7 +63,7 @@ def pin_min_versions_to_ci_deps() -> int: toml_dependencies = tomllib.load(toml_f) ret = 0 for curr_file in all_yaml_files: - with open(curr_file) as yaml_f: + with open(curr_file, encoding="utf-8") as yaml_f: yaml_start_data = yaml_f.read() yaml_file = yaml.safe_load(yaml_start_data) yaml_dependencies = yaml_file["dependencies"] @@ -73,7 +73,7 @@ def pin_min_versions_to_ci_deps() -> int: yaml_map, toml_map, yaml_start_data ) if yaml_result_data != yaml_start_data: - with open(curr_file, "w") as f: + with open(curr_file, "w", encoding="utf-8") as f: f.write(yaml_result_data) ret |= 1 return ret diff --git a/scripts/validate_rst_title_capitalization.py b/scripts/validate_rst_title_capitalization.py index 4446ed62f6b8a..0f4c11eb30b07 100755 --- a/scripts/validate_rst_title_capitalization.py +++ b/scripts/validate_rst_title_capitalization.py @@ -226,7 +226,7 @@ def find_titles(rst_file: str) -> Iterable[tuple[str, int]]: The corresponding line number of the heading. """ - with open(rst_file) as fd: + with open(rst_file, encoding="utf-8") as fd: previous_line = "" for i, line in enumerate(fd): line_no_last_elem = line[:-1] diff --git a/setup.py b/setup.py index 49f6557e2e250..52739a97bec2a 100755 --- a/setup.py +++ b/setup.py @@ -88,11 +88,11 @@ def render_templates(cls, pxifiles): # if .pxi.in is not updated, no need to output .pxi continue - with open(pxifile) as f: + with open(pxifile, encoding="utf-8") as f: tmpl = f.read() pyxcontent = Tempita.sub(tmpl) - with open(outfile, "w") as f: + with open(outfile, "w", encoding="utf-8") as f: f.write(pyxcontent) def build_extensions(self): diff --git a/web/pandas_web.py b/web/pandas_web.py index 5e902f1b1919b..9191cde31c20f 100755 --- a/web/pandas_web.py +++ b/web/pandas_web.py @@ -110,7 +110,7 @@ def blog_add_posts(context): md = markdown.Markdown( extensions=context["main"]["markdown_extensions"] ) - with open(os.path.join(posts_path, fname)) as f: + with open(os.path.join(posts_path, fname), encoding="utf-8") as f: html = md.convert(f.read()) title = md.Meta["title"][0] summary = re.sub(tag_expr, "", html) @@ -197,7 +197,11 @@ def maintainers_add_info(context): # save the data fetched from github to use it in case we exceed # git github api quota in the future - with open(pathlib.Path(context["target_path"]) / "maintainers.json", "w") as f: + with open( + pathlib.Path(context["target_path"]) / "maintainers.json", + "w", + encoding="utf-8", + ) as f: json.dump(maintainers_info, f) return context @@ -220,7 +224,11 @@ def home_add_releases(context): resp.raise_for_status() releases = resp.json() - with open(pathlib.Path(context["target_path"]) / "releases.json", "w") as f: + with open( + pathlib.Path(context["target_path"]) / "releases.json", + "w", + encoding="utf-8", + ) as f: json.dump(releases, f, default=datetime.datetime.isoformat) for release in releases: @@ -304,7 +312,9 @@ def roadmap_pdeps(context): resp.raise_for_status() pdeps = resp.json() - with open(pathlib.Path(context["target_path"]) / "pdeps.json", "w") as f: + with open( + pathlib.Path(context["target_path"]) / "pdeps.json", "w", encoding="utf-8" + ) as f: json.dump(pdeps, f) for pdep in sorted(pdeps["items"], key=operator.itemgetter("title")): @@ -346,7 +356,7 @@ def get_context(config_fname: str, **kwargs): Load the config yaml as the base context, and enrich it with the information added by the context preprocessors defined in the file. """ - with open(config_fname) as f: + with open(config_fname, encoding="utf-8") as f: context = yaml.safe_load(f) context["source_path"] = os.path.dirname(config_fname) @@ -418,7 +428,7 @@ def main( extension = os.path.splitext(fname)[-1] if extension in (".html", ".md"): - with open(os.path.join(source_path, fname)) as f: + with open(os.path.join(source_path, fname), encoding="utf-8") as f: content = f.read() if extension == ".md": body = markdown.markdown( @@ -431,7 +441,9 @@ def main( context["base_url"] = "".join(["../"] * os.path.normpath(fname).count("/")) content = jinja_env.from_string(content).render(**context) fname_html = os.path.splitext(fname)[0] + ".html" - with open(os.path.join(target_path, fname_html), "w") as f: + with open( + os.path.join(target_path, fname_html), "w", encoding="utf-8" + ) as f: f.write(content) else: shutil.copy(