From f909e0d5af8066da804b02a764e74019eae537da Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 8 Feb 2024 10:43:13 -0800 Subject: [PATCH 01/10] Ban literal json in read_json --- pandas/io/json/_json.py | 87 ++++++++--------------------- pandas/tests/io/json/test_pandas.py | 46 +++++---------- 2 files changed, 35 insertions(+), 98 deletions(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index cea34cdfb0b9d..a75e4e6aa9600 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -5,7 +5,6 @@ abstractmethod, ) from collections import abc -from io import StringIO from itertools import islice from typing import ( TYPE_CHECKING, @@ -30,7 +29,6 @@ from pandas.compat._optional import import_optional_dependency from pandas.errors import AbstractMethodError from pandas.util._decorators import doc -from pandas.util._exceptions import find_stack_level from pandas.util._validators import check_dtype_backend from pandas.core.dtypes.common import ( @@ -58,9 +56,7 @@ extension_to_compression, file_exists, get_handle, - is_fsspec_url, is_potential_multi_index, - is_url, stringify_path, ) from pandas.io.json._normalize import convert_to_line_delimits @@ -530,7 +526,7 @@ def read_json( Parameters ---------- - path_or_buf : a valid JSON str, path object or file-like object + path_or_buf : a string path, path object or file-like object Any valid string path is acceptable. The string could be a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. A local file could be: @@ -879,18 +875,6 @@ def __init__( self.nrows = validate_integer("nrows", self.nrows, 0) if not self.lines: raise ValueError("nrows can only be passed if lines=True") - if ( - isinstance(filepath_or_buffer, str) - and not self.lines - and "\n" in filepath_or_buffer - ): - warnings.warn( - "Passing literal json to 'read_json' is deprecated and " - "will be removed in a future version. To read from a " - "literal string, wrap it in a 'StringIO' object.", - FutureWarning, - stacklevel=find_stack_level(), - ) if self.engine == "pyarrow": if not self.lines: raise ValueError( @@ -900,45 +884,22 @@ def __init__( self.data = filepath_or_buffer elif self.engine == "ujson": data = self._get_data_from_filepath(filepath_or_buffer) - self.data = self._preprocess_data(data) - - def _preprocess_data(self, data): - """ - At this point, the data either has a `read` attribute (e.g. a file - object or a StringIO) or is a string that is a JSON document. - - If self.chunksize, we prepare the data for the `__next__` method. - Otherwise, we read it into memory for the `read` method. - """ - if hasattr(data, "read") and not (self.chunksize or self.nrows): - with self: - data = data.read() - if not hasattr(data, "read") and (self.chunksize or self.nrows): - data = StringIO(data) - - return data + # If self.chunksize, we prepare the data for the `__next__` method. + # Otherwise, we read it into memory for the `read` method. + if not (self.chunksize or self.nrows): + with self: + self.data = data.read() + else: + self.data = data def _get_data_from_filepath(self, filepath_or_buffer): """ The function read_json accepts three input types: 1. filepath (string-like) 2. file-like object (e.g. open file object, StringIO) - 3. JSON string - - This method turns (1) into (2) to simplify the rest of the processing. - It returns input types (2) and (3) unchanged. - - It raises FileNotFoundError if the input is a string ending in - one of .json, .json.gz, .json.bz2, etc. but no such file exists. """ - # if it is a string but the file does not exist, it might be a JSON string filepath_or_buffer = stringify_path(filepath_or_buffer) - if ( - not isinstance(filepath_or_buffer, str) - or is_url(filepath_or_buffer) - or is_fsspec_url(filepath_or_buffer) - or file_exists(filepath_or_buffer) - ): + try: self.handles = get_handle( filepath_or_buffer, "r", @@ -947,23 +908,19 @@ def _get_data_from_filepath(self, filepath_or_buffer): storage_options=self.storage_options, errors=self.encoding_errors, ) - filepath_or_buffer = self.handles.handle - elif ( - isinstance(filepath_or_buffer, str) - and filepath_or_buffer.lower().endswith( - (".json",) + tuple(f".json{c}" for c in extension_to_compression) - ) - and not file_exists(filepath_or_buffer) - ): - raise FileNotFoundError(f"File {filepath_or_buffer} does not exist") - else: - warnings.warn( - "Passing literal json to 'read_json' is deprecated and " - "will be removed in a future version. To read from a " - "literal string, wrap it in a 'StringIO' object.", - FutureWarning, - stacklevel=find_stack_level(), - ) + except OSError as err: + if ( + isinstance(filepath_or_buffer, str) + and filepath_or_buffer.lower().endswith( + (".json",) + tuple(f".json{c}" for c in extension_to_compression) + ) + and not file_exists(filepath_or_buffer) + ): + raise FileNotFoundError( + f"File {filepath_or_buffer} does not exist" + ) from err + raise + filepath_or_buffer = self.handles.handle return filepath_or_buffer def _combine_lines(self, lines) -> str: diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 9a263e8bc5f44..b5e197303d12c 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -36,49 +36,29 @@ from pandas.io.json import ujson_dumps -def test_literal_json_deprecation(): +def test_literal_json_raises(): # PR 53409 - expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) - jsonl = """{"a": 1, "b": 2} {"a": 3, "b": 4} {"a": 5, "b": 6} {"a": 7, "b": 8}""" - msg = ( - "Passing literal json to 'read_json' is deprecated and " - "will be removed in a future version. To read from a " - "literal string, wrap it in a 'StringIO' object." - ) + msg = r"\[Errno 2\] No such file or directory" - with tm.assert_produces_warning(FutureWarning, match=msg): - try: - read_json(jsonl, lines=False) - except ValueError: - pass + with pytest.raises(FileNotFoundError, match=msg): + read_json(jsonl, lines=False) - with tm.assert_produces_warning(FutureWarning, match=msg): - read_json(expected.to_json(), lines=False) + with pytest.raises(FileNotFoundError, match=msg): + read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True) - tm.assert_frame_equal(result, expected) + with pytest.raises(FileNotFoundError, match=msg): + read_json( + '{"a\\\\":"foo\\\\","b":"bar"}\n{"a\\\\":"foo\\"","b":"bar"}\n', + lines=False, + ) - with tm.assert_produces_warning(FutureWarning, match=msg): - try: - result = read_json( - '{"a\\\\":"foo\\\\","b":"bar"}\n{"a\\\\":"foo\\"","b":"bar"}\n', - lines=False, - ) - except ValueError: - pass - - with tm.assert_produces_warning(FutureWarning, match=msg): - try: - result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=False) - except ValueError: - pass - tm.assert_frame_equal(result, expected) + with pytest.raises(FileNotFoundError, match=msg): + read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=False) def assert_json_roundtrip_equal(result, expected, orient): From ddcf0344b5d83f76c98ae26372ebf8c2ef141471 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 8 Feb 2024 13:34:04 -0800 Subject: [PATCH 02/10] address xml --- pandas/io/formats/xml.py | 7 +-- pandas/io/xml.py | 75 ++++++++---------------------- pandas/tests/io/xml/test_to_xml.py | 37 +++++++-------- pandas/tests/io/xml/test_xml.py | 72 ++++++++++------------------ 4 files changed, 62 insertions(+), 129 deletions(-) diff --git a/pandas/io/formats/xml.py b/pandas/io/formats/xml.py index 775f1842692cb..e55561902d4d3 100644 --- a/pandas/io/formats/xml.py +++ b/pandas/io/formats/xml.py @@ -24,10 +24,7 @@ from pandas.core.shared_docs import _shared_docs from pandas.io.common import get_handle -from pandas.io.xml import ( - get_data_from_filepath, - preprocess_data, -) +from pandas.io.xml import get_data_from_filepath if TYPE_CHECKING: from pandas._typing import ( @@ -548,7 +545,7 @@ def _transform_doc(self) -> bytes: storage_options=self.storage_options, ) - with preprocess_data(handle_data) as xml_data: + with handle_data as xml_data: curr_parser = XMLParser(encoding=self.encoding) if isinstance(xml_data, io.StringIO): diff --git a/pandas/io/xml.py b/pandas/io/xml.py index 97bf520a77611..24ab111215218 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -11,7 +11,6 @@ Any, Callable, ) -import warnings from pandas._libs import lib from pandas.compat._optional import import_optional_dependency @@ -20,7 +19,6 @@ ParserError, ) from pandas.util._decorators import doc -from pandas.util._exceptions import find_stack_level from pandas.util._validators import check_dtype_backend from pandas.core.dtypes.common import is_list_like @@ -28,10 +26,8 @@ from pandas.core.shared_docs import _shared_docs from pandas.io.common import ( - file_exists, get_handle, infer_compression, - is_file_like, is_fsspec_url, is_url, stringify_path, @@ -528,7 +524,7 @@ def _parse_doc( storage_options=self.storage_options, ) - with preprocess_data(handle_data) as xml_data: + with handle_data as xml_data: curr_parser = XMLParser(encoding=self.encoding) document = parse(xml_data, parser=curr_parser) @@ -635,7 +631,7 @@ def _parse_doc( storage_options=self.storage_options, ) - with preprocess_data(handle_data) as xml_data: + with handle_data as xml_data: curr_parser = XMLParser(encoding=self.encoding) if isinstance(xml_data, io.StringIO): @@ -677,40 +673,23 @@ def get_data_from_filepath( """ Extract raw XML data. - The method accepts three input types: + The method accepts two input types: 1. filepath (string-like) 2. file-like object (e.g. open file object, StringIO) - 3. XML string or bytes - - This method turns (1) into (2) to simplify the rest of the processing. - It returns input types (2) and (3) unchanged. """ - if not isinstance(filepath_or_buffer, bytes): - filepath_or_buffer = stringify_path(filepath_or_buffer) - - if ( - isinstance(filepath_or_buffer, str) - and not filepath_or_buffer.startswith((" io.StringIO | io.BytesIO: @@ -790,22 +769,6 @@ def _parse( p: _EtreeFrameParser | _LxmlFrameParser - if isinstance(path_or_buffer, str) and not any( - [ - is_file_like(path_or_buffer), - file_exists(path_or_buffer), - is_url(path_or_buffer), - is_fsspec_url(path_or_buffer), - ] - ): - warnings.warn( - "Passing literal xml to 'read_xml' is deprecated and " - "will be removed in a future version. To read from a " - "literal string, wrap it in a 'StringIO' object.", - FutureWarning, - stacklevel=find_stack_level(), - ) - if parser == "lxml": lxml = import_optional_dependency("lxml.etree", errors="ignore") @@ -894,8 +857,8 @@ def read_xml( ---------- path_or_buffer : str, path object, or file-like object String, path object (implementing ``os.PathLike[str]``), or file-like - object implementing a ``read()`` function. The string can be any valid XML - string or a path. The string can further be a URL. Valid URL schemes + object implementing a ``read()`` function. The string can be a path. + The string can further be a URL. Valid URL schemes include http, ftp, s3, and file. .. deprecated:: 2.1.0 @@ -969,7 +932,7 @@ def read_xml( and ability to use XSLT stylesheet are supported. stylesheet : str, path object or file-like object - A URL, file-like object, or a raw string containing an XSLT script. + A URL, file-like object, or a string path containing an XSLT script. This stylesheet should flatten complex, deeply nested XML documents for easier parsing. To use this feature you must have ``lxml`` module installed and specify 'lxml' as ``parser``. The ``xpath`` must diff --git a/pandas/tests/io/xml/test_to_xml.py b/pandas/tests/io/xml/test_to_xml.py index a123f6dd52c08..62cc33376c630 100644 --- a/pandas/tests/io/xml/test_to_xml.py +++ b/pandas/tests/io/xml/test_to_xml.py @@ -1034,26 +1034,23 @@ def test_stylesheet_buffered_reader(xsl_row_field_output, mode, geom_df): with open( xsl_row_field_output, mode, encoding="utf-8" if mode == "r" else None ) as f: - xsl_obj = f.read() - - output = geom_df.to_xml(stylesheet=xsl_obj) + output = geom_df.to_xml(stylesheet=f) assert output == xsl_expected def test_stylesheet_wrong_path(geom_df): - lxml_etree = pytest.importorskip("lxml.etree") + pytest.importorskip("lxml.etree") - xsl = os.path.join("data", "xml", "row_field_output.xslt") + xsl = os.path.join("does", "not", "exist", "row_field_output.xslt") with pytest.raises( - lxml_etree.XMLSyntaxError, - match=("Start tag expected, '<' not found"), + FileNotFoundError, match=r"\[Errno 2\] No such file or director" ): geom_df.to_xml(stylesheet=xsl) -@pytest.mark.parametrize("val", ["", b""]) +@pytest.mark.parametrize("val", [StringIO(""), BytesIO(b"")]) def test_empty_string_stylesheet(val, geom_df): lxml_etree = pytest.importorskip("lxml.etree") @@ -1095,9 +1092,9 @@ def test_incorrect_xsl_syntax(geom_df): """ with pytest.raises( - lxml_etree.XMLSyntaxError, match=("Opening and ending tag mismatch") + lxml_etree.XMLSyntaxError, match="Opening and ending tag mismatch" ): - geom_df.to_xml(stylesheet=xsl) + geom_df.to_xml(stylesheet=StringIO(xsl)) def test_incorrect_xsl_eval(geom_df): @@ -1124,8 +1121,8 @@ def test_incorrect_xsl_eval(geom_df): """ - with pytest.raises(lxml_etree.XSLTParseError, match=("failed to compile")): - geom_df.to_xml(stylesheet=xsl) + with pytest.raises(lxml_etree.XSLTParseError, match="failed to compile"): + geom_df.to_xml(stylesheet=StringIO(xsl)) def test_incorrect_xsl_apply(geom_df): @@ -1143,9 +1140,9 @@ def test_incorrect_xsl_apply(geom_df): """ - with pytest.raises(lxml_etree.XSLTApplyError, match=("Cannot resolve URI")): + with pytest.raises(lxml_etree.XSLTApplyError, match="Cannot resolve URI"): with tm.ensure_clean("test.xml") as path: - geom_df.to_xml(path, stylesheet=xsl) + geom_df.to_xml(path, stylesheet=StringIO(xsl)) def test_stylesheet_with_etree(geom_df): @@ -1160,10 +1157,8 @@ def test_stylesheet_with_etree(geom_df): """ - with pytest.raises( - ValueError, match=("To use stylesheet, you need lxml installed") - ): - geom_df.to_xml(parser="etree", stylesheet=xsl) + with pytest.raises(ValueError, match="To use stylesheet, you need lxml installed"): + geom_df.to_xml(parser="etree", stylesheet=StringIO(xsl)) def test_style_to_csv(geom_df): @@ -1190,7 +1185,7 @@ def test_style_to_csv(geom_df): if out_csv is not None: out_csv = out_csv.strip() - out_xml = geom_df.to_xml(stylesheet=xsl) + out_xml = geom_df.to_xml(stylesheet=StringIO(xsl)) assert out_csv == out_xml @@ -1224,7 +1219,7 @@ def test_style_to_string(geom_df): """ out_str = geom_df.to_string() - out_xml = geom_df.to_xml(na_rep="NaN", stylesheet=xsl) + out_xml = geom_df.to_xml(na_rep="NaN", stylesheet=StringIO(xsl)) assert out_xml == out_str @@ -1269,7 +1264,7 @@ def test_style_to_json(geom_df): """ out_json = geom_df.to_json() - out_xml = geom_df.to_xml(stylesheet=xsl) + out_xml = geom_df.to_xml(stylesheet=StringIO(xsl)) assert out_json == out_xml diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index 0ee3ec85ab6c6..2f5eca5e1e353 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -247,16 +247,12 @@ ) -def test_literal_xml_deprecation(): +def test_literal_xml_raises(): # GH 53809 pytest.importorskip("lxml") - msg = ( - "Passing literal xml to 'read_xml' is deprecated and " - "will be removed in a future version. To read from a " - "literal string, wrap it in a 'StringIO' object." - ) + msg = r"\[Errno 2\] No such file or directory" - with tm.assert_produces_warning(FutureWarning, match=msg): + with pytest.raises(FileNotFoundError, match=msg): read_xml(xml_default_nmsp) @@ -490,16 +486,10 @@ def test_empty_string_etree(val): def test_wrong_file_path(parser): - msg = ( - "Passing literal xml to 'read_xml' is deprecated and " - "will be removed in a future version. To read from a " - "literal string, wrap it in a 'StringIO' object." - ) - filename = os.path.join("data", "html", "books.xml") + filename = os.path.join("does", "not", "exist", "books.xml") with pytest.raises( - FutureWarning, - match=msg, + FileNotFoundError, match=r"\[Errno 2\] No such file or directory" ): read_xml(filename, parser=parser) @@ -1197,14 +1187,12 @@ def test_stylesheet_io(kml_cta_rail_lines, xsl_flatten_doc, mode): def test_stylesheet_buffered_reader(kml_cta_rail_lines, xsl_flatten_doc, mode): pytest.importorskip("lxml") with open(xsl_flatten_doc, mode, encoding="utf-8" if mode == "r" else None) as f: - xsl_obj = f.read() - - df_style = read_xml( - kml_cta_rail_lines, - xpath=".//k:Placemark", - namespaces={"k": "http://www.opengis.net/kml/2.2"}, - stylesheet=xsl_obj, - ) + df_style = read_xml( + kml_cta_rail_lines, + xpath=".//k:Placemark", + namespaces={"k": "http://www.opengis.net/kml/2.2"}, + stylesheet=f, + ) tm.assert_frame_equal(df_kml, df_style) @@ -1233,7 +1221,7 @@ def test_style_charset(): """ df_orig = read_xml(StringIO(xml)) - df_style = read_xml(StringIO(xml), stylesheet=xsl) + df_style = read_xml(StringIO(xml), stylesheet=StringIO(xsl)) tm.assert_frame_equal(df_orig, df_style) @@ -1271,9 +1259,9 @@ def test_incorrect_xsl_syntax(kml_cta_rail_lines): """ with pytest.raises( - lxml_etree.XMLSyntaxError, match=("Extra content at the end of the document") + lxml_etree.XMLSyntaxError, match="Extra content at the end of the document" ): - read_xml(kml_cta_rail_lines, stylesheet=xsl) + read_xml(kml_cta_rail_lines, stylesheet=StringIO(xsl)) def test_incorrect_xsl_eval(kml_cta_rail_lines): @@ -1299,8 +1287,8 @@ def test_incorrect_xsl_eval(kml_cta_rail_lines): """ - with pytest.raises(lxml_etree.XSLTParseError, match=("failed to compile")): - read_xml(kml_cta_rail_lines, stylesheet=xsl) + with pytest.raises(lxml_etree.XSLTParseError, match="failed to compile"): + read_xml(kml_cta_rail_lines, stylesheet=StringIO(xsl)) def test_incorrect_xsl_apply(kml_cta_rail_lines): @@ -1318,18 +1306,17 @@ def test_incorrect_xsl_apply(kml_cta_rail_lines): """ - with pytest.raises(lxml_etree.XSLTApplyError, match=("Cannot resolve URI")): - read_xml(kml_cta_rail_lines, stylesheet=xsl) + with pytest.raises(lxml_etree.XSLTApplyError, match="Cannot resolve URI"): + read_xml(kml_cta_rail_lines, stylesheet=StringIO(xsl)) def test_wrong_stylesheet(kml_cta_rail_lines, xml_data_path): - xml_etree = pytest.importorskip("lxml.etree") + pytest.importorskip("lxml.etree") - xsl = xml_data_path / "flatten.xsl" + xsl = xml_data_path / "flatten_doesnt_exist.xsl" with pytest.raises( - xml_etree.XMLSyntaxError, - match=("Start tag expected, '<' not found"), + FileNotFoundError, match=r"\[Errno 2\] No such file or directory" ): read_xml(kml_cta_rail_lines, stylesheet=xsl) @@ -1359,20 +1346,11 @@ def test_stylesheet_with_etree(kml_cta_rail_lines, xsl_flatten_doc): read_xml(kml_cta_rail_lines, parser="etree", stylesheet=xsl_flatten_doc) -@pytest.mark.parametrize("val", ["", b""]) -def test_empty_stylesheet(val): +@pytest.mark.parametrize("val", [StringIO(""), BytesIO(b"")]) +def test_empty_stylesheet(val, kml_cta_rail_lines): lxml_etree = pytest.importorskip("lxml.etree") - - msg = ( - "Passing literal xml to 'read_xml' is deprecated and " - "will be removed in a future version. To read from a " - "literal string, wrap it in a 'StringIO' object." - ) - kml = os.path.join("data", "xml", "cta_rail_lines.kml") - with pytest.raises(lxml_etree.XMLSyntaxError): - with tm.assert_produces_warning(FutureWarning, match=msg): - read_xml(kml, stylesheet=val) + read_xml(kml_cta_rail_lines, stylesheet=val) # ITERPARSE @@ -1910,7 +1888,7 @@ def test_online_stylesheet(): StringIO(xml), xpath=".//tr[td and position() <= 6]", names=["title", "artist"], - stylesheet=xsl, + stylesheet=StringIO(xsl), ) df_expected = DataFrame( From 1bf5e9c72b32959a6c93f426fba67b37b3cffab3 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 8 Feb 2024 13:52:41 -0800 Subject: [PATCH 03/10] Add html --- pandas/io/html.py | 83 +++++++++++------------------------- pandas/tests/io/test_html.py | 10 ++--- 2 files changed, 27 insertions(+), 66 deletions(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index adcb78d3fb7d1..3197880f4eaaf 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -7,7 +7,9 @@ from __future__ import annotations from collections import abc +import errno import numbers +import os import re from re import Pattern from typing import ( @@ -15,7 +17,6 @@ Literal, cast, ) -import warnings from pandas._libs import lib from pandas.compat._optional import import_optional_dependency @@ -24,7 +25,6 @@ EmptyDataError, ) from pandas.util._decorators import doc -from pandas.util._exceptions import find_stack_level from pandas.util._validators import check_dtype_backend from pandas.core.dtypes.common import is_list_like @@ -36,10 +36,7 @@ from pandas.core.shared_docs import _shared_docs from pandas.io.common import ( - file_exists, get_handle, - is_file_like, - is_fsspec_url, is_url, stringify_path, validate_header_arg, @@ -134,21 +131,15 @@ def _read( ------- raw_text : str """ - text: str | bytes - if ( - is_url(obj) - or hasattr(obj, "read") - or (isinstance(obj, str) and file_exists(obj)) - ): + try: with get_handle( obj, "r", encoding=encoding, storage_options=storage_options ) as handles: - text = handles.handle.read() - elif isinstance(obj, (str, bytes)): - text = obj - else: - raise TypeError(f"Cannot read object of type '{type(obj).__name__}'") - return text + return handles.handle.read() + except OSError as err: + raise FileNotFoundError( + f"[Errno {errno.ENOENT}] {os.strerror(errno.ENOENT)}: {obj}" + ) from err class _HtmlFrameParser: @@ -158,7 +149,7 @@ class _HtmlFrameParser: Parameters ---------- io : str or file-like - This can be either a string of raw HTML, a valid URL using the HTTP, + This can be either a string path, a valid URL using the HTTP, FTP, or FILE protocols or a file-like object. match : str or regex @@ -780,36 +771,26 @@ def _build_doc(self): from lxml.etree import XMLSyntaxError from lxml.html import ( HTMLParser, - fromstring, parse, ) parser = HTMLParser(recover=True, encoding=self.encoding) - try: - if is_url(self.io): - with get_handle( - self.io, "r", storage_options=self.storage_options - ) as f: - r = parse(f.handle, parser=parser) - else: - # try to parse the input in the simplest way - r = parse(self.io, parser=parser) + if is_url(self.io): + with get_handle(self.io, "r", storage_options=self.storage_options) as f: + r = parse(f.handle, parser=parser) + else: + # try to parse the input in the simplest way try: - r = r.getroot() - except AttributeError: - pass - except (UnicodeDecodeError, OSError) as e: - # if the input is a blob of html goop - if not is_url(self.io): - r = fromstring(self.io, parser=parser) - - try: - r = r.getroot() - except AttributeError: - pass - else: - raise e + r = parse(self.io, parser=parser) + except OSError as err: + raise FileNotFoundError( + f"[Errno {errno.ENOENT}] {os.strerror(errno.ENOENT)}: {self.io}" + ) from err + try: + r = r.getroot() + except AttributeError: + pass else: if not hasattr(r, "text_content"): raise XMLSyntaxError("no text parsed from document", 0, 0, 0) @@ -1059,7 +1040,7 @@ def read_html( io : str, path object, or file-like object String, path object (implementing ``os.PathLike[str]``), or file-like object implementing a string ``read()`` function. - The string can represent a URL or the HTML itself. Note that + The string can represent a URL. Note that lxml only accepts the http, ftp and file url protocols. If you have a URL that starts with ``'https'`` you might try removing the ``'s'``. @@ -1227,22 +1208,6 @@ def read_html( io = stringify_path(io) - if isinstance(io, str) and not any( - [ - is_file_like(io), - file_exists(io), - is_url(io), - is_fsspec_url(io), - ] - ): - warnings.warn( - "Passing literal html to 'read_html' is deprecated and " - "will be removed in a future version. To read from a " - "literal string, wrap it in a 'StringIO' object.", - FutureWarning, - stacklevel=find_stack_level(), - ) - return _parse( flavor=flavor, io=io, diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 73044b8c24a53..2251fa20f0b63 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -112,13 +112,9 @@ def flavor_read_html(request): class TestReadHtml: def test_literal_html_deprecation(self, flavor_read_html): # GH 53785 - msg = ( - "Passing literal html to 'read_html' is deprecated and " - "will be removed in a future version. To read from a " - "literal string, wrap it in a 'StringIO' object." - ) + msg = r"\[Errno 2\] No such file or director" - with tm.assert_produces_warning(FutureWarning, match=msg): + with pytest.raises(FileNotFoundError, match=msg): flavor_read_html( """ @@ -1405,7 +1401,7 @@ def test_encode(self, html_encoding_file, flavor_read_html): try: with open(html_encoding_file, "rb") as fobj: from_string = flavor_read_html( - fobj.read(), encoding=encoding, index_col=0 + BytesIO(fobj.read()), encoding=encoding, index_col=0 ).pop() with open(html_encoding_file, "rb") as fobj: From 60b5d3f43c537d524c4029b8bf57d7666faae479 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 8 Feb 2024 14:04:54 -0800 Subject: [PATCH 04/10] Fix excel --- pandas/io/excel/_base.py | 23 +---------------------- pandas/tests/io/excel/test_readers.py | 13 +++---------- 2 files changed, 4 insertions(+), 32 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 1f272d0e09db8..cf9c3be97ee5c 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -8,7 +8,6 @@ ) import datetime from functools import partial -from io import BytesIO import os from textwrap import fill from typing import ( @@ -94,7 +93,7 @@ Parameters ---------- -io : str, bytes, ExcelFile, xlrd.Book, path object, or file-like object +io : str, ExcelFile, xlrd.Book, path object, or file-like object Any valid string path is acceptable. The string could be a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. A local file could be: ``file://localhost/path/to/table.xlsx``. @@ -552,10 +551,6 @@ def __init__( if engine_kwargs is None: engine_kwargs = {} - # First argument can also be bytes, so create a buffer - if isinstance(filepath_or_buffer, bytes): - filepath_or_buffer = BytesIO(filepath_or_buffer) - self.handles = IOHandles( handle=filepath_or_buffer, compression={"method": None} ) @@ -1405,9 +1400,6 @@ def inspect_excel_format( BadZipFile If resulting stream does not have an XLS signature and is not a valid zipfile. """ - if isinstance(content_or_path, bytes): - content_or_path = BytesIO(content_or_path) - with get_handle( content_or_path, "rb", storage_options=storage_options, is_text=False ) as handle: @@ -1526,19 +1518,6 @@ def __init__( if engine is not None and engine not in self._engines: raise ValueError(f"Unknown engine: {engine}") - # First argument can also be bytes, so create a buffer - if isinstance(path_or_buffer, bytes): - path_or_buffer = BytesIO(path_or_buffer) - warnings.warn( - "Passing bytes to 'read_excel' is deprecated and " - "will be removed in a future version. To read from a " - "byte string, wrap it in a `BytesIO` object.", - FutureWarning, - stacklevel=find_stack_level(), - ) - - # Could be a str, ExcelFile, Book, etc. - self.io = path_or_buffer # Always a string self._io = stringify_path(path_or_buffer) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 708f01839a23c..e4a8791396ee2 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1448,17 +1448,10 @@ def test_euro_decimal_format(self, read_ext): class TestExcelFileRead: - def test_deprecate_bytes_input(self, engine, read_ext): + def test_raises_bytes_input(self, engine, read_ext): # GH 53830 - msg = ( - "Passing bytes to 'read_excel' is deprecated and " - "will be removed in a future version. To read from a " - "byte string, wrap it in a `BytesIO` object." - ) - - with tm.assert_produces_warning( - FutureWarning, match=msg, raise_on_extra_warnings=False - ): + msg = "Expected file path name or file-like object" + with pytest.raises(TypeError, match=msg): with open("test1" + read_ext, "rb") as f: pd.read_excel(f.read(), engine=engine) From 3e0ccb90b50e4642346fc7ba7cf3abf6366d393f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 8 Feb 2024 14:08:19 -0800 Subject: [PATCH 05/10] Add whatsnew --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 2c39318fa28b3..f43cd7547e4b2 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -103,6 +103,7 @@ Deprecations Removal of prior version deprecations/changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +- :func:`read_excel`, :func:`read_json`, :func:`read_html`, and :func:`read_xml` no longer accept raw string or byte representation of the data. That type of data must be wrapped in a :py:class:`StringIO` or :py:class:`BytesIO` (:issue:`53767`) - All arguments except the first ``path``-like argument in IO writers are now keyword only (:issue:`54229`) - Removed :meth:`DataFrame.first` and :meth:`DataFrame.last` (:issue:`53710`) - Removed :meth:`DataFrameGroupby.fillna` and :meth:`SeriesGroupBy.fillna` (:issue:`55719`) From f9a986438cdc85bb294915a8df0915d1db6d79a5 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 8 Feb 2024 14:59:51 -0800 Subject: [PATCH 06/10] Fix docs --- doc/source/user_guide/io.rst | 4 ++-- doc/source/whatsnew/v0.12.0.rst | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 80572de91e0c7..a08315818366f 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -3247,7 +3247,7 @@ output (as shown below for demonstration) for easier parse into ``DataFrame``: """ - df = pd.read_xml(StringIO(xml), stylesheet=xsl) + df = pd.read_xml(StringIO(xml), stylesheet=StringIO(xsl)) df For very large XML files that can range in hundreds of megabytes to gigabytes, :func:`pandas.read_xml` @@ -3418,7 +3418,7 @@ Write an XML and transform with stylesheet: """ - print(geom_df.to_xml(stylesheet=xsl)) + print(geom_df.to_xml(stylesheet=StringIO(xsl))) XML Final Notes diff --git a/doc/source/whatsnew/v0.12.0.rst b/doc/source/whatsnew/v0.12.0.rst index 59d104cb3e96c..c805758f85b35 100644 --- a/doc/source/whatsnew/v0.12.0.rst +++ b/doc/source/whatsnew/v0.12.0.rst @@ -201,12 +201,12 @@ IO enhancements You can use ``pd.read_html()`` to read the output from ``DataFrame.to_html()`` like so .. ipython:: python - :okwarning: + import io df = pd.DataFrame({"a": range(3), "b": list("abc")}) print(df) html = df.to_html() - alist = pd.read_html(html, index_col=0) + alist = pd.read_html(io.StringIO(html), index_col=0) print(df == alist[0]) Note that ``alist`` here is a Python ``list`` so ``pd.read_html()`` and From 1a71d9fc5e2dd722725c1095e67cf3f049d4c7d5 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 8 Feb 2024 16:36:18 -0800 Subject: [PATCH 07/10] Adjust some tests --- pandas/io/html.py | 8 +++++--- pandas/tests/io/json/test_pandas.py | 2 +- pandas/tests/io/xml/test_xml.py | 2 +- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index 3197880f4eaaf..b4f6a5508726b 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -137,9 +137,11 @@ def _read( ) as handles: return handles.handle.read() except OSError as err: - raise FileNotFoundError( - f"[Errno {errno.ENOENT}] {os.strerror(errno.ENOENT)}: {obj}" - ) from err + if not is_url(obj): + raise FileNotFoundError( + f"[Errno {errno.ENOENT}] {os.strerror(errno.ENOENT)}: {obj}" + ) from err + raise class _HtmlFrameParser: diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index b5e197303d12c..9f4c7bdc46067 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -43,7 +43,7 @@ def test_literal_json_raises(): {"a": 5, "b": 6} {"a": 7, "b": 8}""" - msg = r"\[Errno 2\] No such file or directory" + msg = r".*No such file or directory" with pytest.raises(FileNotFoundError, match=msg): read_json(jsonl, lines=False) diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index 2f5eca5e1e353..c0800da37dca5 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -250,7 +250,7 @@ def test_literal_xml_raises(): # GH 53809 pytest.importorskip("lxml") - msg = r"\[Errno 2\] No such file or directory" + msg = r".*No such file or directory" with pytest.raises(FileNotFoundError, match=msg): read_xml(xml_default_nmsp) From f84ba8661582acbc27be64cb103558231fd8969c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 8 Feb 2024 17:44:06 -0800 Subject: [PATCH 08/10] Ignore typing for now --- pandas/io/xml.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/xml.py b/pandas/io/xml.py index 24ab111215218..2f5a1e9f5a4bb 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -669,7 +669,7 @@ def get_data_from_filepath( encoding: str | None, compression: CompressionOptions, storage_options: StorageOptions, -) -> str | bytes | ReadBuffer[bytes] | ReadBuffer[str]: +): """ Extract raw XML data. @@ -677,7 +677,7 @@ def get_data_from_filepath( 1. filepath (string-like) 2. file-like object (e.g. open file object, StringIO) """ - filepath_or_buffer = stringify_path(filepath_or_buffer) + filepath_or_buffer = stringify_path(filepath_or_buffer) # type: ignore[arg-type] with get_handle( filepath_or_buffer, "r", From d225f4da692fba2984287014df30af59e9017695 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 9 Feb 2024 10:25:31 -0800 Subject: [PATCH 09/10] Windows compat --- pandas/io/json/_json.py | 16 +++------------- pandas/tests/io/json/test_pandas.py | 2 +- pandas/tests/io/xml/test_xml.py | 4 ++-- 3 files changed, 6 insertions(+), 16 deletions(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index a75e4e6aa9600..91d0732c52cce 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -53,8 +53,6 @@ from pandas.io.common import ( IOHandles, dedup_names, - extension_to_compression, - file_exists, get_handle, is_potential_multi_index, stringify_path, @@ -909,17 +907,9 @@ def _get_data_from_filepath(self, filepath_or_buffer): errors=self.encoding_errors, ) except OSError as err: - if ( - isinstance(filepath_or_buffer, str) - and filepath_or_buffer.lower().endswith( - (".json",) + tuple(f".json{c}" for c in extension_to_compression) - ) - and not file_exists(filepath_or_buffer) - ): - raise FileNotFoundError( - f"File {filepath_or_buffer} does not exist" - ) from err - raise + raise FileNotFoundError( + f"File {filepath_or_buffer} does not exist" + ) from err filepath_or_buffer = self.handles.handle return filepath_or_buffer diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 9f4c7bdc46067..8eadbb9aac3c3 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -43,7 +43,7 @@ def test_literal_json_raises(): {"a": 5, "b": 6} {"a": 7, "b": 8}""" - msg = r".*No such file or directory" + msg = r".* does not exist" with pytest.raises(FileNotFoundError, match=msg): read_json(jsonl, lines=False) diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index c0800da37dca5..97599722cb93f 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -250,9 +250,9 @@ def test_literal_xml_raises(): # GH 53809 pytest.importorskip("lxml") - msg = r".*No such file or directory" + msg = "|".join([r".*No such file or directory", r".*Invalid argument"]) - with pytest.raises(FileNotFoundError, match=msg): + with pytest.raises((FileNotFoundError, OSError), match=msg): read_xml(xml_default_nmsp) From 3b1778fbaccab4aab8fa8aeace6b05f22801a755 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 9 Feb 2024 12:14:24 -0800 Subject: [PATCH 10/10] Add generaltypeissues --- pandas/io/json/_json.py | 2 +- pandas/io/xml.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 91d0732c52cce..de246a2757409 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -524,7 +524,7 @@ def read_json( Parameters ---------- - path_or_buf : a string path, path object or file-like object + path_or_buf : a str path, path object or file-like object Any valid string path is acceptable. The string could be a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. A local file could be: diff --git a/pandas/io/xml.py b/pandas/io/xml.py index 2f5a1e9f5a4bb..2038733bee808 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -678,8 +678,8 @@ def get_data_from_filepath( 2. file-like object (e.g. open file object, StringIO) """ filepath_or_buffer = stringify_path(filepath_or_buffer) # type: ignore[arg-type] - with get_handle( - filepath_or_buffer, + with get_handle( # pyright: ignore[reportGeneralTypeIssues] + filepath_or_buffer, # pyright: ignore[reportGeneralTypeIssues] "r", encoding=encoding, compression=compression,