From f909e0d5af8066da804b02a764e74019eae537da Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 8 Feb 2024 10:43:13 -0800
Subject: [PATCH 01/10] Ban literal json in read_json
---
pandas/io/json/_json.py | 87 ++++++++---------------------
pandas/tests/io/json/test_pandas.py | 46 +++++----------
2 files changed, 35 insertions(+), 98 deletions(-)
diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py
index cea34cdfb0b9d..a75e4e6aa9600 100644
--- a/pandas/io/json/_json.py
+++ b/pandas/io/json/_json.py
@@ -5,7 +5,6 @@
abstractmethod,
)
from collections import abc
-from io import StringIO
from itertools import islice
from typing import (
TYPE_CHECKING,
@@ -30,7 +29,6 @@
from pandas.compat._optional import import_optional_dependency
from pandas.errors import AbstractMethodError
from pandas.util._decorators import doc
-from pandas.util._exceptions import find_stack_level
from pandas.util._validators import check_dtype_backend
from pandas.core.dtypes.common import (
@@ -58,9 +56,7 @@
extension_to_compression,
file_exists,
get_handle,
- is_fsspec_url,
is_potential_multi_index,
- is_url,
stringify_path,
)
from pandas.io.json._normalize import convert_to_line_delimits
@@ -530,7 +526,7 @@ def read_json(
Parameters
----------
- path_or_buf : a valid JSON str, path object or file-like object
+ path_or_buf : a string path, path object or file-like object
Any valid string path is acceptable. The string could be a URL. Valid
URL schemes include http, ftp, s3, and file. For file URLs, a host is
expected. A local file could be:
@@ -879,18 +875,6 @@ def __init__(
self.nrows = validate_integer("nrows", self.nrows, 0)
if not self.lines:
raise ValueError("nrows can only be passed if lines=True")
- if (
- isinstance(filepath_or_buffer, str)
- and not self.lines
- and "\n" in filepath_or_buffer
- ):
- warnings.warn(
- "Passing literal json to 'read_json' is deprecated and "
- "will be removed in a future version. To read from a "
- "literal string, wrap it in a 'StringIO' object.",
- FutureWarning,
- stacklevel=find_stack_level(),
- )
if self.engine == "pyarrow":
if not self.lines:
raise ValueError(
@@ -900,45 +884,22 @@ def __init__(
self.data = filepath_or_buffer
elif self.engine == "ujson":
data = self._get_data_from_filepath(filepath_or_buffer)
- self.data = self._preprocess_data(data)
-
- def _preprocess_data(self, data):
- """
- At this point, the data either has a `read` attribute (e.g. a file
- object or a StringIO) or is a string that is a JSON document.
-
- If self.chunksize, we prepare the data for the `__next__` method.
- Otherwise, we read it into memory for the `read` method.
- """
- if hasattr(data, "read") and not (self.chunksize or self.nrows):
- with self:
- data = data.read()
- if not hasattr(data, "read") and (self.chunksize or self.nrows):
- data = StringIO(data)
-
- return data
+ # If self.chunksize, we prepare the data for the `__next__` method.
+ # Otherwise, we read it into memory for the `read` method.
+ if not (self.chunksize or self.nrows):
+ with self:
+ self.data = data.read()
+ else:
+ self.data = data
def _get_data_from_filepath(self, filepath_or_buffer):
"""
The function read_json accepts three input types:
1. filepath (string-like)
2. file-like object (e.g. open file object, StringIO)
- 3. JSON string
-
- This method turns (1) into (2) to simplify the rest of the processing.
- It returns input types (2) and (3) unchanged.
-
- It raises FileNotFoundError if the input is a string ending in
- one of .json, .json.gz, .json.bz2, etc. but no such file exists.
"""
- # if it is a string but the file does not exist, it might be a JSON string
filepath_or_buffer = stringify_path(filepath_or_buffer)
- if (
- not isinstance(filepath_or_buffer, str)
- or is_url(filepath_or_buffer)
- or is_fsspec_url(filepath_or_buffer)
- or file_exists(filepath_or_buffer)
- ):
+ try:
self.handles = get_handle(
filepath_or_buffer,
"r",
@@ -947,23 +908,19 @@ def _get_data_from_filepath(self, filepath_or_buffer):
storage_options=self.storage_options,
errors=self.encoding_errors,
)
- filepath_or_buffer = self.handles.handle
- elif (
- isinstance(filepath_or_buffer, str)
- and filepath_or_buffer.lower().endswith(
- (".json",) + tuple(f".json{c}" for c in extension_to_compression)
- )
- and not file_exists(filepath_or_buffer)
- ):
- raise FileNotFoundError(f"File {filepath_or_buffer} does not exist")
- else:
- warnings.warn(
- "Passing literal json to 'read_json' is deprecated and "
- "will be removed in a future version. To read from a "
- "literal string, wrap it in a 'StringIO' object.",
- FutureWarning,
- stacklevel=find_stack_level(),
- )
+ except OSError as err:
+ if (
+ isinstance(filepath_or_buffer, str)
+ and filepath_or_buffer.lower().endswith(
+ (".json",) + tuple(f".json{c}" for c in extension_to_compression)
+ )
+ and not file_exists(filepath_or_buffer)
+ ):
+ raise FileNotFoundError(
+ f"File {filepath_or_buffer} does not exist"
+ ) from err
+ raise
+ filepath_or_buffer = self.handles.handle
return filepath_or_buffer
def _combine_lines(self, lines) -> str:
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
index 9a263e8bc5f44..b5e197303d12c 100644
--- a/pandas/tests/io/json/test_pandas.py
+++ b/pandas/tests/io/json/test_pandas.py
@@ -36,49 +36,29 @@
from pandas.io.json import ujson_dumps
-def test_literal_json_deprecation():
+def test_literal_json_raises():
# PR 53409
- expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
-
jsonl = """{"a": 1, "b": 2}
{"a": 3, "b": 4}
{"a": 5, "b": 6}
{"a": 7, "b": 8}"""
- msg = (
- "Passing literal json to 'read_json' is deprecated and "
- "will be removed in a future version. To read from a "
- "literal string, wrap it in a 'StringIO' object."
- )
+ msg = r"\[Errno 2\] No such file or directory"
- with tm.assert_produces_warning(FutureWarning, match=msg):
- try:
- read_json(jsonl, lines=False)
- except ValueError:
- pass
+ with pytest.raises(FileNotFoundError, match=msg):
+ read_json(jsonl, lines=False)
- with tm.assert_produces_warning(FutureWarning, match=msg):
- read_json(expected.to_json(), lines=False)
+ with pytest.raises(FileNotFoundError, match=msg):
+ read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True)
- with tm.assert_produces_warning(FutureWarning, match=msg):
- result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True)
- tm.assert_frame_equal(result, expected)
+ with pytest.raises(FileNotFoundError, match=msg):
+ read_json(
+ '{"a\\\\":"foo\\\\","b":"bar"}\n{"a\\\\":"foo\\"","b":"bar"}\n',
+ lines=False,
+ )
- with tm.assert_produces_warning(FutureWarning, match=msg):
- try:
- result = read_json(
- '{"a\\\\":"foo\\\\","b":"bar"}\n{"a\\\\":"foo\\"","b":"bar"}\n',
- lines=False,
- )
- except ValueError:
- pass
-
- with tm.assert_produces_warning(FutureWarning, match=msg):
- try:
- result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=False)
- except ValueError:
- pass
- tm.assert_frame_equal(result, expected)
+ with pytest.raises(FileNotFoundError, match=msg):
+ read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=False)
def assert_json_roundtrip_equal(result, expected, orient):
From ddcf0344b5d83f76c98ae26372ebf8c2ef141471 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 8 Feb 2024 13:34:04 -0800
Subject: [PATCH 02/10] address xml
---
pandas/io/formats/xml.py | 7 +--
pandas/io/xml.py | 75 ++++++++----------------------
pandas/tests/io/xml/test_to_xml.py | 37 +++++++--------
pandas/tests/io/xml/test_xml.py | 72 ++++++++++------------------
4 files changed, 62 insertions(+), 129 deletions(-)
diff --git a/pandas/io/formats/xml.py b/pandas/io/formats/xml.py
index 775f1842692cb..e55561902d4d3 100644
--- a/pandas/io/formats/xml.py
+++ b/pandas/io/formats/xml.py
@@ -24,10 +24,7 @@
from pandas.core.shared_docs import _shared_docs
from pandas.io.common import get_handle
-from pandas.io.xml import (
- get_data_from_filepath,
- preprocess_data,
-)
+from pandas.io.xml import get_data_from_filepath
if TYPE_CHECKING:
from pandas._typing import (
@@ -548,7 +545,7 @@ def _transform_doc(self) -> bytes:
storage_options=self.storage_options,
)
- with preprocess_data(handle_data) as xml_data:
+ with handle_data as xml_data:
curr_parser = XMLParser(encoding=self.encoding)
if isinstance(xml_data, io.StringIO):
diff --git a/pandas/io/xml.py b/pandas/io/xml.py
index 97bf520a77611..24ab111215218 100644
--- a/pandas/io/xml.py
+++ b/pandas/io/xml.py
@@ -11,7 +11,6 @@
Any,
Callable,
)
-import warnings
from pandas._libs import lib
from pandas.compat._optional import import_optional_dependency
@@ -20,7 +19,6 @@
ParserError,
)
from pandas.util._decorators import doc
-from pandas.util._exceptions import find_stack_level
from pandas.util._validators import check_dtype_backend
from pandas.core.dtypes.common import is_list_like
@@ -28,10 +26,8 @@
from pandas.core.shared_docs import _shared_docs
from pandas.io.common import (
- file_exists,
get_handle,
infer_compression,
- is_file_like,
is_fsspec_url,
is_url,
stringify_path,
@@ -528,7 +524,7 @@ def _parse_doc(
storage_options=self.storage_options,
)
- with preprocess_data(handle_data) as xml_data:
+ with handle_data as xml_data:
curr_parser = XMLParser(encoding=self.encoding)
document = parse(xml_data, parser=curr_parser)
@@ -635,7 +631,7 @@ def _parse_doc(
storage_options=self.storage_options,
)
- with preprocess_data(handle_data) as xml_data:
+ with handle_data as xml_data:
curr_parser = XMLParser(encoding=self.encoding)
if isinstance(xml_data, io.StringIO):
@@ -677,40 +673,23 @@ def get_data_from_filepath(
"""
Extract raw XML data.
- The method accepts three input types:
+ The method accepts two input types:
1. filepath (string-like)
2. file-like object (e.g. open file object, StringIO)
- 3. XML string or bytes
-
- This method turns (1) into (2) to simplify the rest of the processing.
- It returns input types (2) and (3) unchanged.
"""
- if not isinstance(filepath_or_buffer, bytes):
- filepath_or_buffer = stringify_path(filepath_or_buffer)
-
- if (
- isinstance(filepath_or_buffer, str)
- and not filepath_or_buffer.startswith((" io.StringIO | io.BytesIO:
@@ -790,22 +769,6 @@ def _parse(
p: _EtreeFrameParser | _LxmlFrameParser
- if isinstance(path_or_buffer, str) and not any(
- [
- is_file_like(path_or_buffer),
- file_exists(path_or_buffer),
- is_url(path_or_buffer),
- is_fsspec_url(path_or_buffer),
- ]
- ):
- warnings.warn(
- "Passing literal xml to 'read_xml' is deprecated and "
- "will be removed in a future version. To read from a "
- "literal string, wrap it in a 'StringIO' object.",
- FutureWarning,
- stacklevel=find_stack_level(),
- )
-
if parser == "lxml":
lxml = import_optional_dependency("lxml.etree", errors="ignore")
@@ -894,8 +857,8 @@ def read_xml(
----------
path_or_buffer : str, path object, or file-like object
String, path object (implementing ``os.PathLike[str]``), or file-like
- object implementing a ``read()`` function. The string can be any valid XML
- string or a path. The string can further be a URL. Valid URL schemes
+ object implementing a ``read()`` function. The string can be a path.
+ The string can further be a URL. Valid URL schemes
include http, ftp, s3, and file.
.. deprecated:: 2.1.0
@@ -969,7 +932,7 @@ def read_xml(
and ability to use XSLT stylesheet are supported.
stylesheet : str, path object or file-like object
- A URL, file-like object, or a raw string containing an XSLT script.
+ A URL, file-like object, or a string path containing an XSLT script.
This stylesheet should flatten complex, deeply nested XML documents
for easier parsing. To use this feature you must have ``lxml`` module
installed and specify 'lxml' as ``parser``. The ``xpath`` must
diff --git a/pandas/tests/io/xml/test_to_xml.py b/pandas/tests/io/xml/test_to_xml.py
index a123f6dd52c08..62cc33376c630 100644
--- a/pandas/tests/io/xml/test_to_xml.py
+++ b/pandas/tests/io/xml/test_to_xml.py
@@ -1034,26 +1034,23 @@ def test_stylesheet_buffered_reader(xsl_row_field_output, mode, geom_df):
with open(
xsl_row_field_output, mode, encoding="utf-8" if mode == "r" else None
) as f:
- xsl_obj = f.read()
-
- output = geom_df.to_xml(stylesheet=xsl_obj)
+ output = geom_df.to_xml(stylesheet=f)
assert output == xsl_expected
def test_stylesheet_wrong_path(geom_df):
- lxml_etree = pytest.importorskip("lxml.etree")
+ pytest.importorskip("lxml.etree")
- xsl = os.path.join("data", "xml", "row_field_output.xslt")
+ xsl = os.path.join("does", "not", "exist", "row_field_output.xslt")
with pytest.raises(
- lxml_etree.XMLSyntaxError,
- match=("Start tag expected, '<' not found"),
+ FileNotFoundError, match=r"\[Errno 2\] No such file or director"
):
geom_df.to_xml(stylesheet=xsl)
-@pytest.mark.parametrize("val", ["", b""])
+@pytest.mark.parametrize("val", [StringIO(""), BytesIO(b"")])
def test_empty_string_stylesheet(val, geom_df):
lxml_etree = pytest.importorskip("lxml.etree")
@@ -1095,9 +1092,9 @@ def test_incorrect_xsl_syntax(geom_df):
"""
with pytest.raises(
- lxml_etree.XMLSyntaxError, match=("Opening and ending tag mismatch")
+ lxml_etree.XMLSyntaxError, match="Opening and ending tag mismatch"
):
- geom_df.to_xml(stylesheet=xsl)
+ geom_df.to_xml(stylesheet=StringIO(xsl))
def test_incorrect_xsl_eval(geom_df):
@@ -1124,8 +1121,8 @@ def test_incorrect_xsl_eval(geom_df):
"""
- with pytest.raises(lxml_etree.XSLTParseError, match=("failed to compile")):
- geom_df.to_xml(stylesheet=xsl)
+ with pytest.raises(lxml_etree.XSLTParseError, match="failed to compile"):
+ geom_df.to_xml(stylesheet=StringIO(xsl))
def test_incorrect_xsl_apply(geom_df):
@@ -1143,9 +1140,9 @@ def test_incorrect_xsl_apply(geom_df):
"""
- with pytest.raises(lxml_etree.XSLTApplyError, match=("Cannot resolve URI")):
+ with pytest.raises(lxml_etree.XSLTApplyError, match="Cannot resolve URI"):
with tm.ensure_clean("test.xml") as path:
- geom_df.to_xml(path, stylesheet=xsl)
+ geom_df.to_xml(path, stylesheet=StringIO(xsl))
def test_stylesheet_with_etree(geom_df):
@@ -1160,10 +1157,8 @@ def test_stylesheet_with_etree(geom_df):
"""
- with pytest.raises(
- ValueError, match=("To use stylesheet, you need lxml installed")
- ):
- geom_df.to_xml(parser="etree", stylesheet=xsl)
+ with pytest.raises(ValueError, match="To use stylesheet, you need lxml installed"):
+ geom_df.to_xml(parser="etree", stylesheet=StringIO(xsl))
def test_style_to_csv(geom_df):
@@ -1190,7 +1185,7 @@ def test_style_to_csv(geom_df):
if out_csv is not None:
out_csv = out_csv.strip()
- out_xml = geom_df.to_xml(stylesheet=xsl)
+ out_xml = geom_df.to_xml(stylesheet=StringIO(xsl))
assert out_csv == out_xml
@@ -1224,7 +1219,7 @@ def test_style_to_string(geom_df):
"""
out_str = geom_df.to_string()
- out_xml = geom_df.to_xml(na_rep="NaN", stylesheet=xsl)
+ out_xml = geom_df.to_xml(na_rep="NaN", stylesheet=StringIO(xsl))
assert out_xml == out_str
@@ -1269,7 +1264,7 @@ def test_style_to_json(geom_df):
"""
out_json = geom_df.to_json()
- out_xml = geom_df.to_xml(stylesheet=xsl)
+ out_xml = geom_df.to_xml(stylesheet=StringIO(xsl))
assert out_json == out_xml
diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py
index 0ee3ec85ab6c6..2f5eca5e1e353 100644
--- a/pandas/tests/io/xml/test_xml.py
+++ b/pandas/tests/io/xml/test_xml.py
@@ -247,16 +247,12 @@
)
-def test_literal_xml_deprecation():
+def test_literal_xml_raises():
# GH 53809
pytest.importorskip("lxml")
- msg = (
- "Passing literal xml to 'read_xml' is deprecated and "
- "will be removed in a future version. To read from a "
- "literal string, wrap it in a 'StringIO' object."
- )
+ msg = r"\[Errno 2\] No such file or directory"
- with tm.assert_produces_warning(FutureWarning, match=msg):
+ with pytest.raises(FileNotFoundError, match=msg):
read_xml(xml_default_nmsp)
@@ -490,16 +486,10 @@ def test_empty_string_etree(val):
def test_wrong_file_path(parser):
- msg = (
- "Passing literal xml to 'read_xml' is deprecated and "
- "will be removed in a future version. To read from a "
- "literal string, wrap it in a 'StringIO' object."
- )
- filename = os.path.join("data", "html", "books.xml")
+ filename = os.path.join("does", "not", "exist", "books.xml")
with pytest.raises(
- FutureWarning,
- match=msg,
+ FileNotFoundError, match=r"\[Errno 2\] No such file or directory"
):
read_xml(filename, parser=parser)
@@ -1197,14 +1187,12 @@ def test_stylesheet_io(kml_cta_rail_lines, xsl_flatten_doc, mode):
def test_stylesheet_buffered_reader(kml_cta_rail_lines, xsl_flatten_doc, mode):
pytest.importorskip("lxml")
with open(xsl_flatten_doc, mode, encoding="utf-8" if mode == "r" else None) as f:
- xsl_obj = f.read()
-
- df_style = read_xml(
- kml_cta_rail_lines,
- xpath=".//k:Placemark",
- namespaces={"k": "http://www.opengis.net/kml/2.2"},
- stylesheet=xsl_obj,
- )
+ df_style = read_xml(
+ kml_cta_rail_lines,
+ xpath=".//k:Placemark",
+ namespaces={"k": "http://www.opengis.net/kml/2.2"},
+ stylesheet=f,
+ )
tm.assert_frame_equal(df_kml, df_style)
@@ -1233,7 +1221,7 @@ def test_style_charset():
"""
df_orig = read_xml(StringIO(xml))
- df_style = read_xml(StringIO(xml), stylesheet=xsl)
+ df_style = read_xml(StringIO(xml), stylesheet=StringIO(xsl))
tm.assert_frame_equal(df_orig, df_style)
@@ -1271,9 +1259,9 @@ def test_incorrect_xsl_syntax(kml_cta_rail_lines):
"""
with pytest.raises(
- lxml_etree.XMLSyntaxError, match=("Extra content at the end of the document")
+ lxml_etree.XMLSyntaxError, match="Extra content at the end of the document"
):
- read_xml(kml_cta_rail_lines, stylesheet=xsl)
+ read_xml(kml_cta_rail_lines, stylesheet=StringIO(xsl))
def test_incorrect_xsl_eval(kml_cta_rail_lines):
@@ -1299,8 +1287,8 @@ def test_incorrect_xsl_eval(kml_cta_rail_lines):