Skip to content

Commit 862d431

Browse files
authored
CLN: Remove literal string/bytes support in IO readers (#57307)
1 parent 0764e9c commit 862d431

File tree

13 files changed

+128
-332
lines changed

13 files changed

+128
-332
lines changed

doc/source/user_guide/io.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -3247,7 +3247,7 @@ output (as shown below for demonstration) for easier parse into ``DataFrame``:
32473247
</row>
32483248
</response>"""
32493249
3250-
df = pd.read_xml(StringIO(xml), stylesheet=xsl)
3250+
df = pd.read_xml(StringIO(xml), stylesheet=StringIO(xsl))
32513251
df
32523252
32533253
For very large XML files that can range in hundreds of megabytes to gigabytes, :func:`pandas.read_xml`
@@ -3418,7 +3418,7 @@ Write an XML and transform with stylesheet:
34183418
</xsl:template>
34193419
</xsl:stylesheet>"""
34203420
3421-
print(geom_df.to_xml(stylesheet=xsl))
3421+
print(geom_df.to_xml(stylesheet=StringIO(xsl)))
34223422
34233423
34243424
XML Final Notes

doc/source/whatsnew/v0.12.0.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -201,12 +201,12 @@ IO enhancements
201201
You can use ``pd.read_html()`` to read the output from ``DataFrame.to_html()`` like so
202202

203203
.. ipython:: python
204-
:okwarning:
205204
205+
import io
206206
df = pd.DataFrame({"a": range(3), "b": list("abc")})
207207
print(df)
208208
html = df.to_html()
209-
alist = pd.read_html(html, index_col=0)
209+
alist = pd.read_html(io.StringIO(html), index_col=0)
210210
print(df == alist[0])
211211
212212
Note that ``alist`` here is a Python ``list`` so ``pd.read_html()`` and

doc/source/whatsnew/v3.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ Deprecations
103103

104104
Removal of prior version deprecations/changes
105105
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
106+
- :func:`read_excel`, :func:`read_json`, :func:`read_html`, and :func:`read_xml` no longer accept raw string or byte representation of the data. That type of data must be wrapped in a :py:class:`StringIO` or :py:class:`BytesIO` (:issue:`53767`)
106107
- All arguments except the first ``path``-like argument in IO writers are now keyword only (:issue:`54229`)
107108
- Changed the default value of ``observed`` in :meth:`DataFrame.groupby` and :meth:`Series.groupby` to ``True`` (:issue:`51811`)
108109
- Removed ``DataFrame.bool`` and ``Series.bool`` (:issue:`51756`)

pandas/io/excel/_base.py

+1-22
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
)
99
import datetime
1010
from functools import partial
11-
from io import BytesIO
1211
import os
1312
from textwrap import fill
1413
from typing import (
@@ -94,7 +93,7 @@
9493
9594
Parameters
9695
----------
97-
io : str, bytes, ExcelFile, xlrd.Book, path object, or file-like object
96+
io : str, ExcelFile, xlrd.Book, path object, or file-like object
9897
Any valid string path is acceptable. The string could be a URL. Valid
9998
URL schemes include http, ftp, s3, and file. For file URLs, a host is
10099
expected. A local file could be: ``file://localhost/path/to/table.xlsx``.
@@ -552,10 +551,6 @@ def __init__(
552551
if engine_kwargs is None:
553552
engine_kwargs = {}
554553

555-
# First argument can also be bytes, so create a buffer
556-
if isinstance(filepath_or_buffer, bytes):
557-
filepath_or_buffer = BytesIO(filepath_or_buffer)
558-
559554
self.handles = IOHandles(
560555
handle=filepath_or_buffer, compression={"method": None}
561556
)
@@ -1405,9 +1400,6 @@ def inspect_excel_format(
14051400
BadZipFile
14061401
If resulting stream does not have an XLS signature and is not a valid zipfile.
14071402
"""
1408-
if isinstance(content_or_path, bytes):
1409-
content_or_path = BytesIO(content_or_path)
1410-
14111403
with get_handle(
14121404
content_or_path, "rb", storage_options=storage_options, is_text=False
14131405
) as handle:
@@ -1526,19 +1518,6 @@ def __init__(
15261518
if engine is not None and engine not in self._engines:
15271519
raise ValueError(f"Unknown engine: {engine}")
15281520

1529-
# First argument can also be bytes, so create a buffer
1530-
if isinstance(path_or_buffer, bytes):
1531-
path_or_buffer = BytesIO(path_or_buffer)
1532-
warnings.warn(
1533-
"Passing bytes to 'read_excel' is deprecated and "
1534-
"will be removed in a future version. To read from a "
1535-
"byte string, wrap it in a `BytesIO` object.",
1536-
FutureWarning,
1537-
stacklevel=find_stack_level(),
1538-
)
1539-
1540-
# Could be a str, ExcelFile, Book, etc.
1541-
self.io = path_or_buffer
15421521
# Always a string
15431522
self._io = stringify_path(path_or_buffer)
15441523

pandas/io/formats/xml.py

+2-5
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,7 @@
2424
from pandas.core.shared_docs import _shared_docs
2525

2626
from pandas.io.common import get_handle
27-
from pandas.io.xml import (
28-
get_data_from_filepath,
29-
preprocess_data,
30-
)
27+
from pandas.io.xml import get_data_from_filepath
3128

3229
if TYPE_CHECKING:
3330
from pandas._typing import (
@@ -548,7 +545,7 @@ def _transform_doc(self) -> bytes:
548545
storage_options=self.storage_options,
549546
)
550547

551-
with preprocess_data(handle_data) as xml_data:
548+
with handle_data as xml_data:
552549
curr_parser = XMLParser(encoding=self.encoding)
553550

554551
if isinstance(xml_data, io.StringIO):

pandas/io/html.py

+26-59
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,16 @@
77
from __future__ import annotations
88

99
from collections import abc
10+
import errno
1011
import numbers
12+
import os
1113
import re
1214
from re import Pattern
1315
from typing import (
1416
TYPE_CHECKING,
1517
Literal,
1618
cast,
1719
)
18-
import warnings
1920

2021
from pandas._libs import lib
2122
from pandas.compat._optional import import_optional_dependency
@@ -24,7 +25,6 @@
2425
EmptyDataError,
2526
)
2627
from pandas.util._decorators import doc
27-
from pandas.util._exceptions import find_stack_level
2828
from pandas.util._validators import check_dtype_backend
2929

3030
from pandas.core.dtypes.common import is_list_like
@@ -36,10 +36,7 @@
3636
from pandas.core.shared_docs import _shared_docs
3737

3838
from pandas.io.common import (
39-
file_exists,
4039
get_handle,
41-
is_file_like,
42-
is_fsspec_url,
4340
is_url,
4441
stringify_path,
4542
validate_header_arg,
@@ -134,21 +131,17 @@ def _read(
134131
-------
135132
raw_text : str
136133
"""
137-
text: str | bytes
138-
if (
139-
is_url(obj)
140-
or hasattr(obj, "read")
141-
or (isinstance(obj, str) and file_exists(obj))
142-
):
134+
try:
143135
with get_handle(
144136
obj, "r", encoding=encoding, storage_options=storage_options
145137
) as handles:
146-
text = handles.handle.read()
147-
elif isinstance(obj, (str, bytes)):
148-
text = obj
149-
else:
150-
raise TypeError(f"Cannot read object of type '{type(obj).__name__}'")
151-
return text
138+
return handles.handle.read()
139+
except OSError as err:
140+
if not is_url(obj):
141+
raise FileNotFoundError(
142+
f"[Errno {errno.ENOENT}] {os.strerror(errno.ENOENT)}: {obj}"
143+
) from err
144+
raise
152145

153146

154147
class _HtmlFrameParser:
@@ -158,7 +151,7 @@ class _HtmlFrameParser:
158151
Parameters
159152
----------
160153
io : str or file-like
161-
This can be either a string of raw HTML, a valid URL using the HTTP,
154+
This can be either a string path, a valid URL using the HTTP,
162155
FTP, or FILE protocols or a file-like object.
163156
164157
match : str or regex
@@ -780,36 +773,26 @@ def _build_doc(self):
780773
from lxml.etree import XMLSyntaxError
781774
from lxml.html import (
782775
HTMLParser,
783-
fromstring,
784776
parse,
785777
)
786778

787779
parser = HTMLParser(recover=True, encoding=self.encoding)
788780

789-
try:
790-
if is_url(self.io):
791-
with get_handle(
792-
self.io, "r", storage_options=self.storage_options
793-
) as f:
794-
r = parse(f.handle, parser=parser)
795-
else:
796-
# try to parse the input in the simplest way
797-
r = parse(self.io, parser=parser)
781+
if is_url(self.io):
782+
with get_handle(self.io, "r", storage_options=self.storage_options) as f:
783+
r = parse(f.handle, parser=parser)
784+
else:
785+
# try to parse the input in the simplest way
798786
try:
799-
r = r.getroot()
800-
except AttributeError:
801-
pass
802-
except (UnicodeDecodeError, OSError) as e:
803-
# if the input is a blob of html goop
804-
if not is_url(self.io):
805-
r = fromstring(self.io, parser=parser)
806-
807-
try:
808-
r = r.getroot()
809-
except AttributeError:
810-
pass
811-
else:
812-
raise e
787+
r = parse(self.io, parser=parser)
788+
except OSError as err:
789+
raise FileNotFoundError(
790+
f"[Errno {errno.ENOENT}] {os.strerror(errno.ENOENT)}: {self.io}"
791+
) from err
792+
try:
793+
r = r.getroot()
794+
except AttributeError:
795+
pass
813796
else:
814797
if not hasattr(r, "text_content"):
815798
raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
@@ -1059,7 +1042,7 @@ def read_html(
10591042
io : str, path object, or file-like object
10601043
String, path object (implementing ``os.PathLike[str]``), or file-like
10611044
object implementing a string ``read()`` function.
1062-
The string can represent a URL or the HTML itself. Note that
1045+
The string can represent a URL. Note that
10631046
lxml only accepts the http, ftp and file url protocols. If you have a
10641047
URL that starts with ``'https'`` you might try removing the ``'s'``.
10651048
@@ -1227,22 +1210,6 @@ def read_html(
12271210

12281211
io = stringify_path(io)
12291212

1230-
if isinstance(io, str) and not any(
1231-
[
1232-
is_file_like(io),
1233-
file_exists(io),
1234-
is_url(io),
1235-
is_fsspec_url(io),
1236-
]
1237-
):
1238-
warnings.warn(
1239-
"Passing literal html to 'read_html' is deprecated and "
1240-
"will be removed in a future version. To read from a "
1241-
"literal string, wrap it in a 'StringIO' object.",
1242-
FutureWarning,
1243-
stacklevel=find_stack_level(),
1244-
)
1245-
12461213
return _parse(
12471214
flavor=flavor,
12481215
io=io,

0 commit comments

Comments
 (0)